diff options
Diffstat (limited to 'intern/cycles')
462 files changed, 38094 insertions, 20566 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt index 31fa15a2042..c3305ac3dd8 100644 --- a/intern/cycles/CMakeLists.txt +++ b/intern/cycles/CMakeLists.txt @@ -9,6 +9,7 @@ endif() # External Libraries include(cmake/external_libs.cmake) +include(cmake/macros.cmake) # Build Flags # todo: this code could be refactored a bit to avoid duplication @@ -21,7 +22,11 @@ if(WITH_CYCLES_NATIVE_ONLY) add_definitions( -DWITH_KERNEL_NATIVE ) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") + + if(NOT MSVC) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") + set(CYCLES_KERNEL_FLAGS "-march=native") + endif() elseif(NOT WITH_CPU_SSE) set(CXX_HAS_SSE FALSE) set(CXX_HAS_AVX FALSE) @@ -40,57 +45,113 @@ elseif(WIN32 AND MSVC) set(CYCLES_AVX2_ARCH_FLAGS "/arch:SSE2") endif() + # Unlike GCC/clang we still use fast math, because there is no fine + # grained control and the speedup we get here is too big to ignore. + set(CYCLES_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") + # there is no /arch:SSE3, but intrinsics are available anyway if(CMAKE_CL_64) - set(CYCLES_SSE2_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") - set(CYCLES_SSE3_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") - set(CYCLES_SSE41_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") - set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") - set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") + set(CYCLES_SSE2_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS}") + set(CYCLES_SSE3_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS}") + set(CYCLES_SSE41_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS}") + set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}") + set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}") else() - set(CYCLES_SSE2_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") - set(CYCLES_SSE3_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") - set(CYCLES_SSE41_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") - set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") - set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") + set(CYCLES_SSE2_KERNEL_FLAGS "/arch:SSE2 ${CYCLES_KERNEL_FLAGS}") + set(CYCLES_SSE3_KERNEL_FLAGS "/arch:SSE2 ${CYCLES_KERNEL_FLAGS}") + set(CYCLES_SSE41_KERNEL_FLAGS "/arch:SSE2 ${CYCLES_KERNEL_FLAGS}") + set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}") + set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}") endif() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CYCLES_KERNEL_FLAGS}") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Ox") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /Ox") set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /Ox") -elseif(CMAKE_COMPILER_IS_GNUCC) +elseif(CMAKE_COMPILER_IS_GNUCC OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) check_cxx_compiler_flag(-msse CXX_HAS_SSE) check_cxx_compiler_flag(-mavx CXX_HAS_AVX) check_cxx_compiler_flag(-mavx2 CXX_HAS_AVX2) - if(CXX_HAS_SSE) - set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2 -mfpmath=sse") - set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -mfpmath=sse") - set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mfpmath=sse") - endif() - if(CXX_HAS_AVX) - set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mfpmath=sse") + + # Assume no signal trapping for better code generation. + set(CYCLES_KERNEL_FLAGS "-fno-trapping-math") + # Avoid overhead of setting errno for NaNs. + set(CYCLES_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -fno-math-errno") + # Let compiler optimize 0.0 - x without worrying about signed zeros. + set(CYCLES_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -fno-signed-zeros") + + if(CMAKE_COMPILER_IS_GNUCC) + # Assume no signal trapping for better code generation. + set(CYCLES_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -fno-signaling-nans") + # Assume a fixed rounding mode for better constant folding. + set(CYCLES_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -fno-rounding-math") endif() - if(CXX_HAS_AVX2) - set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c -mfpmath=sse") + + if(CXX_HAS_SSE) + if(CMAKE_COMPILER_IS_GNUCC) + set(CYCLES_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -mfpmath=sse") + endif() + + set(CYCLES_SSE2_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -msse -msse2") + set(CYCLES_SSE3_KERNEL_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS} -msse3 -mssse3") + set(CYCLES_SSE41_KERNEL_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS} -msse4.1") + if(CXX_HAS_AVX) + set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS} -mavx") + endif() + if(CXX_HAS_AVX2) + set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS} -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c") + endif() endif() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -fno-finite-math-only") -elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang") - check_cxx_compiler_flag(-msse CXX_HAS_SSE) - check_cxx_compiler_flag(-mavx CXX_HAS_AVX) - check_cxx_compiler_flag(-mavx2 CXX_HAS_AVX2) + + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CYCLES_KERNEL_FLAGS}") +elseif(WIN32 AND CMAKE_CXX_COMPILER_ID MATCHES "Intel") + check_cxx_compiler_flag(/QxSSE2 CXX_HAS_SSE) + check_cxx_compiler_flag(/arch:AVX CXX_HAS_AVX) + check_cxx_compiler_flag(/QxCORE-AVX2 CXX_HAS_AVX2) + if(CXX_HAS_SSE) - set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2") - set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3") - set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1") + set(CYCLES_SSE2_KERNEL_FLAGS "/QxSSE2") + set(CYCLES_SSE3_KERNEL_FLAGS "/QxSSSE3") + set(CYCLES_SSE41_KERNEL_FLAGS "/QxSSE4.1") + + if(CXX_HAS_AVX) + set(CYCLES_AVX_KERNEL_FLAGS "/arch:AVX") + endif() + + if(CXX_HAS_AVX2) + set(CYCLES_AVX2_KERNEL_FLAGS "/QxCORE-AVX2") + endif() endif() - if(CXX_HAS_AVX) - set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx") +elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel") + if(APPLE) + # ICC does not support SSE2 flag on MacOSX + check_cxx_compiler_flag(-xssse3 CXX_HAS_SSE) + else() + check_cxx_compiler_flag(-xsse2 CXX_HAS_SSE) endif() - if(CXX_HAS_AVX2) - set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c") + + check_cxx_compiler_flag(-xavx CXX_HAS_AVX) + check_cxx_compiler_flag(-xcore-avx2 CXX_HAS_AVX2) + + if(CXX_HAS_SSE) + if(APPLE) + # ICC does not support SSE2 flag on MacOSX + set(CYCLES_SSE2_KERNEL_FLAGS "-xssse3") + else() + set(CYCLES_SSE2_KERNEL_FLAGS "-xsse2") + endif() + + set(CYCLES_SSE3_KERNEL_FLAGS "-xssse3") + set(CYCLES_SSE41_KERNEL_FLAGS "-xsse4.1") + + if(CXX_HAS_AVX) + set(CYCLES_AVX_KERNEL_FLAGS "-xavx") + endif() + + if(CXX_HAS_AVX2) + set(CYCLES_AVX2_KERNEL_FLAGS "-xcore-avx2") + endif() endif() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -fno-finite-math-only") endif() if(CXX_HAS_SSE) @@ -154,8 +215,13 @@ if(WITH_CYCLES_OPENSUBDIV) ) endif() -set(WITH_CYCLES_DEVICE_OPENCL TRUE) -set(WITH_CYCLES_DEVICE_CUDA TRUE) +if(WITH_CYCLES_STANDALONE) + set(WITH_CYCLES_DEVICE_OPENCL TRUE) + set(WITH_CYCLES_DEVICE_CUDA TRUE) + # Experimental and unfinished. + set(WITH_CYCLES_NETWORK FALSE) +endif() +# TODO(sergey): Consider removing it, only causes confusion in interface. set(WITH_CYCLES_DEVICE_MULTI TRUE) if(CYCLES_STANDALONE_REPOSITORY) @@ -186,7 +252,7 @@ endif() # Logging capabilities using GLog library. if(WITH_CYCLES_LOGGING) add_definitions(-DWITH_CYCLES_LOGGING) - add_definitions(-DGOOGLE_GLOG_DLL_DECL=) + add_definitions(${GLOG_DEFINES}) add_definitions(-DCYCLES_GFLAGS_NAMESPACE=${GFLAGS_NAMESPACE}) include_directories( SYSTEM @@ -200,6 +266,10 @@ if(WITH_CYCLES_DEBUG) add_definitions(-DWITH_CYCLES_DEBUG) endif() +if(NOT OPENIMAGEIO_PUGIXML_FOUND) + add_definitions(-DWITH_SYSTEM_PUGIXML) +endif() + include_directories( SYSTEM ${BOOST_INCLUDE_DIR} @@ -226,6 +296,24 @@ if(CMAKE_COMPILER_IS_GNUCXX) unset(_has_no_error_unused_macros) endif() +if(WITH_CYCLES_CUDA_BINARIES AND (NOT WITH_CYCLES_CUBIN_COMPILER)) + if(MSVC) + set(MAX_MSVC 1800) + if(${CUDA_VERSION} EQUAL "8.0") + set(MAX_MSVC 1900) + elseif(${CUDA_VERSION} EQUAL "9.0") + set(MAX_MSVC 1910) + elseif(${CUDA_VERSION} EQUAL "9.1") + set(MAX_MSVC 1911) + endif() + if(NOT MSVC_VERSION LESS ${MAX_MSVC}) + message(STATUS "nvcc not supported for this compiler version, using cycles_cubin_cc instead.") + set(WITH_CYCLES_CUBIN_COMPILER ON) + endif() + unset(MAX_MSVC) + endif() +endif() + # Subdirectories @@ -238,7 +326,7 @@ if(WITH_CYCLES_NETWORK) add_definitions(-DWITH_NETWORK) endif() -if(WITH_CYCLES_STANDALONE OR WITH_CYCLES_NETWORK) +if(WITH_CYCLES_STANDALONE OR WITH_CYCLES_NETWORK OR WITH_CYCLES_CUBIN_COMPILER) add_subdirectory(app) endif() diff --git a/intern/cycles/app/CMakeLists.txt b/intern/cycles/app/CMakeLists.txt index 8cd499b7ca6..d1f86a5fe7d 100644 --- a/intern/cycles/app/CMakeLists.txt +++ b/intern/cycles/app/CMakeLists.txt @@ -1,14 +1,6 @@ set(INC - . - ../bvh - ../device - ../graph - ../kernel - ../kernel/svm - ../render - ../subd - ../util + .. ) set(INC_SYS ) @@ -43,18 +35,15 @@ if(WITH_CYCLES_OSL) list(APPEND LIBRARIES cycles_kernel_osl) endif() -if(CYCLES_STANDALONE_REPOSITORY) - if(WITH_CYCLES_LOGGING) - list(APPEND LIBRARIES - ${GLOG_LIBRARIES} - ${GFLAGS_LIBRARIES} - ) - endif() -else() +if(NOT CYCLES_STANDALONE_REPOSITORY) list(APPEND LIBRARIES bf_intern_glew_mx bf_intern_guardedalloc) - if(WITH_CYCLES_LOGGING) - list(APPEND LIBRARIES extern_glog extern_gflags) - endif() +endif() + +if(WITH_CYCLES_LOGGING) + list(APPEND LIBRARIES + ${GLOG_LIBRARIES} + ${GFLAGS_LIBRARIES} + ) endif() if(WITH_CYCLES_STANDALONE AND WITH_CYCLES_STANDALONE_GUI) @@ -131,3 +120,32 @@ if(WITH_CYCLES_NETWORK) endif() unset(SRC) endif() + +if(WITH_CYCLES_CUBIN_COMPILER) + # 32 bit windows is special, nvrtc is not supported on x86, so even + # though we are building 32 bit blender a 64 bit cubin_cc will have + # to be build to compile the cubins. + if(MSVC AND NOT CMAKE_CL_64) + message("Building with CUDA not supported on 32 bit, skipped") + set(WITH_CYCLES_CUDA_BINARIES OFF) + else() + set(SRC + cycles_cubin_cc.cpp + ) + set(INC + ../../../extern/cuew/include + ) + add_executable(cycles_cubin_cc ${SRC}) + include_directories(${INC}) + target_link_libraries(cycles_cubin_cc + extern_cuew + ${OPENIMAGEIO_LIBRARIES} + ${PLATFORM_LINKLIBS} + ) + if(NOT CYCLES_STANDALONE_REPOSITORY) + target_link_libraries(cycles_cubin_cc bf_intern_guardedalloc) + endif() + unset(SRC) + unset(INC) + endif() +endif() diff --git a/intern/cycles/app/cycles_cubin_cc.cpp b/intern/cycles/app/cycles_cubin_cc.cpp new file mode 100644 index 00000000000..3c83bf2dae3 --- /dev/null +++ b/intern/cycles/app/cycles_cubin_cc.cpp @@ -0,0 +1,296 @@ +/* + * Copyright 2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <stdio.h> +#include <stdint.h> + +#include <string> +#include <vector> + +#include <OpenImageIO/argparse.h> +#include <OpenImageIO/filesystem.h> + +#include "cuew.h" + +#ifdef _MSC_VER +# include <Windows.h> +#endif + +using std::string; +using std::vector; + +namespace std { + template<typename T> + std::string to_string(const T &n) { + std::ostringstream s; + s << n; + return s.str(); + } +} + +class CompilationSettings +{ +public: + CompilationSettings() + : target_arch(0), + bits(64), + verbose(false), + fast_math(false) + {} + + string cuda_toolkit_dir; + string input_file; + string output_file; + string ptx_file; + vector<string> defines; + vector<string> includes; + int target_arch; + int bits; + bool verbose; + bool fast_math; +}; + +bool compile_cuda(CompilationSettings &settings) +{ + const char* headers[] = {"stdlib.h" , "float.h", "math.h", "stdio.h"}; + const char* header_content[] = {"\n", "\n", "\n", "\n"}; + + printf("Building %s\n", settings.input_file.c_str()); + + string code; + if(!OIIO::Filesystem::read_text_file(settings.input_file, code)) { + fprintf(stderr, "Error: unable to read %s\n", settings.input_file.c_str()); + return false; + } + + vector<string> options; + for(size_t i = 0; i < settings.includes.size(); i++) { + options.push_back("-I" + settings.includes[i]); + } + + for(size_t i = 0; i < settings.defines.size(); i++) { + options.push_back("-D" + settings.defines[i]); + } + options.push_back("-D__KERNEL_CUDA_VERSION__=" + std::to_string(cuewNvrtcVersion())); + options.push_back("-arch=compute_" + std::to_string(settings.target_arch)); + options.push_back("--device-as-default-execution-space"); + if(settings.fast_math) + options.push_back("--use_fast_math"); + + nvrtcProgram prog; + nvrtcResult result = nvrtcCreateProgram(&prog, + code.c_str(), // buffer + NULL, // name + sizeof(headers) / sizeof(void*), // numHeaders + header_content, // headers + headers); // includeNames + + if(result != NVRTC_SUCCESS) { + fprintf(stderr, "Error: nvrtcCreateProgram failed (%x)\n\n", result); + return false; + } + + /* Tranfer options to a classic C array. */ + vector<const char*> opts(options.size()); + for(size_t i = 0; i < options.size(); i++) { + opts[i] = options[i].c_str(); + } + + result = nvrtcCompileProgram(prog, options.size(), &opts[0]); + + if(result != NVRTC_SUCCESS) { + fprintf(stderr, "Error: nvrtcCompileProgram failed (%x)\n\n", result); + + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + + vector<char> log(log_size); + nvrtcGetProgramLog(prog, &log[0]); + fprintf(stderr, "%s\n", &log[0]); + + return false; + } + + /* Retrieve the ptx code. */ + size_t ptx_size; + result = nvrtcGetPTXSize(prog, &ptx_size); + if(result != NVRTC_SUCCESS) { + fprintf(stderr, "Error: nvrtcGetPTXSize failed (%x)\n\n", result); + return false; + } + + vector<char> ptx_code(ptx_size); + result = nvrtcGetPTX(prog, &ptx_code[0]); + if(result != NVRTC_SUCCESS) { + fprintf(stderr, "Error: nvrtcGetPTX failed (%x)\n\n", result); + return false; + } + + /* Write a file in the temp folder with the ptx code. */ + settings.ptx_file = OIIO::Filesystem::temp_directory_path() + "/" + OIIO::Filesystem::unique_path(); + FILE * f= fopen(settings.ptx_file.c_str(), "wb"); + fwrite(&ptx_code[0], 1, ptx_size, f); + fclose(f); + + return true; +} + +bool link_ptxas(CompilationSettings &settings) +{ + string cudapath = ""; + if(settings.cuda_toolkit_dir.size()) + cudapath = settings.cuda_toolkit_dir + "/bin/"; + + string ptx = "\"" +cudapath + "ptxas\" " + settings.ptx_file + + " -o " + settings.output_file + + " --gpu-name sm_" + std::to_string(settings.target_arch) + + " -m" + std::to_string(settings.bits); + + if (settings.verbose) + { + ptx += " --verbose"; + printf("%s\n", ptx.c_str()); + } + + int pxresult = system(ptx.c_str()); + if(pxresult) { + fprintf(stderr, "Error: ptxas failed (%x)\n\n", pxresult); + return false; + } + + if(!OIIO::Filesystem::remove(settings.ptx_file)) { + fprintf(stderr, "Error: removing %s\n\n", settings.ptx_file.c_str()); + } + + return true; +} + +bool init(CompilationSettings &settings) +{ +#ifdef _MSC_VER + if(settings.cuda_toolkit_dir.size()) { + SetDllDirectory((settings.cuda_toolkit_dir + "/bin").c_str()); + } +#endif + + int cuewresult = cuewInit(CUEW_INIT_NVRTC); + if(cuewresult != CUEW_SUCCESS) { + fprintf(stderr, "Error: cuew init fialed (0x%x)\n\n", cuewresult); + return false; + } + + if(cuewNvrtcVersion() < 80) { + fprintf(stderr, "Error: only cuda 8 and higher is supported, %d\n\n", cuewCompilerVersion()); + return false; + } + + if(!nvrtcCreateProgram) { + fprintf(stderr, "Error: nvrtcCreateProgram not resolved\n"); + return false; + } + + if(!nvrtcCompileProgram) { + fprintf(stderr, "Error: nvrtcCompileProgram not resolved\n"); + return false; + } + + if(!nvrtcGetProgramLogSize) { + fprintf(stderr, "Error: nvrtcGetProgramLogSize not resolved\n"); + return false; + } + + if(!nvrtcGetProgramLog) { + fprintf(stderr, "Error: nvrtcGetProgramLog not resolved\n"); + return false; + } + + if(!nvrtcGetPTXSize) { + fprintf(stderr, "Error: nvrtcGetPTXSize not resolved\n"); + return false; + } + + if(!nvrtcGetPTX) { + fprintf(stderr, "Error: nvrtcGetPTX not resolved\n"); + return false; + } + + return true; +} + +bool parse_parameters(int argc, const char **argv, CompilationSettings &settings) +{ + OIIO::ArgParse ap; + ap.options("Usage: cycles_cubin_cc [options]", + "-target %d", &settings.target_arch, "target shader model", + "-m %d", &settings.bits, "Cuda architecture bits", + "-i %s", &settings.input_file, "Input source filename", + "-o %s", &settings.output_file, "Output cubin filename", + "-I %L", &settings.includes, "Add additional includepath", + "-D %L", &settings.defines, "Add additional defines", + "-v", &settings.verbose, "Use verbose logging", + "--use_fast_math", &settings.fast_math, "Use fast math", + "-cuda-toolkit-dir %s", &settings.cuda_toolkit_dir, "path to the cuda toolkit binary directory", + NULL); + + if(ap.parse(argc, argv) < 0) { + fprintf(stderr, "%s\n", ap.geterror().c_str()); + ap.usage(); + return false; + } + + if(!settings.output_file.size()) { + fprintf(stderr, "Error: Output file not set(-o), required\n\n"); + return false; + } + + if(!settings.input_file.size()) { + fprintf(stderr, "Error: Input file not set(-i, required\n\n"); + return false; + } + + if(!settings.target_arch) { + fprintf(stderr, "Error: target shader model not set (-target), required\n\n"); + return false; + } + + return true; +} + +int main(int argc, const char **argv) +{ + CompilationSettings settings; + + if(!parse_parameters(argc, argv, settings)) { + fprintf(stderr, "Error: invalid parameters, exiting\n"); + exit(EXIT_FAILURE); + } + + if(!init(settings)) { + fprintf(stderr, "Error: initialization error, exiting\n"); + exit(EXIT_FAILURE); + } + + if(!compile_cuda(settings)) { + fprintf(stderr, "Error: compilation error, exiting\n"); + exit(EXIT_FAILURE); + } + + if(!link_ptxas(settings)) { + exit(EXIT_FAILURE); + } + + return 0; +} diff --git a/intern/cycles/app/cycles_server.cpp b/intern/cycles/app/cycles_server.cpp index 4ef9cd070bb..e65b9d769e4 100644 --- a/intern/cycles/app/cycles_server.cpp +++ b/intern/cycles/app/cycles_server.cpp @@ -16,15 +16,15 @@ #include <stdio.h> -#include "device.h" - -#include "util_args.h" -#include "util_foreach.h" -#include "util_path.h" -#include "util_stats.h" -#include "util_string.h" -#include "util_task.h" -#include "util_logging.h" +#include "device/device.h" + +#include "util/util_args.h" +#include "util/util_foreach.h" +#include "util/util_path.h" +#include "util/util_stats.h" +#include "util/util_string.h" +#include "util/util_task.h" +#include "util/util_logging.h" using namespace ccl; diff --git a/intern/cycles/app/cycles_standalone.cpp b/intern/cycles/app/cycles_standalone.cpp index 9816d614a7c..c682744f5fa 100644 --- a/intern/cycles/app/cycles_standalone.cpp +++ b/intern/cycles/app/cycles_standalone.cpp @@ -16,29 +16,29 @@ #include <stdio.h> -#include "buffers.h" -#include "camera.h" -#include "device.h" -#include "scene.h" -#include "session.h" -#include "integrator.h" - -#include "util_args.h" -#include "util_foreach.h" -#include "util_function.h" -#include "util_logging.h" -#include "util_path.h" -#include "util_progress.h" -#include "util_string.h" -#include "util_time.h" -#include "util_transform.h" -#include "util_version.h" +#include "render/buffers.h" +#include "render/camera.h" +#include "device/device.h" +#include "render/scene.h" +#include "render/session.h" +#include "render/integrator.h" + +#include "util/util_args.h" +#include "util/util_foreach.h" +#include "util/util_function.h" +#include "util/util_logging.h" +#include "util/util_path.h" +#include "util/util_progress.h" +#include "util/util_string.h" +#include "util/util_time.h" +#include "util/util_transform.h" +#include "util/util_version.h" #ifdef WITH_CYCLES_STANDALONE_GUI -#include "util_view.h" +#include "util/util_view.h" #endif -#include "cycles_xml.h" +#include "app/cycles_xml.h" CCL_NAMESPACE_BEGIN @@ -51,6 +51,7 @@ struct Options { SessionParams session_params; bool quiet; bool show_help, interactive, pause; + string output_path; } options; static void session_print(const string& str) @@ -86,6 +87,34 @@ static void session_print_status() session_print(status); } +static bool write_render(const uchar *pixels, int w, int h, int channels) +{ + string msg = string_printf("Writing image %s", options.output_path.c_str()); + session_print(msg); + + ImageOutput *out = ImageOutput::create(options.output_path); + if(!out) { + return false; + } + + ImageSpec spec(w, h, channels, TypeDesc::UINT8); + if(!out->open(options.output_path, spec)) { + return false; + } + + /* conversion for different top/bottom convention */ + out->write_image(TypeDesc::UINT8, + pixels + (h - 1) * w * channels, + AutoStride, + -w * channels, + AutoStride); + + out->close(); + delete out; + + return true; +} + static BufferParams& session_buffer_params() { static BufferParams buffer_params; @@ -97,27 +126,9 @@ static BufferParams& session_buffer_params() return buffer_params; } -static void session_init() -{ - options.session = new Session(options.session_params); - options.session->reset(session_buffer_params(), options.session_params.samples); - options.session->scene = options.scene; - - if(options.session_params.background && !options.quiet) - options.session->progress.set_update_callback(function_bind(&session_print_status)); -#ifdef WITH_CYCLES_STANDALONE_GUI - else - options.session->progress.set_update_callback(function_bind(&view_redraw)); -#endif - - options.session->start(); - - options.scene = NULL; -} - static void scene_init() { - options.scene = new Scene(options.scene_params, options.session_params.device); + options.scene = new Scene(options.scene_params, options.session->device); /* Read XML */ xml_read_file(options.scene, options.filepath.c_str()); @@ -136,16 +147,32 @@ static void scene_init() options.scene->camera->compute_auto_viewplane(); } +static void session_init() +{ + options.session_params.write_render_cb = write_render; + options.session = new Session(options.session_params); + + if(options.session_params.background && !options.quiet) + options.session->progress.set_update_callback(function_bind(&session_print_status)); +#ifdef WITH_CYCLES_STANDALONE_GUI + else + options.session->progress.set_update_callback(function_bind(&view_redraw)); +#endif + + /* load scene */ + scene_init(); + options.session->scene = options.scene; + + options.session->reset(session_buffer_params(), options.session_params.samples); + options.session->start(); +} + static void session_exit() { if(options.session) { delete options.session; options.session = NULL; } - if(options.scene) { - delete options.scene; - options.scene = NULL; - } if(options.session_params.background && !options.quiet) { session_print("Finished Rendering."); @@ -367,7 +394,7 @@ static void options_parse(int argc, const char **argv) "--background", &options.session_params.background, "Render in background, without user interface", "--quiet", &options.quiet, "In background mode, don't print progress messages", "--samples %d", &options.session_params.samples, "Number of samples to render", - "--output %s", &options.session_params.output_path, "File path to write output image", + "--output %s", &options.output_path, "File path to write output image", "--threads %d", &options.session_params.threads, "CPU Rendering Threads", "--width %d", &options.width, "Window width in pixel", "--height %d", &options.height, "Window height in pixel", @@ -430,7 +457,6 @@ static void options_parse(int argc, const char **argv) /* find matching device */ DeviceType device_type = Device::type_from_string(devicename.c_str()); vector<DeviceInfo>& devices = Device::available_devices(); - DeviceInfo device_info; bool device_available = false; foreach(DeviceInfo& device, devices) { @@ -467,9 +493,6 @@ static void options_parse(int argc, const char **argv) /* For smoother Viewport */ options.session_params.start_resolution = 64; - - /* load scene */ - scene_init(); } CCL_NAMESPACE_END diff --git a/intern/cycles/app/cycles_xml.cpp b/intern/cycles/app/cycles_xml.cpp index 35a30ae683f..a46955322e3 100644 --- a/intern/cycles/app/cycles_xml.cpp +++ b/intern/cycles/app/cycles_xml.cpp @@ -20,31 +20,31 @@ #include <algorithm> #include <iterator> -#include "node_xml.h" - -#include "background.h" -#include "camera.h" -#include "film.h" -#include "graph.h" -#include "integrator.h" -#include "light.h" -#include "mesh.h" -#include "nodes.h" -#include "object.h" -#include "osl.h" -#include "shader.h" -#include "scene.h" - -#include "subd_patch.h" -#include "subd_split.h" - -#include "util_debug.h" -#include "util_foreach.h" -#include "util_path.h" -#include "util_transform.h" -#include "util_xml.h" - -#include "cycles_xml.h" +#include "graph/node_xml.h" + +#include "render/background.h" +#include "render/camera.h" +#include "render/film.h" +#include "render/graph.h" +#include "render/integrator.h" +#include "render/light.h" +#include "render/mesh.h" +#include "render/nodes.h" +#include "render/object.h" +#include "render/osl.h" +#include "render/shader.h" +#include "render/scene.h" + +#include "subd/subd_patch.h" +#include "subd/subd_split.h" + +#include "util/util_foreach.h" +#include "util/util_path.h" +#include "util/util_projection.h" +#include "util/util_transform.h" +#include "util/util_xml.h" + +#include "app/cycles_xml.h" CCL_NAMESPACE_BEGIN @@ -70,9 +70,9 @@ struct XMLReadState : public XMLReader { /* Attribute Reading */ -static bool xml_read_int(int *value, pugi::xml_node node, const char *name) +static bool xml_read_int(int *value, xml_node node, const char *name) { - pugi::xml_attribute attr = node.attribute(name); + xml_attribute attr = node.attribute(name); if(attr) { *value = atoi(attr.value()); @@ -82,9 +82,9 @@ static bool xml_read_int(int *value, pugi::xml_node node, const char *name) return false; } -static bool xml_read_int_array(vector<int>& value, pugi::xml_node node, const char *name) +static bool xml_read_int_array(vector<int>& value, xml_node node, const char *name) { - pugi::xml_attribute attr = node.attribute(name); + xml_attribute attr = node.attribute(name); if(attr) { vector<string> tokens; @@ -99,9 +99,9 @@ static bool xml_read_int_array(vector<int>& value, pugi::xml_node node, const ch return false; } -static bool xml_read_float(float *value, pugi::xml_node node, const char *name) +static bool xml_read_float(float *value, xml_node node, const char *name) { - pugi::xml_attribute attr = node.attribute(name); + xml_attribute attr = node.attribute(name); if(attr) { *value = (float)atof(attr.value()); @@ -111,9 +111,9 @@ static bool xml_read_float(float *value, pugi::xml_node node, const char *name) return false; } -static bool xml_read_float_array(vector<float>& value, pugi::xml_node node, const char *name) +static bool xml_read_float_array(vector<float>& value, xml_node node, const char *name) { - pugi::xml_attribute attr = node.attribute(name); + xml_attribute attr = node.attribute(name); if(attr) { vector<string> tokens; @@ -128,7 +128,7 @@ static bool xml_read_float_array(vector<float>& value, pugi::xml_node node, cons return false; } -static bool xml_read_float3(float3 *value, pugi::xml_node node, const char *name) +static bool xml_read_float3(float3 *value, xml_node node, const char *name) { vector<float> array; @@ -140,7 +140,7 @@ static bool xml_read_float3(float3 *value, pugi::xml_node node, const char *name return false; } -static bool xml_read_float3_array(vector<float3>& value, pugi::xml_node node, const char *name) +static bool xml_read_float3_array(vector<float3>& value, xml_node node, const char *name) { vector<float> array; @@ -154,7 +154,7 @@ static bool xml_read_float3_array(vector<float3>& value, pugi::xml_node node, co return false; } -static bool xml_read_float4(float4 *value, pugi::xml_node node, const char *name) +static bool xml_read_float4(float4 *value, xml_node node, const char *name) { vector<float> array; @@ -166,9 +166,9 @@ static bool xml_read_float4(float4 *value, pugi::xml_node node, const char *name return false; } -static bool xml_read_string(string *str, pugi::xml_node node, const char *name) +static bool xml_read_string(string *str, xml_node node, const char *name) { - pugi::xml_attribute attr = node.attribute(name); + xml_attribute attr = node.attribute(name); if(attr) { *str = attr.value(); @@ -178,9 +178,9 @@ static bool xml_read_string(string *str, pugi::xml_node node, const char *name) return false; } -static bool xml_equal_string(pugi::xml_node node, const char *name, const char *value) +static bool xml_equal_string(xml_node node, const char *name, const char *value) { - pugi::xml_attribute attr = node.attribute(name); + xml_attribute attr = node.attribute(name); if(attr) return string_iequals(attr.value(), value); @@ -190,7 +190,7 @@ static bool xml_equal_string(pugi::xml_node node, const char *name, const char * /* Camera */ -static void xml_read_camera(XMLReadState& state, pugi::xml_node node) +static void xml_read_camera(XMLReadState& state, xml_node node) { Camera *cam = state.scene->camera; @@ -205,12 +205,12 @@ static void xml_read_camera(XMLReadState& state, pugi::xml_node node) cam->matrix = state.tfm; cam->need_update = true; - cam->update(); + cam->update(state.scene); } /* Shader */ -static void xml_read_shader_graph(XMLReadState& state, Shader *shader, pugi::xml_node graph_node) +static void xml_read_shader_graph(XMLReadState& state, Shader *shader, xml_node graph_node) { xml_read_node(state, shader, graph_node); @@ -220,7 +220,7 @@ static void xml_read_shader_graph(XMLReadState& state, Shader *shader, pugi::xml XMLReader graph_reader; graph_reader.node_map[ustring("output")] = graph->output(); - for(pugi::xml_node node = graph_node.first_child(); node; node = node.next_sibling()) { + for(xml_node node = graph_node.first_child(); node; node = node.next_sibling()) { ustring node_name(node.name()); if(node_name == "connect") { @@ -349,7 +349,7 @@ static void xml_read_shader_graph(XMLReadState& state, Shader *shader, pugi::xml shader->tag_update(state.scene); } -static void xml_read_shader(XMLReadState& state, pugi::xml_node node) +static void xml_read_shader(XMLReadState& state, xml_node node) { Shader *shader = new Shader(); xml_read_shader_graph(state, shader, node); @@ -358,7 +358,7 @@ static void xml_read_shader(XMLReadState& state, pugi::xml_node node) /* Background */ -static void xml_read_background(XMLReadState& state, pugi::xml_node node) +static void xml_read_background(XMLReadState& state, xml_node node) { /* Background Settings */ xml_read_node(state, state.scene->background, node); @@ -385,7 +385,7 @@ static Mesh *xml_add_mesh(Scene *scene, const Transform& tfm) return mesh; } -static void xml_read_mesh(const XMLReadState& state, pugi::xml_node node) +static void xml_read_mesh(const XMLReadState& state, xml_node node) { /* add mesh */ Mesh *mesh = xml_add_mesh(state.scene, state.tfm); @@ -516,7 +516,7 @@ static void xml_read_mesh(const XMLReadState& state, pugi::xml_node node) xml_read_float(&sdparams.dicing_rate, node, "dicing_rate"); sdparams.dicing_rate = std::max(0.1f, sdparams.dicing_rate); - state.scene->camera->update(); + state.scene->camera->update(state.scene); sdparams.camera = state.scene->camera; sdparams.objecttoworld = state.tfm; } @@ -531,7 +531,7 @@ static void xml_read_mesh(const XMLReadState& state, pugi::xml_node node) /* Light */ -static void xml_read_light(XMLReadState& state, pugi::xml_node node) +static void xml_read_light(XMLReadState& state, xml_node node) { Light *light = new Light(); @@ -543,12 +543,14 @@ static void xml_read_light(XMLReadState& state, pugi::xml_node node) /* Transform */ -static void xml_read_transform(pugi::xml_node node, Transform& tfm) +static void xml_read_transform(xml_node node, Transform& tfm) { if(node.attribute("matrix")) { vector<float> matrix; - if(xml_read_float_array(matrix, node, "matrix") && matrix.size() == 16) - tfm = tfm * transform_transpose((*(Transform*)&matrix[0])); + if(xml_read_float_array(matrix, node, "matrix") && matrix.size() == 16) { + ProjectionTransform projection = *(ProjectionTransform*)&matrix[0]; + tfm = tfm * projection_to_transform(projection_transpose(projection)); + } } if(node.attribute("translate")) { @@ -572,7 +574,7 @@ static void xml_read_transform(pugi::xml_node node, Transform& tfm) /* State */ -static void xml_read_state(XMLReadState& state, pugi::xml_node node) +static void xml_read_state(XMLReadState& state, xml_node node) { /* read shader */ string shadername; @@ -605,9 +607,9 @@ static void xml_read_state(XMLReadState& state, pugi::xml_node node) static void xml_read_include(XMLReadState& state, const string& src); -static void xml_read_scene(XMLReadState& state, pugi::xml_node scene_node) +static void xml_read_scene(XMLReadState& state, xml_node scene_node) { - for(pugi::xml_node node = scene_node.first_child(); node; node = node.next_sibling()) { + for(xml_node node = scene_node.first_child(); node; node = node.next_sibling()) { if(string_iequals(node.name(), "film")) { xml_read_node(state, state.scene->film, node); } @@ -657,8 +659,8 @@ static void xml_read_scene(XMLReadState& state, pugi::xml_node scene_node) static void xml_read_include(XMLReadState& state, const string& src) { /* open XML document */ - pugi::xml_document doc; - pugi::xml_parse_result parse_result; + xml_document doc; + xml_parse_result parse_result; string path = path_join(state.base, src); parse_result = doc.load_file(path.c_str()); @@ -667,7 +669,7 @@ static void xml_read_include(XMLReadState& state, const string& src) XMLReadState substate = state; substate.base = path_dirname(path); - pugi::xml_node cycles = doc.child("cycles"); + xml_node cycles = doc.child("cycles"); xml_read_scene(substate, cycles); } else { diff --git a/intern/cycles/blender/CMakeLists.txt b/intern/cycles/blender/CMakeLists.txt index b57502b3b14..ae4977aaed0 100644 --- a/intern/cycles/blender/CMakeLists.txt +++ b/intern/cycles/blender/CMakeLists.txt @@ -1,12 +1,6 @@ set(INC - ../graph - ../render - ../device - ../kernel - ../kernel/svm - ../util - ../subd + .. ../../glew-mx ../../guardedalloc ../../mikktspace diff --git a/intern/cycles/blender/addon/__init__.py b/intern/cycles/blender/addon/__init__.py index 235d19e91e8..a2d6262fb20 100644 --- a/intern/cycles/blender/addon/__init__.py +++ b/intern/cycles/blender/addon/__init__.py @@ -102,12 +102,21 @@ class CyclesRender(bpy.types.RenderEngine): else: self.report({'ERROR'}, "OSL support disabled in this build.") + def update_render_passes(self, scene, srl): + engine.register_passes(self, scene, srl) + def engine_exit(): engine.exit() +classes = ( + CyclesRender, +) + + def register(): + from bpy.utils import register_class from . import ui from . import properties from . import presets @@ -122,12 +131,15 @@ def register(): properties.register() ui.register() presets.register() - bpy.utils.register_module(__name__) + + for cls in classes: + register_class(cls) bpy.app.handlers.version_update.append(version_update.do_versions) def unregister(): + from bpy.utils import unregister_class from . import ui from . import properties from . import presets @@ -138,4 +150,6 @@ def unregister(): ui.unregister() properties.unregister() presets.unregister() - bpy.utils.unregister_module(__name__) + + for cls in classes: + unregister_class(cls) diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py index c8c9ef58c52..1f97eff9bd0 100644 --- a/intern/cycles/blender/addon/engine.py +++ b/intern/cycles/blender/addon/engine.py @@ -50,6 +50,24 @@ def _workaround_buggy_drivers(): _cycles.opencl_disable() +def _configure_argument_parser(): + import argparse + parser = argparse.ArgumentParser(description="Cycles Addon argument parser") + parser.add_argument("--cycles-resumable-num-chunks", + help="Number of chunks to split sample range into", + default=None) + parser.add_argument("--cycles-resumable-current-chunk", + help="Current chunk of samples range to render", + default=None) + parser.add_argument("--cycles-resumable-start-chunk", + help="Start chunk to render", + default=None) + parser.add_argument("--cycles-resumable-end-chunk", + help="End chunk to render", + default=None) + return parser + + def _parse_command_line(): import sys @@ -57,25 +75,22 @@ def _parse_command_line(): if "--" not in argv: return - argv = argv[argv.index("--") + 1:] - - num_resumable_chunks = None - current_resumable_chunk = None - - # TODO(sergey): Add some nice error prints if argument is not used properly. - idx = 0 - while idx < len(argv) - 1: - arg = argv[idx] - if arg == '--cycles-resumable-num-chunks': - num_resumable_chunks = int(argv[idx + 1]) - elif arg == '--cycles-resumable-current-chunk': - current_resumable_chunk = int(argv[idx + 1]) - idx += 1 + parser = _configure_argument_parser() + args, unknown = parser.parse_known_args(argv[argv.index("--") + 1:]) - if num_resumable_chunks is not None and current_resumable_chunk is not None: - import _cycles - _cycles.set_resumable_chunks(num_resumable_chunks, - current_resumable_chunk) + if args.cycles_resumable_num_chunks is not None: + if args.cycles_resumable_current_chunk is not None: + import _cycles + _cycles.set_resumable_chunk( + int(args.cycles_resumable_num_chunks), + int(args.cycles_resumable_current_chunk)) + elif args.cycles_resumable_start_chunk is not None and \ + args.cycles_resumable_end_chunk: + import _cycles + _cycles.set_resumable_chunk_range( + int(args.cycles_resumable_num_chunks), + int(args.cycles_resumable_start_chunk), + int(args.cycles_resumable_end_chunk)) def init(): @@ -190,3 +205,52 @@ def with_network(): def system_info(): import _cycles return _cycles.system_info() + +def register_passes(engine, scene, srl): + engine.register_pass(scene, srl, "Combined", 4, "RGBA", 'COLOR') + + if srl.use_pass_z: engine.register_pass(scene, srl, "Depth", 1, "Z", 'VALUE') + if srl.use_pass_mist: engine.register_pass(scene, srl, "Mist", 1, "Z", 'VALUE') + if srl.use_pass_normal: engine.register_pass(scene, srl, "Normal", 3, "XYZ", 'VECTOR') + if srl.use_pass_vector: engine.register_pass(scene, srl, "Vector", 4, "XYZW", 'VECTOR') + if srl.use_pass_uv: engine.register_pass(scene, srl, "UV", 3, "UVA", 'VECTOR') + if srl.use_pass_object_index: engine.register_pass(scene, srl, "IndexOB", 1, "X", 'VALUE') + if srl.use_pass_material_index: engine.register_pass(scene, srl, "IndexMA", 1, "X", 'VALUE') + if srl.use_pass_shadow: engine.register_pass(scene, srl, "Shadow", 3, "RGB", 'COLOR') + if srl.use_pass_ambient_occlusion: engine.register_pass(scene, srl, "AO", 3, "RGB", 'COLOR') + if srl.use_pass_diffuse_direct: engine.register_pass(scene, srl, "DiffDir", 3, "RGB", 'COLOR') + if srl.use_pass_diffuse_indirect: engine.register_pass(scene, srl, "DiffInd", 3, "RGB", 'COLOR') + if srl.use_pass_diffuse_color: engine.register_pass(scene, srl, "DiffCol", 3, "RGB", 'COLOR') + if srl.use_pass_glossy_direct: engine.register_pass(scene, srl, "GlossDir", 3, "RGB", 'COLOR') + if srl.use_pass_glossy_indirect: engine.register_pass(scene, srl, "GlossInd", 3, "RGB", 'COLOR') + if srl.use_pass_glossy_color: engine.register_pass(scene, srl, "GlossCol", 3, "RGB", 'COLOR') + if srl.use_pass_transmission_direct: engine.register_pass(scene, srl, "TransDir", 3, "RGB", 'COLOR') + if srl.use_pass_transmission_indirect: engine.register_pass(scene, srl, "TransInd", 3, "RGB", 'COLOR') + if srl.use_pass_transmission_color: engine.register_pass(scene, srl, "TransCol", 3, "RGB", 'COLOR') + if srl.use_pass_subsurface_direct: engine.register_pass(scene, srl, "SubsurfaceDir", 3, "RGB", 'COLOR') + if srl.use_pass_subsurface_indirect: engine.register_pass(scene, srl, "SubsurfaceInd", 3, "RGB", 'COLOR') + if srl.use_pass_subsurface_color: engine.register_pass(scene, srl, "SubsurfaceCol", 3, "RGB", 'COLOR') + if srl.use_pass_emit: engine.register_pass(scene, srl, "Emit", 3, "RGB", 'COLOR') + if srl.use_pass_environment: engine.register_pass(scene, srl, "Env", 3, "RGB", 'COLOR') + + crl = srl.cycles + if crl.pass_debug_render_time: engine.register_pass(scene, srl, "Debug Render Time", 1, "X", 'VALUE') + if crl.pass_debug_bvh_traversed_nodes: engine.register_pass(scene, srl, "Debug BVH Traversed Nodes", 1, "X", 'VALUE') + if crl.pass_debug_bvh_traversed_instances: engine.register_pass(scene, srl, "Debug BVH Traversed Instances", 1, "X", 'VALUE') + if crl.pass_debug_bvh_intersections: engine.register_pass(scene, srl, "Debug BVH Intersections", 1, "X", 'VALUE') + if crl.pass_debug_ray_bounces: engine.register_pass(scene, srl, "Debug Ray Bounces", 1, "X", 'VALUE') + if crl.use_pass_volume_direct: engine.register_pass(scene, srl, "VolumeDir", 3, "RGB", 'COLOR') + if crl.use_pass_volume_indirect: engine.register_pass(scene, srl, "VolumeInd", 3, "RGB", 'COLOR') + + cscene = scene.cycles + if crl.use_denoising and crl.denoising_store_passes and not cscene.use_progressive_refine: + engine.register_pass(scene, srl, "Denoising Normal", 3, "XYZ", 'VECTOR') + engine.register_pass(scene, srl, "Denoising Normal Variance", 3, "XYZ", 'VECTOR') + engine.register_pass(scene, srl, "Denoising Albedo", 3, "RGB", 'COLOR') + engine.register_pass(scene, srl, "Denoising Albedo Variance", 3, "RGB", 'COLOR') + engine.register_pass(scene, srl, "Denoising Depth", 1, "Z", 'VALUE') + engine.register_pass(scene, srl, "Denoising Depth Variance", 1, "Z", 'VALUE') + engine.register_pass(scene, srl, "Denoising Shadow A", 3, "XYV", 'VECTOR') + engine.register_pass(scene, srl, "Denoising Shadow B", 3, "XYV", 'VECTOR') + engine.register_pass(scene, srl, "Denoising Image", 3, "RGB", 'COLOR') + engine.register_pass(scene, srl, "Denoising Image Variance", 3, "RGB", 'COLOR') diff --git a/intern/cycles/blender/addon/presets.py b/intern/cycles/blender/addon/presets.py index f97b51b629d..17efb00abdb 100644 --- a/intern/cycles/blender/addon/presets.py +++ b/intern/cycles/blender/addon/presets.py @@ -32,14 +32,11 @@ class AddPresetIntegrator(AddPresetBase, Operator): preset_values = [ "cycles.max_bounces", - "cycles.min_bounces", "cycles.diffuse_bounces", "cycles.glossy_bounces", "cycles.transmission_bounces", "cycles.volume_bounces", - "cycles.transparent_min_bounces", "cycles.transparent_max_bounces", - "cycles.use_transparent_shadows", "cycles.caustics_reflective", "cycles.caustics_refractive", "cycles.blur_glossy" @@ -82,12 +79,23 @@ class AddPresetSampling(AddPresetBase, Operator): preset_subdir = "cycles/sampling" +classes = ( + AddPresetIntegrator, + AddPresetSampling, +) + + def register(): - pass + from bpy.utils import register_class + for cls in classes: + register_class(cls) def unregister(): - pass + from bpy.utils import unregister_class + for cls in classes: + unregister_class(cls) + if __name__ == "__main__": register() diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index 5c51f9afc28..8dbd80f3747 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -42,9 +42,14 @@ enum_feature_set = ( ) enum_displacement_methods = ( - ('BUMP', "Bump", "Bump mapping to simulate the appearance of displacement"), - ('TRUE', "True", "Use true displacement only, requires fine subdivision"), - ('BOTH', "Both", "Combination of displacement and bump mapping"), + ('BUMP', "Bump Only", "Bump mapping to simulate the appearance of displacement"), + ('DISPLACEMENT', "Displacement Only", "Use true displacement of surface only, requires fine subdivision"), + ('BOTH', "Displacement and Bump", "Combination of true displacement and bump mapping for finer detail"), + ) + +enum_bvh_layouts = ( + ('BVH2', "BVH2", "", 1), + ('BVH4', "BVH4", "", 2), ) enum_bvh_types = ( @@ -205,13 +210,13 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): name="AA Samples", description="Number of antialiasing samples to render for each pixel", min=1, max=2097151, - default=4, + default=128, ) cls.preview_aa_samples = IntProperty( name="AA Samples", description="Number of antialiasing samples to render in the viewport, unlimited if 0", min=0, max=2097151, - default=4, + default=32, ) cls.diffuse_samples = IntProperty( name="Diffuse Samples", @@ -308,17 +313,9 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): description="Adaptively blur glossy shaders after blurry bounces, " "to reduce noise at the cost of accuracy", min=0.0, max=10.0, - default=0.0, + default=1.0, ) - cls.min_bounces = IntProperty( - name="Min Bounces", - description="Minimum number of bounces, setting this lower " - "than the maximum enables probabilistic path " - "termination (faster but noisier)", - min=0, max=1024, - default=3, - ) cls.max_bounces = IntProperty( name="Max Bounces", description="Total maximum number of bounces", @@ -351,26 +348,12 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): default=0, ) - cls.transparent_min_bounces = IntProperty( - name="Transparent Min Bounces", - description="Minimum number of transparent bounces, setting " - "this lower than the maximum enables " - "probabilistic path termination (faster but " - "noisier)", - min=0, max=1024, - default=8, - ) cls.transparent_max_bounces = IntProperty( name="Transparent Max Bounces", description="Maximum number of transparent bounces", min=0, max=1024, default=8, ) - cls.use_transparent_shadows = BoolProperty( - name="Transparent Shadows", - description="Use transparency of surfaces for rendering shadows", - default=True, - ) cls.volume_step_size = FloatProperty( name="Step Size", @@ -410,6 +393,23 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): default=12, ) + cls.dicing_camera = PointerProperty( + name="Dicing Camera", + description="Camera to use as reference point when subdividing geometry, useful to avoid crawling " + "artifacts in animations when the scene camera is moving", + type=bpy.types.Object, + poll=lambda self, obj: obj.type == 'CAMERA', + ) + cls.offscreen_dicing_scale = FloatProperty( + name="Offscreen Dicing Scale", + description="Multiplier for dicing rate of geometry outside of the camera view. The dicing rate " + "of objects is gradually increased the further they are outside the camera view. " + "Lower values provide higher quality reflections and shadows for off screen objects, " + "while higher values use less memory", + min=1.0, soft_max=25.0, + default=4.0, + ) + cls.film_exposure = FloatProperty( name="Exposure", description="Image brightness scale", @@ -418,9 +418,20 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): ) cls.film_transparent = BoolProperty( name="Transparent", - description="World background is transparent with premultiplied alpha", + description="World background is transparent, for compositing the render over another background", default=False, ) + cls.film_transparent_glass = BoolProperty( + name="Transparent Glass", + description="Render transmissive surfaces as transparent, for compositing glass over another background", + default=False, + ) + cls.film_transparent_roughness = FloatProperty( + name="Transparent Roughness Threshold", + description="For transparent transmission, keep surfaces with roughness above the threshold opaque", + min=0.0, max=1.0, + default=0.1, + ) # Really annoyingly, we have to keep it around for a few releases, # otherwise forward compatibility breaks in really bad manner: CRASH! @@ -475,7 +486,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): "higher values will be scaled down to avoid too " "much noise and slow convergence at the cost of accuracy", min=0.0, max=1e8, - default=0.0, + default=10.0, ) cls.debug_tile_size = IntProperty( @@ -560,6 +571,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): ('SHADOW', "Shadow", ""), ('NORMAL', "Normal", ""), ('UV', "UV", ""), + ('ROUGHNESS', "Roughness", ""), ('EMIT', "Emit", ""), ('ENVIRONMENT', "Environment", ""), ('DIFFUSE', "Diffuse", ""), @@ -664,9 +676,15 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): cls.debug_use_cpu_sse41 = BoolProperty(name="SSE41", default=True) cls.debug_use_cpu_sse3 = BoolProperty(name="SSE3", default=True) cls.debug_use_cpu_sse2 = BoolProperty(name="SSE2", default=True) - cls.debug_use_qbvh = BoolProperty(name="QBVH", default=True) + cls.debug_bvh_layout = EnumProperty( + name="BVH Layout", + items=enum_bvh_layouts, + default='BVH4', + ) + cls.debug_use_cpu_split_kernel = BoolProperty(name="Split Kernel", default=False) cls.debug_use_cuda_adaptive_compile = BoolProperty(name="Adaptive Compile", default=False) + cls.debug_use_cuda_split_kernel = BoolProperty(name="Split Kernel", default=False) cls.debug_opencl_kernel_type = EnumProperty( name="OpenCL Kernel Type", @@ -693,8 +711,17 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): update=devices_update_callback ) + cls.debug_opencl_kernel_single_program = BoolProperty( + name="Single Program", + default=True, + update=devices_update_callback, + ) + cls.debug_use_opencl_debug = BoolProperty(name="Debug OpenCL", default=False) + cls.debug_opencl_mem_limit = IntProperty(name="Memory limit", default=0, + description="Artificial limit on OpenCL memory usage in MB (0 to disable limit)") + @classmethod def unregister(cls): del bpy.types.Scene.cycles @@ -852,7 +879,7 @@ class CyclesMaterialSettings(bpy.types.PropertyGroup): name="Displacement Method", description="Method to use for the displacement", items=enum_displacement_methods, - default='BUMP', + default='DISPLACEMENT', ) @classmethod @@ -1062,7 +1089,7 @@ class CyclesObjectSettings(bpy.types.PropertyGroup): cls.motion_steps = IntProperty( name="Motion Steps", - description="Control accuracy of deformation motion blur, more steps gives more memory usage (actual number of steps is 2^(steps - 1))", + description="Control accuracy of motion blur, more steps gives more memory usage (actual number of steps is 2^(steps - 1))", min=1, soft_max=8, default=1, ) @@ -1092,6 +1119,21 @@ class CyclesObjectSettings(bpy.types.PropertyGroup): default=1.0, ) + cls.is_shadow_catcher = BoolProperty( + name="Shadow Catcher", + description="Only render shadows on this object, for compositing renders into real footage", + default=False, + ) + + cls.is_holdout = BoolProperty( + name="Holdout", + description="Render objects as a holdout or matte, creating a " + "hole in the image with zero alpha, to fill out in " + "compositing with real footange or another render", + default=False, + ) + + @classmethod def unregister(cls): del bpy.types.Object.cycles @@ -1156,6 +1198,143 @@ class CyclesCurveRenderSettings(bpy.types.PropertyGroup): def unregister(cls): del bpy.types.Scene.cycles_curves +def update_render_passes(self, context): + scene = context.scene + rd = scene.render + rl = rd.layers.active + rl.update_render_passes() + +class CyclesRenderLayerSettings(bpy.types.PropertyGroup): + @classmethod + def register(cls): + bpy.types.SceneRenderLayer.cycles = PointerProperty( + name="Cycles SceneRenderLayer Settings", + description="Cycles SceneRenderLayer Settings", + type=cls, + ) + cls.pass_debug_bvh_traversed_nodes = BoolProperty( + name="Debug BVH Traversed Nodes", + description="Store Debug BVH Traversed Nodes pass", + default=False, + update=update_render_passes, + ) + cls.pass_debug_bvh_traversed_instances = BoolProperty( + name="Debug BVH Traversed Instances", + description="Store Debug BVH Traversed Instances pass", + default=False, + update=update_render_passes, + ) + cls.pass_debug_bvh_intersections = BoolProperty( + name="Debug BVH Intersections", + description="Store Debug BVH Intersections", + default=False, + update=update_render_passes, + ) + cls.pass_debug_ray_bounces = BoolProperty( + name="Debug Ray Bounces", + description="Store Debug Ray Bounces pass", + default=False, + update=update_render_passes, + ) + cls.pass_debug_render_time = BoolProperty( + name="Debug Render Time", + description="Render time in milliseconds per sample and pixel", + default=False, + update=update_render_passes, + ) + cls.use_pass_volume_direct = BoolProperty( + name="Volume Direct", + description="Deliver direct volumetric scattering pass", + default=False, + update=update_render_passes, + ) + cls.use_pass_volume_indirect = BoolProperty( + name="Volume Indirect", + description="Deliver indirect volumetric scattering pass", + default=False, + update=update_render_passes, + ) + + cls.use_denoising = BoolProperty( + name="Use Denoising", + description="Denoise the rendered image", + default=False, + update=update_render_passes, + ) + cls.denoising_diffuse_direct = BoolProperty( + name="Diffuse Direct", + description="Denoise the direct diffuse lighting", + default=True, + ) + cls.denoising_diffuse_indirect = BoolProperty( + name="Diffuse Indirect", + description="Denoise the indirect diffuse lighting", + default=True, + ) + cls.denoising_glossy_direct = BoolProperty( + name="Glossy Direct", + description="Denoise the direct glossy lighting", + default=True, + ) + cls.denoising_glossy_indirect = BoolProperty( + name="Glossy Indirect", + description="Denoise the indirect glossy lighting", + default=True, + ) + cls.denoising_transmission_direct = BoolProperty( + name="Transmission Direct", + description="Denoise the direct transmission lighting", + default=True, + ) + cls.denoising_transmission_indirect = BoolProperty( + name="Transmission Indirect", + description="Denoise the indirect transmission lighting", + default=True, + ) + cls.denoising_subsurface_direct = BoolProperty( + name="Subsurface Direct", + description="Denoise the direct subsurface lighting", + default=True, + ) + cls.denoising_subsurface_indirect = BoolProperty( + name="Subsurface Indirect", + description="Denoise the indirect subsurface lighting", + default=True, + ) + cls.denoising_strength = FloatProperty( + name="Denoising Strength", + description="Controls neighbor pixel weighting for the denoising filter (lower values preserve more detail, but aren't as smooth)", + min=0.0, max=1.0, + default=0.5, + ) + cls.denoising_feature_strength = FloatProperty( + name="Denoising Feature Strength", + description="Controls removal of noisy image feature passes (lower values preserve more detail, but aren't as smooth)", + min=0.0, max=1.0, + default=0.5, + ) + cls.denoising_radius = IntProperty( + name="Denoising Radius", + description="Size of the image area that's used to denoise a pixel (higher values are smoother, but might lose detail and are slower)", + min=1, max=25, + default=8, + ) + cls.denoising_relative_pca = BoolProperty( + name="Relative filter", + description="When removing pixels that don't carry information, use a relative threshold instead of an absolute one (can help to reduce artifacts, but might cause detail loss around edges)", + default=False, + ) + cls.denoising_store_passes = BoolProperty( + name="Store denoising passes", + description="Store the denoising feature passes and the noisy image", + default=False, + update=update_render_passes, + ) + + @classmethod + def unregister(cls): + del bpy.types.SceneRenderLayer.cycles + class CyclesCurveSettings(bpy.types.PropertyGroup): @classmethod @@ -1230,35 +1409,54 @@ class CyclesPreferences(bpy.types.AddonPreferences): devices = bpy.props.CollectionProperty(type=CyclesDeviceSettings) - def get_devices(self): - import _cycles - # Layout of the device tuples: (Name, Type, Persistent ID) - device_list = _cycles.available_devices() + def find_existing_device_entry(self, device): + for device_entry in self.devices: + if device_entry.id == device[2] and device_entry.type == device[1]: + return device_entry + return None - cuda_devices = [] - opencl_devices = [] + + def update_device_entries(self, device_list): for device in device_list: - if not device[1] in {'CUDA', 'OPENCL'}: + if not device[1] in {'CUDA', 'OPENCL', 'CPU'}: continue - - entry = None # Try to find existing Device entry - for dev in self.devices: - if dev.id == device[2] and dev.type == device[1]: - entry = dev - break - # Create new entry if no existing one was found + entry = self.find_existing_device_entry(device) if not entry: + # Create new entry if no existing one was found entry = self.devices.add() entry.id = device[2] entry.name = device[0] entry.type = device[1] + entry.use = entry.type != 'CPU' + elif entry.name != device[0]: + # Update name in case it changed + entry.name = device[0] + - # Sort entries into lists + def get_devices(self): + import _cycles + # Layout of the device tuples: (Name, Type, Persistent ID) + device_list = _cycles.available_devices() + # Make sure device entries are up to date and not referenced before + # we know we don't add new devices. This way we guarantee to not + # hold pointers to a resized array. + self.update_device_entries(device_list) + # Sort entries into lists + cuda_devices = [] + opencl_devices = [] + cpu_devices = [] + for device in device_list: + entry = self.find_existing_device_entry(device) if entry.type == 'CUDA': cuda_devices.append(entry) elif entry.type == 'OPENCL': opencl_devices.append(entry) + elif entry.type == 'CPU': + cpu_devices.append(entry) + # Extend all GPU devices with CPU. + cuda_devices.extend(cpu_devices) + opencl_devices.extend(cpu_devices) return cuda_devices, opencl_devices @@ -1287,14 +1485,14 @@ class CyclesPreferences(bpy.types.AddonPreferences): row = layout.row() if self.compute_device_type == 'CUDA' and cuda_devices: - col = row.column(align=True) + box = row.box() for device in cuda_devices: - col.prop(device, "use", text=device.name, toggle=True) + box.prop(device, "use", text=device.name) if self.compute_device_type == 'OPENCL' and opencl_devices: - col = row.column(align=True) + box = row.box() for device in opencl_devices: - col.prop(device, "use", text=device.name, toggle=True) + box.prop(device, "use", text=device.name) def draw(self, context): @@ -1314,6 +1512,7 @@ def register(): bpy.utils.register_class(CyclesCurveSettings) bpy.utils.register_class(CyclesDeviceSettings) bpy.utils.register_class(CyclesPreferences) + bpy.utils.register_class(CyclesRenderLayerSettings) def unregister(): @@ -1329,3 +1528,4 @@ def unregister(): bpy.utils.unregister_class(CyclesCurveSettings) bpy.utils.unregister_class(CyclesDeviceSettings) bpy.utils.unregister_class(CyclesPreferences) + bpy.utils.unregister_class(CyclesRenderLayerSettings) diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py index 44af5f7efed..707f8756f6f 100644 --- a/intern/cycles/blender/addon/ui.py +++ b/intern/cycles/blender/addon/ui.py @@ -78,7 +78,7 @@ def use_cuda(context): def use_branched_path(context): cscene = context.scene.cycles - return (cscene.progressive == 'BRANCHED_PATH' and not use_opencl(context)) + return (cscene.progressive == 'BRANCHED_PATH') def use_sample_all_lights(context): @@ -86,12 +86,10 @@ def use_sample_all_lights(context): return cscene.sample_all_lights_direct or cscene.sample_all_lights_indirect -def show_device_selection(context): - type = get_device_type(context) - if type == 'NETWORK': +def show_device_active(context): + cscene = context.scene.cycles + if cscene.device != 'GPU': return True - if not type in {'CUDA', 'OPENCL'}: - return False return context.user_preferences.addons[__package__].preferences.has_active_device() @@ -141,7 +139,7 @@ def draw_samples_info(layout, context): (ao * aa, ml * aa, sss * aa, vol * aa)) -class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel): +class CYCLES_RENDER_PT_sampling(CyclesButtonsPanel, Panel): bl_label = "Sampling" bl_options = {'DEFAULT_CLOSED'} @@ -158,7 +156,6 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel): row = layout.row() sub = row.row() - sub.active = get_device_type(context) != 'OPENCL' or use_cpu(context) sub.prop(cscene, "progressive", text="") row.prop(cscene, "use_square_samples") @@ -186,9 +183,6 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel): sub.label(text="AA Samples:") sub.prop(cscene, "aa_samples", text="Render") sub.prop(cscene, "preview_aa_samples", text="Preview") - sub.separator() - sub.prop(cscene, "sample_all_lights_direct") - sub.prop(cscene, "sample_all_lights_indirect") col = split.column() sub = col.column(align=True) @@ -205,8 +199,11 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel): sub.prop(cscene, "subsurface_samples", text="Subsurface") sub.prop(cscene, "volume_samples", text="Volume") - if not (use_opencl(context) and cscene.feature_set != 'EXPERIMENTAL'): - layout.row().prop(cscene, "sampling_pattern", text="Pattern") + col = layout.column(align=True) + col.prop(cscene, "sample_all_lights_direct") + col.prop(cscene, "sample_all_lights_indirect") + + layout.row().prop(cscene, "sampling_pattern", text="Pattern") for rl in scene.render.layers: if rl.samples > 0: @@ -217,7 +214,7 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel): draw_samples_info(layout, context) -class CyclesRender_PT_geometry(CyclesButtonsPanel, Panel): +class CYCLES_RENDER_PT_geometry(CyclesButtonsPanel, Panel): bl_label = "Geometry" bl_options = {'DEFAULT_CLOSED'} @@ -228,31 +225,32 @@ class CyclesRender_PT_geometry(CyclesButtonsPanel, Panel): cscene = scene.cycles ccscene = scene.cycles_curves + row = layout.row() + row.label("Volume Sampling:") + row = layout.row() + row.prop(cscene, "volume_step_size") + row.prop(cscene, "volume_max_steps") + + layout.separator() + if cscene.feature_set == 'EXPERIMENTAL': + layout.label("Subdivision Rate:") split = layout.split() col = split.column() - sub = col.column(align=True) - sub.label("Volume Sampling:") - sub.prop(cscene, "volume_step_size") - sub.prop(cscene, "volume_max_steps") + sub.prop(cscene, "dicing_rate", text="Render") + sub.prop(cscene, "preview_dicing_rate", text="Preview") col = split.column() + col.prop(cscene, "offscreen_dicing_scale", text="Offscreen Scale") + col.prop(cscene, "max_subdivisions") - sub = col.column(align=True) - sub.label("Subdivision Rate:") - sub.prop(cscene, "dicing_rate", text="Render") - sub.prop(cscene, "preview_dicing_rate", text="Preview") - sub.separator() - sub.prop(cscene, "max_subdivisions") - else: - row = layout.row() - row.label("Volume Sampling:") - row = layout.row() - row.prop(cscene, "volume_step_size") - row.prop(cscene, "volume_max_steps") + layout.prop(cscene, "dicing_camera") + + layout.separator() + layout.label("Hair:") layout.prop(ccscene, "use_curves", text="Use Hair") col = layout.column() col.active = ccscene.use_curves @@ -270,10 +268,10 @@ class CyclesRender_PT_geometry(CyclesButtonsPanel, Panel): row = col.row() row.prop(ccscene, "minimum_width", text="Min Pixels") - row.prop(ccscene, "maximum_width", text="Max Ext.") + row.prop(ccscene, "maximum_width", text="Max Extension") -class CyclesRender_PT_light_paths(CyclesButtonsPanel, Panel): +class CYCLES_RENDER_PT_light_paths(CyclesButtonsPanel, Panel): bl_label = "Light Paths" bl_options = {'DEFAULT_CLOSED'} @@ -295,8 +293,6 @@ class CyclesRender_PT_light_paths(CyclesButtonsPanel, Panel): sub = col.column(align=True) sub.label("Transparency:") sub.prop(cscene, "transparent_max_bounces", text="Max") - sub.prop(cscene, "transparent_min_bounces", text="Min") - sub.prop(cscene, "use_transparent_shadows", text="Shadows") col.separator() @@ -309,7 +305,6 @@ class CyclesRender_PT_light_paths(CyclesButtonsPanel, Panel): sub = col.column(align=True) sub.label(text="Bounces:") sub.prop(cscene, "max_bounces", text="Max") - sub.prop(cscene, "min_bounces", text="Min") sub = col.column(align=True) sub.prop(cscene, "diffuse_bounces", text="Diffuse") @@ -318,7 +313,7 @@ class CyclesRender_PT_light_paths(CyclesButtonsPanel, Panel): sub.prop(cscene, "volume_bounces", text="Volume") -class CyclesRender_PT_motion_blur(CyclesButtonsPanel, Panel): +class CYCLES_RENDER_PT_motion_blur(CyclesButtonsPanel, Panel): bl_label = "Motion Blur" bl_options = {'DEFAULT_CLOSED'} @@ -359,7 +354,7 @@ class CyclesRender_PT_motion_blur(CyclesButtonsPanel, Panel): row.prop(cscene, "rolling_shutter_duration") -class CyclesRender_PT_film(CyclesButtonsPanel, Panel): +class CYCLES_RENDER_PT_film(CyclesButtonsPanel, Panel): bl_label = "Film" def draw(self, context): @@ -372,16 +367,23 @@ class CyclesRender_PT_film(CyclesButtonsPanel, Panel): col = split.column() col.prop(cscene, "film_exposure") - col.prop(cscene, "film_transparent") - - col = split.column() + col.separator() sub = col.column(align=True) sub.prop(cscene, "pixel_filter_type", text="") if cscene.pixel_filter_type != 'BOX': sub.prop(cscene, "filter_width", text="Width") + col = split.column() + col.prop(cscene, "film_transparent") + sub = col.row() + sub.prop(cscene, "film_transparent_glass", text="Transparent Glass") + sub.active = cscene.film_transparent + sub = col.row() + sub.prop(cscene, "film_transparent_roughness", text="Roughness Threshold") + sub.active = cscene.film_transparent and cscene.film_transparent_glass + -class CyclesRender_PT_performance(CyclesButtonsPanel, Panel): +class CYCLES_RENDER_PT_performance(CyclesButtonsPanel, Panel): bl_label = "Performance" bl_options = {'DEFAULT_CLOSED'} @@ -402,6 +404,8 @@ class CyclesRender_PT_performance(CyclesButtonsPanel, Panel): sub.enabled = rd.threads_mode == 'FIXED' sub.prop(rd, "threads") + col.separator() + sub = col.column(align=True) sub.label(text="Tiles:") sub.prop(cscene, "tile_order", text="") @@ -409,21 +413,17 @@ class CyclesRender_PT_performance(CyclesButtonsPanel, Panel): sub.prop(rd, "tile_x", text="X") sub.prop(rd, "tile_y", text="Y") - sub.prop(cscene, "use_progressive_refine") - - subsub = sub.column(align=True) - subsub.prop(rd, "use_save_buffers") - - col = split.column(align=True) - - col.label(text="Viewport:") - col.prop(cscene, "debug_bvh_type", text="") - col.separator() - col.prop(cscene, "preview_start_resolution") + subsub = sub.column() + subsub.active = not rd.use_save_buffers + for rl in rd.layers: + if rl.cycles.use_denoising: + subsub.active = False + subsub.prop(cscene, "use_progressive_refine") - col.separator() + col = split.column() col.label(text="Final Render:") + col.prop(rd, "use_save_buffers") col.prop(rd, "use_persistent_data", text="Persistent Images") col.separator() @@ -436,13 +436,20 @@ class CyclesRender_PT_performance(CyclesButtonsPanel, Panel): row.active = not cscene.debug_use_spatial_splits row.prop(cscene, "debug_bvh_time_steps") + col = layout.column() + col.label(text="Viewport Resolution:") + split = col.split() + split.prop(rd, "preview_pixel_size", text="") + split.prop(cscene, "preview_start_resolution") + -class CyclesRender_PT_layer_options(CyclesButtonsPanel, Panel): +class CYCLES_RENDER_PT_layer_options(CyclesButtonsPanel, Panel): bl_label = "Layer" bl_context = "render_layer" def draw(self, context): layout = self.layout + with_freestyle = bpy.app.build_options.freestyle scene = context.scene rd = scene.render @@ -471,19 +478,26 @@ class CyclesRender_PT_layer_options(CyclesButtonsPanel, Panel): col.prop(rl, "use_ao", "Use AO") col.prop(rl, "use_solid", "Use Surfaces") col.prop(rl, "use_strand", "Use Hair") + if with_freestyle: + row = col.row() + row.prop(rl, "use_freestyle", "Use Freestyle") + row.active = rd.use_freestyle -class CyclesRender_PT_layer_passes(CyclesButtonsPanel, Panel): +class CYCLES_RENDER_PT_layer_passes(CyclesButtonsPanel, Panel): bl_label = "Passes" bl_context = "render_layer" bl_options = {'DEFAULT_CLOSED'} def draw(self, context): + import _cycles + layout = self.layout scene = context.scene rd = scene.render rl = rd.layers.active + crl = rl.cycles split = layout.split() @@ -525,16 +539,31 @@ class CyclesRender_PT_layer_passes(CyclesButtonsPanel, Panel): row.prop(rl, "use_pass_subsurface_direct", text="Direct", toggle=True) row.prop(rl, "use_pass_subsurface_indirect", text="Indirect", toggle=True) row.prop(rl, "use_pass_subsurface_color", text="Color", toggle=True) + col.label(text="Volume:") + row = col.row(align=True) + row.prop(crl, "use_pass_volume_direct", text="Direct", toggle=True) + row.prop(crl, "use_pass_volume_indirect", text="Indirect", toggle=True) col.separator() col.prop(rl, "use_pass_emit", text="Emission") col.prop(rl, "use_pass_environment") - if hasattr(rd, "debug_pass_type"): - layout.prop(rd, "debug_pass_type") + if context.scene.cycles.feature_set == 'EXPERIMENTAL': + col.separator() + sub = col.column() + sub.active = crl.use_denoising + sub.prop(crl, "denoising_store_passes", text="Denoising") + + col = layout.column() + col.prop(crl, "pass_debug_render_time") + if _cycles.with_cycles_debug: + col.prop(crl, "pass_debug_bvh_traversed_nodes") + col.prop(crl, "pass_debug_bvh_traversed_instances") + col.prop(crl, "pass_debug_bvh_intersections") + col.prop(crl, "pass_debug_ray_bounces") -class CyclesRender_PT_views(CyclesButtonsPanel, Panel): +class CYCLES_RENDER_PT_views(CyclesButtonsPanel, Panel): bl_label = "Views" bl_context = "render_layer" bl_options = {'DEFAULT_CLOSED'} @@ -577,7 +606,71 @@ class CyclesRender_PT_views(CyclesButtonsPanel, Panel): row.prop(rv, "camera_suffix", text="") -class Cycles_PT_post_processing(CyclesButtonsPanel, Panel): +class CYCLES_RENDER_PT_denoising(CyclesButtonsPanel, Panel): + bl_label = "Denoising" + bl_context = "render_layer" + bl_options = {'DEFAULT_CLOSED'} + + def draw_header(self, context): + rd = context.scene.render + rl = rd.layers.active + crl = rl.cycles + cscene = context.scene.cycles + layout = self.layout + + layout.prop(crl, "use_denoising", text="") + + def draw(self, context): + layout = self.layout + + scene = context.scene + cscene = scene.cycles + rd = scene.render + rl = rd.layers.active + crl = rl.cycles + + layout.active = crl.use_denoising + + split = layout.split() + + col = split.column() + sub = col.column(align=True) + sub.prop(crl, "denoising_radius", text="Radius") + sub.prop(crl, "denoising_strength", slider=True, text="Strength") + + col = split.column() + sub = col.column(align=True) + sub.prop(crl, "denoising_feature_strength", slider=True, text="Feature Strength") + sub.prop(crl, "denoising_relative_pca") + + layout.separator() + + row = layout.row() + row.label(text="Diffuse:") + sub = row.row(align=True) + sub.prop(crl, "denoising_diffuse_direct", text="Direct", toggle=True) + sub.prop(crl, "denoising_diffuse_indirect", text="Indirect", toggle=True) + + row = layout.row() + row.label(text="Glossy:") + sub = row.row(align=True) + sub.prop(crl, "denoising_glossy_direct", text="Direct", toggle=True) + sub.prop(crl, "denoising_glossy_indirect", text="Indirect", toggle=True) + + row = layout.row() + row.label(text="Transmission:") + sub = row.row(align=True) + sub.prop(crl, "denoising_transmission_direct", text="Direct", toggle=True) + sub.prop(crl, "denoising_transmission_indirect", text="Indirect", toggle=True) + + row = layout.row() + row.label(text="Subsurface:") + sub = row.row(align=True) + sub.prop(crl, "denoising_subsurface_direct", text="Direct", toggle=True) + sub.prop(crl, "denoising_subsurface_indirect", text="Indirect", toggle=True) + + +class CYCLES_PT_post_processing(CyclesButtonsPanel, Panel): bl_label = "Post Processing" bl_options = {'DEFAULT_CLOSED'} @@ -596,7 +689,7 @@ class Cycles_PT_post_processing(CyclesButtonsPanel, Panel): col.prop(rd, "dither_intensity", text="Dither", slider=True) -class CyclesCamera_PT_dof(CyclesButtonsPanel, Panel): +class CYCLES_CAMERA_PT_dof(CyclesButtonsPanel, Panel): bl_label = "Depth of Field" bl_context = "data" @@ -647,7 +740,7 @@ class CyclesCamera_PT_dof(CyclesButtonsPanel, Panel): sub.prop(ccam, "aperture_ratio", text="Ratio") -class Cycles_PT_context_material(CyclesButtonsPanel, Panel): +class CYCLES_PT_context_material(CyclesButtonsPanel, Panel): bl_label = "" bl_context = "material" bl_options = {'HIDE_HEADER'} @@ -707,7 +800,7 @@ class Cycles_PT_context_material(CyclesButtonsPanel, Panel): split.separator() -class CyclesObject_PT_motion_blur(CyclesButtonsPanel, Panel): +class CYCLES_OBJECT_PT_motion_blur(CyclesButtonsPanel, Panel): bl_label = "Motion Blur" bl_context = "object" bl_options = {'DEFAULT_CLOSED'} @@ -716,7 +809,7 @@ class CyclesObject_PT_motion_blur(CyclesButtonsPanel, Panel): def poll(cls, context): ob = context.object if CyclesButtonsPanel.poll(context) and ob: - if ob.type in {'MESH', 'CURVE', 'CURVE', 'SURFACE', 'FONT', 'META'}: + if ob.type in {'MESH', 'CURVE', 'CURVE', 'SURFACE', 'FONT', 'META', 'CAMERA'}: return True if ob.dupli_type == 'GROUP' and ob.dupli_group: return True @@ -748,14 +841,12 @@ class CyclesObject_PT_motion_blur(CyclesButtonsPanel, Panel): layout.active = (rd.use_motion_blur and cob.use_motion_blur) row = layout.row() - row.prop(cob, "use_deform_motion", text="Deformation") - - sub = row.row() - sub.active = cob.use_deform_motion - sub.prop(cob, "motion_steps", text="Steps") + if ob.type != 'CAMERA': + row.prop(cob, "use_deform_motion", text="Deformation") + row.prop(cob, "motion_steps", text="Steps") -class CyclesObject_PT_cycles_settings(CyclesButtonsPanel, Panel): +class CYCLES_OBJECT_PT_cycles_settings(CyclesButtonsPanel, Panel): bl_label = "Cycles Settings" bl_context = "object" bl_options = {'DEFAULT_CLOSED'} @@ -788,6 +879,10 @@ class CyclesObject_PT_cycles_settings(CyclesButtonsPanel, Panel): if ob.type != 'LAMP': flow.prop(visibility, "shadow") + row = layout.row() + row.prop(cob, "is_shadow_catcher") + row.prop(cob, "is_holdout") + col = layout.column() col.label(text="Performance:") row = col.row() @@ -862,7 +957,7 @@ def panel_node_draw(layout, id_data, output_type, input_name): return True -class CyclesLamp_PT_preview(CyclesButtonsPanel, Panel): +class CYCLES_LAMP_PT_preview(CyclesButtonsPanel, Panel): bl_label = "Preview" bl_context = "data" bl_options = {'DEFAULT_CLOSED'} @@ -878,7 +973,7 @@ class CyclesLamp_PT_preview(CyclesButtonsPanel, Panel): self.layout.template_preview(context.lamp) -class CyclesLamp_PT_lamp(CyclesButtonsPanel, Panel): +class CYCLES_LAMP_PT_lamp(CyclesButtonsPanel, Panel): bl_label = "Lamp" bl_context = "data" @@ -932,7 +1027,7 @@ class CyclesLamp_PT_lamp(CyclesButtonsPanel, Panel): layout.label(text="Not supported, interpreted as sun lamp") -class CyclesLamp_PT_nodes(CyclesButtonsPanel, Panel): +class CYCLES_LAMP_PT_nodes(CyclesButtonsPanel, Panel): bl_label = "Nodes" bl_context = "data" @@ -950,7 +1045,7 @@ class CyclesLamp_PT_nodes(CyclesButtonsPanel, Panel): layout.prop(lamp, "color") -class CyclesLamp_PT_spot(CyclesButtonsPanel, Panel): +class CYCLES_LAMP_PT_spot(CyclesButtonsPanel, Panel): bl_label = "Spot Shape" bl_context = "data" @@ -975,7 +1070,7 @@ class CyclesLamp_PT_spot(CyclesButtonsPanel, Panel): col.prop(lamp, "show_cone") -class CyclesWorld_PT_preview(CyclesButtonsPanel, Panel): +class CYCLES_WORLD_PT_preview(CyclesButtonsPanel, Panel): bl_label = "Preview" bl_context = "world" bl_options = {'DEFAULT_CLOSED'} @@ -988,7 +1083,7 @@ class CyclesWorld_PT_preview(CyclesButtonsPanel, Panel): self.layout.template_preview(context.world) -class CyclesWorld_PT_surface(CyclesButtonsPanel, Panel): +class CYCLES_WORLD_PT_surface(CyclesButtonsPanel, Panel): bl_label = "Surface" bl_context = "world" @@ -1005,7 +1100,7 @@ class CyclesWorld_PT_surface(CyclesButtonsPanel, Panel): layout.prop(world, "horizon_color", text="Color") -class CyclesWorld_PT_volume(CyclesButtonsPanel, Panel): +class CYCLES_WORLD_PT_volume(CyclesButtonsPanel, Panel): bl_label = "Volume" bl_context = "world" bl_options = {'DEFAULT_CLOSED'} @@ -1022,7 +1117,7 @@ class CyclesWorld_PT_volume(CyclesButtonsPanel, Panel): panel_node_draw(layout, world, 'OUTPUT_WORLD', 'Volume') -class CyclesWorld_PT_ambient_occlusion(CyclesButtonsPanel, Panel): +class CYCLES_WORLD_PT_ambient_occlusion(CyclesButtonsPanel, Panel): bl_label = "Ambient Occlusion" bl_context = "world" @@ -1047,7 +1142,7 @@ class CyclesWorld_PT_ambient_occlusion(CyclesButtonsPanel, Panel): row.prop(light, "distance", text="Distance") -class CyclesWorld_PT_mist(CyclesButtonsPanel, Panel): +class CYCLES_WORLD_PT_mist(CyclesButtonsPanel, Panel): bl_label = "Mist Pass" bl_context = "world" bl_options = {'DEFAULT_CLOSED'} @@ -1074,7 +1169,7 @@ class CyclesWorld_PT_mist(CyclesButtonsPanel, Panel): layout.prop(world.mist_settings, "falloff") -class CyclesWorld_PT_ray_visibility(CyclesButtonsPanel, Panel): +class CYCLES_WORLD_PT_ray_visibility(CyclesButtonsPanel, Panel): bl_label = "Ray Visibility" bl_context = "world" bl_options = {'DEFAULT_CLOSED'} @@ -1098,7 +1193,7 @@ class CyclesWorld_PT_ray_visibility(CyclesButtonsPanel, Panel): flow.prop(visibility, "scatter") -class CyclesWorld_PT_settings(CyclesButtonsPanel, Panel): +class CYCLES_WORLD_PT_settings(CyclesButtonsPanel, Panel): bl_label = "Settings" bl_context = "world" bl_options = {'DEFAULT_CLOSED'} @@ -1135,11 +1230,11 @@ class CyclesWorld_PT_settings(CyclesButtonsPanel, Panel): sub = col.column() sub.active = use_cpu(context) sub.prop(cworld, "volume_sampling", text="") - sub.prop(cworld, "volume_interpolation", text="") + col.prop(cworld, "volume_interpolation", text="") col.prop(cworld, "homogeneous_volume", text="Homogeneous") -class CyclesMaterial_PT_preview(CyclesButtonsPanel, Panel): +class CYCLES_MATERIAL_PT_preview(CyclesButtonsPanel, Panel): bl_label = "Preview" bl_context = "material" bl_options = {'DEFAULT_CLOSED'} @@ -1152,7 +1247,7 @@ class CyclesMaterial_PT_preview(CyclesButtonsPanel, Panel): self.layout.template_preview(context.material) -class CyclesMaterial_PT_surface(CyclesButtonsPanel, Panel): +class CYCLES_MATERIAL_PT_surface(CyclesButtonsPanel, Panel): bl_label = "Surface" bl_context = "material" @@ -1168,7 +1263,7 @@ class CyclesMaterial_PT_surface(CyclesButtonsPanel, Panel): layout.prop(mat, "diffuse_color") -class CyclesMaterial_PT_volume(CyclesButtonsPanel, Panel): +class CYCLES_MATERIAL_PT_volume(CyclesButtonsPanel, Panel): bl_label = "Volume" bl_context = "material" bl_options = {'DEFAULT_CLOSED'} @@ -1187,7 +1282,7 @@ class CyclesMaterial_PT_volume(CyclesButtonsPanel, Panel): panel_node_draw(layout, mat, 'OUTPUT_MATERIAL', 'Volume') -class CyclesMaterial_PT_displacement(CyclesButtonsPanel, Panel): +class CYCLES_MATERIAL_PT_displacement(CyclesButtonsPanel, Panel): bl_label = "Displacement" bl_context = "material" @@ -1203,10 +1298,9 @@ class CyclesMaterial_PT_displacement(CyclesButtonsPanel, Panel): panel_node_draw(layout, mat, 'OUTPUT_MATERIAL', 'Displacement') -class CyclesMaterial_PT_settings(CyclesButtonsPanel, Panel): +class CYCLES_MATERIAL_PT_settings(CyclesButtonsPanel, Panel): bl_label = "Settings" bl_context = "material" - bl_options = {'DEFAULT_CLOSED'} @classmethod def poll(cls, context): @@ -1224,41 +1318,53 @@ class CyclesMaterial_PT_settings(CyclesButtonsPanel, Panel): col.prop(cmat, "sample_as_light", text="Multiple Importance") col.prop(cmat, "use_transparent_shadow") - if context.scene.cycles.feature_set == 'EXPERIMENTAL': - col.separator() - col.label(text="Displacement:") - col.prop(cmat, "displacement_method", text="") + col.separator() + col.label(text="Geometry:") + col.prop(cmat, "displacement_method", text="") col = split.column() col.label(text="Volume:") sub = col.column() sub.active = use_cpu(context) sub.prop(cmat, "volume_sampling", text="") - sub.prop(cmat, "volume_interpolation", text="") + col.prop(cmat, "volume_interpolation", text="") col.prop(cmat, "homogeneous_volume", text="Homogeneous") - layout.separator() + col.separator() + col.prop(mat, "pass_index") + + +class CYCLES_MATERIAL_PT_viewport(CyclesButtonsPanel, Panel): + bl_label = "Viewport" + bl_context = "material" + bl_options = {'DEFAULT_CLOSED'} + + @classmethod + def poll(cls, context): + return context.material and CyclesButtonsPanel.poll(context) + + def draw(self, context): + mat = context.material + + layout = self.layout split = layout.split() col = split.column(align=True) - col.label("Viewport Color:") + col.label("Color:") col.prop(mat, "diffuse_color", text="") col.prop(mat, "alpha") col.separator() - col.label("Viewport Alpha:") + col.label("Alpha:") col.prop(mat.game_settings, "alpha_blend", text="") col = split.column(align=True) - col.label("Viewport Specular:") + col.label("Specular:") col.prop(mat, "specular_color", text="") col.prop(mat, "specular_hardness", text="Hardness") - col.separator() - col.prop(mat, "pass_index") - -class CyclesTexture_PT_context(CyclesButtonsPanel, Panel): +class CYCLES_TEXTURE_PT_context(CyclesButtonsPanel, Panel): bl_label = "" bl_context = "texture" bl_options = {'HIDE_HEADER'} @@ -1299,7 +1405,7 @@ class CyclesTexture_PT_context(CyclesButtonsPanel, Panel): split.prop(tex, "type", text="") -class CyclesTexture_PT_node(CyclesButtonsPanel, Panel): +class CYCLES_TEXTURE_PT_node(CyclesButtonsPanel, Panel): bl_label = "Node" bl_context = "texture" @@ -1316,7 +1422,7 @@ class CyclesTexture_PT_node(CyclesButtonsPanel, Panel): layout.template_node_view(ntree, node, None) -class CyclesTexture_PT_mapping(CyclesButtonsPanel, Panel): +class CYCLES_TEXTURE_PT_mapping(CyclesButtonsPanel, Panel): bl_label = "Mapping" bl_context = "texture" @@ -1349,7 +1455,7 @@ class CyclesTexture_PT_mapping(CyclesButtonsPanel, Panel): row.prop(mapping, "mapping_z", text="") -class CyclesTexture_PT_colors(CyclesButtonsPanel, Panel): +class CYCLES_TEXTURE_PT_colors(CyclesButtonsPanel, Panel): bl_label = "Color" bl_context = "texture" bl_options = {'DEFAULT_CLOSED'} @@ -1388,7 +1494,7 @@ class CyclesTexture_PT_colors(CyclesButtonsPanel, Panel): layout.template_color_ramp(mapping, "color_ramp", expand=True) -class CyclesParticle_PT_textures(CyclesButtonsPanel, Panel): +class CYCLES_PARTICLE_PT_textures(CyclesButtonsPanel, Panel): bl_label = "Textures" bl_context = "particle" bl_options = {'DEFAULT_CLOSED'} @@ -1419,7 +1525,7 @@ class CyclesParticle_PT_textures(CyclesButtonsPanel, Panel): layout.template_ID(slot, "texture", new="texture.new") -class CyclesRender_PT_bake(CyclesButtonsPanel, Panel): +class CYCLES_RENDER_PT_bake(CyclesButtonsPanel, Panel): bl_label = "Bake" bl_context = "render" bl_options = {'DEFAULT_CLOSED'} @@ -1492,7 +1598,7 @@ class CyclesRender_PT_bake(CyclesButtonsPanel, Panel): sub.prop(cbk, "cage_extrusion", text="Ray Distance") -class CyclesRender_PT_debug(CyclesButtonsPanel, Panel): +class CYCLES_RENDER_PT_debug(CyclesButtonsPanel, Panel): bl_label = "Debug" bl_context = "render" bl_options = {'DEFAULT_CLOSED'} @@ -1517,20 +1623,33 @@ class CyclesRender_PT_debug(CyclesButtonsPanel, Panel): row.prop(cscene, "debug_use_cpu_sse41", toggle=True) row.prop(cscene, "debug_use_cpu_avx", toggle=True) row.prop(cscene, "debug_use_cpu_avx2", toggle=True) - col.prop(cscene, "debug_use_qbvh") + col.prop(cscene, "debug_bvh_layout") + col.prop(cscene, "debug_use_cpu_split_kernel") + + col.separator() col = layout.column() col.label('CUDA Flags:') col.prop(cscene, "debug_use_cuda_adaptive_compile") + col.prop(cscene, "debug_use_cuda_split_kernel") + + col.separator() col = layout.column() col.label('OpenCL Flags:') col.prop(cscene, "debug_opencl_kernel_type", text="Kernel") col.prop(cscene, "debug_opencl_device_type", text="Device") + col.prop(cscene, "debug_opencl_kernel_single_program", text="Single Program") col.prop(cscene, "debug_use_opencl_debug", text="Debug") + col.prop(cscene, "debug_opencl_mem_limit") + + col.separator() + + col = layout.column() + col.prop(cscene, "debug_bvh_type") -class CyclesParticle_PT_CurveSettings(CyclesButtonsPanel, Panel): +class CYCLES_PARTICLE_PT_curve_settings(CyclesButtonsPanel, Panel): bl_label = "Cycles Hair Settings" bl_context = "particle" @@ -1561,7 +1680,7 @@ class CyclesParticle_PT_CurveSettings(CyclesButtonsPanel, Panel): row.prop(cpsys, "use_closetip", text="Close tip") -class CyclesScene_PT_simplify(CyclesButtonsPanel, Panel): +class CYCLES_SCENE_PT_simplify(CyclesButtonsPanel, Panel): bl_label = "Simplify" bl_context = "scene" COMPAT_ENGINES = {'CYCLES'} @@ -1630,10 +1749,10 @@ def draw_device(self, context): layout.prop(cscene, "feature_set") - split = layout.split(percentage=1/3) + split = layout.split(percentage=1 / 3) split.label("Device:") row = split.row() - row.active = show_device_selection(context) + row.active = show_device_active(context) row.prop(cscene, "device", text="") if engine.with_osl() and use_cpu(context): @@ -1712,17 +1831,77 @@ def get_panels(): return panels + +classes = ( + CYCLES_MT_sampling_presets, + CYCLES_MT_integrator_presets, + CYCLES_RENDER_PT_sampling, + CYCLES_RENDER_PT_geometry, + CYCLES_RENDER_PT_light_paths, + CYCLES_RENDER_PT_motion_blur, + CYCLES_RENDER_PT_film, + CYCLES_RENDER_PT_performance, + CYCLES_RENDER_PT_layer_options, + CYCLES_RENDER_PT_layer_passes, + CYCLES_RENDER_PT_views, + CYCLES_RENDER_PT_denoising, + CYCLES_PT_post_processing, + CYCLES_CAMERA_PT_dof, + CYCLES_PT_context_material, + CYCLES_OBJECT_PT_motion_blur, + CYCLES_OBJECT_PT_cycles_settings, + CYCLES_OT_use_shading_nodes, + CYCLES_LAMP_PT_preview, + CYCLES_LAMP_PT_lamp, + CYCLES_LAMP_PT_nodes, + CYCLES_LAMP_PT_spot, + CYCLES_WORLD_PT_preview, + CYCLES_WORLD_PT_surface, + CYCLES_WORLD_PT_volume, + CYCLES_WORLD_PT_ambient_occlusion, + CYCLES_WORLD_PT_mist, + CYCLES_WORLD_PT_ray_visibility, + CYCLES_WORLD_PT_settings, + CYCLES_MATERIAL_PT_preview, + CYCLES_MATERIAL_PT_surface, + CYCLES_MATERIAL_PT_volume, + CYCLES_MATERIAL_PT_displacement, + CYCLES_MATERIAL_PT_settings, + CYCLES_MATERIAL_PT_viewport, + CYCLES_TEXTURE_PT_context, + CYCLES_TEXTURE_PT_node, + CYCLES_TEXTURE_PT_mapping, + CYCLES_TEXTURE_PT_colors, + CYCLES_PARTICLE_PT_textures, + CYCLES_RENDER_PT_bake, + CYCLES_RENDER_PT_debug, + CYCLES_PARTICLE_PT_curve_settings, + CYCLES_SCENE_PT_simplify, +) + + def register(): + from bpy.utils import register_class + bpy.types.RENDER_PT_render.append(draw_device) bpy.types.VIEW3D_HT_header.append(draw_pause) for panel in get_panels(): panel.COMPAT_ENGINES.add('CYCLES') + for cls in classes: + register_class(cls) + + def unregister(): + from bpy.utils import unregister_class + bpy.types.RENDER_PT_render.remove(draw_device) bpy.types.VIEW3D_HT_header.remove(draw_pause) for panel in get_panels(): if 'CYCLES' in panel.COMPAT_ENGINES: panel.COMPAT_ENGINES.remove('CYCLES') + + for cls in classes: + unregister_class(cls) diff --git a/intern/cycles/blender/addon/version_update.py b/intern/cycles/blender/addon/version_update.py index b2a745500a1..292f0a1fa90 100644 --- a/intern/cycles/blender/addon/version_update.py +++ b/intern/cycles/blender/addon/version_update.py @@ -17,6 +17,7 @@ # <pep8 compliant> import bpy +import math from bpy.app.handlers import persistent @@ -89,6 +90,106 @@ def foreach_cycles_node(callback): traversed) +def displacement_node_insert(material, nodetree, traversed): + if nodetree in traversed: + return + traversed.add(nodetree) + + for node in nodetree.nodes: + if node.bl_idname == 'ShaderNodeGroup': + displacement_node_insert(material, node.node_tree, traversed) + + # Gather links to replace + displacement_links = [] + for link in nodetree.links: + if link.to_node.bl_idname == 'ShaderNodeOutputMaterial' and \ + link.from_node.bl_idname != 'ShaderNodeDisplacement' and \ + link.to_socket.identifier == 'Displacement': + displacement_links.append(link) + + # Replace links with displacement node + for link in displacement_links: + from_node = link.from_node + from_socket = link.from_socket + to_node = link.to_node + to_socket = link.to_socket + + nodetree.links.remove(link) + + node = nodetree.nodes.new(type='ShaderNodeDisplacement') + node.location[0] = 0.5 * (from_node.location[0] + to_node.location[0]); + node.location[1] = 0.5 * (from_node.location[1] + to_node.location[1]); + node.inputs['Scale'].default_value = 0.1 + node.inputs['Midlevel'].default_value = 0.0 + + nodetree.links.new(from_socket, node.inputs['Height']) + nodetree.links.new(node.outputs['Displacement'], to_socket) + +def displacement_nodes_insert(): + traversed = set() + for material in bpy.data.materials: + if check_is_new_shading_material(material): + displacement_node_insert(material, material.node_tree, traversed) + +def displacement_principled_nodes(node): + if node.bl_idname == 'ShaderNodeDisplacement': + if node.space != 'WORLD': + node.space = 'OBJECT' + if node.bl_idname == 'ShaderNodeBsdfPrincipled': + if node.subsurface_method != 'RANDOM_WALK': + node.subsurface_method = 'BURLEY' + +def square_roughness_node_insert(material, nodetree, traversed): + if nodetree in traversed: + return + traversed.add(nodetree) + + roughness_node_types = { + 'ShaderNodeBsdfAnisotropic', + 'ShaderNodeBsdfGlass', + 'ShaderNodeBsdfGlossy', + 'ShaderNodeBsdfRefraction'} + + # Update default values + for node in nodetree.nodes: + if node.bl_idname == 'ShaderNodeGroup': + square_roughness_node_insert(material, node.node_tree, traversed) + elif node.bl_idname in roughness_node_types: + roughness_input = node.inputs['Roughness'] + roughness_input.default_value = math.sqrt(max(roughness_input.default_value, 0.0)) + + # Gather roughness links to replace + roughness_links = [] + for link in nodetree.links: + if link.to_node.bl_idname in roughness_node_types and \ + link.to_socket.identifier == 'Roughness': + roughness_links.append(link) + + # Replace links with sqrt node + for link in roughness_links: + from_node = link.from_node + from_socket = link.from_socket + to_node = link.to_node + to_socket = link.to_socket + + nodetree.links.remove(link) + + node = nodetree.nodes.new(type='ShaderNodeMath') + node.operation = 'POWER' + node.location[0] = 0.5 * (from_node.location[0] + to_node.location[0]); + node.location[1] = 0.5 * (from_node.location[1] + to_node.location[1]); + + nodetree.links.new(from_socket, node.inputs[0]) + node.inputs[1].default_value = 0.5 + nodetree.links.new(node.outputs['Value'], to_socket) + +def square_roughness_nodes_insert(): + traversed = set() + for material in bpy.data.materials: + if check_is_new_shading_material(material): + square_roughness_node_insert(material, material.node_tree, traversed) + + def mapping_node_order_flip(node): """ Flip euler order of mapping shader node @@ -302,3 +403,31 @@ def do_versions(self): cscene = scene.cycles if not cscene.is_property_set("light_sampling_threshold"): cscene.light_sampling_threshold = 0.0 + + if bpy.data.version <= (2, 79, 0): + for scene in bpy.data.scenes: + cscene = scene.cycles + # Default changes + if not cscene.is_property_set("aa_samples"): + cscene.aa_samples = 4 + if not cscene.is_property_set("preview_aa_samples"): + cscene.preview_aa_samples = 4 + if not cscene.is_property_set("blur_glossy"): + cscene.blur_glossy = 0.0 + if not cscene.is_property_set("sample_clamp_indirect"): + cscene.sample_clamp_indirect = 0.0 + + if bpy.data.version <= (2, 79, 1): + displacement_nodes_insert() + + if bpy.data.version <= (2, 79, 2): + for mat in bpy.data.materials: + cmat = mat.cycles + if not cmat.is_property_set("displacement_method"): + cmat.displacement_method = 'BUMP' + + foreach_cycles_node(displacement_principled_nodes) + + if bpy.data.version <= (2, 79, 3): + # Switch to squared roughness convention + square_roughness_nodes_insert() diff --git a/intern/cycles/blender/blender_camera.cpp b/intern/cycles/blender/blender_camera.cpp index f02fc553908..f00ade320e7 100644 --- a/intern/cycles/blender/blender_camera.cpp +++ b/intern/cycles/blender/blender_camera.cpp @@ -14,13 +14,13 @@ * limitations under the License. */ -#include "camera.h" -#include "scene.h" +#include "render/camera.h" +#include "render/scene.h" -#include "blender_sync.h" -#include "blender_util.h" +#include "blender/blender_sync.h" +#include "blender/blender_util.h" -#include "util_logging.h" +#include "util/util_logging.h" CCL_NAMESPACE_BEGIN @@ -81,6 +81,10 @@ struct BlenderCamera { BoundBox2D viewport_camera_border; Transform matrix; + + float offscreen_dicing_scale; + + int motion_steps; }; static void blender_camera_init(BlenderCamera *bcam, @@ -104,6 +108,7 @@ static void blender_camera_init(BlenderCamera *bcam, bcam->pano_viewplane.top = 1.0f; bcam->viewport_camera_border.right = 1.0f; bcam->viewport_camera_border.top = 1.0f; + bcam->offscreen_dicing_scale = 1.0f; /* render resolution */ bcam->full_width = render_resolution_x(b_render); @@ -223,6 +228,8 @@ static void blender_camera_from_object(BlenderCamera *bcam, bcam->sensor_fit = BlenderCamera::HORIZONTAL; else bcam->sensor_fit = BlenderCamera::VERTICAL; + + bcam->motion_steps = object_motion_steps(b_ob, b_ob); } else { /* from lamp not implemented yet */ @@ -243,8 +250,7 @@ static Transform blender_camera_matrix(const Transform& tfm, result = tfm * make_transform(1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, - 0.0f, 1.0f, 0.0f, 0.0f, - 0.0f, 0.0f, 0.0f, 1.0f); + 0.0f, 1.0f, 0.0f, 0.0f); } else { /* Make it so environment camera needs to be pointed in the direction @@ -254,8 +260,7 @@ static Transform blender_camera_matrix(const Transform& tfm, result = tfm * make_transform( 0.0f, -1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, - -1.0f, 0.0f, 0.0f, 0.0f, - 0.0f, 0.0f, 0.0f, 1.0f); + -1.0f, 0.0f, 0.0f, 0.0f); } } else { @@ -353,7 +358,11 @@ static void blender_camera_viewplane(BlenderCamera *bcam, } } -static void blender_camera_sync(Camera *cam, BlenderCamera *bcam, int width, int height, const char *viewname) +static void blender_camera_sync(Camera *cam, + BlenderCamera *bcam, + int width, int height, + const char *viewname, + PointerRNA *cscene) { /* copy camera to compare later */ Camera prevcam = *cam; @@ -448,9 +457,7 @@ static void blender_camera_sync(Camera *cam, BlenderCamera *bcam, int width, int cam->matrix = blender_camera_matrix(bcam->matrix, bcam->type, bcam->panorama_type); - cam->motion.pre = cam->matrix; - cam->motion.post = cam->matrix; - cam->use_motion = false; + cam->motion.resize(bcam->motion_steps, cam->matrix); cam->use_perspective_motion = false; cam->shuttertime = bcam->shuttertime; cam->fov_pre = cam->fov; @@ -466,6 +473,9 @@ static void blender_camera_sync(Camera *cam, BlenderCamera *bcam, int width, int cam->border = bcam->border; cam->viewport_camera_border = bcam->viewport_camera_border; + bcam->offscreen_dicing_scale = RNA_float_get(cscene, "offscreen_dicing_scale"); + cam->offscreen_dicing_scale = bcam->offscreen_dicing_scale; + /* set update flag */ if(cam->modified(prevcam)) cam->tag_update(); @@ -525,7 +535,21 @@ void BlenderSync::sync_camera(BL::RenderSettings& b_render, /* sync */ Camera *cam = scene->camera; - blender_camera_sync(cam, &bcam, width, height, viewname); + blender_camera_sync(cam, &bcam, width, height, viewname, &cscene); + + /* dicing camera */ + b_ob = BL::Object(RNA_pointer_get(&cscene, "dicing_camera")); + if(b_ob) { + BL::Array<float, 16> b_ob_matrix; + blender_camera_from_object(&bcam, b_engine, b_ob); + b_engine.camera_model_matrix(b_ob, bcam.use_spherical_stereo, b_ob_matrix); + bcam.matrix = get_transform(b_ob_matrix); + + blender_camera_sync(scene->dicing_camera, &bcam, width, height, viewname, &cscene); + } + else { + *scene->dicing_camera = *cam; + } } void BlenderSync::sync_camera_motion(BL::RenderSettings& b_render, @@ -542,16 +566,15 @@ void BlenderSync::sync_camera_motion(BL::RenderSettings& b_render, Transform tfm = get_transform(b_ob_matrix); tfm = blender_camera_matrix(tfm, cam->type, cam->panorama_type); - if(tfm != cam->matrix) { - VLOG(1) << "Camera " << b_ob.name() << " motion detected."; - if(motion_time == -1.0f) { - cam->motion.pre = tfm; - cam->use_motion = true; - } - else if(motion_time == 1.0f) { - cam->motion.post = tfm; - cam->use_motion = true; - } + if(motion_time == 0.0f) { + /* When motion blur is not centered in frame, cam->matrix gets reset. */ + cam->matrix = tfm; + } + + /* Set transform in motion array. */ + int motion_step = cam->motion_step(motion_time); + if(motion_step >= 0) { + cam->motion[motion_step] = tfm; } if(cam->type == CAMERA_PERSPECTIVE) { @@ -573,7 +596,10 @@ void BlenderSync::sync_camera_motion(BL::RenderSettings& b_render, float fov = 2.0f * atanf((0.5f * sensor_size) / bcam.lens / aspectratio); if(fov != cam->fov) { VLOG(1) << "Camera " << b_ob.name() << " FOV change detected."; - if(motion_time == -1.0f) { + if(motion_time == 0.0f) { + cam->fov = fov; + } + else if(motion_time == -1.0f) { cam->fov_pre = fov; cam->use_perspective_motion = true; } @@ -811,7 +837,22 @@ void BlenderSync::sync_view(BL::SpaceView3D& b_v3d, b_v3d, b_rv3d, width, height); - blender_camera_sync(scene->camera, &bcam, width, height, ""); + PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles"); + blender_camera_sync(scene->camera, &bcam, width, height, "", &cscene); + + /* dicing camera */ + BL::Object b_ob = BL::Object(RNA_pointer_get(&cscene, "dicing_camera")); + if(b_ob) { + BL::Array<float, 16> b_ob_matrix; + blender_camera_from_object(&bcam, b_engine, b_ob); + b_engine.camera_model_matrix(b_ob, bcam.use_spherical_stereo, b_ob_matrix); + bcam.matrix = get_transform(b_ob_matrix); + + blender_camera_sync(scene->dicing_camera, &bcam, width, height, "", &cscene); + } + else { + *scene->dicing_camera = *scene->camera; + } } BufferParams BlenderSync::get_buffer_params(BL::RenderSettings& b_render, diff --git a/intern/cycles/blender/blender_curves.cpp b/intern/cycles/blender/blender_curves.cpp index e42ff5d72a6..daccb89f5a2 100644 --- a/intern/cycles/blender/blender_curves.cpp +++ b/intern/cycles/blender/blender_curves.cpp @@ -14,18 +14,19 @@ * limitations under the License. */ -#include "attribute.h" -#include "camera.h" -#include "curves.h" -#include "mesh.h" -#include "object.h" -#include "scene.h" +#include "render/attribute.h" +#include "render/camera.h" +#include "render/curves.h" +#include "render/mesh.h" +#include "render/object.h" +#include "render/scene.h" -#include "blender_sync.h" -#include "blender_util.h" +#include "blender/blender_sync.h" +#include "blender/blender_util.h" -#include "util_foreach.h" -#include "util_logging.h" +#include "util/util_foreach.h" +#include "util/util_hash.h" +#include "util/util_logging.h" CCL_NAMESPACE_BEGIN @@ -411,6 +412,7 @@ static void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData, } } + mesh->resize_mesh(mesh->verts.size(), mesh->num_triangles()); mesh->attributes.remove(ATTR_STD_VERTEX_NORMAL); mesh->attributes.remove(ATTR_STD_FACE_NORMAL); mesh->add_face_normals(); @@ -434,8 +436,8 @@ static void ExportCurveTriangleGeometry(Mesh *mesh, if(CData->curve_keynum[curve] <= 1 || CData->curve_length[curve] == 0.0f) continue; - numverts += (CData->curve_keynum[curve] - 2)*2*resolution + resolution; - numtris += (CData->curve_keynum[curve] - 2)*resolution; + numverts += (CData->curve_keynum[curve] - 1)*resolution + resolution; + numtris += (CData->curve_keynum[curve] - 1)*2*resolution; } } @@ -545,6 +547,7 @@ static void ExportCurveTriangleGeometry(Mesh *mesh, } } + mesh->resize_mesh(mesh->verts.size(), mesh->num_triangles()); mesh->attributes.remove(ATTR_STD_VERTEX_NORMAL); mesh->attributes.remove(ATTR_STD_FACE_NORMAL); mesh->add_face_normals(); @@ -563,9 +566,12 @@ static void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CDa return; Attribute *attr_intercept = NULL; + Attribute *attr_random = NULL; if(mesh->need_attribute(scene, ATTR_STD_CURVE_INTERCEPT)) attr_intercept = mesh->curve_attributes.add(ATTR_STD_CURVE_INTERCEPT); + if(mesh->need_attribute(scene, ATTR_STD_CURVE_RANDOM)) + attr_random = mesh->curve_attributes.add(ATTR_STD_CURVE_RANDOM); /* compute and reserve size of arrays */ for(int sys = 0; sys < CData->psys_firstcurve.size(); sys++) { @@ -610,6 +616,10 @@ static void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CDa num_curve_keys++; } + if(attr_random != NULL) { + attr_random->add(hash_int_01(num_curves)); + } + mesh->add_curve(num_keys, CData->psys_shader[sys]); num_keys += num_curve_keys; num_curves++; @@ -623,10 +633,10 @@ static void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CDa } } -static void ExportCurveSegmentsMotion(Mesh *mesh, ParticleCurveData *CData, int time_index) +static void ExportCurveSegmentsMotion(Mesh *mesh, ParticleCurveData *CData, int motion_step) { VLOG(1) << "Exporting curve motion segments for mesh " << mesh->name - << ", time index " << time_index; + << ", motion step " << motion_step; /* find attribute */ Attribute *attr_mP = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); @@ -641,7 +651,7 @@ static void ExportCurveSegmentsMotion(Mesh *mesh, ParticleCurveData *CData, int /* export motion vectors for curve keys */ size_t numkeys = mesh->curve_keys.size(); - float4 *mP = attr_mP->data_float4() + time_index*numkeys; + float4 *mP = attr_mP->data_float4() + motion_step*numkeys; bool have_motion = false; int i = 0; @@ -692,12 +702,12 @@ static void ExportCurveSegmentsMotion(Mesh *mesh, ParticleCurveData *CData, int } mesh->curve_attributes.remove(ATTR_STD_MOTION_VERTEX_POSITION); } - else if(time_index > 0) { - VLOG(1) << "Filling in new motion vertex position for time_index " - << time_index; + else if(motion_step > 0) { + VLOG(1) << "Filling in new motion vertex position for motion_step " + << motion_step; /* motion, fill up previous steps that we might have skipped because * they had no motion, but we need them anyway now */ - for(int step = 0; step < time_index; step++) { + for(int step = 0; step < motion_step; step++) { float4 *mP = attr_mP->data_float4() + step*numkeys; for(int key = 0; key < numkeys; key++) { @@ -774,17 +784,17 @@ static void ExportCurveTriangleVcol(ParticleCurveData *CData, for(int curvekey = CData->curve_firstkey[curve]; curvekey < CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1; curvekey++) { for(int section = 0; section < resol; section++) { - cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve])); + cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear_v3(CData->curve_vcol[curve])); vertexindex++; - cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve])); + cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear_v3(CData->curve_vcol[curve])); vertexindex++; - cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve])); + cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear_v3(CData->curve_vcol[curve])); vertexindex++; - cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve])); + cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear_v3(CData->curve_vcol[curve])); vertexindex++; - cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve])); + cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear_v3(CData->curve_vcol[curve])); vertexindex++; - cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear(CData->curve_vcol[curve])); + cdata[vertexindex] = color_float_to_byte(color_srgb_to_scene_linear_v3(CData->curve_vcol[curve])); vertexindex++; } } @@ -878,7 +888,7 @@ void BlenderSync::sync_curves(Mesh *mesh, BL::Mesh& b_mesh, BL::Object& b_ob, bool motion, - int time_index) + int motion_step) { if(!motion) { /* Clear stored curve data */ @@ -890,7 +900,7 @@ void BlenderSync::sync_curves(Mesh *mesh, } /* obtain general settings */ - bool use_curves = scene->curve_system_manager->use_curves; + const bool use_curves = scene->curve_system_manager->use_curves; if(!(use_curves && b_ob.mode() != b_ob.mode_PARTICLE_EDIT)) { if(!motion) @@ -898,11 +908,11 @@ void BlenderSync::sync_curves(Mesh *mesh, return; } - int primitive = scene->curve_system_manager->primitive; - int triangle_method = scene->curve_system_manager->triangle_method; - int resolution = scene->curve_system_manager->resolution; - size_t vert_num = mesh->verts.size(); - size_t tri_num = mesh->num_triangles(); + const int primitive = scene->curve_system_manager->primitive; + const int triangle_method = scene->curve_system_manager->triangle_method; + const int resolution = scene->curve_system_manager->resolution; + const size_t vert_num = mesh->verts.size(); + const size_t tri_num = mesh->num_triangles(); int used_res = 1; /* extract particle hair data - should be combined with connecting to mesh later*/ @@ -941,7 +951,7 @@ void BlenderSync::sync_curves(Mesh *mesh, } else { if(motion) - ExportCurveSegmentsMotion(mesh, &CData, time_index); + ExportCurveSegmentsMotion(mesh, &CData, motion_step); else ExportCurveSegments(scene, mesh, &CData); } @@ -1002,7 +1012,7 @@ void BlenderSync::sync_curves(Mesh *mesh, for(size_t curve = 0; curve < CData.curve_vcol.size(); curve++) if(!(CData.curve_keynum[curve] <= 1 || CData.curve_length[curve] == 0.0f)) - fdata[i++] = color_srgb_to_scene_linear(CData.curve_vcol[curve]); + fdata[i++] = color_srgb_to_scene_linear_v3(CData.curve_vcol[curve]); } } } diff --git a/intern/cycles/blender/blender_logging.cpp b/intern/cycles/blender/blender_logging.cpp index f4f86929168..d0f82e37662 100644 --- a/intern/cycles/blender/blender_logging.cpp +++ b/intern/cycles/blender/blender_logging.cpp @@ -14,8 +14,8 @@ * limitations under the License. */ -#include "CCL_api.h" -#include "util_logging.h" +#include "blender/CCL_api.h" +#include "util/util_logging.h" void CCL_init_logging(const char *argv0) { diff --git a/intern/cycles/blender/blender_mesh.cpp b/intern/cycles/blender/blender_mesh.cpp index fdc287084eb..7d6ca18b074 100644 --- a/intern/cycles/blender/blender_mesh.cpp +++ b/intern/cycles/blender/blender_mesh.cpp @@ -14,23 +14,22 @@ * limitations under the License. */ - -#include "mesh.h" -#include "object.h" -#include "scene.h" -#include "camera.h" +#include "render/mesh.h" +#include "render/object.h" +#include "render/scene.h" +#include "render/camera.h" -#include "blender_sync.h" -#include "blender_session.h" -#include "blender_util.h" +#include "blender/blender_sync.h" +#include "blender/blender_session.h" +#include "blender/blender_util.h" -#include "subd_patch.h" -#include "subd_split.h" +#include "subd/subd_patch.h" +#include "subd/subd_split.h" -#include "util_algorithm.h" -#include "util_foreach.h" -#include "util_logging.h" -#include "util_math.h" +#include "util/util_algorithm.h" +#include "util/util_foreach.h" +#include "util/util_logging.h" +#include "util/util_math.h" #include "mikktspace.h" @@ -51,8 +50,7 @@ enum { * Two triangles has vertex indices in the original Blender-side face. * If face is already a quad tri_b will not be initialized. */ -inline void face_split_tri_indices(const int num_verts, - const int face_flag, +inline void face_split_tri_indices(const int face_flag, int tri_a[3], int tri_b[3]) { @@ -60,143 +58,253 @@ inline void face_split_tri_indices(const int num_verts, tri_a[0] = 0; tri_a[1] = 1; tri_a[2] = 3; - if(num_verts == 4) { - tri_b[0] = 2; - tri_b[1] = 3; - tri_b[2] = 1; - } + + tri_b[0] = 2; + tri_b[1] = 3; + tri_b[2] = 1; } - else /*if(face_flag & FACE_FLAG_DIVIDE_13)*/ { + else { + /* Quad with FACE_FLAG_DIVIDE_13 or single triangle. */ tri_a[0] = 0; tri_a[1] = 1; tri_a[2] = 2; - if(num_verts == 4) { - tri_b[0] = 0; - tri_b[1] = 2; - tri_b[2] = 3; - } + + tri_b[0] = 0; + tri_b[1] = 2; + tri_b[2] = 3; } } /* Tangent Space */ struct MikkUserData { - MikkUserData(const BL::Mesh& mesh_, - BL::MeshTextureFaceLayer *layer_, - int num_faces_) - : mesh(mesh_), layer(layer_), num_faces(num_faces_) + MikkUserData(const BL::Mesh& b_mesh, + const char *layer_name, + const Mesh *mesh, + float3 *tangent, + float *tangent_sign) + : mesh(mesh), + texface(NULL), + orco(NULL), + tangent(tangent), + tangent_sign(tangent_sign) { - tangent.resize(num_faces*4); + const AttributeSet& attributes = (mesh->subd_faces.size()) ? + mesh->subd_attributes : mesh->attributes; + + Attribute *attr_vN = attributes.find(ATTR_STD_VERTEX_NORMAL); + vertex_normal = attr_vN->data_float3(); + + if(layer_name == NULL) { + Attribute *attr_orco = attributes.find(ATTR_STD_GENERATED); + + if(attr_orco) { + orco = attr_orco->data_float3(); + mesh_texture_space(*(BL::Mesh*)&b_mesh, orco_loc, orco_size); + } + } + else { + Attribute *attr_uv = attributes.find(ustring(layer_name)); + if(attr_uv != NULL) { + texface = attr_uv->data_float3(); + } + } } - BL::Mesh mesh; - BL::MeshTextureFaceLayer *layer; + const Mesh *mesh; int num_faces; - vector<float4> tangent; + + float3 *vertex_normal; + float3 *texface; + float3 *orco; + float3 orco_loc, orco_size; + + float3 *tangent; + float *tangent_sign; }; static int mikk_get_num_faces(const SMikkTSpaceContext *context) { - MikkUserData *userdata = (MikkUserData*)context->m_pUserData; - return userdata->num_faces; + const MikkUserData *userdata = (const MikkUserData *)context->m_pUserData; + if(userdata->mesh->subd_faces.size()) { + return userdata->mesh->subd_faces.size(); + } + else { + return userdata->mesh->num_triangles(); + } } -static int mikk_get_num_verts_of_face(const SMikkTSpaceContext *context, const int face_num) +static int mikk_get_num_verts_of_face(const SMikkTSpaceContext *context, + const int face_num) { - MikkUserData *userdata = (MikkUserData*)context->m_pUserData; - BL::MeshTessFace f = userdata->mesh.tessfaces[face_num]; - int4 vi = get_int4(f.vertices_raw()); + const MikkUserData *userdata = (const MikkUserData *)context->m_pUserData; + if(userdata->mesh->subd_faces.size()) { + const Mesh *mesh = userdata->mesh; + return mesh->subd_faces[face_num].num_corners; + } + else { + return 3; + } +} - return (vi[3] == 0)? 3: 4; +static int mikk_vertex_index(const Mesh *mesh, const int face_num, const int vert_num) +{ + if(mesh->subd_faces.size()) { + const Mesh::SubdFace& face = mesh->subd_faces[face_num]; + return mesh->subd_face_corners[face.start_corner + vert_num]; + } + else { + return mesh->triangles[face_num * 3 + vert_num]; + } } -static void mikk_get_position(const SMikkTSpaceContext *context, float P[3], const int face_num, const int vert_num) +static int mikk_corner_index(const Mesh *mesh, const int face_num, const int vert_num) { - MikkUserData *userdata = (MikkUserData*)context->m_pUserData; - BL::MeshTessFace f = userdata->mesh.tessfaces[face_num]; - int4 vi = get_int4(f.vertices_raw()); - BL::MeshVertex v = userdata->mesh.vertices[vi[vert_num]]; - float3 vP = get_float3(v.co()); + if(mesh->subd_faces.size()) { + const Mesh::SubdFace& face = mesh->subd_faces[face_num]; + return face.start_corner + vert_num; + } + else { + return face_num * 3 + vert_num; + } +} +static void mikk_get_position(const SMikkTSpaceContext *context, + float P[3], + const int face_num, const int vert_num) +{ + const MikkUserData *userdata = (const MikkUserData *)context->m_pUserData; + const Mesh *mesh = userdata->mesh; + const int vertex_index = mikk_vertex_index(mesh, face_num, vert_num); + const float3 vP = mesh->verts[vertex_index]; P[0] = vP.x; P[1] = vP.y; P[2] = vP.z; } -static void mikk_get_texture_coordinate(const SMikkTSpaceContext *context, float uv[2], const int face_num, const int vert_num) +static void mikk_get_texture_coordinate(const SMikkTSpaceContext *context, + float uv[2], + const int face_num, const int vert_num) { - MikkUserData *userdata = (MikkUserData*)context->m_pUserData; - if(userdata->layer != NULL) { - BL::MeshTextureFace tf = userdata->layer->data[face_num]; - float3 tfuv; - - switch(vert_num) { - case 0: - tfuv = get_float3(tf.uv1()); - break; - case 1: - tfuv = get_float3(tf.uv2()); - break; - case 2: - tfuv = get_float3(tf.uv3()); - break; - default: - tfuv = get_float3(tf.uv4()); - break; - } - + const MikkUserData *userdata = (const MikkUserData *)context->m_pUserData; + const Mesh *mesh = userdata->mesh; + if(userdata->texface != NULL) { + const int corner_index = mikk_corner_index(mesh, face_num, vert_num); + float3 tfuv = userdata->texface[corner_index]; uv[0] = tfuv.x; uv[1] = tfuv.y; } - else { - int vert_idx = userdata->mesh.tessfaces[face_num].vertices()[vert_num]; - float3 orco = - get_float3(userdata->mesh.vertices[vert_idx].undeformed_co()); - float2 tmp = map_to_sphere(make_float3(orco[0], orco[1], orco[2])); + else if(userdata->orco != NULL) { + const int vertex_index = mikk_vertex_index(mesh, face_num, vert_num); + const float3 orco_loc = userdata->orco_loc; + const float3 orco_size = userdata->orco_size; + const float3 orco = (userdata->orco[vertex_index] + orco_loc) / orco_size; + + const float2 tmp = map_to_sphere(orco); uv[0] = tmp.x; uv[1] = tmp.y; } + else { + uv[0] = 0.0f; + uv[1] = 0.0f; + } } -static void mikk_get_normal(const SMikkTSpaceContext *context, float N[3], const int face_num, const int vert_num) +static void mikk_get_normal(const SMikkTSpaceContext *context, float N[3], + const int face_num, const int vert_num) { - MikkUserData *userdata = (MikkUserData*)context->m_pUserData; - BL::MeshTessFace f = userdata->mesh.tessfaces[face_num]; + const MikkUserData *userdata = (const MikkUserData *)context->m_pUserData; + const Mesh *mesh = userdata->mesh; float3 vN; - - if(f.use_smooth()) { - int4 vi = get_int4(f.vertices_raw()); - BL::MeshVertex v = userdata->mesh.vertices[vi[vert_num]]; - vN = get_float3(v.normal()); + if(mesh->subd_faces.size()) { + const Mesh::SubdFace& face = mesh->subd_faces[face_num]; + if(face.smooth) { + const int vertex_index = mikk_vertex_index(mesh, face_num, vert_num); + vN = userdata->vertex_normal[vertex_index]; + } + else { + vN = face.normal(mesh); + } } else { - vN = get_float3(f.normal()); + if(mesh->smooth[face_num]) { + const int vertex_index = mikk_vertex_index(mesh, face_num, vert_num); + vN = userdata->vertex_normal[vertex_index]; + } + else { + const Mesh::Triangle tri = mesh->get_triangle(face_num); + vN = tri.compute_normal(&mesh->verts[0]); + } } - N[0] = vN.x; N[1] = vN.y; N[2] = vN.z; } -static void mikk_set_tangent_space(const SMikkTSpaceContext *context, const float T[], const float sign, const int face, const int vert) +static void mikk_set_tangent_space(const SMikkTSpaceContext *context, + const float T[], + const float sign, + const int face_num, const int vert_num) { - MikkUserData *userdata = (MikkUserData*)context->m_pUserData; - - userdata->tangent[face*4 + vert] = make_float4(T[0], T[1], T[2], sign); + MikkUserData *userdata = (MikkUserData *)context->m_pUserData; + const Mesh *mesh = userdata->mesh; + const int corner_index = mikk_corner_index(mesh, face_num, vert_num); + userdata->tangent[corner_index] = make_float3(T[0], T[1], T[2]); + if(userdata->tangent_sign != NULL) { + userdata->tangent_sign[corner_index] = sign; + } } -static void mikk_compute_tangents(BL::Mesh& b_mesh, - BL::MeshTextureFaceLayer *b_layer, +static void mikk_compute_tangents(const BL::Mesh& b_mesh, + const char *layer_name, Mesh *mesh, - const vector<int>& nverts, - const vector<int>& face_flags, bool need_sign, bool active_render) { - /* setup userdata */ - MikkUserData userdata(b_mesh, b_layer, nverts.size()); + /* Create tangent attributes. */ + AttributeSet& attributes = (mesh->subd_faces.size()) ? + mesh->subd_attributes : mesh->attributes; + Attribute *attr; + ustring name; + if(layer_name != NULL) { + name = ustring((string(layer_name) + ".tangent").c_str()); + } + else { + name = ustring("orco.tangent"); + } + if(active_render) { + attr = attributes.add(ATTR_STD_UV_TANGENT, name); + } + else { + attr = attributes.add(name, TypeDesc::TypeVector, ATTR_ELEMENT_CORNER); + } + float3 *tangent = attr->data_float3(); + /* Create bitangent sign attribute. */ + float *tangent_sign = NULL; + if(need_sign) { + Attribute *attr_sign; + ustring name_sign; + if(layer_name != NULL) { + name_sign = ustring((string(layer_name) + + ".tangent_sign").c_str()); + } + else { + name_sign = ustring("orco.tangent_sign"); + } - /* setup interface */ + if(active_render) { + attr_sign = attributes.add(ATTR_STD_UV_TANGENT_SIGN, name_sign); + } + else { + attr_sign = attributes.add(name_sign, + TypeDesc::TypeFloat, + ATTR_ELEMENT_CORNER); + } + tangent_sign = attr_sign->data_float(); + } + /* Setup userdata. */ + MikkUserData userdata(b_mesh, layer_name, mesh, tangent, tangent_sign); + /* Setup interface. */ SMikkTSpaceInterface sm_interface; memset(&sm_interface, 0, sizeof(sm_interface)); sm_interface.m_getNumFaces = mikk_get_num_faces; @@ -205,80 +313,13 @@ static void mikk_compute_tangents(BL::Mesh& b_mesh, sm_interface.m_getTexCoord = mikk_get_texture_coordinate; sm_interface.m_getNormal = mikk_get_normal; sm_interface.m_setTSpaceBasic = mikk_set_tangent_space; - - /* setup context */ + /* Setup context. */ SMikkTSpaceContext context; memset(&context, 0, sizeof(context)); context.m_pUserData = &userdata; context.m_pInterface = &sm_interface; - - /* compute tangents */ + /* Compute tangents. */ genTangSpaceDefault(&context); - - /* create tangent attributes */ - Attribute *attr; - ustring name; - if(b_layer != NULL) - name = ustring((string(b_layer->name().c_str()) + ".tangent").c_str()); - else - name = ustring("orco.tangent"); - - if(active_render) - attr = mesh->attributes.add(ATTR_STD_UV_TANGENT, name); - else - attr = mesh->attributes.add(name, TypeDesc::TypeVector, ATTR_ELEMENT_CORNER); - - float3 *tangent = attr->data_float3(); - - /* create bitangent sign attribute */ - float *tangent_sign = NULL; - - if(need_sign) { - Attribute *attr_sign; - ustring name_sign; - if(b_layer != NULL) - name_sign = ustring((string(b_layer->name().c_str()) + ".tangent_sign").c_str()); - else - name_sign = ustring("orco.tangent_sign"); - - if(active_render) - attr_sign = mesh->attributes.add(ATTR_STD_UV_TANGENT_SIGN, name_sign); - else - attr_sign = mesh->attributes.add(name_sign, TypeDesc::TypeFloat, ATTR_ELEMENT_CORNER); - - tangent_sign = attr_sign->data_float(); - } - - for(int i = 0; i < nverts.size(); i++) { - int tri_a[3], tri_b[3]; - face_split_tri_indices(nverts[i], face_flags[i], tri_a, tri_b); - - tangent[0] = float4_to_float3(userdata.tangent[i*4 + tri_a[0]]); - tangent[1] = float4_to_float3(userdata.tangent[i*4 + tri_a[1]]); - tangent[2] = float4_to_float3(userdata.tangent[i*4 + tri_a[2]]); - tangent += 3; - - if(tangent_sign) { - tangent_sign[0] = userdata.tangent[i*4 + tri_a[0]].w; - tangent_sign[1] = userdata.tangent[i*4 + tri_a[1]].w; - tangent_sign[2] = userdata.tangent[i*4 + tri_a[2]].w; - tangent_sign += 3; - } - - if(nverts[i] == 4) { - tangent[0] = float4_to_float3(userdata.tangent[i*4 + tri_b[0]]); - tangent[1] = float4_to_float3(userdata.tangent[i*4 + tri_b[1]]); - tangent[2] = float4_to_float3(userdata.tangent[i*4 + tri_b[2]]); - tangent += 3; - - if(tangent_sign) { - tangent_sign[0] = userdata.tangent[i*4 + tri_b[0]].w; - tangent_sign[1] = userdata.tangent[i*4 + tri_b[1]].w; - tangent_sign[2] = userdata.tangent[i*4 + tri_b[2]].w; - tangent_sign += 3; - } - } - } } /* Create Volume Attribute */ @@ -293,11 +334,14 @@ static void create_mesh_volume_attribute(BL::Object& b_ob, if(!b_domain) return; - + + mesh->volume_isovalue = b_domain.clipping(); + Attribute *attr = mesh->attributes.add(std); VoxelAttribute *volume_data = attr->data_voxel(); - bool is_float, is_linear; + ImageMetaData metadata; bool animated = false; + bool use_alpha = true; volume_data->manager = image_manager; volume_data->slot = image_manager->add_image( @@ -305,11 +349,10 @@ static void create_mesh_volume_attribute(BL::Object& b_ob, b_ob.ptr.data, animated, frame, - is_float, - is_linear, INTERPOLATION_LINEAR, EXTENSION_CLIP, - true); + use_alpha, + metadata); } static void create_mesh_volume_attributes(Scene *scene, @@ -326,6 +369,8 @@ static void create_mesh_volume_attributes(Scene *scene, create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_FLAME, frame); if(mesh->need_attribute(scene, ATTR_STD_VOLUME_HEAT)) create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_HEAT, frame); + if(mesh->need_attribute(scene, ATTR_STD_VOLUME_TEMPERATURE)) + create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_TEMPERATURE, frame); if(mesh->need_attribute(scene, ATTR_STD_VOLUME_VELOCITY)) create_mesh_volume_attribute(b_ob, mesh, scene->image_manager, ATTR_STD_VOLUME_VELOCITY, frame); } @@ -356,7 +401,7 @@ static void attr_create_vertex_color(Scene *scene, int n = p->loop_total(); for(int i = 0; i < n; i++) { float3 color = get_float3(l->data[p->loop_start() + i].color()); - *(cdata++) = color_float_to_byte(color_srgb_to_scene_linear(color)); + *(cdata++) = color_float_to_byte(color_srgb_to_scene_linear_v3(color)); } } } @@ -377,14 +422,14 @@ static void attr_create_vertex_color(Scene *scene, for(l->data.begin(c); c != l->data.end(); ++c, ++i) { int tri_a[3], tri_b[3]; - face_split_tri_indices(nverts[i], face_flags[i], tri_a, tri_b); + face_split_tri_indices(face_flags[i], tri_a, tri_b); uchar4 colors[4]; - colors[0] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color1()))); - colors[1] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color2()))); - colors[2] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color3()))); + colors[0] = color_float_to_byte(color_srgb_to_scene_linear_v3(get_float3(c->color1()))); + colors[1] = color_float_to_byte(color_srgb_to_scene_linear_v3(get_float3(c->color2()))); + colors[2] = color_float_to_byte(color_srgb_to_scene_linear_v3(get_float3(c->color3()))); if(nverts[i] == 4) { - colors[3] = color_float_to_byte(color_srgb_to_scene_linear(get_float3(c->color4()))); + colors[3] = color_float_to_byte(color_srgb_to_scene_linear_v3(get_float3(c->color4()))); } cdata[0] = colors[tri_a[0]]; @@ -409,68 +454,50 @@ static void attr_create_uv_map(Scene *scene, Mesh *mesh, BL::Mesh& b_mesh, const vector<int>& nverts, - const vector<int>& face_flags, - bool subdivision, - bool subdivide_uvs) + const vector<int>& face_flags) { - if(subdivision) { - BL::Mesh::uv_layers_iterator l; - int i = 0; - - for(b_mesh.uv_layers.begin(l); l != b_mesh.uv_layers.end(); ++l, ++i) { - bool active_render = b_mesh.uv_textures[i].active_render(); - AttributeStandard std = (active_render)? ATTR_STD_UV: ATTR_STD_NONE; - ustring name = ustring(l->name().c_str()); - - /* UV map */ - if(mesh->need_attribute(scene, name) || mesh->need_attribute(scene, std)) { - Attribute *attr; - - if(active_render) - attr = mesh->subd_attributes.add(std, name); - else - attr = mesh->subd_attributes.add(name, TypeDesc::TypePoint, ATTR_ELEMENT_CORNER); - - if(subdivide_uvs) { - attr->flags |= ATTR_SUBDIVIDED; - } - - BL::Mesh::polygons_iterator p; - float3 *fdata = attr->data_float3(); - - for(b_mesh.polygons.begin(p); p != b_mesh.polygons.end(); ++p) { - int n = p->loop_total(); - for(int j = 0; j < n; j++) { - *(fdata++) = get_float3(l->data[p->loop_start() + j].uv()); - } - } - } - } - } - else if(b_mesh.tessface_uv_textures.length() != 0) { + if(b_mesh.tessface_uv_textures.length() != 0) { BL::Mesh::tessface_uv_textures_iterator l; for(b_mesh.tessface_uv_textures.begin(l); l != b_mesh.tessface_uv_textures.end(); ++l) { - bool active_render = l->active_render(); - AttributeStandard std = (active_render)? ATTR_STD_UV: ATTR_STD_NONE; - ustring name = ustring(l->name().c_str()); + const bool active_render = l->active_render(); + AttributeStandard uv_std = (active_render)? ATTR_STD_UV: ATTR_STD_NONE; + ustring uv_name = ustring(l->name().c_str()); + AttributeStandard tangent_std = (active_render)? ATTR_STD_UV_TANGENT + : ATTR_STD_NONE; + ustring tangent_name = ustring( + (string(l->name().c_str()) + ".tangent").c_str()); + + /* Denotes whether UV map was requested directly. */ + const bool need_uv = mesh->need_attribute(scene, uv_name) || + mesh->need_attribute(scene, uv_std); + /* Denotes whether tangent was requested directly. */ + const bool need_tangent = + mesh->need_attribute(scene, tangent_name) || + (active_render && mesh->need_attribute(scene, tangent_std)); /* UV map */ - if(mesh->need_attribute(scene, name) || mesh->need_attribute(scene, std)) { - Attribute *attr; - - if(active_render) - attr = mesh->attributes.add(std, name); - else - attr = mesh->attributes.add(name, TypeDesc::TypePoint, ATTR_ELEMENT_CORNER); + /* NOTE: We create temporary UV layer if its needed for tangent but + * wasn't requested by other nodes in shaders. + */ + Attribute *uv_attr = NULL; + if(need_uv || need_tangent) { + if(active_render) { + uv_attr = mesh->attributes.add(uv_std, uv_name); + } + else { + uv_attr = mesh->attributes.add(uv_name, + TypeDesc::TypePoint, + ATTR_ELEMENT_CORNER); + } BL::MeshTextureFaceLayer::data_iterator t; - float3 *fdata = attr->data_float3(); + float3 *fdata = uv_attr->data_float3(); size_t i = 0; for(l->data.begin(t); t != l->data.end(); ++t, ++i) { int tri_a[3], tri_b[3]; - face_split_tri_indices(nverts[i], face_flags[i], tri_a, tri_b); + face_split_tri_indices(face_flags[i], tri_a, tri_b); float3 uvs[4]; uvs[0] = get_float3(t->uv1()); @@ -495,33 +522,112 @@ static void attr_create_uv_map(Scene *scene, } /* UV tangent */ - std = (active_render)? ATTR_STD_UV_TANGENT: ATTR_STD_NONE; - name = ustring((string(l->name().c_str()) + ".tangent").c_str()); + if(need_tangent) { + AttributeStandard sign_std = + (active_render)? ATTR_STD_UV_TANGENT_SIGN + : ATTR_STD_NONE; + ustring sign_name = ustring( + (string(l->name().c_str()) + ".tangent_sign").c_str()); + bool need_sign = (mesh->need_attribute(scene, sign_name) || + mesh->need_attribute(scene, sign_std)); + mikk_compute_tangents(b_mesh, + l->name().c_str(), + mesh, + need_sign, + active_render); + } + /* Remove temporarily created UV attribute. */ + if(!need_uv && uv_attr != NULL) { + mesh->attributes.remove(uv_attr); + } + } + } + else if(mesh->need_attribute(scene, ATTR_STD_UV_TANGENT)) { + bool need_sign = mesh->need_attribute(scene, ATTR_STD_UV_TANGENT_SIGN); + mikk_compute_tangents(b_mesh, NULL, mesh, need_sign, true); + if(!mesh->need_attribute(scene, ATTR_STD_GENERATED)) { + mesh->attributes.remove(ATTR_STD_GENERATED); + } + } +} + +static void attr_create_subd_uv_map(Scene *scene, + Mesh *mesh, + BL::Mesh& b_mesh, + bool subdivide_uvs) +{ + if(b_mesh.uv_layers.length() != 0) { + BL::Mesh::uv_layers_iterator l; + int i = 0; + + for(b_mesh.uv_layers.begin(l); l != b_mesh.uv_layers.end(); ++l, ++i) { + bool active_render = b_mesh.uv_textures[i].active_render(); + AttributeStandard uv_std = (active_render)? ATTR_STD_UV: ATTR_STD_NONE; + ustring uv_name = ustring(l->name().c_str()); + AttributeStandard tangent_std = (active_render)? ATTR_STD_UV_TANGENT + : ATTR_STD_NONE; + ustring tangent_name = ustring( + (string(l->name().c_str()) + ".tangent").c_str()); + + /* Denotes whether UV map was requested directly. */ + const bool need_uv = mesh->need_attribute(scene, uv_name) || + mesh->need_attribute(scene, uv_std); + /* Denotes whether tangent was requested directly. */ + const bool need_tangent = + mesh->need_attribute(scene, tangent_name) || + (active_render && mesh->need_attribute(scene, tangent_std)); + + Attribute *uv_attr = NULL; + + /* UV map */ + if(need_uv || need_tangent) { + if(active_render) + uv_attr = mesh->subd_attributes.add(uv_std, uv_name); + else + uv_attr = mesh->subd_attributes.add(uv_name, TypeDesc::TypePoint, ATTR_ELEMENT_CORNER); - if(mesh->need_attribute(scene, name) || (active_render && mesh->need_attribute(scene, std))) { - std = (active_render)? ATTR_STD_UV_TANGENT_SIGN: ATTR_STD_NONE; - name = ustring((string(l->name().c_str()) + ".tangent_sign").c_str()); - bool need_sign = (mesh->need_attribute(scene, name) || mesh->need_attribute(scene, std)); + if(subdivide_uvs) { + uv_attr->flags |= ATTR_SUBDIVIDED; + } + + BL::Mesh::polygons_iterator p; + float3 *fdata = uv_attr->data_float3(); + for(b_mesh.polygons.begin(p); p != b_mesh.polygons.end(); ++p) { + int n = p->loop_total(); + for(int j = 0; j < n; j++) { + *(fdata++) = get_float3(l->data[p->loop_start() + j].uv()); + } + } + } + + /* UV tangent */ + if(need_tangent) { + AttributeStandard sign_std = + (active_render)? ATTR_STD_UV_TANGENT_SIGN + : ATTR_STD_NONE; + ustring sign_name = ustring( + (string(l->name().c_str()) + ".tangent_sign").c_str()); + bool need_sign = (mesh->need_attribute(scene, sign_name) || + mesh->need_attribute(scene, sign_std)); mikk_compute_tangents(b_mesh, - &(*l), + l->name().c_str(), mesh, - nverts, - face_flags, need_sign, active_render); } + /* Remove temporarily created UV attribute. */ + if(!need_uv && uv_attr != NULL) { + mesh->subd_attributes.remove(uv_attr); + } } } else if(mesh->need_attribute(scene, ATTR_STD_UV_TANGENT)) { bool need_sign = mesh->need_attribute(scene, ATTR_STD_UV_TANGENT_SIGN); - mikk_compute_tangents(b_mesh, - NULL, - mesh, - nverts, - face_flags, - need_sign, - true); + mikk_compute_tangents(b_mesh, NULL, mesh, need_sign, true); + if(!mesh->need_attribute(scene, ATTR_STD_GENERATED)) { + mesh->subd_attributes.remove(ATTR_STD_GENERATED); + } } } @@ -560,6 +666,9 @@ static void attr_create_pointiness(Scene *scene, return; } const int num_verts = b_mesh.vertices.length(); + if(num_verts == 0) { + return; + } /* STEP 1: Find out duplicated vertices and point duplicates to a single * original vertex. */ @@ -588,8 +697,8 @@ static void attr_create_pointiness(Scene *scene, sorted_vert_indeices[other_sorted_vert_index]; const float3 &other_vert_co = mesh->verts[other_vert_index]; /* We are too far away now, we wouldn't have duplicate. */ - if ((other_vert_co.x + other_vert_co.y + other_vert_co.z) - - (vert_co.x + vert_co.y + vert_co.z) > 3 * FLT_EPSILON) + if((other_vert_co.x + other_vert_co.y + other_vert_co.z) - + (vert_co.x + vert_co.y + vert_co.z) > 3 * FLT_EPSILON) { break; } @@ -717,6 +826,11 @@ static void create_mesh(Scene *scene, int numngons = 0; bool use_loop_normals = b_mesh.use_auto_smooth() && (mesh->subdivision_type != Mesh::SUBDIVISION_CATMULL_CLARK); + /* If no faces, create empty mesh. */ + if(numfaces == 0) { + return; + } + BL::Mesh::vertices_iterator v; BL::Mesh::tessfaces_iterator f; BL::Mesh::polygons_iterator p; @@ -751,7 +865,13 @@ static void create_mesh(Scene *scene, N = attr_N->data_float3(); /* create generated coordinates from undeformed coordinates */ - if(mesh->need_attribute(scene, ATTR_STD_GENERATED)) { + const bool need_default_tangent = + (subdivision == false) && + (b_mesh.tessface_uv_textures.length() == 0) && + (mesh->need_attribute(scene, ATTR_STD_UV_TANGENT)); + if(mesh->need_attribute(scene, ATTR_STD_GENERATED) || + need_default_tangent) + { Attribute *attr = attributes.add(ATTR_STD_GENERATED); attr->flags |= ATTR_SUBDIVIDED; @@ -761,8 +881,9 @@ static void create_mesh(Scene *scene, float3 *generated = attr->data_float3(); size_t i = 0; - for(b_mesh.vertices.begin(v); v != b_mesh.vertices.end(); ++v) + for(b_mesh.vertices.begin(v); v != b_mesh.vertices.end(); ++v) { generated[i++] = get_float3(v->undeformed_co())*size - loc; + } } /* create faces */ @@ -819,7 +940,7 @@ static void create_mesh(Scene *scene, int shader = clamp(p->material_index(), 0, used_shaders.size()-1); bool smooth = p->use_smooth() || use_loop_normals; - vi.reserve(n); + vi.resize(n); for(int i = 0; i < n; i++) { /* NOTE: Autosmooth is already taken care about. */ vi[i] = b_mesh.loops[p->loop_start() + i].vertex_index(); @@ -835,7 +956,13 @@ static void create_mesh(Scene *scene, */ attr_create_pointiness(scene, mesh, b_mesh, subdivision); attr_create_vertex_color(scene, mesh, b_mesh, nverts, face_flags, subdivision); - attr_create_uv_map(scene, mesh, b_mesh, nverts, face_flags, subdivision, subdivide_uvs); + + if(subdivision) { + attr_create_subd_uv_map(scene, mesh, b_mesh, subdivide_uvs); + } + else { + attr_create_uv_map(scene, mesh, b_mesh, nverts, face_flags); + } /* for volume objects, create a matrix to transform from object space to * mesh texture space. this does not work with deformations but that can @@ -898,8 +1025,8 @@ static void create_subd_mesh(Scene *scene, sdparams.dicing_rate = max(0.1f, RNA_float_get(&cobj, "dicing_rate") * dicing_rate); sdparams.max_level = max_subdivisions; - scene->camera->update(); - sdparams.camera = scene->camera; + scene->dicing_camera->update(scene); + sdparams.camera = scene->dicing_camera; sdparams.objecttoworld = get_transform(b_ob.matrix_world()); } @@ -979,7 +1106,7 @@ Mesh *BlenderSync::sync_mesh(BL::Object& b_ob, else used_shaders.push_back(scene->default_surface); } - + /* test if we need to sync */ int requested_geometry_flags = Mesh::GEOMETRY_NONE; if(render_layer.use_surfaces) { @@ -1003,7 +1130,7 @@ Mesh *BlenderSync::sync_mesh(BL::Object& b_ob, bool attribute_recalc = false; foreach(Shader *shader, mesh->used_shaders) - if(shader->need_update_attributes) + if(shader->need_update_mesh) attribute_recalc = true; if(!attribute_recalc) @@ -1014,16 +1141,23 @@ Mesh *BlenderSync::sync_mesh(BL::Object& b_ob, /* ensure we only sync instanced meshes once */ if(mesh_synced.find(mesh) != mesh_synced.end()) return mesh; - + mesh_synced.insert(mesh); /* create derived mesh */ - array<int> oldtriangle = mesh->triangles; - + array<int> oldtriangles; + array<Mesh::SubdFace> oldsubd_faces; + array<int> oldsubd_face_corners; + oldtriangles.steal_data(mesh->triangles); + oldsubd_faces.steal_data(mesh->subd_faces); + oldsubd_face_corners.steal_data(mesh->subd_face_corners); + /* compares curve_keys rather than strands in order to handle quick hair * adjustments in dynamic BVH - other methods could probably do this better*/ - array<float3> oldcurve_keys = mesh->curve_keys; - array<float> oldcurve_radius = mesh->curve_radius; + array<float3> oldcurve_keys; + array<float> oldcurve_radius; + oldcurve_keys.steal_data(mesh->curve_keys); + oldcurve_radius.steal_data(mesh->curve_radius); mesh->clear(); mesh->used_shaders = used_shaders; @@ -1077,7 +1211,7 @@ Mesh *BlenderSync::sync_mesh(BL::Object& b_ob, } /* free derived mesh */ - b_data.meshes.remove(b_mesh, false); + b_data.meshes.remove(b_mesh, false, true, false); } } mesh->geometry_flags = requested_geometry_flags; @@ -1086,29 +1220,12 @@ Mesh *BlenderSync::sync_mesh(BL::Object& b_ob, sync_mesh_fluid_motion(b_ob, scene, mesh); /* tag update */ - bool rebuild = false; - - if(oldtriangle.size() != mesh->triangles.size()) - rebuild = true; - else if(oldtriangle.size()) { - if(memcmp(&oldtriangle[0], &mesh->triangles[0], sizeof(int)*oldtriangle.size()) != 0) - rebuild = true; - } + bool rebuild = (oldtriangles != mesh->triangles) || + (oldsubd_faces != mesh->subd_faces) || + (oldsubd_face_corners != mesh->subd_face_corners) || + (oldcurve_keys != mesh->curve_keys) || + (oldcurve_radius != mesh->curve_radius); - if(oldcurve_keys.size() != mesh->curve_keys.size()) - rebuild = true; - else if(oldcurve_keys.size()) { - if(memcmp(&oldcurve_keys[0], &mesh->curve_keys[0], sizeof(float3)*oldcurve_keys.size()) != 0) - rebuild = true; - } - - if(oldcurve_radius.size() != mesh->curve_radius.size()) - rebuild = true; - else if(oldcurve_radius.size()) { - if(memcmp(&oldcurve_radius[0], &mesh->curve_radius[0], sizeof(float)*oldcurve_radius.size()) != 0) - rebuild = true; - } - mesh->tag_update(scene, rebuild); return mesh; @@ -1131,45 +1248,19 @@ void BlenderSync::sync_mesh_motion(BL::Object& b_ob, if(mesh_synced.find(mesh) == mesh_synced.end()) return; - /* for motion pass always compute, for motion blur it can be disabled */ - int time_index = 0; - - if(scene->need_motion() == Scene::MOTION_BLUR) { - if(!mesh->use_motion_blur) - return; - - /* see if this mesh needs motion data at this time */ - vector<float> object_times = object->motion_times(); - bool found = false; - - foreach(float object_time, object_times) { - if(motion_time == object_time) { - found = true; - break; - } - else - time_index++; - } - - if(!found) - return; - } - else { - if(motion_time == -1.0f) - time_index = 0; - else if(motion_time == 1.0f) - time_index = 1; - else - return; + /* Find time matching motion step required by mesh. */ + int motion_step = mesh->motion_step(motion_time); + if(motion_step < 0) { + return; } /* skip empty meshes */ - size_t numverts = mesh->verts.size(); - size_t numkeys = mesh->curve_keys.size(); + const size_t numverts = mesh->verts.size(); + const size_t numkeys = mesh->curve_keys.size(); if(!numverts && !numkeys) return; - + /* skip objects without deforming modifiers. this is not totally reliable, * would need a more extensive check to see which objects are animated */ BL::Mesh b_mesh(PointerRNA_NULL); @@ -1202,9 +1293,9 @@ void BlenderSync::sync_mesh_motion(BL::Object& b_ob, float3 *P = &mesh->verts[0]; float3 *N = (attr_N)? attr_N->data_float3(): NULL; - memcpy(attr_mP->data_float3() + time_index*numverts, P, sizeof(float3)*numverts); + memcpy(attr_mP->data_float3() + motion_step*numverts, P, sizeof(float3)*numverts); if(attr_mN) - memcpy(attr_mN->data_float3() + time_index*numverts, N, sizeof(float3)*numverts); + memcpy(attr_mN->data_float3() + motion_step*numverts, N, sizeof(float3)*numverts); } } @@ -1214,7 +1305,7 @@ void BlenderSync::sync_mesh_motion(BL::Object& b_ob, if(attr_mP) { float3 *keys = &mesh->curve_keys[0]; - memcpy(attr_mP->data_float3() + time_index*numkeys, keys, sizeof(float3)*numkeys); + memcpy(attr_mP->data_float3() + motion_step*numkeys, keys, sizeof(float3)*numkeys); } } @@ -1223,13 +1314,12 @@ void BlenderSync::sync_mesh_motion(BL::Object& b_ob, /* TODO(sergey): Perform preliminary check for number of verticies. */ if(numverts) { - /* find attributes */ + /* Find attributes. */ Attribute *attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); Attribute *attr_mN = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_NORMAL); Attribute *attr_N = mesh->attributes.find(ATTR_STD_VERTEX_NORMAL); bool new_attribute = false; - - /* add new attributes if they don't exist already */ + /* Add new attributes if they don't exist already. */ if(!attr_mP) { attr_mP = mesh->attributes.add(ATTR_STD_MOTION_VERTEX_POSITION); if(attr_N) @@ -1237,22 +1327,21 @@ void BlenderSync::sync_mesh_motion(BL::Object& b_ob, new_attribute = true; } - - /* load vertex data from mesh */ - float3 *mP = attr_mP->data_float3() + time_index*numverts; - float3 *mN = (attr_mN)? attr_mN->data_float3() + time_index*numverts: NULL; - + /* Load vertex data from mesh. */ + float3 *mP = attr_mP->data_float3() + motion_step*numverts; + float3 *mN = (attr_mN)? attr_mN->data_float3() + motion_step*numverts: NULL; + /* NOTE: We don't copy more that existing amount of vertices to prevent + * possible memory corruption. + */ BL::Mesh::vertices_iterator v; int i = 0; - for(b_mesh.vertices.begin(v); v != b_mesh.vertices.end() && i < numverts; ++v, ++i) { mP[i] = get_float3(v->co()); if(mN) mN[i] = get_float3(v->normal()); } - - /* in case of new attribute, we verify if there really was any motion */ if(new_attribute) { + /* In case of new attribute, we verify if there really was any motion. */ if(b_mesh.vertices.length() != numverts || memcmp(mP, &mesh->verts[0], sizeof(float3)*numverts) == 0) { @@ -1269,28 +1358,37 @@ void BlenderSync::sync_mesh_motion(BL::Object& b_ob, if(attr_mN) mesh->attributes.remove(ATTR_STD_MOTION_VERTEX_NORMAL); } - else if(time_index > 0) { + else if(motion_step > 0) { VLOG(1) << "Filling deformation motion for object " << b_ob.name(); /* motion, fill up previous steps that we might have skipped because * they had no motion, but we need them anyway now */ float3 *P = &mesh->verts[0]; float3 *N = (attr_N)? attr_N->data_float3(): NULL; - - for(int step = 0; step < time_index; step++) { + for(int step = 0; step < motion_step; step++) { memcpy(attr_mP->data_float3() + step*numverts, P, sizeof(float3)*numverts); if(attr_mN) memcpy(attr_mN->data_float3() + step*numverts, N, sizeof(float3)*numverts); } } } + else { + if(b_mesh.vertices.length() != numverts) { + VLOG(1) << "Topology differs, discarding motion blur for object " + << b_ob.name() << " at time " << motion_step; + memcpy(mP, &mesh->verts[0], sizeof(float3)*numverts); + if(mN != NULL) { + memcpy(mN, attr_N->data_float3(), sizeof(float3)*numverts); + } + } + } } /* hair motion */ if(numkeys) - sync_curves(mesh, b_mesh, b_ob, true, time_index); + sync_curves(mesh, b_mesh, b_ob, true, motion_step); /* free derived mesh */ - b_data.meshes.remove(b_mesh, false); + b_data.meshes.remove(b_mesh, false, true, false); } CCL_NAMESPACE_END diff --git a/intern/cycles/blender/blender_object.cpp b/intern/cycles/blender/blender_object.cpp index 637cf7abda8..077ceb4ebef 100644 --- a/intern/cycles/blender/blender_object.cpp +++ b/intern/cycles/blender/blender_object.cpp @@ -14,24 +14,24 @@ * limitations under the License. */ -#include "camera.h" -#include "integrator.h" -#include "graph.h" -#include "light.h" -#include "mesh.h" -#include "object.h" -#include "scene.h" -#include "nodes.h" -#include "particles.h" -#include "shader.h" - -#include "blender_object_cull.h" -#include "blender_sync.h" -#include "blender_util.h" - -#include "util_foreach.h" -#include "util_hash.h" -#include "util_logging.h" +#include "render/camera.h" +#include "render/integrator.h" +#include "render/graph.h" +#include "render/light.h" +#include "render/mesh.h" +#include "render/object.h" +#include "render/scene.h" +#include "render/nodes.h" +#include "render/particles.h" +#include "render/shader.h" + +#include "blender/blender_object_cull.h" +#include "blender/blender_sync.h" +#include "blender/blender_util.h" + +#include "util/util_foreach.h" +#include "util/util_hash.h" +#include "util/util_logging.h" CCL_NAMESPACE_BEGIN @@ -63,8 +63,26 @@ bool BlenderSync::object_is_mesh(BL::Object& b_ob) { BL::ID b_ob_data = b_ob.data(); - return (b_ob_data && (b_ob_data.is_a(&RNA_Mesh) || - b_ob_data.is_a(&RNA_Curve) || b_ob_data.is_a(&RNA_MetaBall))); + if(!b_ob_data) { + return false; + } + + if(b_ob.type() == BL::Object::type_CURVE) { + /* Skip exporting curves without faces, overhead can be + * significant if there are many for path animation. */ + BL::Curve b_curve(b_ob.data()); + + return (b_curve.bevel_object() || + b_curve.extrude() != 0.0f || + b_curve.bevel_depth() != 0.0f || + b_curve.dimensions() == BL::Curve::dimensions_2D || + b_ob.modifiers.length()); + } + else { + return (b_ob_data.is_a(&RNA_Mesh) || + b_ob_data.is_a(&RNA_Curve) || + b_ob_data.is_a(&RNA_MetaBall)); + } } bool BlenderSync::object_is_light(BL::Object& b_ob) @@ -94,6 +112,7 @@ static uint object_ray_visibility(BL::Object& b_ob) void BlenderSync::sync_light(BL::Object& b_parent, int persistent_id[OBJECT_PERSISTENT_ID_SIZE], BL::Object& b_ob, + BL::DupliObject& b_dupli_ob, Transform& tfm, bool *use_portal) { @@ -175,6 +194,13 @@ void BlenderSync::sync_light(BL::Object& b_parent, light->max_bounces = get_int(clamp, "max_bounces"); + if(b_dupli_ob) { + light->random_id = b_dupli_ob.random_id(); + } + else { + light->random_id = hash_int_2d(hash_string(b_ob.name().c_str()), 0); + } + if(light->type == LIGHT_AREA) light->is_portal = get_boolean(clamp, "is_portal"); else @@ -253,7 +279,7 @@ Object *BlenderSync::sync_object(BL::Object& b_parent, if(object_is_light(b_ob)) { /* don't use lamps for excluded layers used as mask layer */ if(!motion && !((layer_flag & render_layer.holdout_layer) && (layer_flag & render_layer.exclude_layer))) - sync_light(b_parent, persistent_id, b_ob, tfm, use_portal); + sync_light(b_parent, persistent_id, b_ob, b_dupli_ob, tfm, use_portal); return NULL; } @@ -268,6 +294,31 @@ Object *BlenderSync::sync_object(BL::Object& b_parent, return NULL; } + /* Visibility flags for both parent and child. */ + PointerRNA cobject = RNA_pointer_get(&b_ob.ptr, "cycles"); + bool use_holdout = (layer_flag & render_layer.holdout_layer) != 0 || + get_boolean(cobject, "is_holdout"); + uint visibility = object_ray_visibility(b_ob) & PATH_RAY_ALL_VISIBILITY; + + if(b_parent.ptr.data != b_ob.ptr.data) { + visibility &= object_ray_visibility(b_parent); + } + + /* Make holdout objects on excluded layer invisible for non-camera rays. */ + if(use_holdout && (layer_flag & render_layer.exclude_layer)) { + visibility &= ~(PATH_RAY_ALL_VISIBILITY - PATH_RAY_CAMERA); + } + + /* Hide objects not on render layer from camera rays. */ + if(!(layer_flag & render_layer.layer)) { + visibility &= ~PATH_RAY_CAMERA; + } + + /* Don't export completely invisible objects. */ + if(visibility == 0) { + return NULL; + } + /* key to lookup object */ ObjectKey key(b_parent, persistent_id, b_ob); Object *object; @@ -276,22 +327,11 @@ Object *BlenderSync::sync_object(BL::Object& b_parent, if(motion) { object = object_map.find(key); - if(object && (scene->need_motion() == Scene::MOTION_PASS || - object_use_motion(b_parent, b_ob))) - { - /* object transformation */ - if(tfm != object->tfm) { - VLOG(1) << "Object " << b_ob.name() << " motion detected."; - if(motion_time == -1.0f || motion_time == 1.0f) { - object->use_motion = true; - } - } - - if(motion_time == -1.0f) { - object->motion.pre = tfm; - } - else if(motion_time == 1.0f) { - object->motion.post = tfm; + if(object && object->use_motion()) { + /* Set transform at matching motion time step. */ + int time_index = object->motion_step(motion_time); + if(time_index >= 0) { + object->motion[time_index] = tfm; } /* mesh deformation */ @@ -308,8 +348,6 @@ Object *BlenderSync::sync_object(BL::Object& b_parent, if(object_map.sync(&object, b_ob, b_parent, key)) object_updated = true; - bool use_holdout = (layer_flag & render_layer.holdout_layer) != 0; - /* mesh sync */ object->mesh = sync_mesh(b_ob, object_updated, hide_tris); @@ -322,27 +360,17 @@ Object *BlenderSync::sync_object(BL::Object& b_parent, object_updated = true; } - /* visibility flags for both parent and child */ - uint visibility = object_ray_visibility(b_ob) & PATH_RAY_ALL_VISIBILITY; - if(b_parent.ptr.data != b_ob.ptr.data) { - visibility &= object_ray_visibility(b_parent); - } - - /* make holdout objects on excluded layer invisible for non-camera rays */ - if(use_holdout && (layer_flag & render_layer.exclude_layer)) { - visibility &= ~(PATH_RAY_ALL_VISIBILITY - PATH_RAY_CAMERA); - } - - /* hide objects not on render layer from camera rays */ - if(!(layer_flag & render_layer.layer)) { - visibility &= ~PATH_RAY_CAMERA; - } - if(visibility != object->visibility) { object->visibility = visibility; object_updated = true; } + bool is_shadow_catcher = get_boolean(cobject, "is_shadow_catcher"); + if(is_shadow_catcher != object->is_shadow_catcher) { + object->is_shadow_catcher = is_shadow_catcher; + object_updated = true; + } + /* object sync * transform comparison should not be needed, but duplis don't work perfect * in the depsgraph and may not signal changes, so this is a workaround */ @@ -350,49 +378,50 @@ Object *BlenderSync::sync_object(BL::Object& b_parent, object->name = b_ob.name().c_str(); object->pass_id = b_ob.pass_index(); object->tfm = tfm; - object->motion.pre = transform_empty(); - object->motion.post = transform_empty(); - object->use_motion = false; + object->motion.clear(); /* motion blur */ - if(scene->need_motion() == Scene::MOTION_BLUR && object->mesh) { + Scene::MotionType need_motion = scene->need_motion(); + if(need_motion != Scene::MOTION_NONE && object->mesh) { Mesh *mesh = object->mesh; - mesh->use_motion_blur = false; + mesh->motion_steps = 0; - if(object_use_motion(b_parent, b_ob)) { - if(object_use_deform_motion(b_parent, b_ob)) { - mesh->motion_steps = object_motion_steps(b_ob); + uint motion_steps; + + if(scene->need_motion() == Scene::MOTION_BLUR) { + motion_steps = object_motion_steps(b_parent, b_ob); + if(motion_steps && object_use_deform_motion(b_parent, b_ob)) { + mesh->motion_steps = motion_steps; mesh->use_motion_blur = true; } - - vector<float> times = object->motion_times(); - foreach(float time, times) - motion_times.insert(time); } - } + else { + motion_steps = 3; + mesh->motion_steps = motion_steps; + } - /* random number */ - object->random_id = hash_string(object->name.c_str()); + object->motion.resize(motion_steps, transform_empty()); - if(persistent_id) { - for(int i = 0; i < OBJECT_PERSISTENT_ID_SIZE; i++) - object->random_id = hash_int_2d(object->random_id, persistent_id[i]); - } - else - object->random_id = hash_int_2d(object->random_id, 0); + if(motion_steps) { + object->motion[motion_steps/2] = tfm; - if(b_parent.ptr.data != b_ob.ptr.data) - object->random_id ^= hash_int(hash_string(b_parent.name().c_str())); + for(size_t step = 0; step < motion_steps; step++) { + motion_times.insert(object->motion_time(step)); + } + } + } - /* dupli texture coordinates */ + /* dupli texture coordinates and random_id */ if(b_dupli_ob) { object->dupli_generated = 0.5f*get_float3(b_dupli_ob.orco()) - make_float3(0.5f, 0.5f, 0.5f); object->dupli_uv = get_float2(b_dupli_ob.uv()); + object->random_id = b_dupli_ob.random_id(); } else { object->dupli_generated = make_float3(0.0f, 0.0f, 0.0f); object->dupli_uv = make_float2(0.0f, 0.0f); + object->random_id = hash_int_2d(hash_string(object->name.c_str()), 0); } object->tag_update(scene); @@ -482,7 +511,7 @@ static bool object_render_hide_duplis(BL::Object& b_ob) /* Object Loop */ -void BlenderSync::sync_objects(BL::SpaceView3D& b_v3d, float motion_time) +void BlenderSync::sync_objects(float motion_time) { /* layer data */ uint scene_layer = render_layer.scene_layer; @@ -510,7 +539,7 @@ void BlenderSync::sync_objects(BL::SpaceView3D& b_v3d, float motion_time) * 1 : DAG_EVAL_PREVIEW * 2 : DAG_EVAL_RENDER */ - int dupli_settings = preview ? 1 : 2; + int dupli_settings = (render_layer.use_viewport_visibility) ? 1 : 2; bool cancel = false; bool use_portal = false; @@ -545,7 +574,7 @@ void BlenderSync::sync_objects(BL::SpaceView3D& b_v3d, float motion_time) for(b_ob.dupli_list.begin(b_dup); b_dup != b_ob.dupli_list.end(); ++b_dup) { Transform tfm = get_transform(b_dup->matrix()); BL::Object b_dup_ob = b_dup->object(); - bool dup_hide = (b_v3d)? b_dup_ob.hide(): b_dup_ob.hide_render(); + bool dup_hide = (render_layer.use_viewport_visibility)? b_dup_ob.hide(): b_dup_ob.hide_render(); bool in_dupli_group = (b_dup->type() == BL::DupliObject::type_GROUP); bool hide_tris; @@ -621,7 +650,6 @@ void BlenderSync::sync_objects(BL::SpaceView3D& b_v3d, float motion_time) } void BlenderSync::sync_motion(BL::RenderSettings& b_render, - BL::SpaceView3D& b_v3d, BL::Object& b_override, int width, int height, void **python_thread_state) @@ -658,7 +686,7 @@ void BlenderSync::sync_motion(BL::RenderSettings& b_render, b_engine.frame_set(frame, subframe); python_thread_state_save(python_thread_state); sync_camera_motion(b_render, b_cam, width, height, 0.0f); - sync_objects(b_v3d, 0.0f); + sync_objects(0.0f); } /* always sample these times for camera motion */ @@ -667,6 +695,11 @@ void BlenderSync::sync_motion(BL::RenderSettings& b_render, /* note iteration over motion_times set happens in sorted order */ foreach(float relative_time, motion_times) { + /* center time is already handled. */ + if(relative_time == 0.0f) { + continue; + } + VLOG(1) << "Synchronizing motion for the relative time " << relative_time << "."; @@ -692,7 +725,7 @@ void BlenderSync::sync_motion(BL::RenderSettings& b_render, } /* sync object */ - sync_objects(b_v3d, relative_time); + sync_objects(relative_time); } /* we need to set the python thread state again because this diff --git a/intern/cycles/blender/blender_object_cull.cpp b/intern/cycles/blender/blender_object_cull.cpp index 08918dd1a49..bdf7dc469b2 100644 --- a/intern/cycles/blender/blender_object_cull.cpp +++ b/intern/cycles/blender/blender_object_cull.cpp @@ -16,9 +16,9 @@ #include <cstdlib> -#include "camera.h" +#include "render/camera.h" -#include "blender_object_cull.h" +#include "blender/blender_object_cull.h" CCL_NAMESPACE_BEGIN @@ -62,7 +62,7 @@ void BlenderObjectCulling::init_object(Scene *scene, BL::Object& b_ob) if(use_camera_cull_ || use_distance_cull_) { /* Need to have proper projection matrix. */ - scene->camera->update(); + scene->camera->update(scene); } } @@ -96,7 +96,7 @@ bool BlenderObjectCulling::test(Scene *scene, BL::Object& b_ob, Transform& tfm) bool BlenderObjectCulling::test_camera(Scene *scene, float3 bb[8]) { Camera *cam = scene->camera; - Transform& worldtondc = cam->worldtondc; + const ProjectionTransform& worldtondc = cam->worldtondc; float3 bb_min = make_float3(FLT_MAX, FLT_MAX, FLT_MAX), bb_max = make_float3(-FLT_MAX, -FLT_MAX, -FLT_MAX); bool all_behind = true; diff --git a/intern/cycles/blender/blender_object_cull.h b/intern/cycles/blender/blender_object_cull.h index b6f0ca5cd31..2147877a860 100644 --- a/intern/cycles/blender/blender_object_cull.h +++ b/intern/cycles/blender/blender_object_cull.h @@ -17,8 +17,8 @@ #ifndef __BLENDER_OBJECT_CULL_H__ #define __BLENDER_OBJECT_CULL_H__ -#include "blender_sync.h" -#include "util_types.h" +#include "blender/blender_sync.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/blender/blender_particles.cpp b/intern/cycles/blender/blender_particles.cpp index dd2900a8d5b..00f8cb3cf1b 100644 --- a/intern/cycles/blender/blender_particles.cpp +++ b/intern/cycles/blender/blender_particles.cpp @@ -14,14 +14,14 @@ * limitations under the License. */ -#include "mesh.h" -#include "object.h" -#include "particles.h" +#include "render/mesh.h" +#include "render/object.h" +#include "render/particles.h" -#include "blender_sync.h" -#include "blender_util.h" +#include "blender/blender_sync.h" +#include "blender/blender_util.h" -#include "util_foreach.h" +#include "util/util_foreach.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp index 438abc49f88..792597cbad5 100644 --- a/intern/cycles/blender/blender_python.cpp +++ b/intern/cycles/blender/blender_python.cpp @@ -16,21 +16,22 @@ #include <Python.h> -#include "CCL_api.h" +#include "blender/CCL_api.h" -#include "blender_sync.h" -#include "blender_session.h" +#include "blender/blender_sync.h" +#include "blender/blender_session.h" -#include "util_foreach.h" -#include "util_logging.h" -#include "util_md5.h" -#include "util_opengl.h" -#include "util_path.h" -#include "util_string.h" -#include "util_types.h" +#include "util/util_debug.h" +#include "util/util_foreach.h" +#include "util/util_logging.h" +#include "util/util_md5.h" +#include "util/util_opengl.h" +#include "util/util_path.h" +#include "util/util_string.h" +#include "util/util_types.h" #ifdef WITH_OSL -#include "osl.h" +#include "render/osl.h" #include <OSL/oslquery.h> #include <OSL/oslconfig.h> @@ -60,15 +61,19 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene) /* Backup some settings for comparison. */ DebugFlags::OpenCL::DeviceType opencl_device_type = flags.opencl.device_type; DebugFlags::OpenCL::KernelType opencl_kernel_type = flags.opencl.kernel_type; + /* Synchronize shared flags. */ + flags.viewport_static_bvh = get_enum(cscene, "debug_bvh_type"); /* Synchronize CPU flags. */ flags.cpu.avx2 = get_boolean(cscene, "debug_use_cpu_avx2"); flags.cpu.avx = get_boolean(cscene, "debug_use_cpu_avx"); flags.cpu.sse41 = get_boolean(cscene, "debug_use_cpu_sse41"); flags.cpu.sse3 = get_boolean(cscene, "debug_use_cpu_sse3"); flags.cpu.sse2 = get_boolean(cscene, "debug_use_cpu_sse2"); - flags.cpu.qbvh = get_boolean(cscene, "debug_use_qbvh"); + flags.cpu.bvh_layout = (BVHLayout)get_enum(cscene, "debug_bvh_layout"); + flags.cpu.split_kernel = get_boolean(cscene, "debug_use_cpu_split_kernel"); /* Synchronize CUDA flags. */ flags.cuda.adaptive_compile = get_boolean(cscene, "debug_use_cuda_adaptive_compile"); + flags.cuda.split_kernel = get_boolean(cscene, "debug_use_cuda_split_kernel"); /* Synchronize OpenCL kernel type. */ switch(get_enum(cscene, "debug_opencl_kernel_type")) { case 0: @@ -104,6 +109,8 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene) } /* Synchronize other OpenCL flags. */ flags.opencl.debug = get_boolean(cscene, "debug_use_opencl_debug"); + flags.opencl.mem_limit = ((size_t)get_int(cscene, "debug_opencl_mem_limit"))*1024*1024; + flags.opencl.single_program = get_boolean(cscene, "debug_opencl_kernel_single_program"); return flags.opencl.device_type != opencl_device_type || flags.opencl.kernel_type != opencl_kernel_type; } @@ -641,7 +648,7 @@ static PyObject *debug_flags_reset_func(PyObject * /*self*/, PyObject * /*args*/ Py_RETURN_NONE; } -static PyObject *set_resumable_chunks_func(PyObject * /*self*/, PyObject *args) +static PyObject *set_resumable_chunk_func(PyObject * /*self*/, PyObject *args) { int num_resumable_chunks, current_resumable_chunk; if(!PyArg_ParseTuple(args, "ii", @@ -676,6 +683,53 @@ static PyObject *set_resumable_chunks_func(PyObject * /*self*/, PyObject *args) Py_RETURN_NONE; } +static PyObject *set_resumable_chunk_range_func(PyObject * /*self*/, PyObject *args) +{ + int num_chunks, start_chunk, end_chunk; + if(!PyArg_ParseTuple(args, "iii", + &num_chunks, + &start_chunk, + &end_chunk)) { + Py_RETURN_NONE; + } + + if(num_chunks <= 0) { + fprintf(stderr, "Cycles: Bad value for number of resumable chunks.\n"); + abort(); + Py_RETURN_NONE; + } + if(start_chunk < 1 || start_chunk > num_chunks) { + fprintf(stderr, "Cycles: Bad value for start chunk number.\n"); + abort(); + Py_RETURN_NONE; + } + if(end_chunk < 1 || end_chunk > num_chunks) { + fprintf(stderr, "Cycles: Bad value for start chunk number.\n"); + abort(); + Py_RETURN_NONE; + } + if(start_chunk > end_chunk) { + fprintf(stderr, "Cycles: End chunk should be higher than start one.\n"); + abort(); + Py_RETURN_NONE; + } + + VLOG(1) << "Initialized resumable render: " + << "num_resumable_chunks=" << num_chunks << ", " + << "start_resumable_chunk=" << start_chunk + << "end_resumable_chunk=" << end_chunk; + BlenderSession::num_resumable_chunks = num_chunks; + BlenderSession::start_resumable_chunk = start_chunk; + BlenderSession::end_resumable_chunk = end_chunk; + + printf("Cycles: Will render chunks %d to %d of %d\n", + start_chunk, + end_chunk, + num_chunks); + + Py_RETURN_NONE; +} + static PyObject *get_device_types_func(PyObject * /*self*/, PyObject * /*args*/) { vector<DeviceInfo>& devices = Device::available_devices(); @@ -715,7 +769,8 @@ static PyMethodDef methods[] = { {"debug_flags_reset", debug_flags_reset_func, METH_NOARGS, ""}, /* Resumable render */ - {"set_resumable_chunks", set_resumable_chunks_func, METH_VARARGS, ""}, + {"set_resumable_chunk", set_resumable_chunk_func, METH_VARARGS, ""}, + {"set_resumable_chunk_range", set_resumable_chunk_range_func, METH_VARARGS, ""}, /* Compute Device selection */ {"get_device_types", get_device_types_func, METH_VARARGS, ""}, @@ -760,6 +815,14 @@ void *CCL_python_module_init() PyModule_AddStringConstant(mod, "osl_version_string", "unknown"); #endif +#ifdef WITH_CYCLES_DEBUG + PyModule_AddObject(mod, "with_cycles_debug", Py_True); + Py_INCREF(Py_True); +#else + PyModule_AddObject(mod, "with_cycles_debug", Py_False); + Py_INCREF(Py_False); +#endif + #ifdef WITH_NETWORK PyModule_AddObject(mod, "with_network", Py_True); Py_INCREF(Py_True); diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp index 2f30cbd961f..00d23b9095e 100644 --- a/intern/cycles/blender/blender_session.cpp +++ b/intern/cycles/blender/blender_session.cpp @@ -16,36 +16,38 @@ #include <stdlib.h> -#include "background.h" -#include "buffers.h" -#include "camera.h" -#include "device.h" -#include "integrator.h" -#include "film.h" -#include "light.h" -#include "mesh.h" -#include "object.h" -#include "scene.h" -#include "session.h" -#include "shader.h" - -#include "util_color.h" -#include "util_foreach.h" -#include "util_function.h" -#include "util_hash.h" -#include "util_logging.h" -#include "util_progress.h" -#include "util_time.h" - -#include "blender_sync.h" -#include "blender_session.h" -#include "blender_util.h" +#include "render/background.h" +#include "render/buffers.h" +#include "render/camera.h" +#include "device/device.h" +#include "render/integrator.h" +#include "render/film.h" +#include "render/light.h" +#include "render/mesh.h" +#include "render/object.h" +#include "render/scene.h" +#include "render/session.h" +#include "render/shader.h" + +#include "util/util_color.h" +#include "util/util_foreach.h" +#include "util/util_function.h" +#include "util/util_hash.h" +#include "util/util_logging.h" +#include "util/util_progress.h" +#include "util/util_time.h" + +#include "blender/blender_sync.h" +#include "blender/blender_session.h" +#include "blender/blender_util.h" CCL_NAMESPACE_BEGIN bool BlenderSession::headless = false; int BlenderSession::num_resumable_chunks = 0; int BlenderSession::current_resumable_chunk = 0; +int BlenderSession::start_resumable_chunk = 0; +int BlenderSession::end_resumable_chunk = 0; BlenderSession::BlenderSession(BL::RenderEngine& b_engine, BL::UserPreferences& b_userpref, @@ -68,6 +70,7 @@ BlenderSession::BlenderSession(BL::RenderEngine& b_engine, background = true; last_redraw_time = 0.0; start_resize_time = 0.0; + last_status_time = 0.0; } BlenderSession::BlenderSession(BL::RenderEngine& b_engine, @@ -93,6 +96,7 @@ BlenderSession::BlenderSession(BL::RenderEngine& b_engine, background = false; last_redraw_time = 0.0; start_resize_time = 0.0; + last_status_time = 0.0; } BlenderSession::~BlenderSession() @@ -111,8 +115,7 @@ void BlenderSession::create() void BlenderSession::create_session() { SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background); - bool is_cpu = session_params.device.type == DEVICE_CPU; - SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background, is_cpu); + SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background); bool session_pause = BlenderSync::get_session_pause(b_scene, background); /* reset status/progress */ @@ -121,14 +124,6 @@ void BlenderSession::create_session() last_progress = -1.0f; start_resize_time = 0.0; - /* create scene */ - scene = new Scene(scene_params, session_params.device); - - /* setup callbacks for builtin image support */ - scene->image_manager->builtin_image_info_cb = function_bind(&BlenderSession::builtin_image_info, this, _1, _2, _3, _4, _5, _6, _7); - scene->image_manager->builtin_image_pixels_cb = function_bind(&BlenderSession::builtin_image_pixels, this, _1, _2, _3, _4); - scene->image_manager->builtin_image_float_pixels_cb = function_bind(&BlenderSession::builtin_image_float_pixels, this, _1, _2, _3, _4); - /* create session */ session = new Session(session_params); session->scene = scene; @@ -136,8 +131,18 @@ void BlenderSession::create_session() session->progress.set_cancel_callback(function_bind(&BlenderSession::test_cancel, this)); session->set_pause(session_pause); + /* create scene */ + scene = new Scene(scene_params, session->device); + + /* setup callbacks for builtin image support */ + scene->image_manager->builtin_image_info_cb = function_bind(&BlenderSession::builtin_image_info, this, _1, _2, _3); + scene->image_manager->builtin_image_pixels_cb = function_bind(&BlenderSession::builtin_image_pixels, this, _1, _2, _3, _4, _5); + scene->image_manager->builtin_image_float_pixels_cb = function_bind(&BlenderSession::builtin_image_float_pixels, this, _1, _2, _3, _4, _5); + + session->scene = scene; + /* create sync */ - sync = new BlenderSync(b_engine, b_data, b_scene, scene, !background, session->progress, is_cpu); + sync = new BlenderSync(b_engine, b_data, b_scene, scene, !background, session->progress); BL::Object b_camera_override(b_engine.camera_override()); if(b_v3d) { if(session_pause == false) { @@ -175,8 +180,7 @@ void BlenderSession::reset_session(BL::BlendData& b_data_, BL::Scene& b_scene_) b_scene = b_scene_; SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background); - const bool is_cpu = session_params.device.type == DEVICE_CPU; - SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background, is_cpu); + SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background); width = render_resolution_x(b_render); height = render_resolution_y(b_render); @@ -207,7 +211,7 @@ void BlenderSession::reset_session(BL::BlendData& b_data_, BL::Scene& b_scene_) session->stats.mem_peak = session->stats.mem_used; /* sync object should be re-created */ - sync = new BlenderSync(b_engine, b_data, b_scene, scene, !background, session->progress, is_cpu); + sync = new BlenderSync(b_engine, b_data, b_scene, scene, !background, session->progress); /* for final render we will do full data sync per render layer, only * do some basic syncing here, no objects or materials for speed */ @@ -239,90 +243,6 @@ void BlenderSession::free_session() delete session; } -static PassType get_pass_type(BL::RenderPass& b_pass) -{ - switch(b_pass.type()) { - case BL::RenderPass::type_COMBINED: - return PASS_COMBINED; - - case BL::RenderPass::type_Z: - return PASS_DEPTH; - case BL::RenderPass::type_MIST: - return PASS_MIST; - case BL::RenderPass::type_NORMAL: - return PASS_NORMAL; - case BL::RenderPass::type_OBJECT_INDEX: - return PASS_OBJECT_ID; - case BL::RenderPass::type_UV: - return PASS_UV; - case BL::RenderPass::type_VECTOR: - return PASS_MOTION; - case BL::RenderPass::type_MATERIAL_INDEX: - return PASS_MATERIAL_ID; - - case BL::RenderPass::type_DIFFUSE_DIRECT: - return PASS_DIFFUSE_DIRECT; - case BL::RenderPass::type_GLOSSY_DIRECT: - return PASS_GLOSSY_DIRECT; - case BL::RenderPass::type_TRANSMISSION_DIRECT: - return PASS_TRANSMISSION_DIRECT; - case BL::RenderPass::type_SUBSURFACE_DIRECT: - return PASS_SUBSURFACE_DIRECT; - - case BL::RenderPass::type_DIFFUSE_INDIRECT: - return PASS_DIFFUSE_INDIRECT; - case BL::RenderPass::type_GLOSSY_INDIRECT: - return PASS_GLOSSY_INDIRECT; - case BL::RenderPass::type_TRANSMISSION_INDIRECT: - return PASS_TRANSMISSION_INDIRECT; - case BL::RenderPass::type_SUBSURFACE_INDIRECT: - return PASS_SUBSURFACE_INDIRECT; - - case BL::RenderPass::type_DIFFUSE_COLOR: - return PASS_DIFFUSE_COLOR; - case BL::RenderPass::type_GLOSSY_COLOR: - return PASS_GLOSSY_COLOR; - case BL::RenderPass::type_TRANSMISSION_COLOR: - return PASS_TRANSMISSION_COLOR; - case BL::RenderPass::type_SUBSURFACE_COLOR: - return PASS_SUBSURFACE_COLOR; - - case BL::RenderPass::type_EMIT: - return PASS_EMISSION; - case BL::RenderPass::type_ENVIRONMENT: - return PASS_BACKGROUND; - case BL::RenderPass::type_AO: - return PASS_AO; - case BL::RenderPass::type_SHADOW: - return PASS_SHADOW; - - case BL::RenderPass::type_DIFFUSE: - case BL::RenderPass::type_COLOR: - case BL::RenderPass::type_REFRACTION: - case BL::RenderPass::type_SPECULAR: - case BL::RenderPass::type_REFLECTION: - return PASS_NONE; -#ifdef WITH_CYCLES_DEBUG - case BL::RenderPass::type_DEBUG: - { - switch(b_pass.debug_type()) { - case BL::RenderPass::debug_type_BVH_TRAVERSED_NODES: - return PASS_BVH_TRAVERSED_NODES; - case BL::RenderPass::debug_type_BVH_TRAVERSED_INSTANCES: - return PASS_BVH_TRAVERSED_INSTANCES; - case BL::RenderPass::debug_type_BVH_INTERSECTIONS: - return PASS_BVH_INTERSECTIONS; - case BL::RenderPass::debug_type_RAY_BOUNCES: - return PASS_RAY_BOUNCES; - } - break; - } -#endif - } - - return PASS_NONE; -} - static ShaderEvalType get_shader_type(const string& pass_type) { const char *shader_type = pass_type.c_str(); @@ -332,6 +252,8 @@ static ShaderEvalType get_shader_type(const string& pass_type) return SHADER_EVAL_NORMAL; else if(strcmp(shader_type, "UV")==0) return SHADER_EVAL_UV; + else if(strcmp(shader_type, "ROUGHNESS")==0) + return SHADER_EVAL_ROUGHNESS; else if(strcmp(shader_type, "DIFFUSE_COLOR")==0) return SHADER_EVAL_DIFFUSE_COLOR; else if(strcmp(shader_type, "GLOSSY_COLOR")==0) @@ -379,18 +301,18 @@ static BL::RenderResult begin_render_result(BL::RenderEngine& b_engine, static void end_render_result(BL::RenderEngine& b_engine, BL::RenderResult& b_rr, bool cancel, + bool highlight, bool do_merge_results) { - b_engine.end_result(b_rr, (int)cancel, (int)do_merge_results); + b_engine.end_result(b_rr, (int)cancel, (int) highlight, (int)do_merge_results); } -void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_update_only) +void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_update_only, bool highlight) { - BufferParams& params = rtile.buffers->params; - int x = params.full_x - session->tile_manager.params.full_x; - int y = params.full_y - session->tile_manager.params.full_y; - int w = params.width; - int h = params.height; + int x = rtile.x - session->tile_manager.params.full_x; + int y = rtile.y - session->tile_manager.params.full_y; + int w = rtile.w; + int h = rtile.h; /* get render result */ BL::RenderResult b_rr = begin_render_result(b_engine, x, y, w, h, b_rlay_name.c_str(), b_rview_name.c_str()); @@ -410,47 +332,47 @@ void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_upda BL::RenderLayer b_rlay = *b_single_rlay; if(do_update_only) { - /* update only needed */ + /* Sample would be zero at initial tile update, which is only needed + * to tag tile form blender side as IN PROGRESS for proper highlight + * no buffers should be sent to blender yet. For denoise we also + * keep showing the noisy buffers until denoise is done. */ + bool merge = (rtile.sample != 0) && (rtile.task != RenderTile::DENOISE); - if(rtile.sample != 0) { - /* sample would be zero at initial tile update, which is only needed - * to tag tile form blender side as IN PROGRESS for proper highlight - * no buffers should be sent to blender yet - */ + if(merge) { update_render_result(b_rr, b_rlay, rtile); } - end_render_result(b_engine, b_rr, true, true); + end_render_result(b_engine, b_rr, true, highlight, merge); } else { - /* write result */ + /* Write final render result. */ write_render_result(b_rr, b_rlay, rtile); - end_render_result(b_engine, b_rr, false, true); + end_render_result(b_engine, b_rr, false, false, true); } } void BlenderSession::write_render_tile(RenderTile& rtile) { - do_write_update_render_tile(rtile, false); + do_write_update_render_tile(rtile, false, false); } -void BlenderSession::update_render_tile(RenderTile& rtile) +void BlenderSession::update_render_tile(RenderTile& rtile, bool highlight) { /* use final write for preview renders, otherwise render result wouldn't be * be updated in blender side * would need to be investigated a bit further, but for now shall be fine */ if(!b_engine.is_preview()) - do_write_update_render_tile(rtile, true); + do_write_update_render_tile(rtile, true, highlight); else - do_write_update_render_tile(rtile, false); + do_write_update_render_tile(rtile, false, false); } void BlenderSession::render() { /* set callback to write out render results */ session->write_render_tile_cb = function_bind(&BlenderSession::write_render_tile, this, _1); - session->update_render_tile_cb = function_bind(&BlenderSession::update_render_tile, this, _1); + session->update_render_tile_cb = function_bind(&BlenderSession::update_render_tile, this, _1, _2); /* get buffer parameters */ SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background); @@ -460,7 +382,10 @@ void BlenderSession::render() BL::RenderSettings r = b_scene.render(); BL::RenderSettings::layers_iterator b_layer_iter; BL::RenderResult::views_iterator b_view_iter; - + + /* We do some special meta attributes when we only have single layer. */ + const bool is_single_layer = (r.layers.length() == 1); + for(r.layers.begin(b_layer_iter); b_layer_iter != r.layers.end(); ++b_layer_iter) { b_rlay_name = b_layer_iter->name(); @@ -471,33 +396,38 @@ void BlenderSession::render() /* layer will be missing if it was disabled in the UI */ if(b_single_rlay == b_rr.layers.end()) { - end_render_result(b_engine, b_rr, true, false); + end_render_result(b_engine, b_rr, true, true, false); continue; } BL::RenderLayer b_rlay = *b_single_rlay; /* add passes */ - array<Pass> passes; - Pass::add(PASS_COMBINED, passes); - - if(session_params.device.advanced_shading) { - - /* loop over passes */ - BL::RenderLayer::passes_iterator b_pass_iter; - - for(b_rlay.passes.begin(b_pass_iter); b_pass_iter != b_rlay.passes.end(); ++b_pass_iter) { - BL::RenderPass b_pass(*b_pass_iter); - PassType pass_type = get_pass_type(b_pass); + array<Pass> passes = sync->sync_render_passes(b_rlay, *b_layer_iter, session_params); + buffer_params.passes = passes; - if(pass_type == PASS_MOTION && scene->integrator->motion_blur) - continue; - if(pass_type != PASS_NONE) - Pass::add(pass_type, passes); - } - } + PointerRNA crl = RNA_pointer_get(&b_layer_iter->ptr, "cycles"); + bool use_denoising = get_boolean(crl, "use_denoising"); + buffer_params.denoising_data_pass = use_denoising; + session->tile_manager.schedule_denoising = use_denoising; + session->params.use_denoising = use_denoising; + scene->film->denoising_data_pass = buffer_params.denoising_data_pass; + scene->film->denoising_flags = 0; + if(!get_boolean(crl, "denoising_diffuse_direct")) scene->film->denoising_flags |= DENOISING_CLEAN_DIFFUSE_DIR; + if(!get_boolean(crl, "denoising_diffuse_indirect")) scene->film->denoising_flags |= DENOISING_CLEAN_DIFFUSE_IND; + if(!get_boolean(crl, "denoising_glossy_direct")) scene->film->denoising_flags |= DENOISING_CLEAN_GLOSSY_DIR; + if(!get_boolean(crl, "denoising_glossy_indirect")) scene->film->denoising_flags |= DENOISING_CLEAN_GLOSSY_IND; + if(!get_boolean(crl, "denoising_transmission_direct")) scene->film->denoising_flags |= DENOISING_CLEAN_TRANSMISSION_DIR; + if(!get_boolean(crl, "denoising_transmission_indirect")) scene->film->denoising_flags |= DENOISING_CLEAN_TRANSMISSION_IND; + if(!get_boolean(crl, "denoising_subsurface_direct")) scene->film->denoising_flags |= DENOISING_CLEAN_SUBSURFACE_DIR; + if(!get_boolean(crl, "denoising_subsurface_indirect")) scene->film->denoising_flags |= DENOISING_CLEAN_SUBSURFACE_IND; + scene->film->denoising_clean_pass = (scene->film->denoising_flags & DENOISING_CLEAN_ALL_PASSES); + buffer_params.denoising_clean_pass = scene->film->denoising_clean_pass; + session->params.denoising_radius = get_int(crl, "denoising_radius"); + session->params.denoising_strength = get_float(crl, "denoising_strength"); + session->params.denoising_feature_strength = get_float(crl, "denoising_feature_strength"); + session->params.denoising_relative_pca = get_boolean(crl, "denoising_relative_pca"); - buffer_params.passes = passes; scene->film->pass_alpha_threshold = b_layer_iter->pass_alpha_threshold(); scene->film->tag_passes_update(scene, passes); scene->film->tag_update(scene); @@ -550,8 +480,17 @@ void BlenderSession::render() break; } + if(is_single_layer) { + BL::RenderResult b_rr = b_engine.get_result(); + string num_aa_samples = string_printf("%d", session->params.samples); + b_rr.stamp_data_add_field("Cycles Samples", num_aa_samples.c_str()); + /* TODO(sergey): Report whether we're doing resumable render + * and also start/end sample if so. + */ + } + /* free result without merging */ - end_render_result(b_engine, b_rr, true, false); + end_render_result(b_engine, b_rr, true, true, false); if(session->progress.get_cancel()) break; @@ -632,8 +571,6 @@ void BlenderSession::bake(BL::Object& b_object, float result[]) { ShaderEvalType shader_type = get_shader_type(pass_type); - size_t object_index = OBJECT_NONE; - int tri_offset = 0; /* Set baking flag in advance, so kernel loading can check if we need * any baking capabilities. @@ -643,9 +580,6 @@ void BlenderSession::bake(BL::Object& b_object, /* ensure kernels are loaded before we do any scene updates */ session->load_kernels(); - if(session->progress.get_cancel()) - return; - if(shader_type == SHADER_EVAL_UV) { /* force UV to be available */ Pass::add(PASS_UV, scene->film->passes); @@ -663,50 +597,61 @@ void BlenderSession::bake(BL::Object& b_object, scene->film->tag_update(scene); scene->integrator->tag_update(scene); - /* update scene */ - BL::Object b_camera_override(b_engine.camera_override()); - sync->sync_camera(b_render, b_camera_override, width, height, ""); - sync->sync_data(b_render, - b_v3d, - b_camera_override, - width, height, - &python_thread_state, - b_rlay_name.c_str()); + if(!session->progress.get_cancel()) { + /* update scene */ + BL::Object b_camera_override(b_engine.camera_override()); + sync->sync_camera(b_render, b_camera_override, width, height, ""); + sync->sync_data(b_render, + b_v3d, + b_camera_override, + width, height, + &python_thread_state, + b_rlay_name.c_str()); + } - /* get buffer parameters */ - SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background); - BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_v3d, b_rv3d, scene->camera, width, height); + BakeData *bake_data = NULL; - scene->bake_manager->set_shader_limit((size_t)b_engine.tile_x(), (size_t)b_engine.tile_y()); + if(!session->progress.get_cancel()) { + /* get buffer parameters */ + SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background); + BufferParams buffer_params = BlenderSync::get_buffer_params(b_render, b_v3d, b_rv3d, scene->camera, width, height); - /* set number of samples */ - session->tile_manager.set_samples(session_params.samples); - session->reset(buffer_params, session_params.samples); - session->update_scene(); + scene->bake_manager->set_shader_limit((size_t)b_engine.tile_x(), (size_t)b_engine.tile_y()); - /* find object index. todo: is arbitrary - copied from mesh_displace.cpp */ - for(size_t i = 0; i < scene->objects.size(); i++) { - if(strcmp(scene->objects[i]->name.c_str(), b_object.name().c_str()) == 0) { - object_index = i; - tri_offset = scene->objects[i]->mesh->tri_offset; - break; - } - } + /* set number of samples */ + session->tile_manager.set_samples(session_params.samples); + session->reset(buffer_params, session_params.samples); + session->update_scene(); + + /* find object index. todo: is arbitrary - copied from mesh_displace.cpp */ + size_t object_index = OBJECT_NONE; + int tri_offset = 0; - int object = object_index; + for(size_t i = 0; i < scene->objects.size(); i++) { + if(strcmp(scene->objects[i]->name.c_str(), b_object.name().c_str()) == 0) { + object_index = i; + tri_offset = scene->objects[i]->mesh->tri_offset; + break; + } + } - BakeData *bake_data = scene->bake_manager->init(object, tri_offset, num_pixels); + int object = object_index; - populate_bake_data(bake_data, object_id, pixel_array, num_pixels); + bake_data = scene->bake_manager->init(object, tri_offset, num_pixels); + populate_bake_data(bake_data, object_id, pixel_array, num_pixels); - /* set number of samples */ - session->tile_manager.set_samples(session_params.samples); - session->reset(buffer_params, session_params.samples); - session->update_scene(); + /* set number of samples */ + session->tile_manager.set_samples(session_params.samples); + session->reset(buffer_params, session_params.samples); + session->update_scene(); - session->progress.set_update_callback(function_bind(&BlenderSession::update_bake_progress, this)); + session->progress.set_update_callback(function_bind(&BlenderSession::update_bake_progress, this)); + } - scene->bake_manager->bake(scene->device, &scene->dscene, scene, session->progress, shader_type, bake_pass_filter, bake_data, result); + /* Perform bake. Check cancel to avoid crash with incomplete scene data. */ + if(!session->progress.get_cancel()) { + scene->bake_manager->bake(scene->device, &scene->dscene, scene, session->progress, shader_type, bake_pass_filter, bake_data, result); + } /* free all memory used (host and device), so we wouldn't leave render * engine with extra memory allocated @@ -729,10 +674,9 @@ void BlenderSession::do_write_update_render_result(BL::RenderResult& b_rr, if(!buffers->copy_from_device()) return; - BufferParams& params = buffers->params; float exposure = scene->film->exposure; - vector<float> pixels(params.width*params.height*4); + vector<float> pixels(rtile.w*rtile.h*4); /* Adjust absolute sample number to the range. */ int sample = rtile.sample; @@ -749,19 +693,31 @@ void BlenderSession::do_write_update_render_result(BL::RenderResult& b_rr, BL::RenderPass b_pass(*b_iter); /* find matching pass type */ - PassType pass_type = get_pass_type(b_pass); + PassType pass_type = BlenderSync::get_pass_type(b_pass); int components = b_pass.channels(); - /* copy pixels */ - if(!buffers->get_pass_rect(pass_type, exposure, sample, components, &pixels[0])) + bool read = false; + if(pass_type != PASS_NONE) { + /* copy pixels */ + read = buffers->get_pass_rect(pass_type, exposure, sample, components, &pixels[0]); + } + else { + int denoising_offset = BlenderSync::get_denoising_pass(b_pass); + if(denoising_offset >= 0) { + read = buffers->get_denoising_pass_rect(denoising_offset, exposure, sample, components, &pixels[0]); + } + } + + if(!read) { memset(&pixels[0], 0, pixels.size()*sizeof(float)); + } b_pass.rect(&pixels[0]); } } else { /* copy combined pass */ - BL::RenderPass b_combined_pass(b_rlay.passes.find_by_type(BL::RenderPass::type_COMBINED, b_rview_name.c_str())); + BL::RenderPass b_combined_pass(b_rlay.passes.find_by_name("Combined", b_rview_name.c_str())); if(buffers->get_pass_rect(PASS_COMBINED, exposure, sample, 4, &pixels[0])) b_combined_pass.rect(&pixels[0]); } @@ -792,8 +748,7 @@ void BlenderSession::synchronize() /* on session/scene parameter changes, we recreate session entirely */ SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background); - const bool is_cpu = session_params.device.type == DEVICE_CPU; - SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background, is_cpu); + SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background); bool session_pause = BlenderSync::get_session_pause(b_scene, background); if(session->params.modified(session_params) || @@ -989,10 +944,14 @@ void BlenderSession::update_status_progress() if(substatus.size() > 0) status += " | " + substatus; - if(status != last_status) { + double current_time = time_dt(); + /* When rendering in a window, redraw the status at least once per second to keep the elapsed and remaining time up-to-date. + * For headless rendering, only report when something significant changes to keep the console output readable. */ + if(status != last_status || (!headless && (current_time - last_status_time) > 1.0)) { b_engine.update_stats("", (timestatus + scene + status).c_str()); b_engine.update_memory_stats(mem_used, mem_peak); last_status = status; + last_status_time = current_time; } if(progress != last_progress) { b_engine.update_progress(progress); @@ -1061,18 +1020,11 @@ int BlenderSession::builtin_image_frame(const string &builtin_name) void BlenderSession::builtin_image_info(const string &builtin_name, void *builtin_data, - bool &is_float, - int &width, - int &height, - int &depth, - int &channels) + ImageMetaData& metadata) { /* empty image */ - is_float = false; - width = 1; - height = 1; - depth = 0; - channels = 0; + metadata.width = 1; + metadata.height = 1; if(!builtin_data) return; @@ -1086,32 +1038,34 @@ void BlenderSession::builtin_image_info(const string &builtin_name, /* image data */ BL::Image b_image(b_id); - is_float = b_image.is_float(); - width = b_image.size()[0]; - height = b_image.size()[1]; - depth = 1; - channels = b_image.channels(); + metadata.builtin_free_cache = !b_image.has_data(); + metadata.is_float = b_image.is_float(); + metadata.width = b_image.size()[0]; + metadata.height = b_image.size()[1]; + metadata.depth = 1; + metadata.channels = b_image.channels(); } else if(b_id.is_a(&RNA_Object)) { /* smoke volume data */ BL::Object b_ob(b_id); BL::SmokeDomainSettings b_domain = object_smoke_domain_find(b_ob); - is_float = true; - depth = 1; - channels = 1; + metadata.is_float = true; + metadata.depth = 1; + metadata.channels = 1; if(!b_domain) return; if(builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_DENSITY) || builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_FLAME) || - builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_HEAT)) - channels = 1; + builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_HEAT) || + builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_TEMPERATURE)) + metadata.channels = 1; else if(builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_COLOR)) - channels = 4; + metadata.channels = 4; else if(builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_VELOCITY)) - channels = 3; + metadata.channels = 3; else return; @@ -1125,9 +1079,9 @@ void BlenderSession::builtin_image_info(const string &builtin_name, amplify = 1; } - width = resolution.x * amplify; - height = resolution.y * amplify; - depth = resolution.z * amplify; + metadata.width = resolution.x * amplify; + metadata.height = resolution.y * amplify; + metadata.depth = resolution.z * amplify; } else { /* TODO(sergey): Check we're indeed in shader node tree. */ @@ -1136,9 +1090,11 @@ void BlenderSession::builtin_image_info(const string &builtin_name, BL::Node b_node(ptr); if(b_node.is_a(&RNA_ShaderNodeTexPointDensity)) { BL::ShaderNodeTexPointDensity b_point_density_node(b_node); - channels = 4; - width = height = depth = b_point_density_node.resolution(); - is_float = true; + metadata.channels = 4; + metadata.width = b_point_density_node.resolution(); + metadata.height = metadata.width; + metadata.depth = metadata.width; + metadata.is_float = true; } } } @@ -1146,7 +1102,8 @@ void BlenderSession::builtin_image_info(const string &builtin_name, bool BlenderSession::builtin_image_pixels(const string &builtin_name, void *builtin_data, unsigned char *pixels, - const size_t pixels_size) + const size_t pixels_size, + const bool free_cache) { if(!builtin_data) { return false; @@ -1167,7 +1124,6 @@ bool BlenderSession::builtin_image_pixels(const string &builtin_name, if(image_pixels && num_pixels * channels == pixels_size) { memcpy(pixels, image_pixels, pixels_size * sizeof(unsigned char)); - MEM_freeN(image_pixels); } else { if(channels == 1) { @@ -1186,6 +1142,16 @@ bool BlenderSession::builtin_image_pixels(const string &builtin_name, } } } + + if(image_pixels) { + MEM_freeN(image_pixels); + } + + /* Free image buffers to save memory during render. */ + if(free_cache) { + b_image.buffers_free(); + } + /* Premultiply, byte images are always straight for Blender. */ unsigned char *cp = pixels; for(size_t i = 0; i < num_pixels; i++, cp += channels) { @@ -1199,7 +1165,8 @@ bool BlenderSession::builtin_image_pixels(const string &builtin_name, bool BlenderSession::builtin_image_float_pixels(const string &builtin_name, void *builtin_data, float *pixels, - const size_t pixels_size) + const size_t pixels_size, + const bool free_cache) { if(!builtin_data) { return false; @@ -1224,7 +1191,6 @@ bool BlenderSession::builtin_image_float_pixels(const string &builtin_name, if(image_pixels && num_pixels * channels == pixels_size) { memcpy(pixels, image_pixels, pixels_size * sizeof(float)); - MEM_freeN(image_pixels); } else { if(channels == 1) { @@ -1244,6 +1210,15 @@ bool BlenderSession::builtin_image_float_pixels(const string &builtin_name, } } + if(image_pixels) { + MEM_freeN(image_pixels); + } + + /* Free image buffers to save memory during render. */ + if(free_cache) { + b_image.buffers_free(); + } + return true; } else if(b_id.is_a(&RNA_Object)) { @@ -1308,6 +1283,13 @@ bool BlenderSession::builtin_image_float_pixels(const string &builtin_name, return true; } } + else if(builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_TEMPERATURE)) { + SmokeDomainSettings_temperature_grid_get_length(&b_domain.ptr, &length); + if(length == num_pixels) { + SmokeDomainSettings_temperature_grid_get(&b_domain.ptr, pixels); + return true; + } + } else { fprintf(stderr, "Cycles error: unknown volume attribute %s, skipping\n", @@ -1342,9 +1324,21 @@ void BlenderSession::update_resumable_tile_manager(int num_samples) return; } - int num_samples_per_chunk = (int)ceilf((float)num_samples / num_resumable_chunks); - int range_start_sample = num_samples_per_chunk * (current_resumable_chunk - 1); - int range_num_samples = num_samples_per_chunk; + const int num_samples_per_chunk = (int)ceilf((float)num_samples / num_resumable_chunks); + + int range_start_sample, range_num_samples; + if(current_resumable_chunk != 0) { + /* Single chunk rendering. */ + range_start_sample = num_samples_per_chunk * (current_resumable_chunk - 1); + range_num_samples = num_samples_per_chunk; + } + else { + /* Ranged-chunks. */ + const int num_chunks = end_resumable_chunk - start_resumable_chunk + 1; + range_start_sample = num_samples_per_chunk * (start_resumable_chunk - 1); + range_num_samples = num_chunks * num_samples_per_chunk; + } + /* Make sure we don't overshoot. */ if(range_start_sample + range_num_samples > num_samples) { range_num_samples = num_samples - range_num_samples; } diff --git a/intern/cycles/blender/blender_session.h b/intern/cycles/blender/blender_session.h index 82fe218b4ce..3804e07cffc 100644 --- a/intern/cycles/blender/blender_session.h +++ b/intern/cycles/blender/blender_session.h @@ -17,15 +17,16 @@ #ifndef __BLENDER_SESSION_H__ #define __BLENDER_SESSION_H__ -#include "device.h" -#include "scene.h" -#include "session.h" -#include "bake.h" +#include "device/device.h" +#include "render/scene.h" +#include "render/session.h" +#include "render/bake.h" -#include "util_vector.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN +class ImageMetaData; class Scene; class Session; class RenderBuffers; @@ -79,7 +80,7 @@ public: void update_render_result(BL::RenderResult& b_rr, BL::RenderLayer& b_rlay, RenderTile& rtile); - void update_render_tile(RenderTile& rtile); + void update_render_tile(RenderTile& rtile, bool highlight); /* interactive updates */ void synchronize(); @@ -113,6 +114,7 @@ public: string last_status; string last_error; float last_progress; + double last_status_time; int width, height; double start_resize_time; @@ -137,29 +139,31 @@ public: /* Current resumable chunk index to render. */ static int current_resumable_chunk; + /* Alternative to single-chunk rendering to render a range of chunks. */ + static int start_resumable_chunk; + static int end_resumable_chunk; + protected: void do_write_update_render_result(BL::RenderResult& b_rr, BL::RenderLayer& b_rlay, RenderTile& rtile, bool do_update_only); - void do_write_update_render_tile(RenderTile& rtile, bool do_update_only); + void do_write_update_render_tile(RenderTile& rtile, bool do_update_only, bool highlight); int builtin_image_frame(const string &builtin_name); void builtin_image_info(const string &builtin_name, void *builtin_data, - bool &is_float, - int &width, - int &height, - int &depth, - int &channels); + ImageMetaData& metadata); bool builtin_image_pixels(const string &builtin_name, void *builtin_data, unsigned char *pixels, - const size_t pixels_size); + const size_t pixels_size, + const bool free_cache); bool builtin_image_float_pixels(const string &builtin_name, void *builtin_data, float *pixels, - const size_t pixels_size); + const size_t pixels_size, + const bool free_cache); /* Update tile manager to reflect resumable render settings. */ void update_resumable_tile_manager(int num_samples); diff --git a/intern/cycles/blender/blender_shader.cpp b/intern/cycles/blender/blender_shader.cpp index 8baa53fc2ec..eb9968a85c2 100644 --- a/intern/cycles/blender/blender_shader.cpp +++ b/intern/cycles/blender/blender_shader.cpp @@ -14,20 +14,23 @@ * limitations under the License. */ -#include "background.h" -#include "graph.h" -#include "light.h" -#include "nodes.h" -#include "osl.h" -#include "scene.h" -#include "shader.h" - -#include "blender_texture.h" -#include "blender_sync.h" -#include "blender_util.h" - -#include "util_debug.h" -#include "util_string.h" +#include "render/background.h" +#include "render/graph.h" +#include "render/light.h" +#include "render/nodes.h" +#include "render/osl.h" +#include "render/scene.h" +#include "render/shader.h" + +#include "blender/blender_texture.h" +#include "blender/blender_sync.h" +#include "blender/blender_util.h" + +#include "util/util_debug.h" +#include "util/util_foreach.h" +#include "util/util_string.h" +#include "util/util_set.h" +#include "util/util_task.h" CCL_NAMESPACE_BEGIN @@ -430,6 +433,9 @@ static ShaderNode *add_node(Scene *scene, case BL::ShaderNodeSubsurfaceScattering::falloff_BURLEY: subsurface->falloff = CLOSURE_BSSRDF_BURLEY_ID; break; + case BL::ShaderNodeSubsurfaceScattering::falloff_RANDOM_WALK: + subsurface->falloff = CLOSURE_BSSRDF_RANDOM_WALK_ID; + break; } node = subsurface; @@ -518,6 +524,27 @@ static ShaderNode *add_node(Scene *scene, } node = hair; } + else if(b_node.is_a(&RNA_ShaderNodeBsdfPrincipled)) { + BL::ShaderNodeBsdfPrincipled b_principled_node(b_node); + PrincipledBsdfNode *principled = new PrincipledBsdfNode(); + switch (b_principled_node.distribution()) { + case BL::ShaderNodeBsdfPrincipled::distribution_GGX: + principled->distribution = CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID; + break; + case BL::ShaderNodeBsdfPrincipled::distribution_MULTI_GGX: + principled->distribution = CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID; + break; + } + switch (b_principled_node.subsurface_method()) { + case BL::ShaderNodeBsdfPrincipled::subsurface_method_BURLEY: + principled->subsurface_method = CLOSURE_BSSRDF_PRINCIPLED_ID; + break; + case BL::ShaderNodeBsdfPrincipled::subsurface_method_RANDOM_WALK: + principled->subsurface_method = CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID; + break; + } + node = principled; + } else if(b_node.is_a(&RNA_ShaderNodeBsdfTranslucent)) { node = new TranslucentBsdfNode(); } @@ -539,6 +566,10 @@ static ShaderNode *add_node(Scene *scene, else if(b_node.is_a(&RNA_ShaderNodeVolumeAbsorption)) { node = new AbsorptionVolumeNode(); } + else if(b_node.is_a(&RNA_ShaderNodeVolumePrincipled)) { + PrincipledVolumeNode *principled = new PrincipledVolumeNode(); + node = principled; + } else if(b_node.is_a(&RNA_ShaderNodeNewGeometry)) { node = new GeometryNode(); } @@ -850,6 +881,25 @@ static ShaderNode *add_node(Scene *scene, transform_inverse(get_transform(b_ob.matrix_world())); } } + else if(b_node.is_a(&RNA_ShaderNodeBevel)) { + BL::ShaderNodeBevel b_bevel_node(b_node); + BevelNode *bevel = new BevelNode(); + bevel->samples = b_bevel_node.samples(); + node = bevel; + } + else if(b_node.is_a(&RNA_ShaderNodeDisplacement)) { + BL::ShaderNodeDisplacement b_disp_node(b_node); + DisplacementNode *disp = new DisplacementNode(); + disp->space = (NodeNormalMapSpace)b_disp_node.space(); + node = disp; + } + else if(b_node.is_a(&RNA_ShaderNodeVectorDisplacement)) { + BL::ShaderNodeVectorDisplacement b_disp_node(b_node); + VectorDisplacementNode *disp = new VectorDisplacementNode(); + disp->space = (NodeNormalMapSpace)b_disp_node.space(); + disp->attribute = ""; + node = disp; + } if(node) { node->name = b_node.name(); @@ -978,6 +1028,10 @@ static void add_nodes(Scene *scene, for(b_node->internal_links.begin(b_link); b_link != b_node->internal_links.end(); ++b_link) { BL::NodeSocket to_socket(b_link->to_socket()); SocketType::Type to_socket_type = convert_socket_type(to_socket); + if (to_socket_type == SocketType::UNDEFINED) { + continue; + } + ConvertNode *proxy = new ConvertNode(to_socket_type, to_socket_type, true); input_map[b_link->from_socket().ptr.data] = proxy->inputs[0]; @@ -1001,6 +1055,10 @@ static void add_nodes(Scene *scene, */ for(b_node->inputs.begin(b_input); b_input != b_node->inputs.end(); ++b_input) { SocketType::Type input_type = convert_socket_type(*b_input); + if (input_type == SocketType::UNDEFINED) { + continue; + } + ConvertNode *proxy = new ConvertNode(input_type, input_type, true); graph->add(proxy); @@ -1013,6 +1071,10 @@ static void add_nodes(Scene *scene, } for(b_node->outputs.begin(b_output); b_output != b_node->outputs.end(); ++b_output) { SocketType::Type output_type = convert_socket_type(*b_output); + if (output_type == SocketType::UNDEFINED) { + continue; + } + ConvertNode *proxy = new ConvertNode(output_type, output_type, true); graph->add(proxy); @@ -1164,6 +1226,9 @@ void BlenderSync::sync_materials(bool update_all) /* material loop */ BL::BlendData::materials_iterator b_mat; + TaskPool pool; + set<Shader*> updated_shaders; + for(b_data.materials.begin(b_mat); b_mat != b_data.materials.end(); ++b_mat) { Shader *shader; @@ -1196,12 +1261,40 @@ void BlenderSync::sync_materials(bool update_all) shader->heterogeneous_volume = !get_boolean(cmat, "homogeneous_volume"); shader->volume_sampling_method = get_volume_sampling(cmat); shader->volume_interpolation_method = get_volume_interpolation(cmat); - shader->displacement_method = (experimental) ? get_displacement_method(cmat) : DISPLACE_BUMP; + shader->displacement_method = get_displacement_method(cmat); shader->set_graph(graph); - shader->tag_update(scene); + + /* By simplifying the shader graph as soon as possible, some + * redundant shader nodes might be removed which prevents loading + * unnecessary attributes later. + * + * However, since graph simplification also accounts for e.g. mix + * weight, this would cause frequent expensive resyncs in interactive + * sessions, so for those sessions optimization is only performed + * right before compiling. + */ + if(!preview) { + pool.push(function_bind(&ShaderGraph::simplify, graph, scene)); + /* NOTE: Update shaders out of the threads since those routines + * are accessing and writing to a global context. + */ + updated_shaders.insert(shader); + } + else { + /* NOTE: Update tagging can access links which are being + * optimized out. + */ + shader->tag_update(scene); + } } } + + pool.wait_work(); + + foreach(Shader *shader, updated_shaders) { + shader->tag_update(scene); + } } /* Sync World */ @@ -1242,11 +1335,8 @@ void BlenderSync::sync_world(bool update_all) /* AO */ BL::WorldLighting b_light = b_world.light_settings(); - if(b_light.use_ambient_occlusion()) - background->ao_factor = b_light.ao_factor(); - else - background->ao_factor = 0.0f; - + background->use_ao = b_light.use_ambient_occlusion(); + background->ao_factor = b_light.ao_factor(); background->ao_distance = b_light.distance(); /* visibility */ @@ -1262,6 +1352,7 @@ void BlenderSync::sync_world(bool update_all) background->visibility = visibility; } else { + background->use_ao = false; background->ao_factor = 0.0f; background->ao_distance = FLT_MAX; } @@ -1282,8 +1373,17 @@ void BlenderSync::sync_world(bool update_all) else background->transparent = b_scene.render().alpha_mode() == BL::RenderSettings::alpha_mode_TRANSPARENT; + if(background->transparent) { + background->transparent_glass = get_boolean(cscene, "film_transparent_glass"); + background->transparent_roughness_threshold = get_float(cscene, "film_transparent_roughness"); + } + else { + background->transparent_glass = false; + background->transparent_roughness_threshold = 0.0f; + } + background->use_shader = render_layer.use_background_shader; - background->use_ao = render_layer.use_background_ao; + background->use_ao = background->use_ao && render_layer.use_background_ao; if(background->modified(prevbackground)) background->tag_update(scene); diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp index d8043105cd8..283aa5600fd 100644 --- a/intern/cycles/blender/blender_sync.cpp +++ b/intern/cycles/blender/blender_sync.cpp @@ -14,29 +14,29 @@ * limitations under the License. */ -#include "background.h" -#include "camera.h" -#include "film.h" -#include "graph.h" -#include "integrator.h" -#include "light.h" -#include "mesh.h" -#include "nodes.h" -#include "object.h" -#include "scene.h" -#include "shader.h" -#include "curves.h" - -#include "device.h" - -#include "blender_sync.h" -#include "blender_session.h" -#include "blender_util.h" - -#include "util_debug.h" -#include "util_foreach.h" -#include "util_opengl.h" -#include "util_hash.h" +#include "render/background.h" +#include "render/camera.h" +#include "render/film.h" +#include "render/graph.h" +#include "render/integrator.h" +#include "render/light.h" +#include "render/mesh.h" +#include "render/nodes.h" +#include "render/object.h" +#include "render/scene.h" +#include "render/shader.h" +#include "render/curves.h" + +#include "device/device.h" + +#include "blender/blender_sync.h" +#include "blender/blender_session.h" +#include "blender/blender_util.h" + +#include "util/util_debug.h" +#include "util/util_foreach.h" +#include "util/util_opengl.h" +#include "util/util_hash.h" CCL_NAMESPACE_BEGIN @@ -47,8 +47,7 @@ BlenderSync::BlenderSync(BL::RenderEngine& b_engine, BL::Scene& b_scene, Scene *scene, bool preview, - Progress &progress, - bool is_cpu) + Progress &progress) : b_engine(b_engine), b_data(b_data), b_scene(b_scene), @@ -62,7 +61,6 @@ BlenderSync::BlenderSync(BL::RenderEngine& b_engine, scene(scene), preview(preview), experimental(false), - is_cpu(is_cpu), dicing_rate(1.0f), max_subdivisions(12), progress(progress) @@ -210,10 +208,9 @@ void BlenderSync::sync_data(BL::RenderSettings& b_render, scene->need_motion() == Scene::MOTION_NONE || scene->camera->motion_position == Camera::MOTION_POSITION_CENTER) { - sync_objects(b_v3d); + sync_objects(); } sync_motion(b_render, - b_v3d, b_override, width, height, python_thread_state); @@ -225,9 +222,7 @@ void BlenderSync::sync_data(BL::RenderSettings& b_render, void BlenderSync::sync_integrator() { -#ifdef __CAMERA_MOTION__ BL::RenderSettings r = b_scene.render(); -#endif PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles"); experimental = (get_enum(cscene, "feature_set") != 0); @@ -235,7 +230,6 @@ void BlenderSync::sync_integrator() Integrator *integrator = scene->integrator; Integrator previntegrator = *integrator; - integrator->min_bounce = get_int(cscene, "min_bounces"); integrator->max_bounce = get_int(cscene, "max_bounces"); integrator->max_diffuse_bounce = get_int(cscene, "diffuse_bounces"); @@ -244,8 +238,6 @@ void BlenderSync::sync_integrator() integrator->max_volume_bounce = get_int(cscene, "volume_bounces"); integrator->transparent_max_bounce = get_int(cscene, "transparent_max_bounces"); - integrator->transparent_min_bounce = get_int(cscene, "transparent_min_bounces"); - integrator->transparent_shadows = get_boolean(cscene, "use_transparent_shadows"); integrator->volume_max_steps = get_int(cscene, "volume_max_steps"); integrator->volume_step_size = get_float(cscene, "volume_step_size"); @@ -275,7 +267,6 @@ void BlenderSync::sync_integrator() integrator->sample_clamp_direct = get_float(cscene, "sample_clamp_direct"); integrator->sample_clamp_indirect = get_float(cscene, "sample_clamp_indirect"); -#ifdef __CAMERA_MOTION__ if(!preview) { if(integrator->motion_blur != r.use_motion_blur()) { scene->object_manager->tag_update(scene); @@ -284,7 +275,6 @@ void BlenderSync::sync_integrator() integrator->motion_blur = r.use_motion_blur(); } -#endif integrator->method = (Integrator::Method)get_enum(cscene, "progressive", @@ -330,6 +320,9 @@ void BlenderSync::sync_integrator() integrator->ao_bounces = get_int(cscene, "ao_bounces_render"); } } + else { + integrator->ao_bounces = 0; + } if(integrator->modified(previntegrator)) integrator->tag_update(scene); @@ -480,11 +473,156 @@ void BlenderSync::sync_images() } } +/* Passes */ +PassType BlenderSync::get_pass_type(BL::RenderPass& b_pass) +{ + string name = b_pass.name(); +#define MAP_PASS(passname, passtype) if(name == passname) return passtype; + /* NOTE: Keep in sync with defined names from DNA_scene_types.h */ + MAP_PASS("Combined", PASS_COMBINED); + MAP_PASS("Depth", PASS_DEPTH); + MAP_PASS("Mist", PASS_MIST); + MAP_PASS("Normal", PASS_NORMAL); + MAP_PASS("IndexOB", PASS_OBJECT_ID); + MAP_PASS("UV", PASS_UV); + MAP_PASS("Vector", PASS_MOTION); + MAP_PASS("IndexMA", PASS_MATERIAL_ID); + + MAP_PASS("DiffDir", PASS_DIFFUSE_DIRECT); + MAP_PASS("GlossDir", PASS_GLOSSY_DIRECT); + MAP_PASS("TransDir", PASS_TRANSMISSION_DIRECT); + MAP_PASS("SubsurfaceDir", PASS_SUBSURFACE_DIRECT); + MAP_PASS("VolumeDir", PASS_VOLUME_DIRECT); + + MAP_PASS("DiffInd", PASS_DIFFUSE_INDIRECT); + MAP_PASS("GlossInd", PASS_GLOSSY_INDIRECT); + MAP_PASS("TransInd", PASS_TRANSMISSION_INDIRECT); + MAP_PASS("SubsurfaceInd", PASS_SUBSURFACE_INDIRECT); + MAP_PASS("VolumeInd", PASS_VOLUME_INDIRECT); + + MAP_PASS("DiffCol", PASS_DIFFUSE_COLOR); + MAP_PASS("GlossCol", PASS_GLOSSY_COLOR); + MAP_PASS("TransCol", PASS_TRANSMISSION_COLOR); + MAP_PASS("SubsurfaceCol", PASS_SUBSURFACE_COLOR); + + MAP_PASS("Emit", PASS_EMISSION); + MAP_PASS("Env", PASS_BACKGROUND); + MAP_PASS("AO", PASS_AO); + MAP_PASS("Shadow", PASS_SHADOW); + +#ifdef __KERNEL_DEBUG__ + MAP_PASS("Debug BVH Traversed Nodes", PASS_BVH_TRAVERSED_NODES); + MAP_PASS("Debug BVH Traversed Instances", PASS_BVH_TRAVERSED_INSTANCES); + MAP_PASS("Debug BVH Intersections", PASS_BVH_INTERSECTIONS); + MAP_PASS("Debug Ray Bounces", PASS_RAY_BOUNCES); +#endif + MAP_PASS("Debug Render Time", PASS_RENDER_TIME); +#undef MAP_PASS + + return PASS_NONE; +} + +int BlenderSync::get_denoising_pass(BL::RenderPass& b_pass) +{ + string name = b_pass.name(); + if(name.substr(0, 10) != "Denoising ") { + return -1; + } + name = name.substr(10); + +#define MAP_PASS(passname, offset) if(name == passname) return offset; + MAP_PASS("Normal", DENOISING_PASS_NORMAL); + MAP_PASS("Normal Variance", DENOISING_PASS_NORMAL_VAR); + MAP_PASS("Albedo", DENOISING_PASS_ALBEDO); + MAP_PASS("Albedo Variance", DENOISING_PASS_ALBEDO_VAR); + MAP_PASS("Depth", DENOISING_PASS_DEPTH); + MAP_PASS("Depth Variance", DENOISING_PASS_DEPTH_VAR); + MAP_PASS("Shadow A", DENOISING_PASS_SHADOW_A); + MAP_PASS("Shadow B", DENOISING_PASS_SHADOW_B); + MAP_PASS("Image", DENOISING_PASS_COLOR); + MAP_PASS("Image Variance", DENOISING_PASS_COLOR_VAR); +#undef MAP_PASS + + return -1; +} + +array<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay, + BL::SceneRenderLayer& b_srlay, + const SessionParams &session_params) +{ + array<Pass> passes; + Pass::add(PASS_COMBINED, passes); + + if(!session_params.device.advanced_shading) { + return passes; + } + + /* loop over passes */ + BL::RenderLayer::passes_iterator b_pass_iter; + + for(b_rlay.passes.begin(b_pass_iter); b_pass_iter != b_rlay.passes.end(); ++b_pass_iter) { + BL::RenderPass b_pass(*b_pass_iter); + PassType pass_type = get_pass_type(b_pass); + + if(pass_type == PASS_MOTION && scene->integrator->motion_blur) + continue; + if(pass_type != PASS_NONE) + Pass::add(pass_type, passes); + } + + PointerRNA crp = RNA_pointer_get(&b_srlay.ptr, "cycles"); + if(get_boolean(crp, "denoising_store_passes") && + get_boolean(crp, "use_denoising")) + { + b_engine.add_pass("Denoising Normal", 3, "XYZ", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Normal Variance", 3, "XYZ", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Albedo", 3, "RGB", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Albedo Variance", 3, "RGB", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Depth", 1, "Z", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Depth Variance", 1, "Z", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Shadow A", 3, "XYV", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Shadow B", 3, "XYV", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Image", 3, "RGB", b_srlay.name().c_str()); + b_engine.add_pass("Denoising Image Variance", 3, "RGB", b_srlay.name().c_str()); + } +#ifdef __KERNEL_DEBUG__ + if(get_boolean(crp, "pass_debug_bvh_traversed_nodes")) { + b_engine.add_pass("Debug BVH Traversed Nodes", 1, "X", b_srlay.name().c_str()); + Pass::add(PASS_BVH_TRAVERSED_NODES, passes); + } + if(get_boolean(crp, "pass_debug_bvh_traversed_instances")) { + b_engine.add_pass("Debug BVH Traversed Instances", 1, "X", b_srlay.name().c_str()); + Pass::add(PASS_BVH_TRAVERSED_INSTANCES, passes); + } + if(get_boolean(crp, "pass_debug_bvh_intersections")) { + b_engine.add_pass("Debug BVH Intersections", 1, "X", b_srlay.name().c_str()); + Pass::add(PASS_BVH_INTERSECTIONS, passes); + } + if(get_boolean(crp, "pass_debug_ray_bounces")) { + b_engine.add_pass("Debug Ray Bounces", 1, "X", b_srlay.name().c_str()); + Pass::add(PASS_RAY_BOUNCES, passes); + } +#endif + if(get_boolean(crp, "pass_debug_render_time")) { + b_engine.add_pass("Debug Render Time", 1, "X", b_srlay.name().c_str()); + Pass::add(PASS_RENDER_TIME, passes); + } + if(get_boolean(crp, "use_pass_volume_direct")) { + b_engine.add_pass("VolumeDir", 3, "RGB", b_srlay.name().c_str()); + Pass::add(PASS_VOLUME_DIRECT, passes); + } + if(get_boolean(crp, "use_pass_volume_indirect")) { + b_engine.add_pass("VolumeInd", 3, "RGB", b_srlay.name().c_str()); + Pass::add(PASS_VOLUME_INDIRECT, passes); + } + + return passes; +} + /* Scene Parameters */ SceneParams BlenderSync::get_scene_params(BL::Scene& b_scene, - bool background, - bool is_cpu) + bool background) { BL::RenderSettings r = b_scene.render(); SceneParams params; @@ -496,14 +634,10 @@ SceneParams BlenderSync::get_scene_params(BL::Scene& b_scene, else if(shadingsystem == 1) params.shadingsystem = SHADINGSYSTEM_OSL; - if(background) + if(background || DebugFlags().viewport_static_bvh) params.bvh_type = SceneParams::BVH_STATIC; else - params.bvh_type = (SceneParams::BVHType)get_enum( - cscene, - "debug_bvh_type", - SceneParams::BVH_NUM_TYPES, - SceneParams::BVH_STATIC); + params.bvh_type = SceneParams::BVH_DYNAMIC; params.use_bvh_spatial_split = RNA_boolean_get(&cscene, "debug_use_spatial_splits"); params.use_bvh_unaligned_nodes = RNA_boolean_get(&cscene, "debug_use_hair_bvh"); @@ -528,15 +662,7 @@ SceneParams BlenderSync::get_scene_params(BL::Scene& b_scene, params.texture_limit = 0; } -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) - if(is_cpu) { - params.use_qbvh = DebugFlags().cpu.qbvh && system_cpu_support_sse2(); - } - else -#endif - { - params.use_qbvh = false; - } + params.bvh_layout = DebugFlags().cpu.bvh_layout; return params; } @@ -560,6 +686,16 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine& b_engine, /* feature set */ params.experimental = (get_enum(cscene, "feature_set") != 0); + /* threads */ + BL::RenderSettings b_r = b_scene.render(); + if(b_r.threads_mode() == BL::RenderSettings::threads_mode_FIXED) + params.threads = b_r.threads(); + else + params.threads = 0; + + /* Background */ + params.background = background; + /* device type */ vector<DeviceInfo>& devices = Device::available_devices(); @@ -588,12 +724,28 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine& b_engine, } } - int compute_device = get_enum(b_preferences, "compute_device_type"); + enum ComputeDevice { + COMPUTE_DEVICE_CPU = 0, + COMPUTE_DEVICE_CUDA = 1, + COMPUTE_DEVICE_OPENCL = 2, + COMPUTE_DEVICE_NUM = 3, + }; - if(compute_device != 0) { + ComputeDevice compute_device = (ComputeDevice)get_enum(b_preferences, + "compute_device_type", + COMPUTE_DEVICE_NUM, + COMPUTE_DEVICE_CPU); + + if(compute_device != COMPUTE_DEVICE_CPU) { vector<DeviceInfo> used_devices; RNA_BEGIN(&b_preferences, device, "devices") { - if(get_enum(device, "type") == compute_device && get_boolean(device, "use")) { + ComputeDevice device_type = (ComputeDevice)get_enum(device, + "type", + COMPUTE_DEVICE_NUM, + COMPUTE_DEVICE_CPU); + + if(get_boolean(device, "use") && + (device_type == compute_device || device_type == COMPUTE_DEVICE_CPU)) { string id = get_string(device, "id"); foreach(DeviceInfo& info, devices) { if(info.id == id) { @@ -608,15 +760,14 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine& b_engine, params.device = used_devices[0]; } else if(used_devices.size() > 1) { - params.device = Device::get_multi_device(used_devices); + params.device = Device::get_multi_device(used_devices, + params.threads, + params.background); } /* Else keep using the CPU device that was set before. */ } } - /* Background */ - params.background = background; - /* samples */ int samples = get_int(cscene, "samples"); int aa_samples = get_int(cscene, "aa_samples"); @@ -676,19 +827,28 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine& b_engine, params.tile_order = TILE_BOTTOM_TO_TOP; } + /* other parameters */ params.start_resolution = get_int(cscene, "preview_start_resolution"); + params.pixel_size = b_engine.get_preview_pixel_size(b_scene); /* other parameters */ - if(b_scene.render().threads_mode() == BL::RenderSettings::threads_mode_FIXED) - params.threads = b_scene.render().threads(); - else - params.threads = 0; - params.cancel_timeout = (double)get_float(cscene, "debug_cancel_timeout"); params.reset_timeout = (double)get_float(cscene, "debug_reset_timeout"); params.text_timeout = (double)get_float(cscene, "debug_text_timeout"); - params.progressive_refine = get_boolean(cscene, "use_progressive_refine"); + /* progressive refine */ + params.progressive_refine = get_boolean(cscene, "use_progressive_refine") && + !b_r.use_save_buffers(); + + if(params.progressive_refine) { + BL::RenderSettings::layers_iterator b_rlay; + for(b_r.layers.begin(b_rlay); b_rlay != b_r.layers.end(); ++b_rlay) { + PointerRNA crl = RNA_pointer_get(&b_rlay->ptr, "cycles"); + if(get_boolean(crl, "use_denoising")) { + params.progressive_refine = false; + } + } + } if(background) { if(params.progressive_refine) @@ -697,6 +857,7 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine& b_engine, params.progressive = false; params.start_resolution = INT_MAX; + params.pixel_size = 1; } else params.progressive = true; diff --git a/intern/cycles/blender/blender_sync.h b/intern/cycles/blender/blender_sync.h index 6984cbda259..1e7b0b32518 100644 --- a/intern/cycles/blender/blender_sync.h +++ b/intern/cycles/blender/blender_sync.h @@ -22,15 +22,15 @@ #include "RNA_access.h" #include "RNA_blender_cpp.h" -#include "blender_util.h" +#include "blender/blender_util.h" -#include "scene.h" -#include "session.h" +#include "render/scene.h" +#include "render/session.h" -#include "util_map.h" -#include "util_set.h" -#include "util_transform.h" -#include "util_vector.h" +#include "util/util_map.h" +#include "util/util_set.h" +#include "util/util_transform.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN @@ -54,8 +54,7 @@ public: BL::Scene& b_scene, Scene *scene, bool preview, - Progress &progress, - bool is_cpu); + Progress &progress); ~BlenderSync(); /* sync */ @@ -67,6 +66,9 @@ public: void **python_thread_state, const char *layer = 0); void sync_render_layers(BL::SpaceView3D& b_v3d, const char *layer); + array<Pass> sync_render_passes(BL::RenderLayer& b_rlay, + BL::SceneRenderLayer& b_srlay, + const SessionParams &session_params); void sync_integrator(); void sync_camera(BL::RenderSettings& b_render, BL::Object& b_override, @@ -80,8 +82,7 @@ public: /* get parameters */ static SceneParams get_scene_params(BL::Scene& b_scene, - bool background, - bool is_cpu); + bool background); static SessionParams get_session_params(BL::RenderEngine& b_engine, BL::UserPreferences& b_userpref, BL::Scene& b_scene, @@ -93,13 +94,15 @@ public: Camera *cam, int width, int height); + static PassType get_pass_type(BL::RenderPass& b_pass); + static int get_denoising_pass(BL::RenderPass& b_pass); + private: /* sync */ void sync_lamps(bool update_all); void sync_materials(bool update_all); - void sync_objects(BL::SpaceView3D& b_v3d, float motion_time = 0.0f); + void sync_objects(float motion_time = 0.0f); void sync_motion(BL::RenderSettings& b_render, - BL::SpaceView3D& b_v3d, BL::Object& b_override, int width, int height, void **python_thread_state); @@ -115,7 +118,7 @@ private: BL::Mesh& b_mesh, BL::Object& b_ob, bool motion, - int time_index = 0); + int motion_step = 0); Object *sync_object(BL::Object& b_parent, int persistent_id[OBJECT_PERSISTENT_ID_SIZE], BL::DupliObject& b_dupli_ob, @@ -128,6 +131,7 @@ private: void sync_light(BL::Object& b_parent, int persistent_id[OBJECT_PERSISTENT_ID_SIZE], BL::Object& b_ob, + BL::DupliObject& b_dupli_ob, Transform& tfm, bool *use_portal); void sync_background_light(bool use_portal); @@ -172,7 +176,6 @@ private: Scene *scene; bool preview; bool experimental; - bool is_cpu; float dicing_rate; int max_subdivisions; diff --git a/intern/cycles/blender/blender_texture.cpp b/intern/cycles/blender/blender_texture.cpp index 3807e683c7c..b2e27b76189 100644 --- a/intern/cycles/blender/blender_texture.cpp +++ b/intern/cycles/blender/blender_texture.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "blender_texture.h" +#include "blender/blender_texture.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/blender/blender_texture.h b/intern/cycles/blender/blender_texture.h index ad96f9db8ed..734231a85ec 100644 --- a/intern/cycles/blender/blender_texture.h +++ b/intern/cycles/blender/blender_texture.h @@ -18,7 +18,7 @@ #define __BLENDER_TEXTURE_H__ #include <stdlib.h> -#include "blender_sync.h" +#include "blender/blender_sync.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/blender/blender_util.h b/intern/cycles/blender/blender_util.h index 23df3c1bc30..c418b19a637 100644 --- a/intern/cycles/blender/blender_util.h +++ b/intern/cycles/blender/blender_util.h @@ -17,15 +17,15 @@ #ifndef __BLENDER_UTIL_H__ #define __BLENDER_UTIL_H__ -#include "mesh.h" +#include "render/mesh.h" -#include "util_algorithm.h" -#include "util_map.h" -#include "util_path.h" -#include "util_set.h" -#include "util_transform.h" -#include "util_types.h" -#include "util_vector.h" +#include "util/util_algorithm.h" +#include "util/util_map.h" +#include "util/util_path.h" +#include "util/util_set.h" +#include "util/util_transform.h" +#include "util/util_types.h" +#include "util/util_vector.h" /* Hacks to hook into Blender API * todo: clean this up ... */ @@ -51,8 +51,8 @@ static inline BL::Mesh object_to_mesh(BL::BlendData& data, bool calc_undeformed, Mesh::SubdivisionType subdivision_type) { - bool subsurf_mod_show_render; - bool subsurf_mod_show_viewport; + bool subsurf_mod_show_render = false; + bool subsurf_mod_show_viewport = false; if(subdivision_type != Mesh::SUBDIVISION_NONE) { BL::Modifier subsurf_mod = object.modifiers[object.modifiers.length()-1]; @@ -174,22 +174,19 @@ static inline void curvemapping_color_to_array(BL::CurveMapping& cumap, if(rgb_curve) { BL::CurveMap mapI = cumap.curves[3]; - for(int i = 0; i < size; i++) { - float t = min_x + (float)i/(float)(size-1) * range_x; - - data[i][0] = mapR.evaluate(mapI.evaluate(t)); - data[i][1] = mapG.evaluate(mapI.evaluate(t)); - data[i][2] = mapB.evaluate(mapI.evaluate(t)); + const float t = min_x + (float)i/(float)(size-1) * range_x; + data[i] = make_float3(mapR.evaluate(mapI.evaluate(t)), + mapG.evaluate(mapI.evaluate(t)), + mapB.evaluate(mapI.evaluate(t))); } } else { for(int i = 0; i < size; i++) { float t = min_x + (float)i/(float)(size-1) * range_x; - - data[i][0] = mapR.evaluate(t); - data[i][1] = mapG.evaluate(t); - data[i][2] = mapB.evaluate(t); + data[i] = make_float3(mapR.evaluate(t), + mapG.evaluate(t), + mapB.evaluate(t)); } } } @@ -250,14 +247,15 @@ static inline float *image_get_float_pixels_for_frame(BL::Image& image, static inline Transform get_transform(const BL::Array<float, 16>& array) { - Transform tfm; + ProjectionTransform projection; - /* we assume both types to be just 16 floats, and transpose because blender - * use column major matrix order while we use row major */ - memcpy(&tfm, &array, sizeof(float)*16); - tfm = transform_transpose(tfm); + /* We assume both types to be just 16 floats, and transpose because blender + * use column major matrix order while we use row major. */ + memcpy(&projection, &array, sizeof(float)*16); + projection = projection_transpose(projection); - return tfm; + /* Drop last row, matrix is assumed to be affine transform. */ + return projection_to_transform(projection); } static inline float2 get_float2(const BL::Array<float, 2>& array) @@ -302,7 +300,7 @@ static inline uint get_layer(const BL::Array<int, 20>& array) for(uint i = 0; i < 20; i++) if(array[i]) layer |= (1 << i); - + return layer; } @@ -437,7 +435,7 @@ static inline string get_string(PointerRNA& ptr, const char *name) string str(cstr); if(cstr != cstrbuf) MEM_freeN(cstr); - + return str; } @@ -454,7 +452,7 @@ static inline string blender_absolute_path(BL::BlendData& b_data, { if(path.size() >= 2 && path[0] == '/' && path[1] == '/') { string dirname; - + if(b_id.library()) { BL::ID b_library_id(b_id.library()); dirname = blender_absolute_path(b_data, @@ -486,33 +484,34 @@ static inline void mesh_texture_space(BL::Mesh& b_mesh, loc = loc*size - make_float3(0.5f, 0.5f, 0.5f); } -/* object used for motion blur */ -static inline bool object_use_motion(BL::Object& b_parent, BL::Object& b_ob) +/* Object motion steps, returns 0 if no motion blur needed. */ +static inline uint object_motion_steps(BL::Object& b_parent, BL::Object& b_ob) { + /* Get motion enabled and steps from object itself. */ PointerRNA cobject = RNA_pointer_get(&b_ob.ptr, "cycles"); bool use_motion = get_boolean(cobject, "use_motion_blur"); - /* If motion blur is enabled for the object we also check - * whether it's enabled for the parent object as well. - * - * This way we can control motion blur from the dupligroup - * duplicator much easier. - */ - if(use_motion && b_parent.ptr.data != b_ob.ptr.data) { + if(!use_motion) { + return 0; + } + + uint steps = max(1, get_int(cobject, "motion_steps")); + + /* Also check parent object, so motion blur and steps can be + * controlled by dupligroup duplicator for linked groups. */ + if(b_parent.ptr.data != b_ob.ptr.data) { PointerRNA parent_cobject = RNA_pointer_get(&b_parent.ptr, "cycles"); use_motion &= get_boolean(parent_cobject, "use_motion_blur"); - } - return use_motion; -} -/* object motion steps */ -static inline uint object_motion_steps(BL::Object& b_ob) -{ - PointerRNA cobject = RNA_pointer_get(&b_ob.ptr, "cycles"); - uint steps = get_int(cobject, "motion_steps"); + if(!use_motion) { + return 0; + } + + steps = max(steps, get_int(parent_cobject, "motion_steps")); + } - /* use uneven number of steps so we get one keyframe at the current frame, - * and ue 2^(steps - 1) so objects with more/fewer steps still have samples - * at the same times, to avoid sampling at many different times */ + /* Use uneven number of steps so we get one keyframe at the current frame, + * and use 2^(steps - 1) so objects with more/fewer steps still have samples + * at the same times, to avoid sampling at many different times. */ return (2 << (steps - 1)) + 1; } @@ -547,7 +546,7 @@ static inline BL::SmokeDomainSettings object_smoke_domain_find(BL::Object& b_ob) return b_smd.domain_settings(); } } - + return BL::SmokeDomainSettings(PointerRNA_NULL); } @@ -819,4 +818,3 @@ protected: CCL_NAMESPACE_END #endif /* __BLENDER_UTIL_H__ */ - diff --git a/intern/cycles/bvh/CMakeLists.txt b/intern/cycles/bvh/CMakeLists.txt index 92e48f0d87f..b8171e7f70d 100644 --- a/intern/cycles/bvh/CMakeLists.txt +++ b/intern/cycles/bvh/CMakeLists.txt @@ -1,12 +1,6 @@ set(INC - . - ../graph - ../kernel - ../kernel/svm - ../render - ../util - ../device + .. ) set(INC_SYS @@ -14,6 +8,8 @@ set(INC_SYS set(SRC bvh.cpp + bvh2.cpp + bvh4.cpp bvh_binning.cpp bvh_build.cpp bvh_node.cpp @@ -24,6 +20,8 @@ set(SRC set(SRC_HEADERS bvh.h + bvh2.h + bvh4.h bvh_binning.h bvh_build.h bvh_node.h @@ -36,4 +34,4 @@ set(SRC_HEADERS include_directories(${INC}) include_directories(SYSTEM ${INC_SYS}) -add_library(cycles_bvh ${SRC} ${SRC_HEADERS}) +cycles_add_library(cycles_bvh ${SRC} ${SRC_HEADERS}) diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp index 1fb2f371a0f..b524ca07d8d 100644 --- a/intern/cycles/bvh/bvh.cpp +++ b/intern/cycles/bvh/bvh.cpp @@ -15,45 +15,68 @@ * limitations under the License. */ -#include "mesh.h" -#include "object.h" -#include "scene.h" -#include "curves.h" - -#include "bvh.h" -#include "bvh_build.h" -#include "bvh_node.h" -#include "bvh_params.h" -#include "bvh_unaligned.h" - -#include "util_debug.h" -#include "util_foreach.h" -#include "util_logging.h" -#include "util_map.h" -#include "util_progress.h" -#include "util_system.h" -#include "util_types.h" -#include "util_math.h" +#include "bvh/bvh.h" + +#include "render/mesh.h" +#include "render/object.h" + +#include "bvh/bvh2.h" +#include "bvh/bvh4.h" +#include "bvh/bvh_build.h" +#include "bvh/bvh_node.h" + +#include "util/util_foreach.h" +#include "util/util_logging.h" +#include "util/util_progress.h" CCL_NAMESPACE_BEGIN -/* Pack Utility */ +/* BVH Parameters. */ -struct BVHStackEntry +const char *bvh_layout_name(BVHLayout layout) { - const BVHNode *node; - int idx; + switch(layout) { + case BVH_LAYOUT_BVH2: return "BVH2"; + case BVH_LAYOUT_BVH4: return "BVH4"; + case BVH_LAYOUT_NONE: return "NONE"; + case BVH_LAYOUT_ALL: return "ALL"; + } + LOG(DFATAL) << "Unsupported BVH layout was passed."; + return ""; +} - BVHStackEntry(const BVHNode* n = 0, int i = 0) - : node(n), idx(i) - { +BVHLayout BVHParams::best_bvh_layout(BVHLayout requested_layout, + BVHLayoutMask supported_layouts) +{ + const BVHLayoutMask requested_layout_mask = (BVHLayoutMask)requested_layout; + /* Check whether requested layout is supported, if so -- no need to do + * any extra computation. + */ + if(supported_layouts & requested_layout_mask) { + return requested_layout; } + /* Some bit magic to get widest supported BVH layout. */ + /* This is a mask of supported BVH layouts which are narrower than the + * requested one. + */ + const BVHLayoutMask allowed_layouts_mask = + (supported_layouts & (requested_layout_mask - 1)); + /* We get widest from allowed ones and convert mask to actual layout. */ + const BVHLayoutMask widest_allowed_layout_mask = __bsr(allowed_layouts_mask); + return (BVHLayout)(1 << widest_allowed_layout_mask); +} - int encodeIdx() const - { - return (node->is_leaf())? ~idx: idx; - } -}; +/* Pack Utility */ + +BVHStackEntry::BVHStackEntry(const BVHNode *n, int i) + : node(n), idx(i) +{ +} + +int BVHStackEntry::encodeIdx() const +{ + return (node->is_leaf())? ~idx: idx; +} /* BVH */ @@ -64,10 +87,17 @@ BVH::BVH(const BVHParams& params_, const vector<Object*>& objects_) BVH *BVH::create(const BVHParams& params, const vector<Object*>& objects) { - if(params.use_qbvh) - return new QBVH(params, objects); - else - return new RegularBVH(params, objects); + switch(params.bvh_layout) { + case BVH_LAYOUT_BVH2: + return new BVH2(params, objects); + case BVH_LAYOUT_BVH4: + return new BVH4(params, objects); + case BVH_LAYOUT_NONE: + case BVH_LAYOUT_ALL: + break; + } + LOG(DFATAL) << "Requested unsupported BVH layout."; + return NULL; } /* Building */ @@ -121,6 +151,73 @@ void BVH::refit(Progress& progress) refit_nodes(); } +void BVH::refit_primitives(int start, int end, BoundBox& bbox, uint& visibility) +{ + /* Refit range of primitives. */ + for(int prim = start; prim < end; prim++) { + int pidx = pack.prim_index[prim]; + int tob = pack.prim_object[prim]; + Object *ob = objects[tob]; + + if(pidx == -1) { + /* Object instance. */ + bbox.grow(ob->bounds); + } + else { + /* Primitives. */ + const Mesh *mesh = ob->mesh; + + if(pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) { + /* Curves. */ + int str_offset = (params.top_level)? mesh->curve_offset: 0; + Mesh::Curve curve = mesh->get_curve(pidx - str_offset); + int k = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[prim]); + + curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bbox); + + visibility |= PATH_RAY_CURVE; + + /* Motion curves. */ + if(mesh->use_motion_blur) { + Attribute *attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); + + if(attr) { + size_t mesh_size = mesh->curve_keys.size(); + size_t steps = mesh->motion_steps - 1; + float3 *key_steps = attr->data_float3(); + + for(size_t i = 0; i < steps; i++) + curve.bounds_grow(k, key_steps + i*mesh_size, &mesh->curve_radius[0], bbox); + } + } + } + else { + /* Triangles. */ + int tri_offset = (params.top_level)? mesh->tri_offset: 0; + Mesh::Triangle triangle = mesh->get_triangle(pidx - tri_offset); + const float3 *vpos = &mesh->verts[0]; + + triangle.bounds_grow(vpos, bbox); + + /* Motion triangles. */ + if(mesh->use_motion_blur) { + Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); + + if(attr) { + size_t mesh_size = mesh->verts.size(); + size_t steps = mesh->motion_steps - 1; + float3 *vert_steps = attr->data_float3(); + + for(size_t i = 0; i < steps; i++) + triangle.bounds_grow(vert_steps + i*mesh_size, bbox); + } + } + } + } + visibility |= ob->visibility_for_tracing(); + } +} + /* Triangles */ void BVH::pack_triangle(int idx, float4 tri_verts[3]) @@ -166,7 +263,6 @@ void BVH::pack_primitives() if(pack.prim_index[i] != -1) { int tob = pack.prim_object[i]; Object *ob = objects[tob]; - if((pack.prim_type[i] & PRIMITIVE_ALL_TRIANGLE) != 0) { pack_triangle(i, (float4*)&pack.prim_tri_verts[3 * prim_triangle_index]); pack.prim_tri_index[i] = 3 * prim_triangle_index; @@ -175,11 +271,10 @@ void BVH::pack_primitives() else { pack.prim_tri_index[i] = -1; } - - pack.prim_visibility[i] = ob->visibility; - - if(pack.prim_type[i] & PRIMITIVE_ALL_CURVE) + pack.prim_visibility[i] = ob->visibility_for_tracing(); + if(pack.prim_type[i] & PRIMITIVE_ALL_CURVE) { pack.prim_visibility[i] |= PATH_RAY_CURVE; + } } else { pack.prim_tri_index[i] = -1; @@ -196,7 +291,8 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size) * BVH's are stored in global arrays. This function merges them into the * top level BVH, adjusting indexes and offsets where appropriate. */ - const bool use_qbvh = params.use_qbvh; + /* TODO(sergey): This code needs adjustment for wider BVH than 4. */ + const bool use_qbvh = (params.bvh_layout == BVH_LAYOUT_BVH4); /* Adjust primitive index to point to the triangle in the global array, for * meshes with transform applied and already in the top level BVH. @@ -418,832 +514,4 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size) } } -/* Regular BVH */ - -static bool node_bvh_is_unaligned(const BVHNode *node) -{ - const BVHNode *node0 = node->get_child(0), - *node1 = node->get_child(1); - return node0->is_unaligned() || node1->is_unaligned(); -} - -RegularBVH::RegularBVH(const BVHParams& params_, const vector<Object*>& objects_) -: BVH(params_, objects_) -{ -} - -void RegularBVH::pack_leaf(const BVHStackEntry& e, - const LeafNode *leaf) -{ - assert(e.idx + BVH_NODE_LEAF_SIZE <= pack.leaf_nodes.size()); - float4 data[BVH_NODE_LEAF_SIZE]; - memset(data, 0, sizeof(data)); - if(leaf->num_triangles() == 1 && pack.prim_index[leaf->m_lo] == -1) { - /* object */ - data[0].x = __int_as_float(~(leaf->m_lo)); - data[0].y = __int_as_float(0); - } - else { - /* triangle */ - data[0].x = __int_as_float(leaf->m_lo); - data[0].y = __int_as_float(leaf->m_hi); - } - data[0].z = __uint_as_float(leaf->m_visibility); - if(leaf->num_triangles() != 0) { - data[0].w = __uint_as_float(pack.prim_type[leaf->m_lo]); - } - - memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_NODE_LEAF_SIZE); -} - -void RegularBVH::pack_inner(const BVHStackEntry& e, - const BVHStackEntry& e0, - const BVHStackEntry& e1) -{ - if(e0.node->is_unaligned() || e1.node->is_unaligned()) { - pack_unaligned_inner(e, e0, e1); - } else { - pack_aligned_inner(e, e0, e1); - } -} - -void RegularBVH::pack_aligned_inner(const BVHStackEntry& e, - const BVHStackEntry& e0, - const BVHStackEntry& e1) -{ - pack_aligned_node(e.idx, - e0.node->m_bounds, e1.node->m_bounds, - e0.encodeIdx(), e1.encodeIdx(), - e0.node->m_visibility, e1.node->m_visibility); -} - -void RegularBVH::pack_aligned_node(int idx, - const BoundBox& b0, - const BoundBox& b1, - int c0, int c1, - uint visibility0, uint visibility1) -{ - assert(idx + BVH_NODE_SIZE <= pack.nodes.size()); - assert(c0 < 0 || c0 < pack.nodes.size()); - assert(c1 < 0 || c1 < pack.nodes.size()); - - int4 data[BVH_NODE_SIZE] = { - make_int4(visibility0 & ~PATH_RAY_NODE_UNALIGNED, - visibility1 & ~PATH_RAY_NODE_UNALIGNED, - c0, c1), - make_int4(__float_as_int(b0.min.x), - __float_as_int(b1.min.x), - __float_as_int(b0.max.x), - __float_as_int(b1.max.x)), - make_int4(__float_as_int(b0.min.y), - __float_as_int(b1.min.y), - __float_as_int(b0.max.y), - __float_as_int(b1.max.y)), - make_int4(__float_as_int(b0.min.z), - __float_as_int(b1.min.z), - __float_as_int(b0.max.z), - __float_as_int(b1.max.z)), - }; - - memcpy(&pack.nodes[idx], data, sizeof(int4)*BVH_NODE_SIZE); -} - -void RegularBVH::pack_unaligned_inner(const BVHStackEntry& e, - const BVHStackEntry& e0, - const BVHStackEntry& e1) -{ - pack_unaligned_node(e.idx, - e0.node->get_aligned_space(), - e1.node->get_aligned_space(), - e0.node->m_bounds, - e1.node->m_bounds, - e0.encodeIdx(), e1.encodeIdx(), - e0.node->m_visibility, e1.node->m_visibility); -} - -void RegularBVH::pack_unaligned_node(int idx, - const Transform& aligned_space0, - const Transform& aligned_space1, - const BoundBox& bounds0, - const BoundBox& bounds1, - int c0, int c1, - uint visibility0, uint visibility1) -{ - assert(idx + BVH_UNALIGNED_NODE_SIZE <= pack.nodes.size()); - assert(c0 < 0 || c0 < pack.nodes.size()); - assert(c1 < 0 || c1 < pack.nodes.size()); - - float4 data[BVH_UNALIGNED_NODE_SIZE]; - Transform space0 = BVHUnaligned::compute_node_transform(bounds0, - aligned_space0); - Transform space1 = BVHUnaligned::compute_node_transform(bounds1, - aligned_space1); - data[0] = make_float4(__int_as_float(visibility0 | PATH_RAY_NODE_UNALIGNED), - __int_as_float(visibility1 | PATH_RAY_NODE_UNALIGNED), - __int_as_float(c0), - __int_as_float(c1)); - - data[1] = space0.x; - data[2] = space0.y; - data[3] = space0.z; - data[4] = space1.x; - data[5] = space1.y; - data[6] = space1.z; - - memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_UNALIGNED_NODE_SIZE); -} - -void RegularBVH::pack_nodes(const BVHNode *root) -{ - const size_t num_nodes = root->getSubtreeSize(BVH_STAT_NODE_COUNT); - const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT); - assert(num_leaf_nodes <= num_nodes); - const size_t num_inner_nodes = num_nodes - num_leaf_nodes; - size_t node_size; - if(params.use_unaligned_nodes) { - const size_t num_unaligned_nodes = - root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_COUNT); - node_size = (num_unaligned_nodes * BVH_UNALIGNED_NODE_SIZE) + - (num_inner_nodes - num_unaligned_nodes) * BVH_NODE_SIZE; - } - else { - node_size = num_inner_nodes * BVH_NODE_SIZE; - } - /* Resize arrays */ - pack.nodes.clear(); - pack.leaf_nodes.clear(); - /* For top level BVH, first merge existing BVH's so we know the offsets. */ - if(params.top_level) { - pack_instances(node_size, num_leaf_nodes*BVH_NODE_LEAF_SIZE); - } - else { - pack.nodes.resize(node_size); - pack.leaf_nodes.resize(num_leaf_nodes*BVH_NODE_LEAF_SIZE); - } - - int nextNodeIdx = 0, nextLeafNodeIdx = 0; - - vector<BVHStackEntry> stack; - stack.reserve(BVHParams::MAX_DEPTH*2); - if(root->is_leaf()) { - stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++)); - } - else { - stack.push_back(BVHStackEntry(root, nextNodeIdx)); - nextNodeIdx += node_bvh_is_unaligned(root) - ? BVH_UNALIGNED_NODE_SIZE - : BVH_NODE_SIZE; - } - - while(stack.size()) { - BVHStackEntry e = stack.back(); - stack.pop_back(); - - if(e.node->is_leaf()) { - /* leaf node */ - const LeafNode *leaf = reinterpret_cast<const LeafNode*>(e.node); - pack_leaf(e, leaf); - } - else { - /* innner node */ - int idx[2]; - for(int i = 0; i < 2; ++i) { - if(e.node->get_child(i)->is_leaf()) { - idx[i] = nextLeafNodeIdx++; - } - else { - idx[i] = nextNodeIdx; - nextNodeIdx += node_bvh_is_unaligned(e.node->get_child(i)) - ? BVH_UNALIGNED_NODE_SIZE - : BVH_NODE_SIZE; - } - } - - stack.push_back(BVHStackEntry(e.node->get_child(0), idx[0])); - stack.push_back(BVHStackEntry(e.node->get_child(1), idx[1])); - - pack_inner(e, stack[stack.size()-2], stack[stack.size()-1]); - } - } - assert(node_size == nextNodeIdx); - /* root index to start traversal at, to handle case of single leaf node */ - pack.root_index = (root->is_leaf())? -1: 0; -} - -void RegularBVH::refit_nodes() -{ - assert(!params.top_level); - - BoundBox bbox = BoundBox::empty; - uint visibility = 0; - refit_node(0, (pack.root_index == -1)? true: false, bbox, visibility); -} - -void RegularBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility) -{ - if(leaf) { - assert(idx + BVH_NODE_LEAF_SIZE <= pack.leaf_nodes.size()); - const int4 *data = &pack.leaf_nodes[idx]; - const int c0 = data[0].x; - const int c1 = data[0].y; - /* refit leaf node */ - for(int prim = c0; prim < c1; prim++) { - int pidx = pack.prim_index[prim]; - int tob = pack.prim_object[prim]; - Object *ob = objects[tob]; - - if(pidx == -1) { - /* object instance */ - bbox.grow(ob->bounds); - } - else { - /* primitives */ - const Mesh *mesh = ob->mesh; - - if(pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) { - /* curves */ - int str_offset = (params.top_level)? mesh->curve_offset: 0; - Mesh::Curve curve = mesh->get_curve(pidx - str_offset); - int k = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[prim]); - - curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bbox); - - visibility |= PATH_RAY_CURVE; - - /* motion curves */ - if(mesh->use_motion_blur) { - Attribute *attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); - - if(attr) { - size_t mesh_size = mesh->curve_keys.size(); - size_t steps = mesh->motion_steps - 1; - float3 *key_steps = attr->data_float3(); - - for(size_t i = 0; i < steps; i++) - curve.bounds_grow(k, key_steps + i*mesh_size, &mesh->curve_radius[0], bbox); - } - } - } - else { - /* triangles */ - int tri_offset = (params.top_level)? mesh->tri_offset: 0; - Mesh::Triangle triangle = mesh->get_triangle(pidx - tri_offset); - const float3 *vpos = &mesh->verts[0]; - - triangle.bounds_grow(vpos, bbox); - - /* motion triangles */ - if(mesh->use_motion_blur) { - Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); - - if(attr) { - size_t mesh_size = mesh->verts.size(); - size_t steps = mesh->motion_steps - 1; - float3 *vert_steps = attr->data_float3(); - - for(size_t i = 0; i < steps; i++) - triangle.bounds_grow(vert_steps + i*mesh_size, bbox); - } - } - } - } - - visibility |= ob->visibility; - } - - /* TODO(sergey): De-duplicate with pack_leaf(). */ - float4 leaf_data[BVH_NODE_LEAF_SIZE]; - leaf_data[0].x = __int_as_float(c0); - leaf_data[0].y = __int_as_float(c1); - leaf_data[0].z = __uint_as_float(visibility); - leaf_data[0].w = __uint_as_float(data[0].w); - memcpy(&pack.leaf_nodes[idx], leaf_data, sizeof(float4)*BVH_NODE_LEAF_SIZE); - } - else { - assert(idx + BVH_NODE_SIZE <= pack.nodes.size()); - - const int4 *data = &pack.nodes[idx]; - const bool is_unaligned = (data[0].x & PATH_RAY_NODE_UNALIGNED) != 0; - const int c0 = data[0].z; - const int c1 = data[0].w; - /* refit inner node, set bbox from children */ - BoundBox bbox0 = BoundBox::empty, bbox1 = BoundBox::empty; - uint visibility0 = 0, visibility1 = 0; - - refit_node((c0 < 0)? -c0-1: c0, (c0 < 0), bbox0, visibility0); - refit_node((c1 < 0)? -c1-1: c1, (c1 < 0), bbox1, visibility1); - - if(is_unaligned) { - Transform aligned_space = transform_identity(); - pack_unaligned_node(idx, - aligned_space, aligned_space, - bbox0, bbox1, - c0, c1, - visibility0, - visibility1); - } - else { - pack_aligned_node(idx, - bbox0, bbox1, - c0, c1, - visibility0, - visibility1); - } - - bbox.grow(bbox0); - bbox.grow(bbox1); - visibility = visibility0|visibility1; - } -} - -/* QBVH */ - -/* Can we avoid this somehow or make more generic? - * - * Perhaps we can merge nodes in actual tree and make our - * life easier all over the place. - */ -static bool node_qbvh_is_unaligned(const BVHNode *node) -{ - const BVHNode *node0 = node->get_child(0), - *node1 = node->get_child(1); - bool has_unaligned = false; - if(node0->is_leaf()) { - has_unaligned |= node0->is_unaligned(); - } - else { - has_unaligned |= node0->get_child(0)->is_unaligned(); - has_unaligned |= node0->get_child(1)->is_unaligned(); - } - if(node1->is_leaf()) { - has_unaligned |= node1->is_unaligned(); - } - else { - has_unaligned |= node1->get_child(0)->is_unaligned(); - has_unaligned |= node1->get_child(1)->is_unaligned(); - } - return has_unaligned; -} - -QBVH::QBVH(const BVHParams& params_, const vector<Object*>& objects_) -: BVH(params_, objects_) -{ - params.use_qbvh = true; -} - -void QBVH::pack_leaf(const BVHStackEntry& e, const LeafNode *leaf) -{ - float4 data[BVH_QNODE_LEAF_SIZE]; - memset(data, 0, sizeof(data)); - if(leaf->num_triangles() == 1 && pack.prim_index[leaf->m_lo] == -1) { - /* object */ - data[0].x = __int_as_float(~(leaf->m_lo)); - data[0].y = __int_as_float(0); - } - else { - /* triangle */ - data[0].x = __int_as_float(leaf->m_lo); - data[0].y = __int_as_float(leaf->m_hi); - } - data[0].z = __uint_as_float(leaf->m_visibility); - if(leaf->num_triangles() != 0) { - data[0].w = __uint_as_float(pack.prim_type[leaf->m_lo]); - } - - memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_QNODE_LEAF_SIZE); -} - -void QBVH::pack_inner(const BVHStackEntry& e, - const BVHStackEntry *en, - int num) -{ - bool has_unaligned = false; - /* Check whether we have to create unaligned node or all nodes are aligned - * and we can cut some corner here. - */ - if(params.use_unaligned_nodes) { - for(int i = 0; i < num; i++) { - if(en[i].node->is_unaligned()) { - has_unaligned = true; - break; - } - } - } - if(has_unaligned) { - /* There's no unaligned children, pack into AABB node. */ - pack_unaligned_inner(e, en, num); - } - else { - /* Create unaligned node with orientation transform for each of the - * children. - */ - pack_aligned_inner(e, en, num); - } -} - -void QBVH::pack_aligned_inner(const BVHStackEntry& e, - const BVHStackEntry *en, - int num) -{ - BoundBox bounds[4]; - int child[4]; - for(int i = 0; i < num; ++i) { - bounds[i] = en[i].node->m_bounds; - child[i] = en[i].encodeIdx(); - } - pack_aligned_node(e.idx, - bounds, - child, - e.node->m_visibility, - e.node->m_time_from, - e.node->m_time_to, - num); -} - -void QBVH::pack_aligned_node(int idx, - const BoundBox *bounds, - const int *child, - const uint visibility, - const float time_from, - const float time_to, - const int num) -{ - float4 data[BVH_QNODE_SIZE]; - memset(data, 0, sizeof(data)); - - data[0].x = __uint_as_float(visibility & ~PATH_RAY_NODE_UNALIGNED); - data[0].y = time_from; - data[0].z = time_to; - - for(int i = 0; i < num; i++) { - float3 bb_min = bounds[i].min; - float3 bb_max = bounds[i].max; - - data[1][i] = bb_min.x; - data[2][i] = bb_max.x; - data[3][i] = bb_min.y; - data[4][i] = bb_max.y; - data[5][i] = bb_min.z; - data[6][i] = bb_max.z; - - data[7][i] = __int_as_float(child[i]); - } - - for(int i = num; i < 4; i++) { - /* We store BB which would never be recorded as intersection - * so kernel might safely assume there are always 4 child nodes. - */ - data[1][i] = FLT_MAX; - data[2][i] = -FLT_MAX; - - data[3][i] = FLT_MAX; - data[4][i] = -FLT_MAX; - - data[5][i] = FLT_MAX; - data[6][i] = -FLT_MAX; - - data[7][i] = __int_as_float(0); - } - - memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_QNODE_SIZE); -} - -void QBVH::pack_unaligned_inner(const BVHStackEntry& e, - const BVHStackEntry *en, - int num) -{ - Transform aligned_space[4]; - BoundBox bounds[4]; - int child[4]; - for(int i = 0; i < num; ++i) { - aligned_space[i] = en[i].node->get_aligned_space(); - bounds[i] = en[i].node->m_bounds; - child[i] = en[i].encodeIdx(); - } - pack_unaligned_node(e.idx, - aligned_space, - bounds, - child, - e.node->m_visibility, - e.node->m_time_from, - e.node->m_time_to, - num); -} - -void QBVH::pack_unaligned_node(int idx, - const Transform *aligned_space, - const BoundBox *bounds, - const int *child, - const uint visibility, - const float time_from, - const float time_to, - const int num) -{ - float4 data[BVH_UNALIGNED_QNODE_SIZE]; - memset(data, 0, sizeof(data)); - - data[0].x = __uint_as_float(visibility | PATH_RAY_NODE_UNALIGNED); - data[0].y = time_from; - data[0].z = time_to; - - for(int i = 0; i < num; i++) { - Transform space = BVHUnaligned::compute_node_transform( - bounds[i], - aligned_space[i]); - - data[1][i] = space.x.x; - data[2][i] = space.x.y; - data[3][i] = space.x.z; - - data[4][i] = space.y.x; - data[5][i] = space.y.y; - data[6][i] = space.y.z; - - data[7][i] = space.z.x; - data[8][i] = space.z.y; - data[9][i] = space.z.z; - - data[10][i] = space.x.w; - data[11][i] = space.y.w; - data[12][i] = space.z.w; - - data[13][i] = __int_as_float(child[i]); - } - - for(int i = num; i < 4; i++) { - /* We store BB which would never be recorded as intersection - * so kernel might safely assume there are always 4 child nodes. - */ - - data[1][i] = 1.0f; - data[2][i] = 0.0f; - data[3][i] = 0.0f; - - data[4][i] = 0.0f; - data[5][i] = 0.0f; - data[6][i] = 0.0f; - - data[7][i] = 0.0f; - data[8][i] = 0.0f; - data[9][i] = 0.0f; - - data[10][i] = -FLT_MAX; - data[11][i] = -FLT_MAX; - data[12][i] = -FLT_MAX; - - data[13][i] = __int_as_float(0); - } - - memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_UNALIGNED_QNODE_SIZE); -} - -/* Quad SIMD Nodes */ - -void QBVH::pack_nodes(const BVHNode *root) -{ - /* Calculate size of the arrays required. */ - const size_t num_nodes = root->getSubtreeSize(BVH_STAT_QNODE_COUNT); - const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT); - assert(num_leaf_nodes <= num_nodes); - const size_t num_inner_nodes = num_nodes - num_leaf_nodes; - size_t node_size; - if(params.use_unaligned_nodes) { - const size_t num_unaligned_nodes = - root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_QNODE_COUNT); - node_size = (num_unaligned_nodes * BVH_UNALIGNED_QNODE_SIZE) + - (num_inner_nodes - num_unaligned_nodes) * BVH_QNODE_SIZE; - } - else { - node_size = num_inner_nodes * BVH_QNODE_SIZE; - } - /* Resize arrays. */ - pack.nodes.clear(); - pack.leaf_nodes.clear(); - /* For top level BVH, first merge existing BVH's so we know the offsets. */ - if(params.top_level) { - pack_instances(node_size, num_leaf_nodes*BVH_QNODE_LEAF_SIZE); - } - else { - pack.nodes.resize(node_size); - pack.leaf_nodes.resize(num_leaf_nodes*BVH_QNODE_LEAF_SIZE); - } - - int nextNodeIdx = 0, nextLeafNodeIdx = 0; - - vector<BVHStackEntry> stack; - stack.reserve(BVHParams::MAX_DEPTH*2); - if(root->is_leaf()) { - stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++)); - } - else { - stack.push_back(BVHStackEntry(root, nextNodeIdx)); - nextNodeIdx += node_qbvh_is_unaligned(root) - ? BVH_UNALIGNED_QNODE_SIZE - : BVH_QNODE_SIZE; - } - - while(stack.size()) { - BVHStackEntry e = stack.back(); - stack.pop_back(); - - if(e.node->is_leaf()) { - /* leaf node */ - const LeafNode *leaf = reinterpret_cast<const LeafNode*>(e.node); - pack_leaf(e, leaf); - } - else { - /* Inner node. */ - const BVHNode *node = e.node; - const BVHNode *node0 = node->get_child(0); - const BVHNode *node1 = node->get_child(1); - /* Collect nodes. */ - const BVHNode *nodes[4]; - int numnodes = 0; - if(node0->is_leaf()) { - nodes[numnodes++] = node0; - } - else { - nodes[numnodes++] = node0->get_child(0); - nodes[numnodes++] = node0->get_child(1); - } - if(node1->is_leaf()) { - nodes[numnodes++] = node1; - } - else { - nodes[numnodes++] = node1->get_child(0); - nodes[numnodes++] = node1->get_child(1); - } - /* Push entries on the stack. */ - for(int i = 0; i < numnodes; ++i) { - int idx; - if(nodes[i]->is_leaf()) { - idx = nextLeafNodeIdx++; - } - else { - idx = nextNodeIdx; - nextNodeIdx += node_qbvh_is_unaligned(nodes[i]) - ? BVH_UNALIGNED_QNODE_SIZE - : BVH_QNODE_SIZE; - } - stack.push_back(BVHStackEntry(nodes[i], idx)); - } - /* Set node. */ - pack_inner(e, &stack[stack.size()-numnodes], numnodes); - } - } - assert(node_size == nextNodeIdx); - /* Root index to start traversal at, to handle case of single leaf node. */ - pack.root_index = (root->is_leaf())? -1: 0; -} - -void QBVH::refit_nodes() -{ - assert(!params.top_level); - - BoundBox bbox = BoundBox::empty; - uint visibility = 0; - refit_node(0, (pack.root_index == -1)? true: false, bbox, visibility); -} - -void QBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility) -{ - if(leaf) { - int4 *data = &pack.leaf_nodes[idx]; - int4 c = data[0]; - /* Refit leaf node. */ - for(int prim = c.x; prim < c.y; prim++) { - int pidx = pack.prim_index[prim]; - int tob = pack.prim_object[prim]; - Object *ob = objects[tob]; - - if(pidx == -1) { - /* Object instance. */ - bbox.grow(ob->bounds); - } - else { - /* Primitives. */ - const Mesh *mesh = ob->mesh; - - if(pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) { - /* Curves. */ - int str_offset = (params.top_level)? mesh->curve_offset: 0; - Mesh::Curve curve = mesh->get_curve(pidx - str_offset); - int k = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[prim]); - - curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bbox); - - visibility |= PATH_RAY_CURVE; - - /* Motion curves. */ - if(mesh->use_motion_blur) { - Attribute *attr = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); - - if(attr) { - size_t mesh_size = mesh->curve_keys.size(); - size_t steps = mesh->motion_steps - 1; - float3 *key_steps = attr->data_float3(); - - for(size_t i = 0; i < steps; i++) - curve.bounds_grow(k, key_steps + i*mesh_size, &mesh->curve_radius[0], bbox); - } - } - } - else { - /* Triangles. */ - int tri_offset = (params.top_level)? mesh->tri_offset: 0; - Mesh::Triangle triangle = mesh->get_triangle(pidx - tri_offset); - const float3 *vpos = &mesh->verts[0]; - - triangle.bounds_grow(vpos, bbox); - - /* Motion triangles. */ - if(mesh->use_motion_blur) { - Attribute *attr = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); - - if(attr) { - size_t mesh_size = mesh->verts.size(); - size_t steps = mesh->motion_steps - 1; - float3 *vert_steps = attr->data_float3(); - - for(size_t i = 0; i < steps; i++) - triangle.bounds_grow(vert_steps + i*mesh_size, bbox); - } - } - } - } - - visibility |= ob->visibility; - } - - /* TODO(sergey): This is actually a copy of pack_leaf(), - * but this chunk of code only knows actual data and has - * no idea about BVHNode. - * - * Would be nice to de-duplicate code, but trying to make - * making code more general ends up in much nastier code - * in my opinion so far. - * - * Same applies to the inner nodes case below. - */ - float4 leaf_data[BVH_QNODE_LEAF_SIZE]; - leaf_data[0].x = __int_as_float(c.x); - leaf_data[0].y = __int_as_float(c.y); - leaf_data[0].z = __uint_as_float(visibility); - leaf_data[0].w = __uint_as_float(c.w); - memcpy(&pack.leaf_nodes[idx], leaf_data, sizeof(float4)*BVH_QNODE_LEAF_SIZE); - } - else { - int4 *data = &pack.nodes[idx]; - bool is_unaligned = (data[0].x & PATH_RAY_NODE_UNALIGNED) != 0; - int4 c; - if(is_unaligned) { - c = data[13]; - } - else { - c = data[7]; - } - /* Refit inner node, set bbox from children. */ - BoundBox child_bbox[4] = {BoundBox::empty, - BoundBox::empty, - BoundBox::empty, - BoundBox::empty}; - uint child_visibility[4] = {0}; - int num_nodes = 0; - - for(int i = 0; i < 4; ++i) { - if(c[i] != 0) { - refit_node((c[i] < 0)? -c[i]-1: c[i], (c[i] < 0), - child_bbox[i], child_visibility[i]); - ++num_nodes; - bbox.grow(child_bbox[i]); - visibility |= child_visibility[i]; - } - } - - if(is_unaligned) { - Transform aligned_space[4] = {transform_identity(), - transform_identity(), - transform_identity(), - transform_identity()}; - pack_unaligned_node(idx, - aligned_space, - child_bbox, - &c[0], - visibility, - 0.0f, - 1.0f, - 4); - } - else { - pack_aligned_node(idx, - child_bbox, - &c[0], - visibility, - 0.0f, - 1.0f, - 4); - } - } -} - CCL_NAMESPACE_END diff --git a/intern/cycles/bvh/bvh.h b/intern/cycles/bvh/bvh.h index 08f41fc736f..6a82f915692 100644 --- a/intern/cycles/bvh/bvh.h +++ b/intern/cycles/bvh/bvh.h @@ -18,10 +18,10 @@ #ifndef __BVH_H__ #define __BVH_H__ -#include "bvh_params.h" +#include "bvh/bvh_params.h" -#include "util_types.h" -#include "util_vector.h" +#include "util/util_types.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN @@ -33,15 +33,8 @@ class LeafNode; class Object; class Progress; -#define BVH_NODE_SIZE 4 -#define BVH_NODE_LEAF_SIZE 1 -#define BVH_QNODE_SIZE 8 -#define BVH_QNODE_LEAF_SIZE 1 -#define BVH_ALIGN 4096 -#define TRI_NODE_SIZE 3 - -#define BVH_UNALIGNED_NODE_SIZE 7 -#define BVH_UNALIGNED_QNODE_SIZE 14 +#define BVH_ALIGN 4096 +#define TRI_NODE_SIZE 3 /* Packed BVH * @@ -54,7 +47,7 @@ struct PackedBVH { /* BVH leaf nodes storage. */ array<int4> leaf_nodes; /* object index to BVH node index mapping for instances */ - array<int> object_node; + array<int> object_node; /* Mapping from primitive index to index in triangle array. */ array<uint> prim_tri_index; /* Continuous storage of triangle vertices. */ @@ -98,6 +91,9 @@ public: protected: BVH(const BVHParams& params, const vector<Object*>& objects); + /* Refit range of primitives. */ + void refit_primitives(int start, int end, BoundBox& bbox, uint& visibility); + /* triangles and strands */ void pack_primitives(); void pack_triangle(int idx, float4 storage[3]); @@ -110,95 +106,16 @@ protected: virtual void refit_nodes() = 0; }; -/* Regular BVH - * - * Typical BVH with each node having two children. */ - -class RegularBVH : public BVH { -protected: - /* constructor */ - friend class BVH; - RegularBVH(const BVHParams& params, const vector<Object*>& objects); - - /* pack */ - void pack_nodes(const BVHNode *root); - - void pack_leaf(const BVHStackEntry& e, - const LeafNode *leaf); - void pack_inner(const BVHStackEntry& e, - const BVHStackEntry& e0, - const BVHStackEntry& e1); - - void pack_aligned_inner(const BVHStackEntry& e, - const BVHStackEntry& e0, - const BVHStackEntry& e1); - void pack_aligned_node(int idx, - const BoundBox& b0, - const BoundBox& b1, - int c0, int c1, - uint visibility0, uint visibility1); - - void pack_unaligned_inner(const BVHStackEntry& e, - const BVHStackEntry& e0, - const BVHStackEntry& e1); - void pack_unaligned_node(int idx, - const Transform& aligned_space0, - const Transform& aligned_space1, - const BoundBox& b0, - const BoundBox& b1, - int c0, int c1, - uint visibility0, uint visibility1); - - /* refit */ - void refit_nodes(); - void refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility); -}; - -/* QBVH - * - * Quad BVH, with each node having four children, to use with SIMD instructions. */ +/* Pack Utility */ +struct BVHStackEntry +{ + const BVHNode *node; + int idx; -class QBVH : public BVH { -protected: - /* constructor */ - friend class BVH; - QBVH(const BVHParams& params, const vector<Object*>& objects); - - /* pack */ - void pack_nodes(const BVHNode *root); - - void pack_leaf(const BVHStackEntry& e, const LeafNode *leaf); - void pack_inner(const BVHStackEntry& e, const BVHStackEntry *en, int num); - - void pack_aligned_inner(const BVHStackEntry& e, - const BVHStackEntry *en, - int num); - void pack_aligned_node(int idx, - const BoundBox *bounds, - const int *child, - const uint visibility, - const float time_from, - const float time_to, - const int num); - - void pack_unaligned_inner(const BVHStackEntry& e, - const BVHStackEntry *en, - int num); - void pack_unaligned_node(int idx, - const Transform *aligned_space, - const BoundBox *bounds, - const int *child, - const uint visibility, - const float time_from, - const float time_to, - const int num); - - /* refit */ - void refit_nodes(); - void refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility); + BVHStackEntry(const BVHNode *n = 0, int i = 0); + int encodeIdx() const; }; CCL_NAMESPACE_END #endif /* __BVH_H__ */ - diff --git a/intern/cycles/bvh/bvh2.cpp b/intern/cycles/bvh/bvh2.cpp new file mode 100644 index 00000000000..9d89d2b6afb --- /dev/null +++ b/intern/cycles/bvh/bvh2.cpp @@ -0,0 +1,303 @@ +/* + * Adapted from code copyright 2009-2010 NVIDIA Corporation + * Modifications Copyright 2011, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "bvh/bvh2.h" + +#include "render/mesh.h" +#include "render/object.h" + +#include "bvh/bvh_node.h" +#include "bvh/bvh_unaligned.h" + +CCL_NAMESPACE_BEGIN + +static bool node_bvh_is_unaligned(const BVHNode *node) +{ + const BVHNode *node0 = node->get_child(0), + *node1 = node->get_child(1); + return node0->is_unaligned || node1->is_unaligned; +} + +BVH2::BVH2(const BVHParams& params_, const vector<Object*>& objects_) +: BVH(params_, objects_) +{ +} + +void BVH2::pack_leaf(const BVHStackEntry& e, + const LeafNode *leaf) +{ + assert(e.idx + BVH_NODE_LEAF_SIZE <= pack.leaf_nodes.size()); + float4 data[BVH_NODE_LEAF_SIZE]; + memset(data, 0, sizeof(data)); + if(leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) { + /* object */ + data[0].x = __int_as_float(~(leaf->lo)); + data[0].y = __int_as_float(0); + } + else { + /* triangle */ + data[0].x = __int_as_float(leaf->lo); + data[0].y = __int_as_float(leaf->hi); + } + data[0].z = __uint_as_float(leaf->visibility); + if(leaf->num_triangles() != 0) { + data[0].w = __uint_as_float(pack.prim_type[leaf->lo]); + } + + memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_NODE_LEAF_SIZE); +} + +void BVH2::pack_inner(const BVHStackEntry& e, + const BVHStackEntry& e0, + const BVHStackEntry& e1) +{ + if(e0.node->is_unaligned || e1.node->is_unaligned) { + pack_unaligned_inner(e, e0, e1); + } else { + pack_aligned_inner(e, e0, e1); + } +} + +void BVH2::pack_aligned_inner(const BVHStackEntry& e, + const BVHStackEntry& e0, + const BVHStackEntry& e1) +{ + pack_aligned_node(e.idx, + e0.node->bounds, e1.node->bounds, + e0.encodeIdx(), e1.encodeIdx(), + e0.node->visibility, e1.node->visibility); +} + +void BVH2::pack_aligned_node(int idx, + const BoundBox& b0, + const BoundBox& b1, + int c0, int c1, + uint visibility0, uint visibility1) +{ + assert(idx + BVH_NODE_SIZE <= pack.nodes.size()); + assert(c0 < 0 || c0 < pack.nodes.size()); + assert(c1 < 0 || c1 < pack.nodes.size()); + + int4 data[BVH_NODE_SIZE] = { + make_int4(visibility0 & ~PATH_RAY_NODE_UNALIGNED, + visibility1 & ~PATH_RAY_NODE_UNALIGNED, + c0, c1), + make_int4(__float_as_int(b0.min.x), + __float_as_int(b1.min.x), + __float_as_int(b0.max.x), + __float_as_int(b1.max.x)), + make_int4(__float_as_int(b0.min.y), + __float_as_int(b1.min.y), + __float_as_int(b0.max.y), + __float_as_int(b1.max.y)), + make_int4(__float_as_int(b0.min.z), + __float_as_int(b1.min.z), + __float_as_int(b0.max.z), + __float_as_int(b1.max.z)), + }; + + memcpy(&pack.nodes[idx], data, sizeof(int4)*BVH_NODE_SIZE); +} + +void BVH2::pack_unaligned_inner(const BVHStackEntry& e, + const BVHStackEntry& e0, + const BVHStackEntry& e1) +{ + pack_unaligned_node(e.idx, + e0.node->get_aligned_space(), + e1.node->get_aligned_space(), + e0.node->bounds, + e1.node->bounds, + e0.encodeIdx(), e1.encodeIdx(), + e0.node->visibility, e1.node->visibility); +} + +void BVH2::pack_unaligned_node(int idx, + const Transform& aligned_space0, + const Transform& aligned_space1, + const BoundBox& bounds0, + const BoundBox& bounds1, + int c0, int c1, + uint visibility0, uint visibility1) +{ + assert(idx + BVH_UNALIGNED_NODE_SIZE <= pack.nodes.size()); + assert(c0 < 0 || c0 < pack.nodes.size()); + assert(c1 < 0 || c1 < pack.nodes.size()); + + float4 data[BVH_UNALIGNED_NODE_SIZE]; + Transform space0 = BVHUnaligned::compute_node_transform(bounds0, + aligned_space0); + Transform space1 = BVHUnaligned::compute_node_transform(bounds1, + aligned_space1); + data[0] = make_float4(__int_as_float(visibility0 | PATH_RAY_NODE_UNALIGNED), + __int_as_float(visibility1 | PATH_RAY_NODE_UNALIGNED), + __int_as_float(c0), + __int_as_float(c1)); + + data[1] = space0.x; + data[2] = space0.y; + data[3] = space0.z; + data[4] = space1.x; + data[5] = space1.y; + data[6] = space1.z; + + memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_UNALIGNED_NODE_SIZE); +} + +void BVH2::pack_nodes(const BVHNode *root) +{ + const size_t num_nodes = root->getSubtreeSize(BVH_STAT_NODE_COUNT); + const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT); + assert(num_leaf_nodes <= num_nodes); + const size_t num_inner_nodes = num_nodes - num_leaf_nodes; + size_t node_size; + if(params.use_unaligned_nodes) { + const size_t num_unaligned_nodes = + root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_COUNT); + node_size = (num_unaligned_nodes * BVH_UNALIGNED_NODE_SIZE) + + (num_inner_nodes - num_unaligned_nodes) * BVH_NODE_SIZE; + } + else { + node_size = num_inner_nodes * BVH_NODE_SIZE; + } + /* Resize arrays */ + pack.nodes.clear(); + pack.leaf_nodes.clear(); + /* For top level BVH, first merge existing BVH's so we know the offsets. */ + if(params.top_level) { + pack_instances(node_size, num_leaf_nodes*BVH_NODE_LEAF_SIZE); + } + else { + pack.nodes.resize(node_size); + pack.leaf_nodes.resize(num_leaf_nodes*BVH_NODE_LEAF_SIZE); + } + + int nextNodeIdx = 0, nextLeafNodeIdx = 0; + + vector<BVHStackEntry> stack; + stack.reserve(BVHParams::MAX_DEPTH*2); + if(root->is_leaf()) { + stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++)); + } + else { + stack.push_back(BVHStackEntry(root, nextNodeIdx)); + nextNodeIdx += node_bvh_is_unaligned(root) + ? BVH_UNALIGNED_NODE_SIZE + : BVH_NODE_SIZE; + } + + while(stack.size()) { + BVHStackEntry e = stack.back(); + stack.pop_back(); + + if(e.node->is_leaf()) { + /* leaf node */ + const LeafNode *leaf = reinterpret_cast<const LeafNode*>(e.node); + pack_leaf(e, leaf); + } + else { + /* innner node */ + int idx[2]; + for(int i = 0; i < 2; ++i) { + if(e.node->get_child(i)->is_leaf()) { + idx[i] = nextLeafNodeIdx++; + } + else { + idx[i] = nextNodeIdx; + nextNodeIdx += node_bvh_is_unaligned(e.node->get_child(i)) + ? BVH_UNALIGNED_NODE_SIZE + : BVH_NODE_SIZE; + } + } + + stack.push_back(BVHStackEntry(e.node->get_child(0), idx[0])); + stack.push_back(BVHStackEntry(e.node->get_child(1), idx[1])); + + pack_inner(e, stack[stack.size()-2], stack[stack.size()-1]); + } + } + assert(node_size == nextNodeIdx); + /* root index to start traversal at, to handle case of single leaf node */ + pack.root_index = (root->is_leaf())? -1: 0; +} + +void BVH2::refit_nodes() +{ + assert(!params.top_level); + + BoundBox bbox = BoundBox::empty; + uint visibility = 0; + refit_node(0, (pack.root_index == -1)? true: false, bbox, visibility); +} + +void BVH2::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility) +{ + if(leaf) { + /* refit leaf node */ + assert(idx + BVH_NODE_LEAF_SIZE <= pack.leaf_nodes.size()); + const int4 *data = &pack.leaf_nodes[idx]; + const int c0 = data[0].x; + const int c1 = data[0].y; + + BVH::refit_primitives(c0, c1, bbox, visibility); + + /* TODO(sergey): De-duplicate with pack_leaf(). */ + float4 leaf_data[BVH_NODE_LEAF_SIZE]; + leaf_data[0].x = __int_as_float(c0); + leaf_data[0].y = __int_as_float(c1); + leaf_data[0].z = __uint_as_float(visibility); + leaf_data[0].w = __uint_as_float(data[0].w); + memcpy(&pack.leaf_nodes[idx], leaf_data, sizeof(float4)*BVH_NODE_LEAF_SIZE); + } + else { + assert(idx + BVH_NODE_SIZE <= pack.nodes.size()); + + const int4 *data = &pack.nodes[idx]; + const bool is_unaligned = (data[0].x & PATH_RAY_NODE_UNALIGNED) != 0; + const int c0 = data[0].z; + const int c1 = data[0].w; + /* refit inner node, set bbox from children */ + BoundBox bbox0 = BoundBox::empty, bbox1 = BoundBox::empty; + uint visibility0 = 0, visibility1 = 0; + + refit_node((c0 < 0)? -c0-1: c0, (c0 < 0), bbox0, visibility0); + refit_node((c1 < 0)? -c1-1: c1, (c1 < 0), bbox1, visibility1); + + if(is_unaligned) { + Transform aligned_space = transform_identity(); + pack_unaligned_node(idx, + aligned_space, aligned_space, + bbox0, bbox1, + c0, c1, + visibility0, + visibility1); + } + else { + pack_aligned_node(idx, + bbox0, bbox1, + c0, c1, + visibility0, + visibility1); + } + + bbox.grow(bbox0); + bbox.grow(bbox1); + visibility = visibility0|visibility1; + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/bvh/bvh2.h b/intern/cycles/bvh/bvh2.h new file mode 100644 index 00000000000..df65ddca5b7 --- /dev/null +++ b/intern/cycles/bvh/bvh2.h @@ -0,0 +1,87 @@ +/* + * Adapted from code copyright 2009-2010 NVIDIA Corporation + * Modifications Copyright 2011, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __BVH2_H__ +#define __BVH2_H__ + +#include "bvh/bvh.h" +#include "bvh/bvh_params.h" + +#include "util/util_types.h" +#include "util/util_vector.h" + +CCL_NAMESPACE_BEGIN + +class BVHNode; +struct BVHStackEntry; +class BVHParams; +class BoundBox; +class LeafNode; +class Object; +class Progress; + +#define BVH_NODE_SIZE 4 +#define BVH_NODE_LEAF_SIZE 1 +#define BVH_UNALIGNED_NODE_SIZE 7 + +/* BVH2 + * + * Typical BVH with each node having two children. + */ +class BVH2 : public BVH { +protected: + /* constructor */ + friend class BVH; + BVH2(const BVHParams& params, const vector<Object*>& objects); + + /* pack */ + void pack_nodes(const BVHNode *root); + + void pack_leaf(const BVHStackEntry& e, + const LeafNode *leaf); + void pack_inner(const BVHStackEntry& e, + const BVHStackEntry& e0, + const BVHStackEntry& e1); + + void pack_aligned_inner(const BVHStackEntry& e, + const BVHStackEntry& e0, + const BVHStackEntry& e1); + void pack_aligned_node(int idx, + const BoundBox& b0, + const BoundBox& b1, + int c0, int c1, + uint visibility0, uint visibility1); + + void pack_unaligned_inner(const BVHStackEntry& e, + const BVHStackEntry& e0, + const BVHStackEntry& e1); + void pack_unaligned_node(int idx, + const Transform& aligned_space0, + const Transform& aligned_space1, + const BoundBox& b0, + const BoundBox& b1, + int c0, int c1, + uint visibility0, uint visibility1); + + /* refit */ + void refit_nodes(); + void refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility); +}; + +CCL_NAMESPACE_END + +#endif /* __BVH2_H__ */ diff --git a/intern/cycles/bvh/bvh4.cpp b/intern/cycles/bvh/bvh4.cpp new file mode 100644 index 00000000000..4faf47af7bb --- /dev/null +++ b/intern/cycles/bvh/bvh4.cpp @@ -0,0 +1,455 @@ +/* + * Adapted from code copyright 2009-2010 NVIDIA Corporation + * Modifications Copyright 2011, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "bvh/bvh4.h" + +#include "render/mesh.h" +#include "render/object.h" + +#include "bvh/bvh_node.h" +#include "bvh/bvh_unaligned.h" + +CCL_NAMESPACE_BEGIN + +/* Can we avoid this somehow or make more generic? + * + * Perhaps we can merge nodes in actual tree and make our + * life easier all over the place. + */ +static bool node_qbvh_is_unaligned(const BVHNode *node) +{ + const BVHNode *node0 = node->get_child(0), + *node1 = node->get_child(1); + bool has_unaligned = false; + if(node0->is_leaf()) { + has_unaligned |= node0->is_unaligned; + } + else { + has_unaligned |= node0->get_child(0)->is_unaligned; + has_unaligned |= node0->get_child(1)->is_unaligned; + } + if(node1->is_leaf()) { + has_unaligned |= node1->is_unaligned; + } + else { + has_unaligned |= node1->get_child(0)->is_unaligned; + has_unaligned |= node1->get_child(1)->is_unaligned; + } + return has_unaligned; +} + +BVH4::BVH4(const BVHParams& params_, const vector<Object*>& objects_) +: BVH(params_, objects_) +{ + params.bvh_layout = BVH_LAYOUT_BVH4; +} + +void BVH4::pack_leaf(const BVHStackEntry& e, const LeafNode *leaf) +{ + float4 data[BVH_QNODE_LEAF_SIZE]; + memset(data, 0, sizeof(data)); + if(leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) { + /* object */ + data[0].x = __int_as_float(~(leaf->lo)); + data[0].y = __int_as_float(0); + } + else { + /* triangle */ + data[0].x = __int_as_float(leaf->lo); + data[0].y = __int_as_float(leaf->hi); + } + data[0].z = __uint_as_float(leaf->visibility); + if(leaf->num_triangles() != 0) { + data[0].w = __uint_as_float(pack.prim_type[leaf->lo]); + } + + memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_QNODE_LEAF_SIZE); +} + +void BVH4::pack_inner(const BVHStackEntry& e, + const BVHStackEntry *en, + int num) +{ + bool has_unaligned = false; + /* Check whether we have to create unaligned node or all nodes are aligned + * and we can cut some corner here. + */ + if(params.use_unaligned_nodes) { + for(int i = 0; i < num; i++) { + if(en[i].node->is_unaligned) { + has_unaligned = true; + break; + } + } + } + if(has_unaligned) { + /* There's no unaligned children, pack into AABB node. */ + pack_unaligned_inner(e, en, num); + } + else { + /* Create unaligned node with orientation transform for each of the + * children. + */ + pack_aligned_inner(e, en, num); + } +} + +void BVH4::pack_aligned_inner(const BVHStackEntry& e, + const BVHStackEntry *en, + int num) +{ + BoundBox bounds[4]; + int child[4]; + for(int i = 0; i < num; ++i) { + bounds[i] = en[i].node->bounds; + child[i] = en[i].encodeIdx(); + } + pack_aligned_node(e.idx, + bounds, + child, + e.node->visibility, + e.node->time_from, + e.node->time_to, + num); +} + +void BVH4::pack_aligned_node(int idx, + const BoundBox *bounds, + const int *child, + const uint visibility, + const float time_from, + const float time_to, + const int num) +{ + float4 data[BVH_QNODE_SIZE]; + memset(data, 0, sizeof(data)); + + data[0].x = __uint_as_float(visibility & ~PATH_RAY_NODE_UNALIGNED); + data[0].y = time_from; + data[0].z = time_to; + + for(int i = 0; i < num; i++) { + float3 bb_min = bounds[i].min; + float3 bb_max = bounds[i].max; + + data[1][i] = bb_min.x; + data[2][i] = bb_max.x; + data[3][i] = bb_min.y; + data[4][i] = bb_max.y; + data[5][i] = bb_min.z; + data[6][i] = bb_max.z; + + data[7][i] = __int_as_float(child[i]); + } + + for(int i = num; i < 4; i++) { + /* We store BB which would never be recorded as intersection + * so kernel might safely assume there are always 4 child nodes. + */ + data[1][i] = FLT_MAX; + data[2][i] = -FLT_MAX; + + data[3][i] = FLT_MAX; + data[4][i] = -FLT_MAX; + + data[5][i] = FLT_MAX; + data[6][i] = -FLT_MAX; + + data[7][i] = __int_as_float(0); + } + + memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_QNODE_SIZE); +} + +void BVH4::pack_unaligned_inner(const BVHStackEntry& e, + const BVHStackEntry *en, + int num) +{ + Transform aligned_space[4]; + BoundBox bounds[4]; + int child[4]; + for(int i = 0; i < num; ++i) { + aligned_space[i] = en[i].node->get_aligned_space(); + bounds[i] = en[i].node->bounds; + child[i] = en[i].encodeIdx(); + } + pack_unaligned_node(e.idx, + aligned_space, + bounds, + child, + e.node->visibility, + e.node->time_from, + e.node->time_to, + num); +} + +void BVH4::pack_unaligned_node(int idx, + const Transform *aligned_space, + const BoundBox *bounds, + const int *child, + const uint visibility, + const float time_from, + const float time_to, + const int num) +{ + float4 data[BVH_UNALIGNED_QNODE_SIZE]; + memset(data, 0, sizeof(data)); + + data[0].x = __uint_as_float(visibility | PATH_RAY_NODE_UNALIGNED); + data[0].y = time_from; + data[0].z = time_to; + + for(int i = 0; i < num; i++) { + Transform space = BVHUnaligned::compute_node_transform( + bounds[i], + aligned_space[i]); + + data[1][i] = space.x.x; + data[2][i] = space.x.y; + data[3][i] = space.x.z; + + data[4][i] = space.y.x; + data[5][i] = space.y.y; + data[6][i] = space.y.z; + + data[7][i] = space.z.x; + data[8][i] = space.z.y; + data[9][i] = space.z.z; + + data[10][i] = space.x.w; + data[11][i] = space.y.w; + data[12][i] = space.z.w; + + data[13][i] = __int_as_float(child[i]); + } + + for(int i = num; i < 4; i++) { + /* We store BB which would never be recorded as intersection + * so kernel might safely assume there are always 4 child nodes. + */ + + data[1][i] = NAN; + data[2][i] = NAN; + data[3][i] = NAN; + + data[4][i] = NAN; + data[5][i] = NAN; + data[6][i] = NAN; + + data[7][i] = NAN; + data[8][i] = NAN; + data[9][i] = NAN; + + data[10][i] = NAN; + data[11][i] = NAN; + data[12][i] = NAN; + + data[13][i] = __int_as_float(0); + } + + memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_UNALIGNED_QNODE_SIZE); +} + +/* Quad SIMD Nodes */ + +void BVH4::pack_nodes(const BVHNode *root) +{ + /* Calculate size of the arrays required. */ + const size_t num_nodes = root->getSubtreeSize(BVH_STAT_QNODE_COUNT); + const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT); + assert(num_leaf_nodes <= num_nodes); + const size_t num_inner_nodes = num_nodes - num_leaf_nodes; + size_t node_size; + if(params.use_unaligned_nodes) { + const size_t num_unaligned_nodes = + root->getSubtreeSize(BVH_STAT_UNALIGNED_INNER_QNODE_COUNT); + node_size = (num_unaligned_nodes * BVH_UNALIGNED_QNODE_SIZE) + + (num_inner_nodes - num_unaligned_nodes) * BVH_QNODE_SIZE; + } + else { + node_size = num_inner_nodes * BVH_QNODE_SIZE; + } + /* Resize arrays. */ + pack.nodes.clear(); + pack.leaf_nodes.clear(); + /* For top level BVH, first merge existing BVH's so we know the offsets. */ + if(params.top_level) { + pack_instances(node_size, num_leaf_nodes*BVH_QNODE_LEAF_SIZE); + } + else { + pack.nodes.resize(node_size); + pack.leaf_nodes.resize(num_leaf_nodes*BVH_QNODE_LEAF_SIZE); + } + + int nextNodeIdx = 0, nextLeafNodeIdx = 0; + + vector<BVHStackEntry> stack; + stack.reserve(BVHParams::MAX_DEPTH*2); + if(root->is_leaf()) { + stack.push_back(BVHStackEntry(root, nextLeafNodeIdx++)); + } + else { + stack.push_back(BVHStackEntry(root, nextNodeIdx)); + nextNodeIdx += node_qbvh_is_unaligned(root) + ? BVH_UNALIGNED_QNODE_SIZE + : BVH_QNODE_SIZE; + } + + while(stack.size()) { + BVHStackEntry e = stack.back(); + stack.pop_back(); + + if(e.node->is_leaf()) { + /* leaf node */ + const LeafNode *leaf = reinterpret_cast<const LeafNode*>(e.node); + pack_leaf(e, leaf); + } + else { + /* Inner node. */ + const BVHNode *node = e.node; + const BVHNode *node0 = node->get_child(0); + const BVHNode *node1 = node->get_child(1); + /* Collect nodes. */ + const BVHNode *nodes[4]; + int numnodes = 0; + if(node0->is_leaf()) { + nodes[numnodes++] = node0; + } + else { + nodes[numnodes++] = node0->get_child(0); + nodes[numnodes++] = node0->get_child(1); + } + if(node1->is_leaf()) { + nodes[numnodes++] = node1; + } + else { + nodes[numnodes++] = node1->get_child(0); + nodes[numnodes++] = node1->get_child(1); + } + /* Push entries on the stack. */ + for(int i = 0; i < numnodes; ++i) { + int idx; + if(nodes[i]->is_leaf()) { + idx = nextLeafNodeIdx++; + } + else { + idx = nextNodeIdx; + nextNodeIdx += node_qbvh_is_unaligned(nodes[i]) + ? BVH_UNALIGNED_QNODE_SIZE + : BVH_QNODE_SIZE; + } + stack.push_back(BVHStackEntry(nodes[i], idx)); + } + /* Set node. */ + pack_inner(e, &stack[stack.size()-numnodes], numnodes); + } + } + assert(node_size == nextNodeIdx); + /* Root index to start traversal at, to handle case of single leaf node. */ + pack.root_index = (root->is_leaf())? -1: 0; +} + +void BVH4::refit_nodes() +{ + assert(!params.top_level); + + BoundBox bbox = BoundBox::empty; + uint visibility = 0; + refit_node(0, (pack.root_index == -1)? true: false, bbox, visibility); +} + +void BVH4::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility) +{ + if(leaf) { + /* Refit leaf node. */ + int4 *data = &pack.leaf_nodes[idx]; + int4 c = data[0]; + + BVH::refit_primitives(c.x, c.y, bbox, visibility); + + /* TODO(sergey): This is actually a copy of pack_leaf(), + * but this chunk of code only knows actual data and has + * no idea about BVHNode. + * + * Would be nice to de-duplicate code, but trying to make + * making code more general ends up in much nastier code + * in my opinion so far. + * + * Same applies to the inner nodes case below. + */ + float4 leaf_data[BVH_QNODE_LEAF_SIZE]; + leaf_data[0].x = __int_as_float(c.x); + leaf_data[0].y = __int_as_float(c.y); + leaf_data[0].z = __uint_as_float(visibility); + leaf_data[0].w = __uint_as_float(c.w); + memcpy(&pack.leaf_nodes[idx], leaf_data, sizeof(float4)*BVH_QNODE_LEAF_SIZE); + } + else { + int4 *data = &pack.nodes[idx]; + bool is_unaligned = (data[0].x & PATH_RAY_NODE_UNALIGNED) != 0; + int4 c; + if(is_unaligned) { + c = data[13]; + } + else { + c = data[7]; + } + /* Refit inner node, set bbox from children. */ + BoundBox child_bbox[4] = {BoundBox::empty, + BoundBox::empty, + BoundBox::empty, + BoundBox::empty}; + uint child_visibility[4] = {0}; + int num_nodes = 0; + + for(int i = 0; i < 4; ++i) { + if(c[i] != 0) { + refit_node((c[i] < 0)? -c[i]-1: c[i], (c[i] < 0), + child_bbox[i], child_visibility[i]); + ++num_nodes; + bbox.grow(child_bbox[i]); + visibility |= child_visibility[i]; + } + } + + if(is_unaligned) { + Transform aligned_space[4] = {transform_identity(), + transform_identity(), + transform_identity(), + transform_identity()}; + pack_unaligned_node(idx, + aligned_space, + child_bbox, + &c[0], + visibility, + 0.0f, + 1.0f, + 4); + } + else { + pack_aligned_node(idx, + child_bbox, + &c[0], + visibility, + 0.0f, + 1.0f, + 4); + } + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/bvh/bvh4.h b/intern/cycles/bvh/bvh4.h new file mode 100644 index 00000000000..310909a37e1 --- /dev/null +++ b/intern/cycles/bvh/bvh4.h @@ -0,0 +1,87 @@ +/* + * Adapted from code copyright 2009-2010 NVIDIA Corporation + * Modifications Copyright 2011, Blender Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __BVH4_H__ +#define __BVH4_H__ + +#include "bvh/bvh.h" +#include "bvh/bvh_params.h" + +#include "util/util_types.h" +#include "util/util_vector.h" + +CCL_NAMESPACE_BEGIN + +class BVHNode; +struct BVHStackEntry; +class BVHParams; +class BoundBox; +class LeafNode; +class Object; +class Progress; + +#define BVH_QNODE_SIZE 8 +#define BVH_QNODE_LEAF_SIZE 1 +#define BVH_UNALIGNED_QNODE_SIZE 14 + +/* BVH4 + * + * Quad BVH, with each node having four children, to use with SIMD instructions. + */ +class BVH4 : public BVH { +protected: + /* constructor */ + friend class BVH; + BVH4(const BVHParams& params, const vector<Object*>& objects); + + /* pack */ + void pack_nodes(const BVHNode *root); + + void pack_leaf(const BVHStackEntry& e, const LeafNode *leaf); + void pack_inner(const BVHStackEntry& e, const BVHStackEntry *en, int num); + + void pack_aligned_inner(const BVHStackEntry& e, + const BVHStackEntry *en, + int num); + void pack_aligned_node(int idx, + const BoundBox *bounds, + const int *child, + const uint visibility, + const float time_from, + const float time_to, + const int num); + + void pack_unaligned_inner(const BVHStackEntry& e, + const BVHStackEntry *en, + int num); + void pack_unaligned_node(int idx, + const Transform *aligned_space, + const BoundBox *bounds, + const int *child, + const uint visibility, + const float time_from, + const float time_to, + const int num); + + /* refit */ + void refit_nodes(); + void refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility); +}; + +CCL_NAMESPACE_END + +#endif /* __BVH4_H__ */ diff --git a/intern/cycles/bvh/bvh_binning.cpp b/intern/cycles/bvh/bvh_binning.cpp index 5ddd7349f7b..63a7fc11668 100644 --- a/intern/cycles/bvh/bvh_binning.cpp +++ b/intern/cycles/bvh/bvh_binning.cpp @@ -17,13 +17,13 @@ //#define __KERNEL_SSE__ -#include <stdlib.h> +#include "bvh/bvh_binning.h" -#include "bvh_binning.h" +#include <stdlib.h> -#include "util_algorithm.h" -#include "util_boundbox.h" -#include "util_types.h" +#include "util/util_algorithm.h" +#include "util/util_boundbox.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/bvh/bvh_binning.h b/intern/cycles/bvh/bvh_binning.h index 52955f70151..c2e259b1696 100644 --- a/intern/cycles/bvh/bvh_binning.h +++ b/intern/cycles/bvh/bvh_binning.h @@ -18,10 +18,10 @@ #ifndef __BVH_BINNING_H__ #define __BVH_BINNING_H__ -#include "bvh_params.h" -#include "bvh_unaligned.h" +#include "bvh/bvh_params.h" +#include "bvh/bvh_unaligned.h" -#include "util_types.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN @@ -111,5 +111,4 @@ protected: CCL_NAMESPACE_END -#endif - +#endif /* __BVH_BINNING_H__ */ diff --git a/intern/cycles/bvh/bvh_build.cpp b/intern/cycles/bvh/bvh_build.cpp index fcbc50f4f6f..c0b3d683e37 100644 --- a/intern/cycles/bvh/bvh_build.cpp +++ b/intern/cycles/bvh/bvh_build.cpp @@ -15,26 +15,26 @@ * limitations under the License. */ -#include "bvh_binning.h" -#include "bvh_build.h" -#include "bvh_node.h" -#include "bvh_params.h" +#include "bvh/bvh_build.h" + +#include "bvh/bvh_binning.h" +#include "bvh/bvh_node.h" +#include "bvh/bvh_params.h" #include "bvh_split.h" -#include "mesh.h" -#include "object.h" -#include "scene.h" -#include "curves.h" - -#include "util_algorithm.h" -#include "util_debug.h" -#include "util_foreach.h" -#include "util_logging.h" -#include "util_progress.h" -#include "util_stack_allocator.h" -#include "util_simd.h" -#include "util_time.h" -#include "util_queue.h" +#include "render/mesh.h" +#include "render/object.h" +#include "render/scene.h" +#include "render/curves.h" + +#include "util/util_algorithm.h" +#include "util/util_foreach.h" +#include "util/util_logging.h" +#include "util/util_progress.h" +#include "util/util_stack_allocator.h" +#include "util/util_simd.h" +#include "util/util_time.h" +#include "util/util_queue.h" CCL_NAMESPACE_BEGIN @@ -128,7 +128,7 @@ void BVHBuild::add_reference_triangles(BoundBox& root, BoundBox& center, Mesh *m if(attr_mP == NULL) { BoundBox bounds = BoundBox::empty; t.bounds_grow(verts, bounds); - if(bounds.valid()) { + if(bounds.valid() && t.valid(verts)) { references.push_back(BVHReference(bounds, j, i, @@ -528,7 +528,9 @@ BVHNode* BVHBuild::run() << " Allocation slop factor: " << ((prim_type.capacity() != 0) ? (float)prim_type.size() / prim_type.capacity() - : 1.0f) << "\n"; + : 1.0f) << "\n" + << " Maximum depth: " + << string_human_readable_number(rootnode->getSubtreeSize(BVH_STAT_DEPTH)) << "\n"; } } @@ -670,7 +672,7 @@ BVHNode* BVHBuild::build_node(const BVHObjectBinning& range, int level) return create_leaf_node(range, references); } } - /* Check whether unaligned split is better than the regulat one. */ + /* Check whether unaligned split is better than the regular one. */ if(unalignedSplitSAH < splitSAH) { do_unalinged_split = true; } @@ -864,10 +866,10 @@ BVHNode *BVHBuild::create_object_leaf_nodes(const BVHReference *ref, int start, prim_time[start] = make_float2(ref->time_from(), ref->time_to()); } - uint visibility = objects[ref->prim_object()]->visibility; + const uint visibility = objects[ref->prim_object()]->visibility_for_tracing(); BVHNode *leaf_node = new LeafNode(ref->bounds(), visibility, start, start+1); - leaf_node->m_time_from = ref->time_from(); - leaf_node->m_time_to = ref->time_to(); + leaf_node->time_from = ref->time_from(); + leaf_node->time_to = ref->time_to(); return leaf_node; } else { @@ -876,12 +878,12 @@ BVHNode *BVHBuild::create_object_leaf_nodes(const BVHReference *ref, int start, BVHNode *leaf1 = create_object_leaf_nodes(ref+mid, start+mid, num-mid); BoundBox bounds = BoundBox::empty; - bounds.grow(leaf0->m_bounds); - bounds.grow(leaf1->m_bounds); + bounds.grow(leaf0->bounds); + bounds.grow(leaf1->bounds); BVHNode *inner_node = new InnerNode(bounds, leaf0, leaf1); - inner_node->m_time_from = min(leaf0->m_time_from, leaf1->m_time_from); - inner_node->m_time_to = max(leaf0->m_time_to, leaf1->m_time_to); + inner_node->time_from = min(leaf0->time_from, leaf1->time_from); + inner_node->time_to = max(leaf0->time_to, leaf1->time_to); return inner_node; } } @@ -905,12 +907,13 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range, * can not control. */ typedef StackAllocator<256, int> LeafStackAllocator; + typedef StackAllocator<256, float2> LeafTimeStackAllocator; typedef StackAllocator<256, BVHReference> LeafReferenceStackAllocator; vector<int, LeafStackAllocator> p_type[PRIMITIVE_NUM_TOTAL]; vector<int, LeafStackAllocator> p_index[PRIMITIVE_NUM_TOTAL]; vector<int, LeafStackAllocator> p_object[PRIMITIVE_NUM_TOTAL]; - vector<float2, LeafStackAllocator> p_time[PRIMITIVE_NUM_TOTAL]; + vector<float2, LeafTimeStackAllocator> p_time[PRIMITIVE_NUM_TOTAL]; vector<BVHReference, LeafReferenceStackAllocator> p_ref[PRIMITIVE_NUM_TOTAL]; /* TODO(sergey): In theory we should be able to store references. */ @@ -937,7 +940,7 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range, ref.time_to())); bounds[type_index].grow(ref.bounds()); - visibility[type_index] |= objects[ref.prim_object()]->visibility; + visibility[type_index] |= objects[ref.prim_object()]->visibility_for_tracing(); if(ref.prim_type() & PRIMITIVE_ALL_CURVE) { visibility[type_index] |= PATH_RAY_CURVE; } @@ -964,7 +967,7 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range, vector<int, LeafStackAllocator> local_prim_type, local_prim_index, local_prim_object; - vector<float2, LeafStackAllocator> local_prim_time; + vector<float2, LeafTimeStackAllocator> local_prim_time; local_prim_type.resize(num_new_prims); local_prim_index.resize(num_new_prims); local_prim_object.resize(num_new_prims); @@ -1003,19 +1006,19 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range, time_from = min(time_from, ref.time_from()); time_to = max(time_to, ref.time_to()); } - leaf_node->m_time_from = time_from; - leaf_node->m_time_to = time_to; + leaf_node->time_from = time_from; + leaf_node->time_to = time_to; } if(alignment_found) { /* Need to recalculate leaf bounds with new alignment. */ - leaf_node->m_bounds = BoundBox::empty; + leaf_node->bounds = BoundBox::empty; for(int j = 0; j < num; ++j) { const BVHReference &ref = p_ref[i][j]; BoundBox ref_bounds = unaligned_heuristic.compute_aligned_prim_boundbox( ref, aligned_space); - leaf_node->m_bounds.grow(ref_bounds); + leaf_node->bounds.grow(ref_bounds); } /* Set alignment space. */ leaf_node->set_aligned_space(aligned_space); @@ -1038,7 +1041,6 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range, */ start_index = spatial_free_index; spatial_free_index += range.size(); - /* Extend an array when needed. */ const size_t range_end = start_index + range.size(); if(prim_type.size() < range_end) { @@ -1064,8 +1066,6 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range, prim_time.resize(range_end); } } - spatial_spin_lock.unlock(); - /* Perform actual data copy. */ if(new_leaf_data_size > 0) { memcpy(&prim_type[start_index], &local_prim_type[0], new_leaf_data_size); @@ -1075,6 +1075,7 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range, memcpy(&prim_time[start_index], &local_prim_time[0], sizeof(float2)*num_new_leaf_data); } } + spatial_spin_lock.unlock(); } else { /* For the regular BVH builder we simply copy new data starting at the @@ -1098,8 +1099,8 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range, */ for(int i = 0; i < num_leaves; ++i) { LeafNode *leaf = (LeafNode *)leaves[i]; - leaf->m_lo += start_index; - leaf->m_hi += start_index; + leaf->lo += start_index; + leaf->hi += start_index; } /* Create leaf node for object. */ @@ -1128,17 +1129,17 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range, return new InnerNode(range.bounds(), leaves[0], leaves[1]); } else if(num_leaves == 3) { - BoundBox inner_bounds = merge(leaves[1]->m_bounds, leaves[2]->m_bounds); + BoundBox inner_bounds = merge(leaves[1]->bounds, leaves[2]->bounds); BVHNode *inner = new InnerNode(inner_bounds, leaves[1], leaves[2]); return new InnerNode(range.bounds(), leaves[0], inner); } else { /* Should be doing more branches if more primitive types added. */ assert(num_leaves <= 5); - BoundBox inner_bounds_a = merge(leaves[0]->m_bounds, leaves[1]->m_bounds); - BoundBox inner_bounds_b = merge(leaves[2]->m_bounds, leaves[3]->m_bounds); + BoundBox inner_bounds_a = merge(leaves[0]->bounds, leaves[1]->bounds); + BoundBox inner_bounds_b = merge(leaves[2]->bounds, leaves[3]->bounds); BVHNode *inner_a = new InnerNode(inner_bounds_a, leaves[0], leaves[1]); BVHNode *inner_b = new InnerNode(inner_bounds_b, leaves[2], leaves[3]); - BoundBox inner_bounds_c = merge(inner_a->m_bounds, inner_b->m_bounds); + BoundBox inner_bounds_c = merge(inner_a->bounds, inner_b->bounds); BVHNode *inner_c = new InnerNode(inner_bounds_c, inner_a, inner_b); if(num_leaves == 5) { return new InnerNode(range.bounds(), inner_c, leaves[4]); @@ -1173,8 +1174,8 @@ void BVHBuild::rotate(BVHNode *node, int max_depth) rotate(parent->children[c], max_depth-1); /* compute current area of all children */ - BoundBox bounds0 = parent->children[0]->m_bounds; - BoundBox bounds1 = parent->children[1]->m_bounds; + BoundBox bounds0 = parent->children[0]->bounds; + BoundBox bounds1 = parent->children[1]->bounds; float area0 = bounds0.half_area(); float area1 = bounds1.half_area(); @@ -1194,8 +1195,8 @@ void BVHBuild::rotate(BVHNode *node, int max_depth) BoundBox& other = (c == 0)? bounds1: bounds0; /* transpose child bounds */ - BoundBox target0 = child->children[0]->m_bounds; - BoundBox target1 = child->children[1]->m_bounds; + BoundBox target0 = child->children[0]->bounds; + BoundBox target1 = child->children[1]->bounds; /* compute cost for both possible swaps */ float cost0 = merge(other, target1).half_area() - child_area[c]; @@ -1227,7 +1228,7 @@ void BVHBuild::rotate(BVHNode *node, int max_depth) InnerNode *child = (InnerNode*)parent->children[best_child]; swap(parent->children[best_other], child->children[best_target]); - child->m_bounds = merge(child->children[0]->m_bounds, child->children[1]->m_bounds); + child->bounds = merge(child->children[0]->bounds, child->children[1]->bounds); } CCL_NAMESPACE_END diff --git a/intern/cycles/bvh/bvh_build.h b/intern/cycles/bvh/bvh_build.h index 430efc3e0f6..7b245139819 100644 --- a/intern/cycles/bvh/bvh_build.h +++ b/intern/cycles/bvh/bvh_build.h @@ -20,17 +20,17 @@ #include <float.h> -#include "bvh.h" -#include "bvh_binning.h" -#include "bvh_unaligned.h" +#include "bvh/bvh_params.h" +#include "bvh/bvh_unaligned.h" -#include "util_boundbox.h" -#include "util_task.h" -#include "util_vector.h" +#include "util/util_task.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN +class Boundbox; class BVHBuildTask; +class BVHNode; class BVHSpatialSplitBuildTask; class BVHParams; class InnerNode; diff --git a/intern/cycles/bvh/bvh_node.cpp b/intern/cycles/bvh/bvh_node.cpp index 67580e1bc7b..24af919ff46 100644 --- a/intern/cycles/bvh/bvh_node.cpp +++ b/intern/cycles/bvh/bvh_node.cpp @@ -15,12 +15,12 @@ * limitations under the License. */ -#include "bvh.h" -#include "bvh_build.h" -#include "bvh_node.h" +#include "bvh/bvh_node.h" -#include "util_debug.h" -#include "util_vector.h" +#include "bvh/bvh.h" +#include "bvh/bvh_build.h" + +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN @@ -62,12 +62,12 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const } return cnt; case BVH_STAT_ALIGNED_COUNT: - if(!is_unaligned()) { + if(!is_unaligned) { cnt = 1; } break; case BVH_STAT_UNALIGNED_COUNT: - if(is_unaligned()) { + if(is_unaligned) { cnt = 1; } break; @@ -75,7 +75,7 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const if(!is_leaf()) { bool has_unaligned = false; for(int j = 0; j < num_children(); j++) { - has_unaligned |= get_child(j)->is_unaligned(); + has_unaligned |= get_child(j)->is_unaligned; } cnt += has_unaligned? 0: 1; } @@ -84,7 +84,7 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const if(!is_leaf()) { bool has_unaligned = false; for(int j = 0; j < num_children(); j++) { - has_unaligned |= get_child(j)->is_unaligned(); + has_unaligned |= get_child(j)->is_unaligned; } cnt += has_unaligned? 1: 0; } @@ -95,12 +95,12 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const for(int i = 0; i < num_children(); i++) { BVHNode *node = get_child(i); if(node->is_leaf()) { - has_unaligned |= node->is_unaligned(); + has_unaligned |= node->is_unaligned; } else { for(int j = 0; j < node->num_children(); j++) { cnt += node->get_child(j)->getSubtreeSize(stat); - has_unaligned |= node->get_child(j)->is_unaligned(); + has_unaligned |= node->get_child(j)->is_unaligned; } } } @@ -113,12 +113,12 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const for(int i = 0; i < num_children(); i++) { BVHNode *node = get_child(i); if(node->is_leaf()) { - has_unaligned |= node->is_unaligned(); + has_unaligned |= node->is_unaligned; } else { for(int j = 0; j < node->num_children(); j++) { cnt += node->get_child(j)->getSubtreeSize(stat); - has_unaligned |= node->get_child(j)->is_unaligned(); + has_unaligned |= node->get_child(j)->is_unaligned; } } } @@ -126,11 +126,22 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const } return cnt; case BVH_STAT_ALIGNED_LEAF_COUNT: - cnt = (is_leaf() && !is_unaligned()) ? 1 : 0; + cnt = (is_leaf() && !is_unaligned) ? 1 : 0; break; case BVH_STAT_UNALIGNED_LEAF_COUNT: - cnt = (is_leaf() && is_unaligned()) ? 1 : 0; + cnt = (is_leaf() && is_unaligned) ? 1 : 0; break; + case BVH_STAT_DEPTH: + if(is_leaf()) { + cnt = 1; + } + else { + for(int i = 0; i < num_children(); i++) { + cnt = max(cnt, get_child(i)->getSubtreeSize(stat)); + } + cnt += 1; + } + return cnt; default: assert(0); /* unknown mode */ } @@ -157,7 +168,7 @@ float BVHNode::computeSubtreeSAHCost(const BVHParams& p, float probability) cons for(int i = 0; i < num_children(); i++) { BVHNode *child = get_child(i); - SAH += child->computeSubtreeSAHCost(p, probability * child->m_bounds.safe_area()/m_bounds.safe_area()); + SAH += child->computeSubtreeSAHCost(p, probability * child->bounds.safe_area()/bounds.safe_area()); } return SAH; @@ -165,15 +176,15 @@ float BVHNode::computeSubtreeSAHCost(const BVHParams& p, float probability) cons uint BVHNode::update_visibility() { - if(!is_leaf() && m_visibility == 0) { + if(!is_leaf() && visibility == 0) { InnerNode *inner = (InnerNode*)this; BVHNode *child0 = inner->children[0]; BVHNode *child1 = inner->children[1]; - m_visibility = child0->update_visibility()|child1->update_visibility(); + visibility = child0->update_visibility()|child1->update_visibility(); } - return m_visibility; + return visibility; } void BVHNode::update_time() @@ -184,8 +195,8 @@ void BVHNode::update_time() BVHNode *child1 = inner->children[1]; child0->update_time(); child1->update_time(); - m_time_from = min(child0->m_time_from, child1->m_time_from); - m_time_to = max(child0->m_time_to, child1->m_time_to); + time_from = min(child0->time_from, child1->time_from); + time_to = max(child0->time_to, child1->time_to); } } @@ -209,7 +220,7 @@ void LeafNode::print(int depth) const for(int i = 0; i < depth; i++) printf(" "); - printf("leaf node %d to %d\n", m_lo, m_hi); + printf("leaf node %d to %d\n", lo, hi); } CCL_NAMESPACE_END diff --git a/intern/cycles/bvh/bvh_node.h b/intern/cycles/bvh/bvh_node.h index 090c426de56..94cf5ab730c 100644 --- a/intern/cycles/bvh/bvh_node.h +++ b/intern/cycles/bvh/bvh_node.h @@ -18,9 +18,8 @@ #ifndef __BVH_NODE_H__ #define __BVH_NODE_H__ -#include "util_boundbox.h" -#include "util_debug.h" -#include "util_types.h" +#include "util/util_boundbox.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN @@ -39,6 +38,7 @@ enum BVH_STAT { BVH_STAT_UNALIGNED_INNER_QNODE_COUNT, BVH_STAT_ALIGNED_LEAF_COUNT, BVH_STAT_UNALIGNED_LEAF_COUNT, + BVH_STAT_DEPTH, }; class BVHParams; @@ -46,16 +46,16 @@ class BVHParams; class BVHNode { public: - BVHNode() : m_is_unaligned(false), - m_aligned_space(NULL), - m_time_from(0.0f), - m_time_to(1.0f) + BVHNode() : is_unaligned(false), + aligned_space(NULL), + time_from(0.0f), + time_to(1.0f) { } virtual ~BVHNode() { - delete m_aligned_space; + delete aligned_space; } virtual bool is_leaf() const = 0; @@ -63,30 +63,26 @@ public: virtual BVHNode *get_child(int i) const = 0; virtual int num_triangles() const { return 0; } virtual void print(int depth = 0) const = 0; - bool is_unaligned() const { return m_is_unaligned; } inline void set_aligned_space(const Transform& aligned_space) { - m_is_unaligned = true; - if(m_aligned_space == NULL) { - m_aligned_space = new Transform(aligned_space); + is_unaligned = true; + if(this->aligned_space == NULL) { + this->aligned_space = new Transform(aligned_space); } else { - *m_aligned_space = aligned_space; + *this->aligned_space = aligned_space; } } inline Transform get_aligned_space() const { - if(m_aligned_space == NULL) { + if(aligned_space == NULL) { return transform_identity(); } - return *m_aligned_space; + return *aligned_space; } - BoundBox m_bounds; - uint m_visibility; - // Subtree functions int getSubtreeSize(BVH_STAT stat=BVH_STAT_NODE_COUNT) const; float computeSubtreeSAHCost(const BVHParams& p, float probability = 1.0f) const; @@ -95,13 +91,18 @@ public: uint update_visibility(); void update_time(); - bool m_is_unaligned; + // Properties. + BoundBox bounds; + uint visibility; + + bool is_unaligned; - // TODO(sergey): Can be stored as 3x3 matrix, but better to have some - // utilities and type defines in util_transform first. - Transform *m_aligned_space; + /* TODO(sergey): Can be stored as 3x3 matrix, but better to have some + * utilities and type defines in util_transform first. + */ + Transform *aligned_space; - float m_time_from, m_time_to; + float time_from, time_to; }; class InnerNode : public BVHNode @@ -111,20 +112,20 @@ public: BVHNode* child0, BVHNode* child1) { - m_bounds = bounds; + this->bounds = bounds; children[0] = child0; children[1] = child1; if(child0 && child1) - m_visibility = child0->m_visibility|child1->m_visibility; + visibility = child0->visibility|child1->visibility; else - m_visibility = 0; /* happens on build cancel */ + visibility = 0; /* happens on build cancel */ } explicit InnerNode(const BoundBox& bounds) { - m_bounds = bounds; - m_visibility = 0; + this->bounds = bounds; + visibility = 0; children[0] = NULL; children[1] = NULL; } @@ -140,12 +141,12 @@ public: class LeafNode : public BVHNode { public: - LeafNode(const BoundBox& bounds, uint visibility, int lo, int hi) + LeafNode(const BoundBox& bounds, uint visibility, int lo, int hi) + : lo(lo), + hi(hi) { - m_bounds = bounds; - m_visibility = visibility; - m_lo = lo; - m_hi = hi; + this->bounds = bounds; + this->visibility = visibility; } LeafNode(const LeafNode& s) @@ -157,14 +158,13 @@ public: bool is_leaf() const { return true; } int num_children() const { return 0; } BVHNode *get_child(int) const { return NULL; } - int num_triangles() const { return m_hi - m_lo; } + int num_triangles() const { return hi - lo; } void print(int depth) const; - int m_lo; - int m_hi; + int lo; + int hi; }; CCL_NAMESPACE_END #endif /* __BVH_NODE_H__ */ - diff --git a/intern/cycles/bvh/bvh_params.h b/intern/cycles/bvh/bvh_params.h index 7b309504728..89a379cf356 100644 --- a/intern/cycles/bvh/bvh_params.h +++ b/intern/cycles/bvh/bvh_params.h @@ -18,17 +18,35 @@ #ifndef __BVH_PARAMS_H__ #define __BVH_PARAMS_H__ -#include "util_boundbox.h" +#include "util/util_boundbox.h" -#include "kernel_types.h" +#include "kernel/kernel_types.h" CCL_NAMESPACE_BEGIN +/* Layout of BVH tree. + * + * For example, how wide BVH tree is, in terms of number of children + * per node. + */ +typedef KernelBVHLayout BVHLayout; + +/* Names bitflag type to denote which BVH layouts are supported by + * particular area. + * + * Bitflags are the BVH_LAYOUT_* values. + */ +typedef int BVHLayoutMask; + +/* Get human readable name of BVH layout. */ +const char *bvh_layout_name(BVHLayout layout); + /* BVH Parameters */ class BVHParams { public: + /* spatial split area threshold */ bool use_spatial_split; float spatial_split_alpha; @@ -50,8 +68,8 @@ public: /* object or mesh level bvh */ bool top_level; - /* QBVH */ - bool use_qbvh; + /* BVH layout to be built. */ + BVHLayout bvh_layout; /* Mask of primitives to be included into the BVH. */ int primitive_mask; @@ -98,7 +116,7 @@ public: max_motion_curve_leaf_size = 4; top_level = false; - use_qbvh = false; + bvh_layout = BVH_LAYOUT_BVH2; use_unaligned_nodes = false; primitive_mask = PRIMITIVE_ALL; @@ -119,6 +137,14 @@ public: __forceinline bool small_enough_for_leaf(int size, int level) { return (size <= min_leaf_size || level >= MAX_DEPTH); } + + /* Gets best matching BVH. + * + * If the requested layout is supported by the device, it will be used. + * Otherwise, widest supported layout below that will be used. + */ + static BVHLayout best_bvh_layout(BVHLayout requested_layout, + BVHLayoutMask supported_layouts); }; /* BVH Reference @@ -246,4 +272,3 @@ struct BVHSpatialStorage { CCL_NAMESPACE_END #endif /* __BVH_PARAMS_H__ */ - diff --git a/intern/cycles/bvh/bvh_sort.cpp b/intern/cycles/bvh/bvh_sort.cpp index e5bcf9995bf..b40bf5bb21b 100644 --- a/intern/cycles/bvh/bvh_sort.cpp +++ b/intern/cycles/bvh/bvh_sort.cpp @@ -15,12 +15,12 @@ * limitations under the License. */ -#include "bvh_build.h" -#include "bvh_sort.h" +#include "bvh/bvh_sort.h" -#include "util_algorithm.h" -#include "util_debug.h" -#include "util_task.h" +#include "bvh/bvh_build.h" + +#include "util/util_algorithm.h" +#include "util/util_task.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/bvh/bvh_sort.h b/intern/cycles/bvh/bvh_sort.h index b49ca02eb60..936401d8607 100644 --- a/intern/cycles/bvh/bvh_sort.h +++ b/intern/cycles/bvh/bvh_sort.h @@ -18,8 +18,11 @@ #ifndef __BVH_SORT_H__ #define __BVH_SORT_H__ +#include <cstddef> + CCL_NAMESPACE_BEGIN +class BVHReference; class BVHUnaligned; struct Transform; @@ -33,4 +36,3 @@ void bvh_reference_sort(int start, CCL_NAMESPACE_END #endif /* __BVH_SORT_H__ */ - diff --git a/intern/cycles/bvh/bvh_split.cpp b/intern/cycles/bvh/bvh_split.cpp index d0d5fbe5a7a..c55ba40b565 100644 --- a/intern/cycles/bvh/bvh_split.cpp +++ b/intern/cycles/bvh/bvh_split.cpp @@ -15,14 +15,15 @@ * limitations under the License. */ -#include "bvh_build.h" -#include "bvh_split.h" -#include "bvh_sort.h" +#include "bvh/bvh_split.h" -#include "mesh.h" -#include "object.h" +#include "bvh/bvh_build.h" +#include "bvh/bvh_sort.h" -#include "util_algorithm.h" +#include "render/mesh.h" +#include "render/object.h" + +#include "util/util_algorithm.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/bvh/bvh_split.h b/intern/cycles/bvh/bvh_split.h index dbdb51f1a5b..a874a118b99 100644 --- a/intern/cycles/bvh/bvh_split.h +++ b/intern/cycles/bvh/bvh_split.h @@ -18,8 +18,8 @@ #ifndef __BVH_SPLIT_H__ #define __BVH_SPLIT_H__ -#include "bvh_build.h" -#include "bvh_params.h" +#include "bvh/bvh_build.h" +#include "bvh/bvh_params.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/bvh/bvh_unaligned.cpp b/intern/cycles/bvh/bvh_unaligned.cpp index a876c670914..910f82137c5 100644 --- a/intern/cycles/bvh/bvh_unaligned.cpp +++ b/intern/cycles/bvh/bvh_unaligned.cpp @@ -14,18 +14,16 @@ * limitations under the License. */ +#include "bvh/bvh_unaligned.h" -#include "bvh_unaligned.h" +#include "render/mesh.h" +#include "render/object.h" -#include "mesh.h" -#include "object.h" - -#include "bvh_binning.h" +#include "bvh/bvh_binning.h" #include "bvh_params.h" -#include "util_boundbox.h" -#include "util_debug.h" -#include "util_transform.h" +#include "util/util_boundbox.h" +#include "util/util_transform.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/bvh/bvh_unaligned.h b/intern/cycles/bvh/bvh_unaligned.h index 4d0872f4a39..c3ece051cd5 100644 --- a/intern/cycles/bvh/bvh_unaligned.h +++ b/intern/cycles/bvh/bvh_unaligned.h @@ -17,7 +17,7 @@ #ifndef __BVH_UNALIGNED_H__ #define __BVH_UNALIGNED_H__ -#include "util_vector.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN @@ -78,4 +78,3 @@ protected: CCL_NAMESPACE_END #endif /* __BVH_UNALIGNED_H__ */ - diff --git a/intern/cycles/cmake/external_libs.cmake b/intern/cycles/cmake/external_libs.cmake index 403a0540963..8d04025e6fd 100644 --- a/intern/cycles/cmake/external_libs.cmake +++ b/intern/cycles/cmake/external_libs.cmake @@ -30,7 +30,7 @@ if(NOT CYCLES_STANDALONE_REPOSITORY) set(GLEW_INCLUDE_DIR "${GLEW_INCLUDE_PATH}") endif() -if(WITH_CYCLES_STANDALONE AND WITH_CYCLES_STANDALONE_GUI) +if(WITH_CYCLES_STANDALONE) set(CYCLES_APP_GLEW_LIBRARY ${BLENDER_GLEW_LIBRARIES}) endif() @@ -135,13 +135,5 @@ if(CYCLES_STANDALONE_REPOSITORY) unset(_lib_DIR) else() - if(WIN32) - set(GLOG_INCLUDE_DIRS ${CMAKE_SOURCE_DIR}/extern/glog/src/windows) - set(GFLAGS_INCLUDE_DIRS ${CMAKE_SOURCE_DIR}/extern/gflags/src) - else() - set(GLOG_INCLUDE_DIRS ${CMAKE_SOURCE_DIR}/extern/glog/src) - set(GFLAGS_INCLUDE_DIRS ${CMAKE_SOURCE_DIR}/extern/gflags/src) - endif() - set(GFLAGS_NAMESPACE "gflags") set(LLVM_LIBRARIES ${LLVM_LIBRARY}) endif() diff --git a/intern/cycles/cmake/macros.cmake b/intern/cycles/cmake/macros.cmake new file mode 100644 index 00000000000..f3ca06ac6b8 --- /dev/null +++ b/intern/cycles/cmake/macros.cmake @@ -0,0 +1,12 @@ +function(cycles_set_solution_folder target) + if(WINDOWS_USE_VISUAL_STUDIO_FOLDERS) + get_filename_component(folderdir ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY) + string(REPLACE ${CMAKE_SOURCE_DIR} "" folderdir ${folderdir}) + set_target_properties(${target} PROPERTIES FOLDER ${folderdir}) + endif() +endfunction() + +macro(cycles_add_library target) + add_library(${target} ${ARGN}) + cycles_set_solution_folder(${target}) +endmacro() diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt index 966ff5e52ba..75e78e038ea 100644 --- a/intern/cycles/device/CMakeLists.txt +++ b/intern/cycles/device/CMakeLists.txt @@ -1,12 +1,6 @@ set(INC - . - ../graph - ../kernel - ../kernel/svm - ../kernel/osl - ../util - ../render + .. ../../glew-mx ) @@ -31,18 +25,23 @@ set(SRC device.cpp device_cpu.cpp device_cuda.cpp + device_denoising.cpp + device_memory.cpp device_multi.cpp device_opencl.cpp + device_split_kernel.cpp device_task.cpp ) set(SRC_OPENCL opencl/opencl.h + opencl/memory_manager.h opencl/opencl_base.cpp opencl/opencl_mega.cpp opencl/opencl_split.cpp opencl/opencl_util.cpp + opencl/memory_manager.cpp ) if(WITH_CYCLES_NETWORK) @@ -53,9 +52,11 @@ endif() set(SRC_HEADERS device.h + device_denoising.h device_memory.h device_intern.h device_network.h + device_split_kernel.h device_task.h ) @@ -76,4 +77,4 @@ endif() include_directories(${INC}) include_directories(SYSTEM ${INC_SYS}) -add_library(cycles_device ${SRC} ${SRC_OPENCL} ${SRC_HEADERS}) +cycles_add_library(cycles_device ${SRC} ${SRC_OPENCL} ${SRC_HEADERS}) diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp index 31c99f49d6d..6959dd73c32 100644 --- a/intern/cycles/device/device.cpp +++ b/intern/cycles/device/device.cpp @@ -17,23 +17,25 @@ #include <stdlib.h> #include <string.h> -#include "device.h" -#include "device_intern.h" - -#include "util_debug.h" -#include "util_foreach.h" -#include "util_half.h" -#include "util_math.h" -#include "util_opengl.h" -#include "util_time.h" -#include "util_types.h" -#include "util_vector.h" -#include "util_string.h" +#include "device/device.h" +#include "device/device_intern.h" + +#include "util/util_foreach.h" +#include "util/util_half.h" +#include "util/util_logging.h" +#include "util/util_math.h" +#include "util/util_opengl.h" +#include "util/util_time.h" +#include "util/util_system.h" +#include "util/util_types.h" +#include "util/util_vector.h" +#include "util/util_string.h" CCL_NAMESPACE_BEGIN bool Device::need_types_update = true; bool Device::need_devices_update = true; +thread_mutex Device::device_mutex; vector<DeviceType> Device::types; vector<DeviceInfo> Device::devices; @@ -44,15 +46,14 @@ std::ostream& operator <<(std::ostream &os, { os << "Experimental features: " << (requested_features.experimental ? "On" : "Off") << std::endl; - os << "Max closure count: " << requested_features.max_closure << std::endl; os << "Max nodes group: " << requested_features.max_nodes_group << std::endl; /* TODO(sergey): Decode bitflag into list of names. */ os << "Nodes features: " << requested_features.nodes_features << std::endl; - os << "Use hair: " + os << "Use Hair: " << string_from_bool(requested_features.use_hair) << std::endl; - os << "Use object motion: " + os << "Use Object Motion: " << string_from_bool(requested_features.use_object_motion) << std::endl; - os << "Use camera motion: " + os << "Use Camera Motion: " << string_from_bool(requested_features.use_camera_motion) << std::endl; os << "Use Baking: " << string_from_bool(requested_features.use_baking) << std::endl; @@ -66,6 +67,10 @@ std::ostream& operator <<(std::ostream &os, << string_from_bool(requested_features.use_patch_evaluation) << std::endl; os << "Use Transparent Shadows: " << string_from_bool(requested_features.use_transparent) << std::endl; + os << "Use Principled BSDF: " + << string_from_bool(requested_features.use_principled) << std::endl; + os << "Use Denoising: " + << string_from_bool(requested_features.use_denoising) << std::endl; return os; } @@ -78,28 +83,12 @@ Device::~Device() } } -void Device::pixels_alloc(device_memory& mem) -{ - mem_alloc(mem, MEM_READ_WRITE); -} - -void Device::pixels_copy_from(device_memory& mem, int y, int w, int h) -{ - if(mem.data_type == TYPE_HALF) - mem_copy_from(mem, y, w, h, sizeof(half4)); - else - mem_copy_from(mem, y, w, h, sizeof(uchar4)); -} - -void Device::pixels_free(device_memory& mem) -{ - mem_free(mem); -} - void Device::draw_pixels(device_memory& rgba, int y, int w, int h, int dx, int dy, int width, int height, bool transparent, const DeviceDrawParams &draw_params) { - pixels_copy_from(rgba, y, w, h); + assert(rgba.type == MEM_PIXELS); + + mem_copy_from(rgba, y, w, h, rgba.memory_elements_size(1)); if(transparent) { glEnable(GL_BLEND); @@ -111,17 +100,17 @@ void Device::draw_pixels(device_memory& rgba, int y, int w, int h, int dx, int d if(rgba.data_type == TYPE_HALF) { /* for multi devices, this assumes the inefficient method that we allocate * all pixels on the device even though we only render to a subset */ - GLhalf *data_pointer = (GLhalf*)rgba.data_pointer; + GLhalf *host_pointer = (GLhalf*)rgba.host_pointer; float vbuffer[16], *basep; float *vp = NULL; - data_pointer += 4*y*w; + host_pointer += 4*y*w; /* draw half float texture, GLSL shader for display transform assumed to be bound */ GLuint texid; glGenTextures(1, &texid); glBindTexture(GL_TEXTURE_2D, texid); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F_ARB, w, h, 0, GL_RGBA, GL_HALF_FLOAT, data_pointer); + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F_ARB, w, h, 0, GL_RGBA, GL_HALF_FLOAT, host_pointer); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); @@ -203,7 +192,7 @@ void Device::draw_pixels(device_memory& rgba, int y, int w, int h, int dx, int d glPixelZoom((float)width/(float)w, (float)height/(float)h); glRasterPos2f(dx, dy); - uint8_t *pixels = (uint8_t*)rgba.data_pointer; + uint8_t *pixels = (uint8_t*)rgba.host_pointer; pixels += 4*y*w; @@ -292,53 +281,49 @@ string Device::string_from_type(DeviceType type) vector<DeviceType>& Device::available_types() { + thread_scoped_lock lock(device_mutex); if(need_types_update) { types.clear(); types.push_back(DEVICE_CPU); - #ifdef WITH_CUDA - if(device_cuda_init()) + if(device_cuda_init()) { types.push_back(DEVICE_CUDA); + } #endif - #ifdef WITH_OPENCL - if(device_opencl_init()) + if(device_opencl_init()) { types.push_back(DEVICE_OPENCL); + } #endif - #ifdef WITH_NETWORK types.push_back(DEVICE_NETWORK); #endif - need_types_update = false; } - return types; } vector<DeviceInfo>& Device::available_devices() { + thread_scoped_lock lock(device_mutex); if(need_devices_update) { devices.clear(); -#ifdef WITH_CUDA - if(device_cuda_init()) - device_cuda_info(devices); -#endif - #ifdef WITH_OPENCL - if(device_opencl_init()) + if(device_opencl_init()) { device_opencl_info(devices); + } +#endif +#ifdef WITH_CUDA + if(device_cuda_init()) { + device_cuda_info(devices); + } #endif - device_cpu_info(devices); - #ifdef WITH_NETWORK device_network_info(devices); #endif - need_devices_update = false; } - return devices; } @@ -346,12 +331,6 @@ string Device::device_capabilities() { string capabilities = "CPU device capabilities: "; capabilities += device_cpu_capabilities() + "\n"; -#ifdef WITH_CUDA - if(device_cuda_init()) { - capabilities += "\nCUDA device capabilities:\n"; - capabilities += device_cuda_capabilities(); - } -#endif #ifdef WITH_OPENCL if(device_opencl_init()) { @@ -360,10 +339,17 @@ string Device::device_capabilities() } #endif +#ifdef WITH_CUDA + if(device_cuda_init()) { + capabilities += "\nCUDA device capabilities:\n"; + capabilities += device_cuda_capabilities(); + } +#endif + return capabilities; } -DeviceInfo Device::get_multi_device(vector<DeviceInfo> subdevices) +DeviceInfo Device::get_multi_device(const vector<DeviceInfo>& subdevices, int threads, bool background) { assert(subdevices.size() > 1); @@ -371,16 +357,47 @@ DeviceInfo Device::get_multi_device(vector<DeviceInfo> subdevices) info.type = DEVICE_MULTI; info.id = "MULTI"; info.description = "Multi Device"; - info.multi_devices = subdevices; info.num = 0; - info.has_bindless_textures = true; - info.pack_images = false; - foreach(DeviceInfo &device, subdevices) { - assert(device.type == info.multi_devices[0].type); + info.has_half_images = true; + info.has_volume_decoupled = true; + info.bvh_layout_mask = BVH_LAYOUT_ALL; + info.has_osl = true; + + foreach(const DeviceInfo &device, subdevices) { + /* Ensure CPU device does not slow down GPU. */ + if(device.type == DEVICE_CPU && subdevices.size() > 1) { + if(background) { + int orig_cpu_threads = (threads)? threads: system_cpu_thread_count(); + int cpu_threads = max(orig_cpu_threads - (subdevices.size() - 1), 0); + + VLOG(1) << "CPU render threads reduced from " + << orig_cpu_threads << " to " << cpu_threads + << ", to dedicate to GPU."; + + if(cpu_threads >= 1) { + DeviceInfo cpu_device = device; + cpu_device.cpu_threads = cpu_threads; + info.multi_devices.push_back(cpu_device); + } + else { + continue; + } + } + else { + VLOG(1) << "CPU render threads disabled for interactive render."; + continue; + } + } + else { + info.multi_devices.push_back(device); + } - info.pack_images |= device.pack_images; - info.has_bindless_textures &= device.has_bindless_textures; + /* Accumulate device info. */ + info.has_half_images &= device.has_half_images; + info.has_volume_decoupled &= device.has_volume_decoupled; + info.bvh_layout_mask = device.bvh_layout_mask & info.bvh_layout_mask; + info.has_osl &= device.has_osl; } return info; diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index ccee25ae34e..b856bdd9d01 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -19,15 +19,18 @@ #include <stdlib.h> -#include "device_memory.h" -#include "device_task.h" +#include "bvh/bvh_params.h" -#include "util_list.h" -#include "util_stats.h" -#include "util_string.h" -#include "util_thread.h" -#include "util_types.h" -#include "util_vector.h" +#include "device/device_memory.h" +#include "device/device_task.h" + +#include "util/util_list.h" +#include "util/util_stats.h" +#include "util/util_string.h" +#include "util/util_thread.h" +#include "util/util_texture.h" +#include "util/util_types.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN @@ -51,11 +54,14 @@ public: string description; string id; /* used for user preferences, should stay fixed with changing hardware config */ int num; - bool display_device; - bool advanced_shading; - bool pack_images; - bool has_bindless_textures; /* flag for GPU and Multi device */ - bool use_split_kernel; /* Denotes if the device is going to run cycles using split-kernel */ + bool display_device; /* GPU is used as a display device. */ + bool advanced_shading; /* Supports full shading system. */ + bool has_half_images; /* Support half-float textures. */ + bool has_volume_decoupled; /* Decoupled volume shading. */ + BVHLayoutMask bvh_layout_mask; /* Bitmask of supported BVH layouts. */ + bool has_osl; /* Support Open Shading Language. */ + bool use_split_kernel; /* Use split or mega kernel. */ + int cpu_threads; vector<DeviceInfo> multi_devices; DeviceInfo() @@ -63,10 +69,13 @@ public: type = DEVICE_CPU; id = "CPU"; num = 0; + cpu_threads = 0; display_device = false; advanced_shading = true; - pack_images = false; - has_bindless_textures = false; + has_half_images = false; + has_volume_decoupled = false; + bvh_layout_mask = BVH_LAYOUT_NONE; + has_osl = false; use_split_kernel = false; } @@ -82,9 +91,6 @@ public: /* Use experimental feature set. */ bool experimental; - /* Maximum number of closures in shader trees. */ - int max_closure; - /* Selective nodes compilation. */ /* Identifier of a node group up to which all the nodes needs to be @@ -121,11 +127,22 @@ public: /* Use Transparent shadows */ bool use_transparent; + /* Use various shadow tricks, such as shadow catcher. */ + bool use_shadow_tricks; + + /* Per-uber shader usage flags. */ + bool use_principled; + + /* Denoising features. */ + bool use_denoising; + + /* Use raytracing in shaders. */ + bool use_shader_raytrace; + DeviceRequestedFeatures() { /* TODO(sergey): Find more meaningful defaults. */ experimental = false; - max_closure = 0; max_nodes_group = 0; nodes_features = 0; use_hair = false; @@ -137,12 +154,15 @@ public: use_integrator_branched = false; use_patch_evaluation = false; use_transparent = false; + use_shadow_tricks = false; + use_principled = false; + use_denoising = false; + use_shader_raytrace = false; } bool modified(const DeviceRequestedFeatures& requested_features) { return !(experimental == requested_features.experimental && - max_closure == requested_features.max_closure && max_nodes_group == requested_features.max_nodes_group && nodes_features == requested_features.nodes_features && use_hair == requested_features.use_hair && @@ -153,7 +173,11 @@ public: use_volume == requested_features.use_volume && use_integrator_branched == requested_features.use_integrator_branched && use_patch_evaluation == requested_features.use_patch_evaluation && - use_transparent == requested_features.use_transparent); + use_transparent == requested_features.use_transparent && + use_shadow_tricks == requested_features.use_shadow_tricks && + use_principled == requested_features.use_principled && + use_denoising == requested_features.use_denoising && + use_shader_raytrace == requested_features.use_shader_raytrace); } /* Convert the requested features structure to a build options, @@ -169,7 +193,6 @@ public: string_printf("%d", max_nodes_group); build_options += " -D__NODES_FEATURES__=" + string_printf("%d", nodes_features); - build_options += string_printf(" -D__MAX_CLOSURE__=%d", max_closure); if(!use_hair) { build_options += " -D__NO_HAIR__"; } @@ -194,9 +217,21 @@ public: if(!use_patch_evaluation) { build_options += " -D__NO_PATCH_EVAL__"; } - if(!use_transparent) { + if(!use_transparent && !use_volume) { build_options += " -D__NO_TRANSPARENT__"; } + if(!use_shadow_tricks) { + build_options += " -D__NO_SHADOW_TRICKS__"; + } + if(!use_principled) { + build_options += " -D__NO_PRINCIPLED__"; + } + if(!use_denoising) { + build_options += " -D__NO_DENOISING__"; + } + if(!use_shader_raytrace) { + build_options += " -D__NO_SHADER_RAYTRACE__"; + } return build_options; } }; @@ -212,6 +247,7 @@ struct DeviceDrawParams { }; class Device { + friend class device_sub_ptr; protected: Device(DeviceInfo& info_, Stats &stats_, bool background) : background(background), vertex_buffer(0), info(info_), stats(stats_) {} @@ -221,6 +257,14 @@ protected: /* used for real time display */ unsigned int vertex_buffer; + virtual device_ptr mem_alloc_sub_ptr(device_memory& /*mem*/, int /*offset*/, int /*size*/) + { + /* Only required for devices that implement denoising. */ + assert(false); + return (device_ptr) 0; + } + virtual void mem_free_sub_ptr(device_ptr /*ptr*/) {}; + public: virtual ~Device(); @@ -228,39 +272,25 @@ public: DeviceInfo info; virtual const string& error_message() { return error_msg; } bool have_error() { return !error_message().empty(); } + virtual void set_error(const string& error) + { + if(!have_error()) { + error_msg = error; + } + fprintf(stderr, "%s\n", error.c_str()); + fflush(stderr); + } virtual bool show_samples() const { return false; } /* statistics */ Stats &stats; - /* regular memory */ - virtual void mem_alloc(device_memory& mem, MemoryType type) = 0; - virtual void mem_copy_to(device_memory& mem) = 0; - virtual void mem_copy_from(device_memory& mem, - int y, int w, int h, int elem) = 0; - virtual void mem_zero(device_memory& mem) = 0; - virtual void mem_free(device_memory& mem) = 0; + /* memory alignment */ + virtual int mem_sub_ptr_alignment() { return MIN_ALIGNMENT_CPU_DATA_TYPES; } /* constant memory */ virtual void const_copy_to(const char *name, void *host, size_t size) = 0; - /* texture memory */ - virtual void tex_alloc(const char * /*name*/, - device_memory& /*mem*/, - InterpolationType interpolation = INTERPOLATION_NONE, - ExtensionType extension = EXTENSION_REPEAT) - { - (void)interpolation; /* Ignored. */ - (void)extension; /* Ignored. */ - }; - - virtual void tex_free(device_memory& /*mem*/) {}; - - /* pixel memory */ - virtual void pixels_alloc(device_memory& mem); - virtual void pixels_copy_from(device_memory& mem, int y, int w, int h); - virtual void pixels_free(device_memory& mem); - /* open shading language, only for CPU device */ virtual void *osl_memory() { return NULL; } @@ -288,6 +318,8 @@ public: /* multi device */ virtual void map_tile(Device * /*sub_device*/, RenderTile& /*tile*/) {} virtual int device_number(Device * /*sub_device*/) { return 0; } + virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) {} + virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) {} /* static */ static Device *create(DeviceInfo& info, Stats &stats, bool background = true); @@ -297,15 +329,32 @@ public: static vector<DeviceType>& available_types(); static vector<DeviceInfo>& available_devices(); static string device_capabilities(); - static DeviceInfo get_multi_device(vector<DeviceInfo> subdevices); + static DeviceInfo get_multi_device(const vector<DeviceInfo>& subdevices, + int threads, + bool background); /* Tag devices lists for update. */ static void tag_update(); static void free_memory(); + +protected: + /* Memory allocation, only accessed through device_memory. */ + friend class MultiDevice; + friend class DeviceServer; + friend class device_memory; + + virtual void mem_alloc(device_memory& mem) = 0; + virtual void mem_copy_to(device_memory& mem) = 0; + virtual void mem_copy_from(device_memory& mem, + int y, int w, int h, int elem) = 0; + virtual void mem_zero(device_memory& mem) = 0; + virtual void mem_free(device_memory& mem) = 0; + private: /* Indicted whether device types and devices lists were initialized. */ static bool need_types_update, need_devices_update; + static thread_mutex device_mutex; static vector<DeviceType> types; static vector<DeviceInfo> devices; }; diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index c8e001ec2fd..6be60f8bbb6 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -20,113 +20,314 @@ /* So ImathMath is included before our kernel_cpu_compat. */ #ifdef WITH_OSL /* So no context pollution happens from indirectly included windows.h */ -# include "util_windows.h" +# include "util/util_windows.h" # include <OSL/oslexec.h> #endif -#include "device.h" -#include "device_intern.h" +#include "device/device.h" +#include "device/device_denoising.h" +#include "device/device_intern.h" +#include "device/device_split_kernel.h" -#include "kernel.h" -#include "kernel_compat_cpu.h" -#include "kernel_types.h" -#include "kernel_globals.h" +#include "kernel/kernel.h" +#include "kernel/kernel_compat_cpu.h" +#include "kernel/kernel_types.h" +#include "kernel/split/kernel_split_data.h" +#include "kernel/kernel_globals.h" -#include "osl_shader.h" -#include "osl_globals.h" +#include "kernel/filter/filter.h" -#include "buffers.h" +#include "kernel/osl/osl_shader.h" +#include "kernel/osl/osl_globals.h" -#include "util_debug.h" -#include "util_foreach.h" -#include "util_function.h" -#include "util_logging.h" -#include "util_opengl.h" -#include "util_progress.h" -#include "util_system.h" -#include "util_thread.h" +#include "render/buffers.h" + +#include "util/util_debug.h" +#include "util/util_foreach.h" +#include "util/util_function.h" +#include "util/util_logging.h" +#include "util/util_map.h" +#include "util/util_opengl.h" +#include "util/util_optimization.h" +#include "util/util_progress.h" +#include "util/util_system.h" +#include "util/util_thread.h" CCL_NAMESPACE_BEGIN -class CPUDevice : public Device -{ -public: - TaskPool task_pool; - KernelGlobals kernel_globals; +class CPUDevice; -#ifdef WITH_OSL - OSLGlobals osl_globals; -#endif - - CPUDevice(DeviceInfo& info, Stats &stats, bool background) - : Device(info, stats, background) +/* Has to be outside of the class to be shared across template instantiations. */ +static const char *logged_architecture = ""; + +template<typename F> +class KernelFunctions { +public: + KernelFunctions() { -#ifdef WITH_OSL - kernel_globals.osl = &osl_globals; -#endif + kernel = (F)NULL; + } - /* do now to avoid thread issues */ - system_cpu_support_sse2(); - system_cpu_support_sse3(); - system_cpu_support_sse41(); - system_cpu_support_avx(); - system_cpu_support_avx2(); + KernelFunctions(F kernel_default, + F kernel_sse2, + F kernel_sse3, + F kernel_sse41, + F kernel_avx, + F kernel_avx2) + { + const char *architecture_name = "default"; + kernel = kernel_default; + /* Silence potential warnings about unused variables + * when compiling without some architectures. */ + (void)kernel_sse2; + (void)kernel_sse3; + (void)kernel_sse41; + (void)kernel_avx; + (void)kernel_avx2; #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 - if(system_cpu_support_avx2()) { - VLOG(1) << "Will be using AVX2 kernels."; + if(DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) { + architecture_name = "AVX2"; + kernel = kernel_avx2; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - if(system_cpu_support_avx()) { - VLOG(1) << "Will be using AVX kernels."; + if(DebugFlags().cpu.has_avx() && system_cpu_support_avx()) { + architecture_name = "AVX"; + kernel = kernel_avx; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - if(system_cpu_support_sse41()) { - VLOG(1) << "Will be using SSE4.1 kernels."; + if(DebugFlags().cpu.has_sse41() && system_cpu_support_sse41()) { + architecture_name = "SSE4.1"; + kernel = kernel_sse41; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - if(system_cpu_support_sse3()) { - VLOG(1) << "Will be using SSE3kernels."; + if(DebugFlags().cpu.has_sse3() && system_cpu_support_sse3()) { + architecture_name = "SSE3"; + kernel = kernel_sse3; } else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - if(system_cpu_support_sse2()) { - VLOG(1) << "Will be using SSE2 kernels."; + if(DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) { + architecture_name = "SSE2"; + kernel = kernel_sse2; } - else #endif - { - VLOG(1) << "Will be using regular kernels."; + + if(strcmp(architecture_name, logged_architecture) != 0) { + VLOG(1) << "Will be using " << architecture_name << " kernels."; + logged_architecture = architecture_name; } } + inline F operator()() const { + assert(kernel); + return kernel; + } +protected: + F kernel; +}; + +class CPUSplitKernel : public DeviceSplitKernel { + CPUDevice *device; +public: + explicit CPUSplitKernel(CPUDevice *device); + + virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim, + RenderTile& rtile, + int num_global_elements, + device_memory& kernel_globals, + device_memory& kernel_data_, + device_memory& split_data, + device_memory& ray_state, + device_memory& queue_index, + device_memory& use_queues_flag, + device_memory& work_pool_wgs); + + virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name, + const DeviceRequestedFeatures&); + virtual int2 split_kernel_local_size(); + virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task); + virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads); +}; + +class CPUDevice : public Device +{ +public: + TaskPool task_pool; + KernelGlobals kernel_globals; + + device_vector<TextureInfo> texture_info; + bool need_texture_info; + +#ifdef WITH_OSL + OSLGlobals osl_globals; +#endif + + bool use_split_kernel; + + DeviceRequestedFeatures requested_features; + + KernelFunctions<void(*)(KernelGlobals *, float *, int, int, int, int, int)> path_trace_kernel; + KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_half_float_kernel; + KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_byte_kernel; + KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, int, int, int, int, int)> shader_kernel; + + KernelFunctions<void(*)(int, TilesInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int)> filter_divide_shadow_kernel; + KernelFunctions<void(*)(int, TilesInfo*, int, int, int, int, float*, float*, int*, int, int)> filter_get_feature_kernel; + KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_detect_outliers_kernel; + KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel; + + KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel; + KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel; + KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel; + KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel; + KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel; + + KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)> filter_construct_transform_kernel; + KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel; + KernelFunctions<void(*)(int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel; + + KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*, + int, int, int, int, int, int, int, int, ccl_global int*, int, + ccl_global char*, ccl_global unsigned int*, unsigned int, ccl_global float*)> data_init_kernel; + unordered_map<string, KernelFunctions<void(*)(KernelGlobals*, KernelData*)> > split_kernels; + +#define KERNEL_FUNCTIONS(name) \ + KERNEL_NAME_EVAL(cpu, name), \ + KERNEL_NAME_EVAL(cpu_sse2, name), \ + KERNEL_NAME_EVAL(cpu_sse3, name), \ + KERNEL_NAME_EVAL(cpu_sse41, name), \ + KERNEL_NAME_EVAL(cpu_avx, name), \ + KERNEL_NAME_EVAL(cpu_avx2, name) + + CPUDevice(DeviceInfo& info_, Stats &stats_, bool background_) + : Device(info_, stats_, background_), + texture_info(this, "__texture_info", MEM_TEXTURE), +#define REGISTER_KERNEL(name) name ## _kernel(KERNEL_FUNCTIONS(name)) + REGISTER_KERNEL(path_trace), + REGISTER_KERNEL(convert_to_half_float), + REGISTER_KERNEL(convert_to_byte), + REGISTER_KERNEL(shader), + REGISTER_KERNEL(filter_divide_shadow), + REGISTER_KERNEL(filter_get_feature), + REGISTER_KERNEL(filter_detect_outliers), + REGISTER_KERNEL(filter_combine_halves), + REGISTER_KERNEL(filter_nlm_calc_difference), + REGISTER_KERNEL(filter_nlm_blur), + REGISTER_KERNEL(filter_nlm_calc_weight), + REGISTER_KERNEL(filter_nlm_update_output), + REGISTER_KERNEL(filter_nlm_normalize), + REGISTER_KERNEL(filter_construct_transform), + REGISTER_KERNEL(filter_nlm_construct_gramian), + REGISTER_KERNEL(filter_finalize), + REGISTER_KERNEL(data_init) +#undef REGISTER_KERNEL + { + if(info.cpu_threads == 0) { + info.cpu_threads = TaskScheduler::num_threads(); + } + +#ifdef WITH_OSL + kernel_globals.osl = &osl_globals; +#endif + use_split_kernel = DebugFlags().cpu.split_kernel; + if(use_split_kernel) { + VLOG(1) << "Will be using split kernel."; + } + need_texture_info = false; + +#define REGISTER_SPLIT_KERNEL(name) split_kernels[#name] = KernelFunctions<void(*)(KernelGlobals*, KernelData*)>(KERNEL_FUNCTIONS(name)) + REGISTER_SPLIT_KERNEL(path_init); + REGISTER_SPLIT_KERNEL(scene_intersect); + REGISTER_SPLIT_KERNEL(lamp_emission); + REGISTER_SPLIT_KERNEL(do_volume); + REGISTER_SPLIT_KERNEL(queue_enqueue); + REGISTER_SPLIT_KERNEL(indirect_background); + REGISTER_SPLIT_KERNEL(shader_setup); + REGISTER_SPLIT_KERNEL(shader_sort); + REGISTER_SPLIT_KERNEL(shader_eval); + REGISTER_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao); + REGISTER_SPLIT_KERNEL(subsurface_scatter); + REGISTER_SPLIT_KERNEL(direct_lighting); + REGISTER_SPLIT_KERNEL(shadow_blocked_ao); + REGISTER_SPLIT_KERNEL(shadow_blocked_dl); + REGISTER_SPLIT_KERNEL(enqueue_inactive); + REGISTER_SPLIT_KERNEL(next_iteration_setup); + REGISTER_SPLIT_KERNEL(indirect_subsurface); + REGISTER_SPLIT_KERNEL(buffer_update); +#undef REGISTER_SPLIT_KERNEL +#undef KERNEL_FUNCTIONS + } + ~CPUDevice() { task_pool.stop(); + texture_info.free(); } virtual bool show_samples() const { - return (TaskScheduler::num_threads() == 1); + return (info.cpu_threads == 1); } - void mem_alloc(device_memory& mem, MemoryType /*type*/) + void load_texture_info() { - mem.device_pointer = mem.data_pointer; - mem.device_size = mem.memory_size(); - stats.mem_alloc(mem.device_size); + if(need_texture_info) { + texture_info.copy_to_device(); + need_texture_info = false; + } } - void mem_copy_to(device_memory& /*mem*/) + void mem_alloc(device_memory& mem) { - /* no-op */ + if(mem.type == MEM_TEXTURE) { + assert(!"mem_alloc not supported for textures."); + } + else { + if(mem.name) { + VLOG(1) << "Buffer allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + } + + if(mem.type == MEM_DEVICE_ONLY) { + assert(!mem.host_pointer); + size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES; + void *data = util_aligned_malloc(mem.memory_size(), alignment); + mem.device_pointer = (device_ptr)data; + } + else { + mem.device_pointer = (device_ptr)mem.host_pointer; + } + + mem.device_size = mem.memory_size(); + stats.mem_alloc(mem.device_size); + } + } + + void mem_copy_to(device_memory& mem) + { + if(mem.type == MEM_TEXTURE) { + tex_free(mem); + tex_alloc(mem); + } + else if(mem.type == MEM_PIXELS) { + assert(!"mem_copy_to not supported for pixels."); + } + else { + if(!mem.device_pointer) { + mem_alloc(mem); + } + + /* copy is no-op */ + } } void mem_copy_from(device_memory& /*mem*/, @@ -138,40 +339,83 @@ public: void mem_zero(device_memory& mem) { - memset((void*)mem.device_pointer, 0, mem.memory_size()); + if(!mem.device_pointer) { + mem_alloc(mem); + } + + if(mem.device_pointer) { + memset((void*)mem.device_pointer, 0, mem.memory_size()); + } } void mem_free(device_memory& mem) { - if(mem.device_pointer) { + if(mem.type == MEM_TEXTURE) { + tex_free(mem); + } + else if(mem.device_pointer) { + if(mem.type == MEM_DEVICE_ONLY) { + util_aligned_free((void*)mem.device_pointer); + } mem.device_pointer = 0; stats.mem_free(mem.device_size); mem.device_size = 0; } } + virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/) + { + return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset)); + } + void const_copy_to(const char *name, void *host, size_t size) { kernel_const_copy(&kernel_globals, name, host, size); } - void tex_alloc(const char *name, - device_memory& mem, - InterpolationType interpolation, - ExtensionType extension) + void tex_alloc(device_memory& mem) { - VLOG(1) << "Texture allocate: " << name << ", " + VLOG(1) << "Texture allocate: " << mem.name << ", " << string_human_readable_number(mem.memory_size()) << " bytes. (" << string_human_readable_size(mem.memory_size()) << ")"; - kernel_tex_copy(&kernel_globals, - name, - mem.data_pointer, - mem.data_width, - mem.data_height, - mem.data_depth, - interpolation, - extension); - mem.device_pointer = mem.data_pointer; + + if(mem.interpolation == INTERPOLATION_NONE) { + /* Data texture. */ + kernel_tex_copy(&kernel_globals, + mem.name, + mem.host_pointer, + mem.data_size); + } + else { + /* Image Texture. */ + int flat_slot = 0; + if(string_startswith(mem.name, "__tex_image")) { + int pos = string(mem.name).rfind("_"); + flat_slot = atoi(mem.name + pos + 1); + } + else { + assert(0); + } + + if(flat_slot >= texture_info.size()) { + /* Allocate some slots in advance, to reduce amount + * of re-allocations. */ + texture_info.resize(flat_slot + 128); + } + + TextureInfo& info = texture_info[flat_slot]; + info.data = (uint64_t)mem.host_pointer; + info.cl_buffer = 0; + info.interpolation = mem.interpolation; + info.extension = mem.extension; + info.width = mem.data_width; + info.height = mem.data_height; + info.depth = mem.data_depth; + + need_texture_info = true; + } + + mem.device_pointer = (device_ptr)mem.host_pointer; mem.device_size = mem.memory_size(); stats.mem_alloc(mem.device_size); } @@ -182,6 +426,7 @@ public: mem.device_pointer = 0; stats.mem_free(mem.device_size); mem.device_size = 0; + need_texture_info = true; } } @@ -196,8 +441,9 @@ public: void thread_run(DeviceTask *task) { - if(task->type == DeviceTask::PATH_TRACE) - thread_path_trace(*task); + if(task->type == DeviceTask::RENDER) { + thread_render(*task); + } else if(task->type == DeviceTask::FILM_CONVERT) thread_film_convert(*task); else if(task->type == DeviceTask::SHADER) @@ -213,74 +459,327 @@ public: } }; - void thread_path_trace(DeviceTask& task) + bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task) { - if(task_pool.canceled()) { - if(task.need_finish_queue == false) - return; + TilesInfo *tiles = (TilesInfo*) task->tiles_mem.host_pointer; + for(int i = 0; i < 9; i++) { + tiles->buffers[i] = buffers[i]; } - KernelGlobals kg = thread_kernel_globals_init(); - RenderTile tile; + task->tiles_mem.copy_to_device(); - void(*path_trace_kernel)(KernelGlobals*, float*, unsigned int*, int, int, int, int, int); + return true; + } -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 - if(system_cpu_support_avx2()) { - path_trace_kernel = kernel_cpu_avx2_path_trace; + bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr, + DenoisingTask *task) + { + int4 rect = task->rect; + int r = task->nlm_state.r; + int f = task->nlm_state.f; + float a = task->nlm_state.a; + float k_2 = task->nlm_state.k_2; + + int w = align_up(rect.z-rect.x, 4); + int h = rect.w-rect.y; + + float *blurDifference = (float*) task->nlm_state.temporary_1_ptr; + float *difference = (float*) task->nlm_state.temporary_2_ptr; + float *weightAccum = (float*) task->nlm_state.temporary_3_ptr; + + memset(weightAccum, 0, sizeof(float)*w*h); + memset((float*) out_ptr, 0, sizeof(float)*w*h); + + for(int i = 0; i < (2*r+1)*(2*r+1); i++) { + int dy = i / (2*r+1) - r; + int dx = i % (2*r+1) - r; + + int local_rect[4] = {max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)}; + filter_nlm_calc_difference_kernel()(dx, dy, + (float*) guide_ptr, + (float*) variance_ptr, + difference, + local_rect, + w, 0, + a, k_2); + + filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f); + filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f); + filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f); + + filter_nlm_update_output_kernel()(dx, dy, + blurDifference, + (float*) image_ptr, + (float*) out_ptr, + weightAccum, + local_rect, + w, f); } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - if(system_cpu_support_avx()) { - path_trace_kernel = kernel_cpu_avx_path_trace; + + int local_rect[4] = {0, 0, rect.z-rect.x, rect.w-rect.y}; + filter_nlm_normalize_kernel()((float*) out_ptr, weightAccum, local_rect, w); + + return true; + } + + bool denoising_construct_transform(DenoisingTask *task) + { + for(int y = 0; y < task->filter_area.w; y++) { + for(int x = 0; x < task->filter_area.z; x++) { + filter_construct_transform_kernel()((float*) task->buffer.mem.device_pointer, + x + task->filter_area.x, + y + task->filter_area.y, + y*task->filter_area.z + x, + (float*) task->storage.transform.device_pointer, + (int*) task->storage.rank.device_pointer, + &task->rect.x, + task->buffer.pass_stride, + task->radius, + task->pca_threshold); + } } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - if(system_cpu_support_sse41()) { - path_trace_kernel = kernel_cpu_sse41_path_trace; + return true; + } + + bool denoising_reconstruct(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr output_ptr, + DenoisingTask *task) + { + mem_zero(task->storage.XtWX); + mem_zero(task->storage.XtWY); + + float *difference = (float*) task->reconstruction_state.temporary_1_ptr; + float *blurDifference = (float*) task->reconstruction_state.temporary_2_ptr; + + int r = task->radius; + for(int i = 0; i < (2*r+1)*(2*r+1); i++) { + int dy = i / (2*r+1) - r; + int dx = i % (2*r+1) - r; + + int local_rect[4] = {max(0, -dx), max(0, -dy), + task->reconstruction_state.source_w - max(0, dx), + task->reconstruction_state.source_h - max(0, dy)}; + filter_nlm_calc_difference_kernel()(dx, dy, + (float*) color_ptr, + (float*) color_variance_ptr, + difference, + local_rect, + task->buffer.stride, + task->buffer.pass_stride, + 1.0f, + task->nlm_k_2); + filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4); + filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.stride, 4); + filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4); + filter_nlm_construct_gramian_kernel()(dx, dy, + blurDifference, + (float*) task->buffer.mem.device_pointer, + (float*) task->storage.transform.device_pointer, + (int*) task->storage.rank.device_pointer, + (float*) task->storage.XtWX.device_pointer, + (float3*) task->storage.XtWY.device_pointer, + local_rect, + &task->reconstruction_state.filter_window.x, + task->buffer.stride, + 4, + task->buffer.pass_stride); } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - if(system_cpu_support_sse3()) { - path_trace_kernel = kernel_cpu_sse3_path_trace; + for(int y = 0; y < task->filter_area.w; y++) { + for(int x = 0; x < task->filter_area.z; x++) { + filter_finalize_kernel()(x, + y, + y*task->filter_area.z + x, + (float*) output_ptr, + (int*) task->storage.rank.device_pointer, + (float*) task->storage.XtWX.device_pointer, + (float3*) task->storage.XtWY.device_pointer, + &task->reconstruction_state.buffer_params.x, + task->render_buffer.samples); + } } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - if(system_cpu_support_sse2()) { - path_trace_kernel = kernel_cpu_sse2_path_trace; + return true; + } + + bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr, + device_ptr mean_ptr, device_ptr variance_ptr, + int r, int4 rect, DenoisingTask * /*task*/) + { + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x; x < rect.z; x++) { + filter_combine_halves_kernel()(x, y, + (float*) mean_ptr, + (float*) variance_ptr, + (float*) a_ptr, + (float*) b_ptr, + &rect.x, + r); + } } - else -#endif - { - path_trace_kernel = kernel_cpu_path_trace; + return true; + } + + bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr, + device_ptr sample_variance_ptr, device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr, DenoisingTask *task) + { + for(int y = task->rect.y; y < task->rect.w; y++) { + for(int x = task->rect.x; x < task->rect.z; x++) { + filter_divide_shadow_kernel()(task->render_buffer.samples, + task->tiles, + x, y, + (float*) a_ptr, + (float*) b_ptr, + (float*) sample_variance_ptr, + (float*) sv_variance_ptr, + (float*) buffer_variance_ptr, + &task->rect.x, + task->render_buffer.pass_stride, + task->render_buffer.denoising_data_offset); + } } - - while(task.acquire_tile(this, tile)) { - float *render_buffer = (float*)tile.buffer; - uint *rng_state = (uint*)tile.rng_state; - int start_sample = tile.start_sample; - int end_sample = tile.start_sample + tile.num_samples; - - for(int sample = start_sample; sample < end_sample; sample++) { - if(task.get_cancel() || task_pool.canceled()) { - if(task.need_finish_queue == false) - break; - } + return true; + } + + bool denoising_get_feature(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr, + DenoisingTask *task) + { + for(int y = task->rect.y; y < task->rect.w; y++) { + for(int x = task->rect.x; x < task->rect.z; x++) { + filter_get_feature_kernel()(task->render_buffer.samples, + task->tiles, + mean_offset, + variance_offset, + x, y, + (float*) mean_ptr, + (float*) variance_ptr, + &task->rect.x, + task->render_buffer.pass_stride, + task->render_buffer.denoising_data_offset); + } + } + return true; + } + + bool denoising_detect_outliers(device_ptr image_ptr, + device_ptr variance_ptr, + device_ptr depth_ptr, + device_ptr output_ptr, + DenoisingTask *task) + { + for(int y = task->rect.y; y < task->rect.w; y++) { + for(int x = task->rect.x; x < task->rect.z; x++) { + filter_detect_outliers_kernel()(x, y, + (float*) image_ptr, + (float*) variance_ptr, + (float*) depth_ptr, + (float*) output_ptr, + &task->rect.x, + task->buffer.pass_stride); + } + } + return true; + } + + void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg) + { + scoped_timer timer(&tile.buffers->render_time); + + float *render_buffer = (float*)tile.buffer; + int start_sample = tile.start_sample; + int end_sample = tile.start_sample + tile.num_samples; + + for(int sample = start_sample; sample < end_sample; sample++) { + if(task.get_cancel() || task_pool.canceled()) { + if(task.need_finish_queue == false) + break; + } - for(int y = tile.y; y < tile.y + tile.h; y++) { - for(int x = tile.x; x < tile.x + tile.w; x++) { - path_trace_kernel(&kg, render_buffer, rng_state, - sample, x, y, tile.offset, tile.stride); - } + for(int y = tile.y; y < tile.y + tile.h; y++) { + for(int x = tile.x; x < tile.x + tile.w; x++) { + path_trace_kernel()(kg, render_buffer, + sample, x, y, tile.offset, tile.stride); } + } + + tile.sample = sample + 1; + + task.update_progress(&tile, tile.w*tile.h); + } + } + + void denoise(DeviceTask &task, DenoisingTask& denoising, RenderTile &tile) + { + tile.sample = tile.start_sample + tile.num_samples; + + denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising); + denoising.functions.reconstruct = function_bind(&CPUDevice::denoising_reconstruct, this, _1, _2, _3, &denoising); + denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); + denoising.functions.combine_halves = function_bind(&CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); + denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising); + denoising.functions.detect_outliers = function_bind(&CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); + denoising.functions.set_tiles = function_bind(&CPUDevice::denoising_set_tiles, this, _1, &denoising); + + denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h); + denoising.render_buffer.samples = tile.sample; - tile.sample = sample + 1; + RenderTile rtiles[9]; + rtiles[4] = tile; + task.map_neighbor_tiles(rtiles, this); + denoising.tiles_from_rendertiles(rtiles); - task.update_progress(&tile, tile.w*tile.h); + denoising.init_from_devicetask(task); + + denoising.run_denoising(); + + task.unmap_neighbor_tiles(rtiles, this); + + task.update_progress(&tile, tile.w*tile.h); + } + + void thread_render(DeviceTask& task) + { + if(task_pool.canceled()) { + if(task.need_finish_queue == false) + return; + } + + /* allocate buffer for kernel globals */ + device_only_memory<KernelGlobals> kgbuffer(this, "kernel_globals"); + kgbuffer.alloc_to_device(1); + + KernelGlobals *kg = new ((void*) kgbuffer.device_pointer) KernelGlobals(thread_kernel_globals_init()); + + CPUSplitKernel *split_kernel = NULL; + if(use_split_kernel) { + split_kernel = new CPUSplitKernel(this); + if(!split_kernel->load_kernels(requested_features)) { + thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer); + kgbuffer.free(); + delete split_kernel; + return; + } + } + + RenderTile tile; + DenoisingTask denoising(this); + + while(task.acquire_tile(this, tile)) { + if(tile.task == RenderTile::PATH_TRACE) { + if(use_split_kernel) { + device_only_memory<uchar> void_buffer(this, "void_buffer"); + split_kernel->path_trace(&task, tile, kgbuffer, void_buffer); + } + else { + path_trace(task, tile, kg); + } + } + else if(tile.task == RenderTile::DENOISE) { + denoise(task, denoising, tile); } task.release_tile(tile); @@ -291,7 +790,10 @@ public: } } - thread_kernel_globals_free(&kg); + thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer); + kg->~KernelGlobals(); + kgbuffer.free(); + delete split_kernel; } void thread_film_convert(DeviceTask& task) @@ -299,86 +801,16 @@ public: float sample_scale = 1.0f/(task.sample + 1); if(task.rgba_half) { - void(*convert_to_half_float_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int); -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 - if(system_cpu_support_avx2()) { - convert_to_half_float_kernel = kernel_cpu_avx2_convert_to_half_float; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - if(system_cpu_support_avx()) { - convert_to_half_float_kernel = kernel_cpu_avx_convert_to_half_float; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - if(system_cpu_support_sse41()) { - convert_to_half_float_kernel = kernel_cpu_sse41_convert_to_half_float; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - if(system_cpu_support_sse3()) { - convert_to_half_float_kernel = kernel_cpu_sse3_convert_to_half_float; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - if(system_cpu_support_sse2()) { - convert_to_half_float_kernel = kernel_cpu_sse2_convert_to_half_float; - } - else -#endif - { - convert_to_half_float_kernel = kernel_cpu_convert_to_half_float; - } - for(int y = task.y; y < task.y + task.h; y++) for(int x = task.x; x < task.x + task.w; x++) - convert_to_half_float_kernel(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer, - sample_scale, x, y, task.offset, task.stride); + convert_to_half_float_kernel()(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); } else { - void(*convert_to_byte_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int); -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 - if(system_cpu_support_avx2()) { - convert_to_byte_kernel = kernel_cpu_avx2_convert_to_byte; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - if(system_cpu_support_avx()) { - convert_to_byte_kernel = kernel_cpu_avx_convert_to_byte; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - if(system_cpu_support_sse41()) { - convert_to_byte_kernel = kernel_cpu_sse41_convert_to_byte; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - if(system_cpu_support_sse3()) { - convert_to_byte_kernel = kernel_cpu_sse3_convert_to_byte; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - if(system_cpu_support_sse2()) { - convert_to_byte_kernel = kernel_cpu_sse2_convert_to_byte; - } - else -#endif - { - convert_to_byte_kernel = kernel_cpu_convert_to_byte; - } - for(int y = task.y; y < task.y + task.h; y++) for(int x = task.x; x < task.x + task.w; x++) - convert_to_byte_kernel(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer, - sample_scale, x, y, task.offset, task.stride); + convert_to_byte_kernel()(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer, + sample_scale, x, y, task.offset, task.stride); } } @@ -390,53 +822,16 @@ public: #ifdef WITH_OSL OSLShader::thread_init(&kg, &kernel_globals, &osl_globals); #endif - void(*shader_kernel)(KernelGlobals*, uint4*, float4*, float*, int, int, int, int, int); - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 - if(system_cpu_support_avx2()) { - shader_kernel = kernel_cpu_avx2_shader; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - if(system_cpu_support_avx()) { - shader_kernel = kernel_cpu_avx_shader; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - if(system_cpu_support_sse41()) { - shader_kernel = kernel_cpu_sse41_shader; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - if(system_cpu_support_sse3()) { - shader_kernel = kernel_cpu_sse3_shader; - } - else -#endif -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - if(system_cpu_support_sse2()) { - shader_kernel = kernel_cpu_sse2_shader; - } - else -#endif - { - shader_kernel = kernel_cpu_shader; - } - for(int sample = 0; sample < task.num_samples; sample++) { for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) - shader_kernel(&kg, - (uint4*)task.shader_input, - (float4*)task.shader_output, - (float*)task.shader_output_luma, - task.shader_eval_type, - task.shader_filter, - x, - task.offset, - sample); + shader_kernel()(&kg, + (uint4*)task.shader_input, + (float4*)task.shader_output, + task.shader_eval_type, + task.shader_filter, + x, + task.offset, + sample); if(task.get_cancel() || task_pool.canceled()) break; @@ -453,20 +848,23 @@ public: int get_split_task_count(DeviceTask& task) { if(task.type == DeviceTask::SHADER) - return task.get_subtask_count(TaskScheduler::num_threads(), 256); + return task.get_subtask_count(info.cpu_threads, 256); else - return task.get_subtask_count(TaskScheduler::num_threads()); + return task.get_subtask_count(info.cpu_threads); } void task_add(DeviceTask& task) { + /* Load texture info. */ + load_texture_info(); + /* split task into smaller ones */ list<DeviceTask> tasks; if(task.type == DeviceTask::SHADER) - task.split(tasks, TaskScheduler::num_threads(), 256); + task.split(tasks, info.cpu_threads, 256); else - task.split(tasks, TaskScheduler::num_threads()); + task.split(tasks, info.cpu_threads); foreach(DeviceTask& task, tasks) task_pool.push(new CPUDeviceTask(this, task)); @@ -501,6 +899,10 @@ protected: inline void thread_kernel_globals_free(KernelGlobals *kg) { + if(kg == NULL) { + return; + } + if(kg->transparent_shadow_intersections != NULL) { free(kg->transparent_shadow_intersections); } @@ -515,8 +917,121 @@ protected: OSLShader::thread_free(kg); #endif } + + virtual bool load_kernels(const DeviceRequestedFeatures& requested_features_) { + requested_features = requested_features_; + + return true; + } }; +/* split kernel */ + +class CPUSplitKernelFunction : public SplitKernelFunction { +public: + CPUDevice* device; + void (*func)(KernelGlobals *kg, KernelData *data); + + CPUSplitKernelFunction(CPUDevice* device) : device(device), func(NULL) {} + ~CPUSplitKernelFunction() {} + + virtual bool enqueue(const KernelDimensions& dim, device_memory& kernel_globals, device_memory& data) + { + if(!func) { + return false; + } + + KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer; + kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]); + + for(int y = 0; y < dim.global_size[1]; y++) { + for(int x = 0; x < dim.global_size[0]; x++) { + kg->global_id = make_int2(x, y); + + func(kg, (KernelData*)data.device_pointer); + } + } + + return true; + } +}; + +CPUSplitKernel::CPUSplitKernel(CPUDevice *device) : DeviceSplitKernel(device), device(device) +{ +} + +bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim, + RenderTile& rtile, + int num_global_elements, + device_memory& kernel_globals, + device_memory& data, + device_memory& split_data, + device_memory& ray_state, + device_memory& queue_index, + device_memory& use_queues_flags, + device_memory& work_pool_wgs) +{ + KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer; + kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]); + + for(int y = 0; y < dim.global_size[1]; y++) { + for(int x = 0; x < dim.global_size[0]; x++) { + kg->global_id = make_int2(x, y); + + device->data_init_kernel()((KernelGlobals*)kernel_globals.device_pointer, + (KernelData*)data.device_pointer, + (void*)split_data.device_pointer, + num_global_elements, + (char*)ray_state.device_pointer, + rtile.start_sample, + rtile.start_sample + rtile.num_samples, + rtile.x, + rtile.y, + rtile.w, + rtile.h, + rtile.offset, + rtile.stride, + (int*)queue_index.device_pointer, + dim.global_size[0] * dim.global_size[1], + (char*)use_queues_flags.device_pointer, + (uint*)work_pool_wgs.device_pointer, + rtile.num_samples, + (float*)rtile.buffer); + } + } + + return true; +} + +SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(const string& kernel_name, + const DeviceRequestedFeatures&) +{ + CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device); + + kernel->func = device->split_kernels[kernel_name](); + if(!kernel->func) { + delete kernel; + return NULL; + } + + return kernel; +} + +int2 CPUSplitKernel::split_kernel_local_size() +{ + return make_int2(1, 1); +} + +int2 CPUSplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask * /*task*/) { + return make_int2(1, 1); +} + +uint64_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device_memory& /*data*/, size_t num_threads) { + KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer; + + return split_data_buffer_size(kg, num_threads); +} + Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background) { return new CPUDevice(info, stats, background); @@ -531,7 +1046,13 @@ void device_cpu_info(vector<DeviceInfo>& devices) info.id = "CPU"; info.num = 0; info.advanced_shading = true; - info.pack_images = false; + info.bvh_layout_mask = BVH_LAYOUT_BVH2; + if (system_cpu_support_sse2()) { + info.bvh_layout_mask |= BVH_LAYOUT_BVH4; + } + info.has_volume_decoupled = true; + info.has_osl = true; + info.has_half_images = true; devices.insert(devices.begin(), info); } diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index dafac6dfcb3..b4529feffa7 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -15,32 +15,40 @@ */ #include <climits> +#include <limits.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include "device.h" -#include "device_intern.h" +#include "device/device.h" +#include "device/device_denoising.h" +#include "device/device_intern.h" +#include "device/device_split_kernel.h" -#include "buffers.h" +#include "render/buffers.h" + +#include "kernel/filter/filter_defines.h" #ifdef WITH_CUDA_DYNLOAD # include "cuew.h" #else -# include "util_opengl.h" +# include "util/util_opengl.h" # include <cuda.h> # include <cudaGL.h> #endif -#include "util_debug.h" -#include "util_logging.h" -#include "util_map.h" -#include "util_md5.h" -#include "util_opengl.h" -#include "util_path.h" -#include "util_string.h" -#include "util_system.h" -#include "util_types.h" -#include "util_time.h" +#include "util/util_debug.h" +#include "util/util_foreach.h" +#include "util/util_logging.h" +#include "util/util_map.h" +#include "util/util_md5.h" +#include "util/util_opengl.h" +#include "util/util_path.h" +#include "util/util_string.h" +#include "util/util_system.h" +#include "util/util_types.h" +#include "util/util_time.h" + +#include "kernel/split/kernel_split_data_types.h" CCL_NAMESPACE_BEGIN @@ -78,18 +86,71 @@ int cuewCompilerVersion(void) } /* namespace */ #endif /* WITH_CUDA_DYNLOAD */ +class CUDADevice; + +class CUDASplitKernel : public DeviceSplitKernel { + CUDADevice *device; +public: + explicit CUDASplitKernel(CUDADevice *device); + + virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads); + + virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim, + RenderTile& rtile, + int num_global_elements, + device_memory& kernel_globals, + device_memory& kernel_data_, + device_memory& split_data, + device_memory& ray_state, + device_memory& queue_index, + device_memory& use_queues_flag, + device_memory& work_pool_wgs); + + virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name, + const DeviceRequestedFeatures&); + virtual int2 split_kernel_local_size(); + virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task); +}; + +/* Utility to push/pop CUDA context. */ +class CUDAContextScope { +public: + CUDAContextScope(CUDADevice *device); + ~CUDAContextScope(); + +private: + CUDADevice *device; +}; + class CUDADevice : public Device { public: DedicatedTaskPool task_pool; CUdevice cuDevice; CUcontext cuContext; - CUmodule cuModule; - map<device_ptr, bool> tex_interp_map; - map<device_ptr, uint> tex_bindless_map; + CUmodule cuModule, cuFilterModule; + size_t device_texture_headroom; + size_t device_working_headroom; + bool move_texture_to_host; + size_t map_host_used; + size_t map_host_limit; + int can_map_host; int cuDevId; int cuDevArchitecture; bool first_error; + CUDASplitKernel *split_kernel; + + struct CUDAMem { + CUDAMem() + : texobject(0), array(0), map_host_pointer(0), free_map_host(false) {} + + CUtexObject texobject; + CUarray array; + void *map_host_pointer; + bool free_map_host; + }; + typedef map<device_memory*, CUDAMem> CUDAMemMap; + CUDAMemMap cuda_mem_map; struct PixelMem { GLuint cuPBO; @@ -97,12 +158,11 @@ public: GLuint cuTexId; int w, h; }; - map<device_ptr, PixelMem> pixel_mem_map; /* Bindless Textures */ - device_vector<uint> bindless_mapping; - bool need_bindless_mapping; + device_vector<TextureInfo> texture_info; + bool need_texture_info; CUdeviceptr cuda_device_ptr(device_ptr mem) { @@ -140,7 +200,7 @@ public: CUresult result = stmt; \ \ if(result != CUDA_SUCCESS) { \ - string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \ + string message = string_printf("CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \ if(error_msg == "") \ error_msg = message; \ fprintf(stderr, "%s\n", message.c_str()); \ @@ -172,18 +232,9 @@ public: cuda_error_documentation(); } - void cuda_push_context() - { - cuda_assert(cuCtxSetCurrent(cuContext)); - } - - void cuda_pop_context() - { - cuda_assert(cuCtxSetCurrent(NULL)); - } - CUDADevice(DeviceInfo& info, Stats &stats, bool background_) - : Device(info, stats, background_) + : Device(info, stats, background_), + texture_info(this, "__texture_info", MEM_TEXTURE) { first_error = true; background = background_; @@ -192,26 +243,50 @@ public: cuDevice = 0; cuContext = 0; - need_bindless_mapping = false; + cuModule = 0; + cuFilterModule = 0; + + split_kernel = NULL; + + need_texture_info = false; + + device_texture_headroom = 0; + device_working_headroom = 0; + move_texture_to_host = false; + map_host_limit = 0; + map_host_used = 0; + can_map_host = 0; - /* intialize */ + /* Intialize CUDA. */ if(cuda_error(cuInit(0))) return; - /* setup device and context */ + /* Setup device and context. */ if(cuda_error(cuDeviceGet(&cuDevice, cuDevId))) return; + /* CU_CTX_MAP_HOST for mapping host memory when out of device memory. + * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render, + * so we can predict which memory to map to host. */ + cuda_assert(cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice)); + + unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX; + if(can_map_host) { + ctx_flags |= CU_CTX_MAP_HOST; + init_host_memory(); + } + + /* Create context. */ CUresult result; if(background) { - result = cuCtxCreate(&cuContext, 0, cuDevice); + result = cuCtxCreate(&cuContext, ctx_flags, cuDevice); } else { - result = cuGLCtxCreate(&cuContext, 0, cuDevice); + result = cuGLCtxCreate(&cuContext, ctx_flags, cuDevice); if(result != CUDA_SUCCESS) { - result = cuCtxCreate(&cuContext, 0, cuDevice); + result = cuCtxCreate(&cuContext, ctx_flags, cuDevice); background = true; } } @@ -224,16 +299,17 @@ public: cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); cuDevArchitecture = major*100 + minor*10; - cuda_pop_context(); + /* Pop context set by cuCtxCreate. */ + cuCtxPopCurrent(NULL); } ~CUDADevice() { task_pool.stop(); - if(info.has_bindless_textures) { - tex_free(bindless_mapping); - } + delete split_kernel; + + texture_info.free(); cuda_assert(cuCtxDestroy(cuContext)); } @@ -244,9 +320,9 @@ public: cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); - /* We only support sm_20 and above */ - if(major < 2) { - cuda_error_message(string_printf("CUDA device supported only with compute capability 2.0 or up, found %d.%d.", major, minor)); + /* We only support sm_30 and above */ + if(major < 3) { + cuda_error_message(string_printf("CUDA device supported only with compute capability 3.0 or up, found %d.%d.", major, minor)); return false; } @@ -258,26 +334,29 @@ public: return DebugFlags().cuda.adaptive_compile; } + bool use_split_kernel() + { + return DebugFlags().cuda.split_kernel; + } + /* Common NVCC flags which stays the same regardless of shading model, * kernel sources md5 and only depends on compiler or compilation settings. */ string compile_kernel_get_common_cflags( - const DeviceRequestedFeatures& requested_features) + const DeviceRequestedFeatures& requested_features, + bool filter=false, bool split=false) { - const int cuda_version = cuewCompilerVersion(); const int machine = system_cpu_bits(); - const string kernel_path = path_get("kernel"); - const string include = kernel_path; + const string source_path = path_get("source"); + const string include_path = source_path; string cflags = string_printf("-m%d " "--ptxas-options=\"-v\" " "--use_fast_math " "-DNVCC " - "-D__KERNEL_CUDA_VERSION__=%d " "-I\"%s\"", machine, - cuda_version, - include.c_str()); - if(use_adaptive_compilation()) { + include_path.c_str()); + if(!filter && use_adaptive_compilation()) { cflags += " " + requested_features.get_build_options(); } const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS"); @@ -287,6 +366,11 @@ public: #ifdef WITH_CYCLES_DEBUG cflags += " -D__KERNEL_DEBUG__"; #endif + + if(split) { + cflags += " -D__SPLIT__"; + } + return cflags; } @@ -306,22 +390,36 @@ public: cuda_error_message("CUDA nvcc compiler version could not be parsed."); return false; } - if(cuda_version < 75) { + if(cuda_version < 80) { printf("Unsupported CUDA version %d.%d detected, " - "you need CUDA 7.5 or newer.\n", + "you need CUDA 8.0 or newer.\n", major, minor); return false; } - else if(cuda_version != 75 && cuda_version != 80) { + else if(cuda_version != 80) { printf("CUDA version %d.%d detected, build may succeed but only " - "CUDA 7.5 and 8.0 are officially supported.\n", + "CUDA 8.0 is officially supported.\n", major, minor); } return true; } - string compile_kernel(const DeviceRequestedFeatures& requested_features) + string compile_kernel(const DeviceRequestedFeatures& requested_features, + bool filter=false, bool split=false) { + const char *name, *source; + if(filter) { + name = "filter"; + source = "filter.cu"; + } + else if(split) { + name = "kernel_split"; + source = "kernel_split.cu"; + } + else { + name = "kernel"; + source = "kernel.cu"; + } /* Compute cubin name. */ int major, minor; cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); @@ -329,8 +427,8 @@ public: /* Attempt to use kernel provided with Blender. */ if(!use_adaptive_compilation()) { - const string cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin", - major, minor)); + const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", + name, major, minor)); VLOG(1) << "Testing for pre-compiled kernel " << cubin << "."; if(path_exists(cubin)) { VLOG(1) << "Using precompiled kernel."; @@ -339,19 +437,19 @@ public: } const string common_cflags = - compile_kernel_get_common_cflags(requested_features); + compile_kernel_get_common_cflags(requested_features, filter, split); /* Try to use locally compiled kernel. */ - const string kernel_path = path_get("kernel"); - const string kernel_md5 = path_files_md5_hash(kernel_path); + const string source_path = path_get("source"); + const string kernel_md5 = path_files_md5_hash(source_path); /* We include cflags into md5 so changing cuda toolkit or changing other * compiler command line arguments makes sure cubin gets re-built. */ const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags); - const string cubin_file = string_printf("cycles_kernel_sm%d%d_%s.cubin", - major, minor, + const string cubin_file = string_printf("cycles_%s_sm%d%d_%s.cubin", + name, major, minor, cubin_md5.c_str()); const string cubin = path_cache_get(path_join("kernels", cubin_file)); VLOG(1) << "Testing for locally compiled kernel " << cubin << "."; @@ -362,9 +460,9 @@ public: #ifdef _WIN32 if(have_precompiled_kernels()) { - if(major < 2) { + if(major < 3) { cuda_error_message(string_printf( - "CUDA device requires compute capability 2.0 or up, " + "CUDA device requires compute capability 3.0 or up, " "found %d.%d. Your GPU is not supported.", major, minor)); } @@ -383,9 +481,10 @@ public: return ""; } const char *nvcc = cuewCompilerPath(); - const string kernel = path_join(kernel_path, - path_join("kernels", - path_join("cuda", "kernel.cu"))); + const string kernel = path_join( + path_join(source_path, "kernel"), + path_join("kernels", + path_join("cuda", source))); double starttime = time_dt(); printf("Compiling CUDA kernel ...\n"); @@ -424,6 +523,16 @@ public: bool load_kernels(const DeviceRequestedFeatures& requested_features) { + /* TODO(sergey): Support kernels re-load for CUDA devices. + * + * Currently re-loading kernel will invalidate memory pointers, + * causing problems in cuCtxSynchronize. + */ + if(cuFilterModule && cuModule) { + VLOG(1) << "Skipping kernel reload, not currently supported."; + return true; + } + /* check if cuda init succeeded */ if(cuContext == 0) return false; @@ -433,13 +542,16 @@ public: return false; /* get kernel */ - string cubin = compile_kernel(requested_features); - + string cubin = compile_kernel(requested_features, false, use_split_kernel()); if(cubin == "") return false; + string filter_cubin = compile_kernel(requested_features, true, false); + if(filter_cubin == "") + return false; + /* open module */ - cuda_push_context(); + CUDAContextScope scope(this); string cubin_data; CUresult result; @@ -452,112 +564,465 @@ public: if(cuda_error_(result, "cuModuleLoad")) cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str())); - cuda_pop_context(); + if(path_read_text(filter_cubin, cubin_data)) + result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str()); + else + result = CUDA_ERROR_FILE_NOT_FOUND; + + if(cuda_error_(result, "cuModuleLoad")) + cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str())); + + if(result == CUDA_SUCCESS) { + reserve_local_memory(requested_features); + } return (result == CUDA_SUCCESS); } - void load_bindless_mapping() + void reserve_local_memory(const DeviceRequestedFeatures& requested_features) + { + if(use_split_kernel()) { + /* Split kernel mostly uses global memory and adaptive compilation, + * difficult to predict how much is needed currently. */ + return; + } + + /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory + * needed for kernel launches, so that we can reliably figure out when + * to allocate scene data in mapped host memory. */ + CUDAContextScope scope(this); + + size_t total = 0, free_before = 0, free_after = 0; + cuMemGetInfo(&free_before, &total); + + /* Get kernel function. */ + CUfunction cuPathTrace; + + if(requested_features.use_integrator_branched) { + cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace")); + } + else { + cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace")); + } + + cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)); + + int min_blocks, num_threads_per_block; + cuda_assert(cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0)); + + /* Launch kernel, using just 1 block appears sufficient to reserve + * memory for all multiprocessors. It would be good to do this in + * parallel for the multi GPU case still to make it faster. */ + CUdeviceptr d_work_tiles = 0; + uint total_work_size = 0; + + void *args[] = {&d_work_tiles, + &total_work_size}; + + cuda_assert(cuLaunchKernel(cuPathTrace, + 1, 1, 1, + num_threads_per_block, 1, 1, + 0, 0, args, 0)); + + cuda_assert(cuCtxSynchronize()); + + cuMemGetInfo(&free_after, &total); + VLOG(1) << "Local memory reserved " + << string_human_readable_number(free_before - free_after) << " bytes. (" + << string_human_readable_size(free_before - free_after) << ")"; + +#if 0 + /* For testing mapped host memory, fill up device memory. */ + const size_t keep_mb = 1024; + + while(free_after > keep_mb * 1024 * 1024LL) { + CUdeviceptr tmp; + cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL)); + cuMemGetInfo(&free_after, &total); + } +#endif + } + + void init_host_memory() { - if(info.has_bindless_textures && need_bindless_mapping) { - tex_free(bindless_mapping); - tex_alloc("__bindless_mapping", bindless_mapping, INTERPOLATION_NONE, EXTENSION_REPEAT); - need_bindless_mapping = false; + /* Limit amount of host mapped memory, because allocating too much can + * cause system instability. Leave at least half or 4 GB of system + * memory free, whichever is smaller. */ + size_t default_limit = 4 * 1024 * 1024 * 1024LL; + size_t system_ram = system_physical_ram(); + + if(system_ram > 0) { + if(system_ram / 2 > default_limit) { + map_host_limit = system_ram - default_limit; + } + else { + map_host_limit = system_ram / 2; + } + } + else { + VLOG(1) << "Mapped host memory disabled, failed to get system RAM"; + map_host_limit = 0; } + + /* Amount of device memory to keep is free after texture memory + * and working memory allocations respectively. We set the working + * memory limit headroom lower so that some space is left after all + * texture memory allocations. */ + device_working_headroom = 32 * 1024 * 1024LL; // 32MB + device_texture_headroom = 128 * 1024 * 1024LL; // 128MB + + VLOG(1) << "Mapped host memory limit set to " + << string_human_readable_number(map_host_limit) << " bytes. (" + << string_human_readable_size(map_host_limit) << ")"; } - void mem_alloc(device_memory& mem, MemoryType /*type*/) + void load_texture_info() { - cuda_push_context(); - CUdeviceptr device_pointer; - size_t size = mem.memory_size(); - cuda_assert(cuMemAlloc(&device_pointer, size)); + if(need_texture_info) { + texture_info.copy_to_device(); + need_texture_info = false; + } + } + + void move_textures_to_host(size_t size, bool for_texture) + { + /* Signal to reallocate textures in host memory only. */ + move_texture_to_host = true; + + while(size > 0) { + /* Find suitable memory allocation to move. */ + device_memory *max_mem = NULL; + size_t max_size = 0; + bool max_is_image = false; + + foreach(CUDAMemMap::value_type& pair, cuda_mem_map) { + device_memory& mem = *pair.first; + CUDAMem *cmem = &pair.second; + + bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info); + bool is_image = is_texture && (mem.data_height > 1); + + /* Can't move this type of memory. */ + if(!is_texture || cmem->array) { + continue; + } + + /* Already in host memory. */ + if(cmem->map_host_pointer) { + continue; + } + + /* For other textures, only move image textures. */ + if(for_texture && !is_image) { + continue; + } + + /* Try to move largest allocation, prefer moving images. */ + if(is_image > max_is_image || + (is_image == max_is_image && mem.device_size > max_size)) { + max_is_image = is_image; + max_size = mem.device_size; + max_mem = &mem; + } + } + + /* Move to host memory. This part is mutex protected since + * multiple CUDA devices could be moving the memory. The + * first one will do it, and the rest will adopt the pointer. */ + if(max_mem) { + VLOG(1) << "Move memory from device to host: " << max_mem->name; + + static thread_mutex move_mutex; + thread_scoped_lock lock(move_mutex); + + /* Preserve the original device pointer, in case of multi device + * we can't change it because the pointer mapping would break. */ + device_ptr prev_pointer = max_mem->device_pointer; + size_t prev_size = max_mem->device_size; + + tex_free(*max_mem); + tex_alloc(*max_mem); + size = (max_size >= size)? 0: size - max_size; + + max_mem->device_pointer = prev_pointer; + max_mem->device_size = prev_size; + } + else { + break; + } + } + + /* Update texture info array with new pointers. */ + load_texture_info(); + + move_texture_to_host = false; + } + + CUDAMem *generic_alloc(device_memory& mem, size_t pitch_padding = 0) + { + CUDAContextScope scope(this); + + CUdeviceptr device_pointer = 0; + size_t size = mem.memory_size() + pitch_padding; + + CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY; + const char *status = ""; + + /* First try allocating in device memory, respecting headroom. We make + * an exception for texture info. It is small and frequently accessed, + * so treat it as working memory. + * + * If there is not enough room for working memory, we will try to move + * textures to host memory, assuming the performance impact would have + * been worse for working memory. */ + bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info); + bool is_image = is_texture && (mem.data_height > 1); + + size_t headroom = (is_texture)? device_texture_headroom: + device_working_headroom; + + size_t total = 0, free = 0; + cuMemGetInfo(&free, &total); + + /* Move textures to host memory if needed. */ + if(!move_texture_to_host && !is_image && (size + headroom) >= free) { + move_textures_to_host(size + headroom - free, is_texture); + cuMemGetInfo(&free, &total); + } + + /* Allocate in device memory. */ + if(!move_texture_to_host && (size + headroom) < free) { + mem_alloc_result = cuMemAlloc(&device_pointer, size); + if(mem_alloc_result == CUDA_SUCCESS) { + status = " in device memory"; + } + } + + /* Fall back to mapped host memory if needed and possible. */ + void *map_host_pointer = 0; + bool free_map_host = false; + + if(mem_alloc_result != CUDA_SUCCESS && can_map_host && + map_host_used + size < map_host_limit) { + if(mem.shared_pointer) { + /* Another device already allocated host memory. */ + mem_alloc_result = CUDA_SUCCESS; + map_host_pointer = mem.shared_pointer; + } + else { + /* Allocate host memory ourselves. */ + mem_alloc_result = cuMemHostAlloc(&map_host_pointer, size, + CU_MEMHOSTALLOC_DEVICEMAP | + CU_MEMHOSTALLOC_WRITECOMBINED); + mem.shared_pointer = map_host_pointer; + free_map_host = true; + } + + if(mem_alloc_result == CUDA_SUCCESS) { + cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, mem.shared_pointer, 0)); + map_host_used += size; + status = " in host memory"; + + /* Replace host pointer with our host allocation. Only works if + * CUDA memory layout is the same and has no pitch padding. Also + * does not work if we move textures to host during a render, + * since other devices might be using the memory. */ + if(!move_texture_to_host && pitch_padding == 0 && + mem.host_pointer && mem.host_pointer != mem.shared_pointer) { + memcpy(mem.shared_pointer, mem.host_pointer, size); + mem.host_free(); + mem.host_pointer = mem.shared_pointer; + } + } + else { + status = " failed, out of host memory"; + } + } + else if(mem_alloc_result != CUDA_SUCCESS) { + status = " failed, out of device and host memory"; + } + + if(mem_alloc_result != CUDA_SUCCESS) { + cuda_assert(mem_alloc_result); + } + + if(mem.name) { + VLOG(1) << "Buffer allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")" + << status; + } + mem.device_pointer = (device_ptr)device_pointer; mem.device_size = size; stats.mem_alloc(size); - cuda_pop_context(); + + if(!mem.device_pointer) { + return NULL; + } + + /* Insert into map of allocations. */ + CUDAMem *cmem = &cuda_mem_map[&mem]; + cmem->map_host_pointer = map_host_pointer; + cmem->free_map_host = free_map_host; + return cmem; + } + + void generic_copy_to(device_memory& mem) + { + if(mem.host_pointer && mem.device_pointer) { + CUDAContextScope scope(this); + + if(mem.host_pointer != mem.shared_pointer) { + cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), + mem.host_pointer, + mem.memory_size())); + } + } + } + + void generic_free(device_memory& mem) + { + if(mem.device_pointer) { + CUDAContextScope scope(this); + const CUDAMem& cmem = cuda_mem_map[&mem]; + + if(cmem.map_host_pointer) { + /* Free host memory. */ + if(cmem.free_map_host) { + cuMemFreeHost(cmem.map_host_pointer); + if(mem.host_pointer == mem.shared_pointer) { + mem.host_pointer = 0; + } + mem.shared_pointer = 0; + } + + map_host_used -= mem.device_size; + } + else { + /* Free device memory. */ + cuMemFree(mem.device_pointer); + } + + stats.mem_free(mem.device_size); + mem.device_pointer = 0; + mem.device_size = 0; + + cuda_mem_map.erase(cuda_mem_map.find(&mem)); + } + } + + void mem_alloc(device_memory& mem) + { + if(mem.type == MEM_PIXELS && !background) { + pixels_alloc(mem); + } + else if(mem.type == MEM_TEXTURE) { + assert(!"mem_alloc not supported for textures."); + } + else { + generic_alloc(mem); + } } void mem_copy_to(device_memory& mem) { - cuda_push_context(); - if(mem.device_pointer) - cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), (void*)mem.data_pointer, mem.memory_size())); - cuda_pop_context(); + if(mem.type == MEM_PIXELS) { + assert(!"mem_copy_to not supported for pixels."); + } + else if(mem.type == MEM_TEXTURE) { + tex_free(mem); + tex_alloc(mem); + } + else { + if(!mem.device_pointer) { + generic_alloc(mem); + } + + generic_copy_to(mem); + } } void mem_copy_from(device_memory& mem, int y, int w, int h, int elem) { - size_t offset = elem*y*w; - size_t size = elem*w*h; - - cuda_push_context(); - if(mem.device_pointer) { - cuda_assert(cuMemcpyDtoH((uchar*)mem.data_pointer + offset, - (CUdeviceptr)(mem.device_pointer + offset), size)); + if(mem.type == MEM_PIXELS && !background) { + pixels_copy_from(mem, y, w, h); + } + else if(mem.type == MEM_TEXTURE) { + assert(!"mem_copy_from not supported for textures."); } else { - memset((char*)mem.data_pointer + offset, 0, size); + CUDAContextScope scope(this); + size_t offset = elem*y*w; + size_t size = elem*w*h; + + if(mem.host_pointer && mem.device_pointer) { + cuda_assert(cuMemcpyDtoH((uchar*)mem.host_pointer + offset, + (CUdeviceptr)(mem.device_pointer + offset), size)); + } + else if(mem.host_pointer) { + memset((char*)mem.host_pointer + offset, 0, size); + } } - cuda_pop_context(); } void mem_zero(device_memory& mem) { - memset((void*)mem.data_pointer, 0, mem.memory_size()); + if(!mem.device_pointer) { + mem_alloc(mem); + } - cuda_push_context(); - if(mem.device_pointer) + if(mem.host_pointer) { + memset(mem.host_pointer, 0, mem.memory_size()); + } + + if(mem.device_pointer && + (!mem.host_pointer || mem.host_pointer != mem.shared_pointer)) { + CUDAContextScope scope(this); cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size())); - cuda_pop_context(); + } } void mem_free(device_memory& mem) { - if(mem.device_pointer) { - cuda_push_context(); - cuda_assert(cuMemFree(cuda_device_ptr(mem.device_pointer))); - cuda_pop_context(); - - mem.device_pointer = 0; - - stats.mem_free(mem.device_size); - mem.device_size = 0; + if(mem.type == MEM_PIXELS && !background) { + pixels_free(mem); + } + else if(mem.type == MEM_TEXTURE) { + tex_free(mem); + } + else { + generic_free(mem); } } + virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/) + { + return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset)); + } + void const_copy_to(const char *name, void *host, size_t size) { + CUDAContextScope scope(this); CUdeviceptr mem; size_t bytes; - cuda_push_context(); cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name)); //assert(bytes == size); cuda_assert(cuMemcpyHtoD(mem, host, size)); - cuda_pop_context(); } - void tex_alloc(const char *name, - device_memory& mem, - InterpolationType interpolation, - ExtensionType extension) + void tex_alloc(device_memory& mem) { - VLOG(1) << "Texture allocate: " << name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; - - /* Check if we are on sm_30 or above. - * We use arrays and bindles textures for storage there */ - bool has_bindless_textures = info.has_bindless_textures; + CUDAContextScope scope(this); /* General variables for both architectures */ - string bind_name = name; + string bind_name = mem.name; size_t dsize = datatype_size(mem.data_type); size_t size = mem.memory_size(); CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP; - switch(extension) { + switch(mem.extension) { case EXTENSION_REPEAT: address_mode = CU_TR_ADDRESS_MODE_WRAP; break; @@ -573,13 +1038,37 @@ public: } CUfilter_mode filter_mode; - if(interpolation == INTERPOLATION_CLOSEST) { + if(mem.interpolation == INTERPOLATION_CLOSEST) { filter_mode = CU_TR_FILTER_MODE_POINT; } else { filter_mode = CU_TR_FILTER_MODE_LINEAR; } + /* Data Storage */ + if(mem.interpolation == INTERPOLATION_NONE) { + generic_alloc(mem); + generic_copy_to(mem); + + CUdeviceptr cumem; + size_t cubytes; + + cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str())); + + if(cubytes == 8) { + /* 64 bit device pointer */ + uint64_t ptr = mem.device_pointer; + cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)); + } + else { + /* 32 bit device pointer */ + uint32_t ptr = (uint32_t)mem.device_pointer; + cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)); + } + return; + } + + /* Image Texture Storage */ CUarray_format_enum format; switch(mem.data_type) { case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break; @@ -590,300 +1079,647 @@ public: default: assert(0); return; } - /* General variables for Fermi */ - CUtexref texref = NULL; + CUDAMem *cmem = NULL; + CUarray array_3d = NULL; + size_t src_pitch = mem.data_width * dsize * mem.data_elements; + size_t dst_pitch = src_pitch; - if(!has_bindless_textures) { - if(mem.data_depth > 1) { - /* Kernel uses different bind names for 2d and 3d float textures, - * so we have to adjust couple of things here. - */ - vector<string> tokens; - string_split(tokens, name, "_"); - bind_name = string_printf("__tex_image_%s_3d_%s", - tokens[2].c_str(), - tokens[3].c_str()); - } + if(mem.data_depth > 1) { + /* 3D texture using array, there is no API for linear memory. */ + CUDA_ARRAY3D_DESCRIPTOR desc; - cuda_push_context(); - cuda_assert(cuModuleGetTexRef(&texref, cuModule, bind_name.c_str())); - cuda_pop_context(); + desc.Width = mem.data_width; + desc.Height = mem.data_height; + desc.Depth = mem.data_depth; + desc.Format = format; + desc.NumChannels = mem.data_elements; + desc.Flags = 0; - if(!texref) { - return; - } - } + VLOG(1) << "Array 3D allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; - /* Data Storage */ - if(interpolation == INTERPOLATION_NONE) { - if(has_bindless_textures) { - mem_alloc(mem, MEM_READ_ONLY); - mem_copy_to(mem); + cuda_assert(cuArray3DCreate(&array_3d, &desc)); - cuda_push_context(); + if(!array_3d) { + return; + } - CUdeviceptr cumem; - size_t cubytes; + CUDA_MEMCPY3D param; + memset(¶m, 0, sizeof(param)); + param.dstMemoryType = CU_MEMORYTYPE_ARRAY; + param.dstArray = array_3d; + param.srcMemoryType = CU_MEMORYTYPE_HOST; + param.srcHost = mem.host_pointer; + param.srcPitch = src_pitch; + param.WidthInBytes = param.srcPitch; + param.Height = mem.data_height; + param.Depth = mem.data_depth; - cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str())); + cuda_assert(cuMemcpy3D(¶m)); - if(cubytes == 8) { - /* 64 bit device pointer */ - uint64_t ptr = mem.device_pointer; - cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)); - } - else { - /* 32 bit device pointer */ - uint32_t ptr = (uint32_t)mem.device_pointer; - cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)); - } + mem.device_pointer = (device_ptr)array_3d; + mem.device_size = size; + stats.mem_alloc(size); - cuda_pop_context(); + cmem = &cuda_mem_map[&mem]; + cmem->texobject = 0; + cmem->array = array_3d; + } + else if(mem.data_height > 0) { + /* 2D texture, using pitch aligned linear memory. */ + int alignment = 0; + cuda_assert(cuDeviceGetAttribute(&alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice)); + dst_pitch = align_up(src_pitch, alignment); + size_t dst_size = dst_pitch * mem.data_height; + + cmem = generic_alloc(mem, dst_size - mem.memory_size()); + if(!cmem) { + return; } - else { - mem_alloc(mem, MEM_READ_ONLY); - mem_copy_to(mem); - cuda_push_context(); + CUDA_MEMCPY2D param; + memset(¶m, 0, sizeof(param)); + param.dstMemoryType = CU_MEMORYTYPE_DEVICE; + param.dstDevice = mem.device_pointer; + param.dstPitch = dst_pitch; + param.srcMemoryType = CU_MEMORYTYPE_HOST; + param.srcHost = mem.host_pointer; + param.srcPitch = src_pitch; + param.WidthInBytes = param.srcPitch; + param.Height = mem.data_height; + + cuda_assert(cuMemcpy2DUnaligned(¶m)); + } + else { + /* 1D texture, using linear memory. */ + cmem = generic_alloc(mem); + if(!cmem) { + return; + } - cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size)); - cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT)); - cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER)); + cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size)); + } - cuda_pop_context(); - } + /* Kepler+, bindless textures. */ + int flat_slot = 0; + if(string_startswith(mem.name, "__tex_image")) { + int pos = string(mem.name).rfind("_"); + flat_slot = atoi(mem.name + pos + 1); } - /* Texture Storage */ else { - CUarray handle = NULL; + assert(0); + } - cuda_push_context(); + CUDA_RESOURCE_DESC resDesc; + memset(&resDesc, 0, sizeof(resDesc)); - if(mem.data_depth > 1) { - CUDA_ARRAY3D_DESCRIPTOR desc; + if(array_3d) { + resDesc.resType = CU_RESOURCE_TYPE_ARRAY; + resDesc.res.array.hArray = array_3d; + resDesc.flags = 0; + } + else if(mem.data_height > 0) { + resDesc.resType = CU_RESOURCE_TYPE_PITCH2D; + resDesc.res.pitch2D.devPtr = mem.device_pointer; + resDesc.res.pitch2D.format = format; + resDesc.res.pitch2D.numChannels = mem.data_elements; + resDesc.res.pitch2D.height = mem.data_height; + resDesc.res.pitch2D.width = mem.data_width; + resDesc.res.pitch2D.pitchInBytes = dst_pitch; + } + else { + resDesc.resType = CU_RESOURCE_TYPE_LINEAR; + resDesc.res.linear.devPtr = mem.device_pointer; + resDesc.res.linear.format = format; + resDesc.res.linear.numChannels = mem.data_elements; + resDesc.res.linear.sizeInBytes = mem.device_size; + } - desc.Width = mem.data_width; - desc.Height = mem.data_height; - desc.Depth = mem.data_depth; - desc.Format = format; - desc.NumChannels = mem.data_elements; - desc.Flags = 0; + CUDA_TEXTURE_DESC texDesc; + memset(&texDesc, 0, sizeof(texDesc)); + texDesc.addressMode[0] = address_mode; + texDesc.addressMode[1] = address_mode; + texDesc.addressMode[2] = address_mode; + texDesc.filterMode = filter_mode; + texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES; + + cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL)); + + /* Resize once */ + if(flat_slot >= texture_info.size()) { + /* Allocate some slots in advance, to reduce amount + * of re-allocations. */ + texture_info.resize(flat_slot + 128); + } - cuda_assert(cuArray3DCreate(&handle, &desc)); - } - else { - CUDA_ARRAY_DESCRIPTOR desc; + /* Set Mapping and tag that we need to (re-)upload to device */ + TextureInfo& info = texture_info[flat_slot]; + info.data = (uint64_t)cmem->texobject; + info.cl_buffer = 0; + info.interpolation = mem.interpolation; + info.extension = mem.extension; + info.width = mem.data_width; + info.height = mem.data_height; + info.depth = mem.data_depth; + need_texture_info = true; + } - desc.Width = mem.data_width; - desc.Height = mem.data_height; - desc.Format = format; - desc.NumChannels = mem.data_elements; + void tex_free(device_memory& mem) + { + if(mem.device_pointer) { + CUDAContextScope scope(this); + const CUDAMem& cmem = cuda_mem_map[&mem]; - cuda_assert(cuArrayCreate(&handle, &desc)); + if(cmem.texobject) { + /* Free bindless texture. */ + cuTexObjectDestroy(cmem.texobject); } - if(!handle) { - cuda_pop_context(); - return; - } + if(cmem.array) { + /* Free array. */ + cuArrayDestroy(cmem.array); + stats.mem_free(mem.device_size); + mem.device_pointer = 0; + mem.device_size = 0; - /* Allocate 3D, 2D or 1D memory */ - if(mem.data_depth > 1) { - CUDA_MEMCPY3D param; - memset(¶m, 0, sizeof(param)); - param.dstMemoryType = CU_MEMORYTYPE_ARRAY; - param.dstArray = handle; - param.srcMemoryType = CU_MEMORYTYPE_HOST; - param.srcHost = (void*)mem.data_pointer; - param.srcPitch = mem.data_width*dsize*mem.data_elements; - param.WidthInBytes = param.srcPitch; - param.Height = mem.data_height; - param.Depth = mem.data_depth; - - cuda_assert(cuMemcpy3D(¶m)); + cuda_mem_map.erase(cuda_mem_map.find(&mem)); } - else if(mem.data_height > 1) { - CUDA_MEMCPY2D param; - memset(¶m, 0, sizeof(param)); - param.dstMemoryType = CU_MEMORYTYPE_ARRAY; - param.dstArray = handle; - param.srcMemoryType = CU_MEMORYTYPE_HOST; - param.srcHost = (void*)mem.data_pointer; - param.srcPitch = mem.data_width*dsize*mem.data_elements; - param.WidthInBytes = param.srcPitch; - param.Height = mem.data_height; - - cuda_assert(cuMemcpy2D(¶m)); + else { + generic_free(mem); } - else - cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size)); + } + } - /* Fermi and Kepler */ - mem.device_pointer = (device_ptr)handle; - mem.device_size = size; + bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task) + { + TilesInfo *tiles = (TilesInfo*) task->tiles_mem.host_pointer; + for(int i = 0; i < 9; i++) { + tiles->buffers[i] = buffers[i]; + } - stats.mem_alloc(size); + task->tiles_mem.copy_to_device(); - /* Bindless Textures - Kepler */ - if(has_bindless_textures) { - int flat_slot = 0; - if(string_startswith(name, "__tex_image")) { - int pos = string(name).rfind("_"); - flat_slot = atoi(name + pos + 1); - } - else { - assert(0); - } + return !have_error(); + } - CUDA_RESOURCE_DESC resDesc; - memset(&resDesc, 0, sizeof(resDesc)); - resDesc.resType = CU_RESOURCE_TYPE_ARRAY; - resDesc.res.array.hArray = handle; - resDesc.flags = 0; - - CUDA_TEXTURE_DESC texDesc; - memset(&texDesc, 0, sizeof(texDesc)); - texDesc.addressMode[0] = address_mode; - texDesc.addressMode[1] = address_mode; - texDesc.addressMode[2] = address_mode; - texDesc.filterMode = filter_mode; - texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES; - - CUtexObject tex = 0; - cuda_assert(cuTexObjectCreate(&tex, &resDesc, &texDesc, NULL)); - - /* Safety check */ - if((uint)tex > UINT_MAX) { - assert(0); - } +#define CUDA_GET_BLOCKSIZE(func, w, h) \ + int threads_per_block; \ + cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ + int threads = (int)sqrt((float)threads_per_block); \ + int xblocks = ((w) + threads - 1)/threads; \ + int yblocks = ((h) + threads - 1)/threads; + +#define CUDA_LAUNCH_KERNEL(func, args) \ + cuda_assert(cuLaunchKernel(func, \ + xblocks, yblocks, 1, \ + threads, threads, 1, \ + 0, 0, args, 0)); + +/* Similar as above, but for 1-dimensional blocks. */ +#define CUDA_GET_BLOCKSIZE_1D(func, w, h) \ + int threads_per_block; \ + cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ + int xblocks = ((w) + threads_per_block - 1)/threads_per_block; \ + int yblocks = h; + +#define CUDA_LAUNCH_KERNEL_1D(func, args) \ + cuda_assert(cuLaunchKernel(func, \ + xblocks, yblocks, 1, \ + threads_per_block, 1, 1, \ + 0, 0, args, 0)); + + bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr, + DenoisingTask *task) + { + if(have_error()) + return false; - /* Resize once */ - if(flat_slot >= bindless_mapping.size()) { - /* Allocate some slots in advance, to reduce amount - * of re-allocations. - */ - bindless_mapping.resize(flat_slot + 128); - } + CUDAContextScope scope(this); - /* Set Mapping and tag that we need to (re-)upload to device */ - bindless_mapping.get_data()[flat_slot] = (uint)tex; - tex_bindless_map[mem.device_pointer] = (uint)tex; - need_bindless_mapping = true; - } - /* Regular Textures - Fermi */ - else { - cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT)); - cuda_assert(cuTexRefSetFilterMode(texref, filter_mode)); - cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES)); - } + int stride = task->buffer.stride; + int w = task->buffer.width; + int h = task->buffer.h; + int r = task->nlm_state.r; + int f = task->nlm_state.f; + float a = task->nlm_state.a; + float k_2 = task->nlm_state.k_2; - cuda_pop_context(); - } + int shift_stride = stride*h; + int num_shifts = (2*r+1)*(2*r+1); + int mem_size = sizeof(float)*shift_stride*num_shifts; + int channel_offset = 0; - /* Fermi, Data and Image Textures */ - if(!has_bindless_textures) { - cuda_push_context(); + device_only_memory<uchar> temporary_mem(this, "Denoising temporary_mem"); + temporary_mem.alloc_to_device(2*mem_size); - cuda_assert(cuTexRefSetAddressMode(texref, 0, address_mode)); - cuda_assert(cuTexRefSetAddressMode(texref, 1, address_mode)); - if(mem.data_depth > 1) { - cuda_assert(cuTexRefSetAddressMode(texref, 2, address_mode)); - } + if(have_error()) + return false; + + CUdeviceptr difference = cuda_device_ptr(temporary_mem.device_pointer); + CUdeviceptr blurDifference = difference + mem_size; - cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements)); + CUdeviceptr weightAccum = task->nlm_state.temporary_3_ptr; + cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*shift_stride)); + cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*shift_stride)); - cuda_pop_context(); + { + CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput; + cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); + cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); + cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); + cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output")); + + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1)); + + CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w*h, num_shifts); + + void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &difference, &w, &h, &stride, &shift_stride, &r, &channel_offset, &a, &k_2}; + void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &shift_stride, &r, &f}; + void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &shift_stride, &r, &f}; + void *update_output_args[] = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &shift_stride, &r, &f}; + + CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args); } - /* Fermi and Kepler */ - tex_interp_map[mem.device_pointer] = (interpolation != INTERPOLATION_NONE); + temporary_mem.free(); + + { + CUfunction cuNLMNormalize; + cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize")); + cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1)); + void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride}; + CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h); + CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args); + cuda_assert(cuCtxSynchronize()); + } + + return !have_error(); } - void tex_free(device_memory& mem) + bool denoising_construct_transform(DenoisingTask *task) { - if(mem.device_pointer) { - if(tex_interp_map[mem.device_pointer]) { - cuda_push_context(); - cuArrayDestroy((CUarray)mem.device_pointer); - cuda_pop_context(); - - /* Free CUtexObject (Bindless Textures) */ - if(info.has_bindless_textures && tex_bindless_map[mem.device_pointer]) { - uint flat_slot = tex_bindless_map[mem.device_pointer]; - cuTexObjectDestroy(flat_slot); - } + if(have_error()) + return false; - tex_interp_map.erase(tex_interp_map.find(mem.device_pointer)); - mem.device_pointer = 0; + CUDAContextScope scope(this); + + CUfunction cuFilterConstructTransform; + cuda_assert(cuModuleGetFunction(&cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform")); + cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED)); + CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, + task->storage.w, + task->storage.h); + + void *args[] = {&task->buffer.mem.device_pointer, + &task->storage.transform.device_pointer, + &task->storage.rank.device_pointer, + &task->filter_area, + &task->rect, + &task->radius, + &task->pca_threshold, + &task->buffer.pass_stride}; + CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args); + cuda_assert(cuCtxSynchronize()); - stats.mem_free(mem.device_size); - mem.device_size = 0; - } - else { - tex_interp_map.erase(tex_interp_map.find(mem.device_pointer)); - mem_free(mem); - } + return !have_error(); + } + + bool denoising_reconstruct(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr output_ptr, + DenoisingTask *task) + { + if(have_error()) + return false; + + CUDAContextScope scope(this); + + mem_zero(task->storage.XtWX); + mem_zero(task->storage.XtWY); + + int r = task->radius; + int f = 4; + float a = 1.0f; + float k_2 = task->nlm_k_2; + + int w = task->reconstruction_state.source_w; + int h = task->reconstruction_state.source_h; + int stride = task->buffer.stride; + + int shift_stride = stride*h; + int num_shifts = (2*r+1)*(2*r+1); + int mem_size = sizeof(float)*shift_stride*num_shifts; + + device_only_memory<uchar> temporary_mem(this, "Denoising temporary_mem"); + temporary_mem.alloc_to_device(2*mem_size); + + if(have_error()) + return false; + + CUdeviceptr difference = cuda_device_ptr(temporary_mem.device_pointer); + CUdeviceptr blurDifference = difference + mem_size; + + { + CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian; + cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); + cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); + cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); + cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian")); + + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED)); + + CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, + task->reconstruction_state.source_w * task->reconstruction_state.source_h, + num_shifts); + + void *calc_difference_args[] = {&color_ptr, &color_variance_ptr, &difference, &w, &h, &stride, &shift_stride, &r, &task->buffer.pass_stride, &a, &k_2}; + void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &shift_stride, &r, &f}; + void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &shift_stride, &r, &f}; + void *construct_gramian_args[] = {&blurDifference, + &task->buffer.mem.device_pointer, + &task->storage.transform.device_pointer, + &task->storage.rank.device_pointer, + &task->storage.XtWX.device_pointer, + &task->storage.XtWY.device_pointer, + &task->reconstruction_state.filter_window, + &w, &h, &stride, + &shift_stride, &r, + &f, + &task->buffer.pass_stride}; + + CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args); } + + temporary_mem.free(); + + { + CUfunction cuFinalize; + cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize")); + cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1)); + void *finalize_args[] = {&output_ptr, + &task->storage.rank.device_pointer, + &task->storage.XtWX.device_pointer, + &task->storage.XtWY.device_pointer, + &task->filter_area, + &task->reconstruction_state.buffer_params.x, + &task->render_buffer.samples}; + CUDA_GET_BLOCKSIZE(cuFinalize, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); + CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args); + } + + cuda_assert(cuCtxSynchronize()); + + return !have_error(); } - void path_trace(RenderTile& rtile, int sample, bool branched) + bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr, + device_ptr mean_ptr, device_ptr variance_ptr, + int r, int4 rect, DenoisingTask *task) { if(have_error()) - return; + return false; + + CUDAContextScope scope(this); + + CUfunction cuFilterCombineHalves; + cuda_assert(cuModuleGetFunction(&cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves")); + cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE(cuFilterCombineHalves, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + void *args[] = {&mean_ptr, + &variance_ptr, + &a_ptr, + &b_ptr, + &rect, + &r}; + CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); + } + + bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr, + device_ptr sample_variance_ptr, device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr, DenoisingTask *task) + { + if(have_error()) + return false; + + CUDAContextScope scope(this); + + CUfunction cuFilterDivideShadow; + cuda_assert(cuModuleGetFunction(&cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow")); + cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE(cuFilterDivideShadow, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + void *args[] = {&task->render_buffer.samples, + &task->tiles_mem.device_pointer, + &a_ptr, + &b_ptr, + &sample_variance_ptr, + &sv_variance_ptr, + &buffer_variance_ptr, + &task->rect, + &task->render_buffer.pass_stride, + &task->render_buffer.denoising_data_offset}; + CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args); + cuda_assert(cuCtxSynchronize()); - cuda_push_context(); + return !have_error(); + } + + bool denoising_get_feature(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr, + DenoisingTask *task) + { + if(have_error()) + return false; + + CUDAContextScope scope(this); + + CUfunction cuFilterGetFeature; + cuda_assert(cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature")); + cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE(cuFilterGetFeature, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + void *args[] = {&task->render_buffer.samples, + &task->tiles_mem.device_pointer, + &mean_offset, + &variance_offset, + &mean_ptr, + &variance_ptr, + &task->rect, + &task->render_buffer.pass_stride, + &task->render_buffer.denoising_data_offset}; + CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); + } + bool denoising_detect_outliers(device_ptr image_ptr, + device_ptr variance_ptr, + device_ptr depth_ptr, + device_ptr output_ptr, + DenoisingTask *task) + { + if(have_error()) + return false; + + CUDAContextScope scope(this); + + CUfunction cuFilterDetectOutliers; + cuda_assert(cuModuleGetFunction(&cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers")); + cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE(cuFilterDetectOutliers, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + void *args[] = {&image_ptr, + &variance_ptr, + &depth_ptr, + &output_ptr, + &task->rect, + &task->buffer.pass_stride}; + + CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); + } + + void denoise(RenderTile &rtile, DenoisingTask& denoising, const DeviceTask &task) + { + denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising); + denoising.functions.reconstruct = function_bind(&CUDADevice::denoising_reconstruct, this, _1, _2, _3, &denoising); + denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); + denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); + denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising); + denoising.functions.detect_outliers = function_bind(&CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); + denoising.functions.set_tiles = function_bind(&CUDADevice::denoising_set_tiles, this, _1, &denoising); + + denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); + denoising.render_buffer.samples = rtile.sample; + + RenderTile rtiles[9]; + rtiles[4] = rtile; + task.map_neighbor_tiles(rtiles, this); + denoising.tiles_from_rendertiles(rtiles); + + denoising.init_from_devicetask(task); + + denoising.run_denoising(); + + task.unmap_neighbor_tiles(rtiles, this); + } + + void path_trace(DeviceTask& task, RenderTile& rtile, device_vector<WorkTile>& work_tiles) + { + scoped_timer timer(&rtile.buffers->render_time); + + if(have_error()) + return; + + CUDAContextScope scope(this); CUfunction cuPathTrace; - CUdeviceptr d_buffer = cuda_device_ptr(rtile.buffer); - CUdeviceptr d_rng_state = cuda_device_ptr(rtile.rng_state); - /* get kernel function */ - if(branched) { + /* Get kernel function. */ + if(task.integrator_branched) { cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace")); } else { cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace")); } - if(have_error()) + if(have_error()) { return; + } - /* pass in parameters */ - void *args[] = {&d_buffer, - &d_rng_state, - &sample, - &rtile.x, - &rtile.y, - &rtile.w, - &rtile.h, - &rtile.offset, - &rtile.stride}; + cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)); - /* launch kernel */ - int threads_per_block; - cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuPathTrace)); + /* Allocate work tile. */ + work_tiles.alloc(1); + + WorkTile *wtile = work_tiles.data(); + wtile->x = rtile.x; + wtile->y = rtile.y; + wtile->w = rtile.w; + wtile->h = rtile.h; + wtile->offset = rtile.offset; + wtile->stride = rtile.stride; + wtile->buffer = (float*)cuda_device_ptr(rtile.buffer); + + /* Prepare work size. More step samples render faster, but for now we + * remain conservative for GPUs connected to a display to avoid driver + * timeouts and display freezing. */ + int min_blocks, num_threads_per_block; + cuda_assert(cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0)); + if(!info.display_device) { + min_blocks *= 8; + } - /*int num_registers; - cuda_assert(cuFuncGetAttribute(&num_registers, CU_FUNC_ATTRIBUTE_NUM_REGS, cuPathTrace)); + uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h); - printf("threads_per_block %d\n", threads_per_block); - printf("num_registers %d\n", num_registers);*/ + /* Render all samples. */ + int start_sample = rtile.start_sample; + int end_sample = rtile.start_sample + rtile.num_samples; - int xthreads = (int)sqrt(threads_per_block); - int ythreads = (int)sqrt(threads_per_block); - int xblocks = (rtile.w + xthreads - 1)/xthreads; - int yblocks = (rtile.h + ythreads - 1)/ythreads; + for(int sample = start_sample; sample < end_sample; sample += step_samples) { + /* Setup and copy work tile to device. */ + wtile->start_sample = sample; + wtile->num_samples = min(step_samples, end_sample - sample);; + work_tiles.copy_to_device(); - cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)); + CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer); + uint total_work_size = wtile->w * wtile->h * wtile->num_samples; + uint num_blocks = divide_up(total_work_size, num_threads_per_block); - cuda_assert(cuLaunchKernel(cuPathTrace, - xblocks , yblocks, 1, /* blocks */ - xthreads, ythreads, 1, /* threads */ - 0, 0, args, 0)); + /* Launch kernel. */ + void *args[] = {&d_work_tiles, + &total_work_size}; - cuda_assert(cuCtxSynchronize()); + cuda_assert(cuLaunchKernel(cuPathTrace, + num_blocks, 1, 1, + num_threads_per_block, 1, 1, + 0, 0, args, 0)); + + cuda_assert(cuCtxSynchronize()); - cuda_pop_context(); + /* Update progress. */ + rtile.sample = sample + wtile->num_samples; + task.update_progress(&rtile, rtile.w*rtile.h*wtile->num_samples); + + if(task.get_cancel()) { + if(task.need_finish_queue == false) + break; + } + } } void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half) @@ -891,7 +1727,7 @@ public: if(have_error()) return; - cuda_push_context(); + CUDAContextScope scope(this); CUfunction cuFilmConvert; CUdeviceptr d_rgba = map_pixels((rgba_byte)? rgba_byte: rgba_half); @@ -937,7 +1773,7 @@ public: unmap_pixels((rgba_byte)? rgba_byte: rgba_half); - cuda_pop_context(); + cuda_assert(cuCtxSynchronize()); } void shader(DeviceTask& task) @@ -945,19 +1781,21 @@ public: if(have_error()) return; - cuda_push_context(); + CUDAContextScope scope(this); CUfunction cuShader; CUdeviceptr d_input = cuda_device_ptr(task.shader_input); CUdeviceptr d_output = cuda_device_ptr(task.shader_output); - CUdeviceptr d_output_luma = cuda_device_ptr(task.shader_output_luma); /* get kernel function */ if(task.shader_eval_type >= SHADER_EVAL_BAKE) { cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_bake")); } + else if(task.shader_eval_type == SHADER_EVAL_DISPLACE) { + cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace")); + } else { - cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_shader")); + cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background")); } /* do tasks in smaller chunks, so we can cancel it */ @@ -976,9 +1814,6 @@ public: int arg = 0; args[arg++] = &d_input; args[arg++] = &d_output; - if(task.shader_eval_type < SHADER_EVAL_BAKE) { - args[arg++] = &d_output_luma; - } args[arg++] = &task.shader_eval_type; if(task.shader_eval_type >= SHADER_EVAL_BAKE) { args[arg++] = &task.shader_filter; @@ -1010,8 +1845,6 @@ public: task.update_progress(NULL); } - - cuda_pop_context(); } CUdeviceptr map_pixels(device_ptr mem) @@ -1041,117 +1874,95 @@ public: void pixels_alloc(device_memory& mem) { - if(!background) { - PixelMem pmem; + PixelMem pmem; - pmem.w = mem.data_width; - pmem.h = mem.data_height; + pmem.w = mem.data_width; + pmem.h = mem.data_height; - cuda_push_context(); - - glGenBuffers(1, &pmem.cuPBO); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); - if(mem.data_type == TYPE_HALF) - glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(GLhalf)*4, NULL, GL_DYNAMIC_DRAW); - else - glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(uint8_t)*4, NULL, GL_DYNAMIC_DRAW); - - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); - - glGenTextures(1, &pmem.cuTexId); - glBindTexture(GL_TEXTURE_2D, pmem.cuTexId); - if(mem.data_type == TYPE_HALF) - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F_ARB, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL); - else - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); - glBindTexture(GL_TEXTURE_2D, 0); + CUDAContextScope scope(this); - CUresult result = cuGraphicsGLRegisterBuffer(&pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE); + glGenBuffers(1, &pmem.cuPBO); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); + if(mem.data_type == TYPE_HALF) + glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(GLhalf)*4, NULL, GL_DYNAMIC_DRAW); + else + glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(uint8_t)*4, NULL, GL_DYNAMIC_DRAW); - if(result == CUDA_SUCCESS) { - cuda_pop_context(); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); - mem.device_pointer = pmem.cuTexId; - pixel_mem_map[mem.device_pointer] = pmem; + glGenTextures(1, &pmem.cuTexId); + glBindTexture(GL_TEXTURE_2D, pmem.cuTexId); + if(mem.data_type == TYPE_HALF) + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F_ARB, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL); + else + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + glBindTexture(GL_TEXTURE_2D, 0); - mem.device_size = mem.memory_size(); - stats.mem_alloc(mem.device_size); + CUresult result = cuGraphicsGLRegisterBuffer(&pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE); - return; - } - else { - /* failed to register buffer, fallback to no interop */ - glDeleteBuffers(1, &pmem.cuPBO); - glDeleteTextures(1, &pmem.cuTexId); + if(result == CUDA_SUCCESS) { + mem.device_pointer = pmem.cuTexId; + pixel_mem_map[mem.device_pointer] = pmem; - cuda_pop_context(); + mem.device_size = mem.memory_size(); + stats.mem_alloc(mem.device_size); - background = true; - } + return; } + else { + /* failed to register buffer, fallback to no interop */ + glDeleteBuffers(1, &pmem.cuPBO); + glDeleteTextures(1, &pmem.cuTexId); - Device::pixels_alloc(mem); + background = true; + } } void pixels_copy_from(device_memory& mem, int y, int w, int h) { - if(!background) { - PixelMem pmem = pixel_mem_map[mem.device_pointer]; - - cuda_push_context(); - - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); - uchar *pixels = (uchar*)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY); - size_t offset = sizeof(uchar)*4*y*w; - memcpy((uchar*)mem.data_pointer + offset, pixels + offset, sizeof(uchar)*4*w*h); - glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); - - cuda_pop_context(); + PixelMem pmem = pixel_mem_map[mem.device_pointer]; - return; - } + CUDAContextScope scope(this); - Device::pixels_copy_from(mem, y, w, h); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); + uchar *pixels = (uchar*)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY); + size_t offset = sizeof(uchar)*4*y*w; + memcpy((uchar*)mem.host_pointer + offset, pixels + offset, sizeof(uchar)*4*w*h); + glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); } void pixels_free(device_memory& mem) { if(mem.device_pointer) { - if(!background) { - PixelMem pmem = pixel_mem_map[mem.device_pointer]; - - cuda_push_context(); - - cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource)); - glDeleteBuffers(1, &pmem.cuPBO); - glDeleteTextures(1, &pmem.cuTexId); - - cuda_pop_context(); + PixelMem pmem = pixel_mem_map[mem.device_pointer]; - pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer)); - mem.device_pointer = 0; + CUDAContextScope scope(this); - stats.mem_free(mem.device_size); - mem.device_size = 0; + cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource)); + glDeleteBuffers(1, &pmem.cuPBO); + glDeleteTextures(1, &pmem.cuTexId); - return; - } + pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer)); + mem.device_pointer = 0; - Device::pixels_free(mem); + stats.mem_free(mem.device_size); + mem.device_size = 0; } } void draw_pixels(device_memory& mem, int y, int w, int h, int dx, int dy, int width, int height, bool transparent, const DeviceDrawParams &draw_params) { + assert(mem.type == MEM_PIXELS); + if(!background) { PixelMem pmem = pixel_mem_map[mem.device_pointer]; float *vpointer; - cuda_push_context(); + CUDAContextScope scope(this); /* for multi devices, this assumes the inefficient method that we allocate * all pixels on the device even though we only render to a subset */ @@ -1240,8 +2051,6 @@ public: glBindTexture(GL_TEXTURE_2D, 0); glDisable(GL_TEXTURE_2D); - cuda_pop_context(); - return; } @@ -1250,44 +2059,55 @@ public: void thread_run(DeviceTask *task) { - if(task->type == DeviceTask::PATH_TRACE) { - RenderTile tile; - - bool branched = task->integrator_branched; + CUDAContextScope scope(this); + + if(task->type == DeviceTask::RENDER) { + DeviceRequestedFeatures requested_features; + if(use_split_kernel()) { + if(split_kernel == NULL) { + split_kernel = new CUDASplitKernel(this); + split_kernel->load_kernels(requested_features); + } + } - /* Upload Bindless Mapping */ - load_bindless_mapping(); + device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY); /* keep rendering tiles until done */ - while(task->acquire_tile(this, tile)) { - int start_sample = tile.start_sample; - int end_sample = tile.start_sample + tile.num_samples; + RenderTile tile; + DenoisingTask denoising(this); - for(int sample = start_sample; sample < end_sample; sample++) { - if(task->get_cancel()) { - if(task->need_finish_queue == false) - break; + while(task->acquire_tile(this, tile)) { + if(tile.task == RenderTile::PATH_TRACE) { + if(use_split_kernel()) { + device_only_memory<uchar> void_buffer(this, "void_buffer"); + split_kernel->path_trace(task, tile, void_buffer, void_buffer); } + else { + path_trace(*task, tile, work_tiles); + } + } + else if(tile.task == RenderTile::DENOISE) { + tile.sample = tile.start_sample + tile.num_samples; - path_trace(tile, sample, branched); - - tile.sample = sample + 1; + denoise(tile, denoising, *task); task->update_progress(&tile, tile.w*tile.h); } task->release_tile(tile); + + if(task->get_cancel()) { + if(task->need_finish_queue == false) + break; + } } + + work_tiles.free(); } else if(task->type == DeviceTask::SHADER) { - /* Upload Bindless Mapping */ - load_bindless_mapping(); - shader(*task); - cuda_push_context(); cuda_assert(cuCtxSynchronize()); - cuda_pop_context(); } } @@ -1307,13 +2127,17 @@ public: void task_add(DeviceTask& task) { + CUDAContextScope scope(this); + + /* Load texture info. */ + load_texture_info(); + + /* Synchronize all memory copies before executing task. */ + cuda_assert(cuCtxSynchronize()); + if(task.type == DeviceTask::FILM_CONVERT) { /* must be done in main thread due to opengl access */ film_convert(task, task.buffer, task.rgba_byte, task.rgba_half); - - cuda_push_context(); - cuda_assert(cuCtxSynchronize()); - cuda_pop_context(); } else { task_pool.push(new CUDADeviceTask(this, task)); @@ -1329,8 +2153,237 @@ public: { task_pool.cancel(); } + + friend class CUDASplitKernelFunction; + friend class CUDASplitKernel; + friend class CUDAContextScope; }; +/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class + * now that the definition of that class is complete + */ +#undef cuda_assert +#define cuda_assert(stmt) \ + { \ + CUresult result = stmt; \ + \ + if(result != CUDA_SUCCESS) { \ + string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \ + if(device->error_msg == "") \ + device->error_msg = message; \ + fprintf(stderr, "%s\n", message.c_str()); \ + /*cuda_abort();*/ \ + device->cuda_error_documentation(); \ + } \ + } (void)0 + + +/* CUDA context scope. */ + +CUDAContextScope::CUDAContextScope(CUDADevice *device) +: device(device) +{ + cuda_assert(cuCtxPushCurrent(device->cuContext)); +} + +CUDAContextScope::~CUDAContextScope() +{ + cuda_assert(cuCtxPopCurrent(NULL)); +} + +/* split kernel */ + +class CUDASplitKernelFunction : public SplitKernelFunction{ + CUDADevice* device; + CUfunction func; +public: + CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func) {} + + /* enqueue the kernel, returns false if there is an error */ + bool enqueue(const KernelDimensions &dim, device_memory &/*kg*/, device_memory &/*data*/) + { + return enqueue(dim, NULL); + } + + /* enqueue the kernel, returns false if there is an error */ + bool enqueue(const KernelDimensions &dim, void *args[]) + { + if(device->have_error()) + return false; + + CUDAContextScope scope(device); + + /* we ignore dim.local_size for now, as this is faster */ + int threads_per_block; + cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); + + int xblocks = (dim.global_size[0]*dim.global_size[1] + threads_per_block - 1)/threads_per_block; + + cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1)); + + cuda_assert(cuLaunchKernel(func, + xblocks, 1, 1, /* blocks */ + threads_per_block, 1, 1, /* threads */ + 0, 0, args, 0)); + + return !device->have_error(); + } +}; + +CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device) +{ +} + +uint64_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory& /*data*/, size_t num_threads) +{ + CUDAContextScope scope(device); + + device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE); + size_buffer.alloc(1); + size_buffer.zero_to_device(); + + uint threads = num_threads; + CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer); + + struct args_t { + uint* num_threads; + CUdeviceptr* size; + }; + + args_t args = { + &threads, + &d_size + }; + + CUfunction state_buffer_size; + cuda_assert(cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size")); + + cuda_assert(cuLaunchKernel(state_buffer_size, + 1, 1, 1, + 1, 1, 1, + 0, 0, (void**)&args, 0)); + + size_buffer.copy_from_device(0, 1, 1); + size_t size = size_buffer[0]; + size_buffer.free(); + + return size; +} + +bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim, + RenderTile& rtile, + int num_global_elements, + device_memory& /*kernel_globals*/, + device_memory& /*kernel_data*/, + device_memory& split_data, + device_memory& ray_state, + device_memory& queue_index, + device_memory& use_queues_flag, + device_memory& work_pool_wgs) +{ + CUDAContextScope scope(device); + + CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer); + CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer); + CUdeviceptr d_queue_index = device->cuda_device_ptr(queue_index.device_pointer); + CUdeviceptr d_use_queues_flag = device->cuda_device_ptr(use_queues_flag.device_pointer); + CUdeviceptr d_work_pool_wgs = device->cuda_device_ptr(work_pool_wgs.device_pointer); + + CUdeviceptr d_buffer = device->cuda_device_ptr(rtile.buffer); + + int end_sample = rtile.start_sample + rtile.num_samples; + int queue_size = dim.global_size[0] * dim.global_size[1]; + + struct args_t { + CUdeviceptr* split_data_buffer; + int* num_elements; + CUdeviceptr* ray_state; + int* start_sample; + int* end_sample; + int* sx; + int* sy; + int* sw; + int* sh; + int* offset; + int* stride; + CUdeviceptr* queue_index; + int* queuesize; + CUdeviceptr* use_queues_flag; + CUdeviceptr* work_pool_wgs; + int* num_samples; + CUdeviceptr* buffer; + }; + + args_t args = { + &d_split_data, + &num_global_elements, + &d_ray_state, + &rtile.start_sample, + &end_sample, + &rtile.x, + &rtile.y, + &rtile.w, + &rtile.h, + &rtile.offset, + &rtile.stride, + &d_queue_index, + &queue_size, + &d_use_queues_flag, + &d_work_pool_wgs, + &rtile.num_samples, + &d_buffer + }; + + CUfunction data_init; + cuda_assert(cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init")); + if(device->have_error()) { + return false; + } + + CUDASplitKernelFunction(device, data_init).enqueue(dim, (void**)&args); + + return !device->have_error(); +} + +SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(const string& kernel_name, + const DeviceRequestedFeatures&) +{ + CUDAContextScope scope(device); + CUfunction func; + + cuda_assert(cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data())); + if(device->have_error()) { + device->cuda_error_message(string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data())); + return NULL; + } + + return new CUDASplitKernelFunction(device, func); +} + +int2 CUDASplitKernel::split_kernel_local_size() +{ + return make_int2(32, 1); +} + +int2 CUDASplitKernel::split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/) +{ + CUDAContextScope scope(device); + size_t free; + size_t total; + + cuda_assert(cuMemGetInfo(&free, &total)); + + VLOG(1) << "Maximum device allocation size: " + << string_human_readable_number(free) << " bytes. (" + << string_human_readable_size(free) << ")."; + + size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2); + size_t side = round_down((int)sqrt(num_elements), 32); + int2 global_size = make_int2(side, round_down(num_elements / side, 16)); + VLOG(1) << "Global size: " << global_size << "."; + return global_size; +} + bool device_cuda_init(void) { #ifdef WITH_CUDA_DYNLOAD @@ -1341,7 +2394,7 @@ bool device_cuda_init(void) return result; initialized = true; - int cuew_result = cuewInit(); + int cuew_result = cuewInit(CUEW_INIT_CUDA); if(cuew_result == CUEW_SUCCESS) { VLOG(1) << "CUEW initialization succeeded"; if(CUDADevice::have_precompiled_kernels()) { @@ -1377,18 +2430,34 @@ Device *device_cuda_create(DeviceInfo& info, Stats &stats, bool background) return new CUDADevice(info, stats, background); } -void device_cuda_info(vector<DeviceInfo>& devices) +static CUresult device_cuda_safe_init() { - CUresult result; - int count = 0; +#ifdef _WIN32 + __try { + return cuInit(0); + } + __except(EXCEPTION_EXECUTE_HANDLER) { + /* Ignore crashes inside the CUDA driver and hope we can + * survive even with corrupted CUDA installs. */ + fprintf(stderr, "Cycles CUDA: driver crashed, continuing without CUDA.\n"); + } + + return CUDA_ERROR_NO_DEVICE; +#else + return cuInit(0); +#endif +} - result = cuInit(0); +void device_cuda_info(vector<DeviceInfo>& devices) +{ + CUresult result = device_cuda_safe_init(); if(result != CUDA_SUCCESS) { if(result != CUDA_ERROR_NO_DEVICE) fprintf(stderr, "CUDA cuInit: %s\n", cuewErrorString(result)); return; } + int count = 0; result = cuDeviceGetCount(&count); if(result != CUDA_SUCCESS) { fprintf(stderr, "CUDA cuDeviceGetCount: %s\n", cuewErrorString(result)); @@ -1399,14 +2468,18 @@ void device_cuda_info(vector<DeviceInfo>& devices) for(int num = 0; num < count; num++) { char name[256]; - int attr; - if(cuDeviceGetName(name, 256, num) != CUDA_SUCCESS) + result = cuDeviceGetName(name, 256, num); + if(result != CUDA_SUCCESS) { + fprintf(stderr, "CUDA cuDeviceGetName: %s\n", cuewErrorString(result)); continue; + } int major; cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, num); - if(major < 2) { + if(major < 3) { + VLOG(1) << "Ignoring device \"" << name + << "\", this graphics card is no longer supported."; continue; } @@ -1416,9 +2489,10 @@ void device_cuda_info(vector<DeviceInfo>& devices) info.description = string(name); info.num = num; - info.advanced_shading = (major >= 2); - info.has_bindless_textures = (major >= 3); - info.pack_images = false; + info.advanced_shading = (major >= 3); + info.has_half_images = (major >= 3); + info.has_volume_decoupled = false; + info.bvh_layout_mask = BVH_LAYOUT_BVH2; int pci_location[3] = {0, 0, 0}; cuDeviceGetAttribute(&pci_location[0], CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, num); @@ -1430,14 +2504,23 @@ void device_cuda_info(vector<DeviceInfo>& devices) (unsigned int)pci_location[1], (unsigned int)pci_location[2]); - /* if device has a kernel timeout, assume it is used for display */ - if(cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, num) == CUDA_SUCCESS && attr == 1) { + /* If device has a kernel timeout and no compute preemption, we assume + * it is connected to a display and will freeze the display while doing + * computations. */ + int timeout_attr = 0, preempt_attr = 0; + cuDeviceGetAttribute(&timeout_attr, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, num); + cuDeviceGetAttribute(&preempt_attr, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, num); + + if(timeout_attr && !preempt_attr) { + VLOG(1) << "Device is recognized as display."; info.description += " (Display)"; info.display_device = true; display_devices.push_back(info); } - else + else { devices.push_back(info); + } + VLOG(1) << "Added device \"" << name << "\" with id \"" << info.id << "\"."; } if(!display_devices.empty()) @@ -1446,7 +2529,7 @@ void device_cuda_info(vector<DeviceInfo>& devices) string device_cuda_capabilities(void) { - CUresult result = cuInit(0); + CUresult result = device_cuda_safe_init(); if(result != CUDA_SUCCESS) { if(result != CUDA_ERROR_NO_DEVICE) { return string("Error initializing CUDA: ") + cuewErrorString(result); diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp new file mode 100644 index 00000000000..644cf6cd10e --- /dev/null +++ b/intern/cycles/device/device_denoising.cpp @@ -0,0 +1,234 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/device_denoising.h" + +#include "kernel/filter/filter_defines.h" + +CCL_NAMESPACE_BEGIN + +DenoisingTask::DenoisingTask(Device *device) +: tiles_mem(device, "denoising tiles_mem", MEM_READ_WRITE), + storage(device), + buffer(device), + device(device) +{ +} + +DenoisingTask::~DenoisingTask() +{ + storage.XtWX.free(); + storage.XtWY.free(); + storage.transform.free(); + storage.rank.free(); + storage.temporary_1.free(); + storage.temporary_2.free(); + storage.temporary_color.free(); + buffer.mem.free(); + tiles_mem.free(); +} + +void DenoisingTask::init_from_devicetask(const DeviceTask &task) +{ + radius = task.denoising_radius; + nlm_k_2 = powf(2.0f, lerp(-5.0f, 3.0f, task.denoising_strength)); + if(task.denoising_relative_pca) { + pca_threshold = -powf(10.0f, lerp(-8.0f, 0.0f, task.denoising_feature_strength)); + } + else { + pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising_feature_strength)); + } + + render_buffer.pass_stride = task.pass_stride; + render_buffer.denoising_data_offset = task.pass_denoising_data; + render_buffer.denoising_clean_offset = task.pass_denoising_clean; + + /* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring tiles */ + rect = rect_from_shape(filter_area.x, filter_area.y, filter_area.z, filter_area.w); + rect = rect_expand(rect, radius); + rect = rect_clip(rect, make_int4(tiles->x[0], tiles->y[0], tiles->x[3], tiles->y[3])); +} + +void DenoisingTask::tiles_from_rendertiles(RenderTile *rtiles) +{ + tiles = (TilesInfo*) tiles_mem.alloc(sizeof(TilesInfo)/sizeof(int)); + + device_ptr buffers[9]; + for(int i = 0; i < 9; i++) { + buffers[i] = rtiles[i].buffer; + tiles->offsets[i] = rtiles[i].offset; + tiles->strides[i] = rtiles[i].stride; + } + tiles->x[0] = rtiles[3].x; + tiles->x[1] = rtiles[4].x; + tiles->x[2] = rtiles[5].x; + tiles->x[3] = rtiles[5].x + rtiles[5].w; + tiles->y[0] = rtiles[1].y; + tiles->y[1] = rtiles[4].y; + tiles->y[2] = rtiles[7].y; + tiles->y[3] = rtiles[7].y + rtiles[7].h; + + render_buffer.offset = rtiles[4].offset; + render_buffer.stride = rtiles[4].stride; + render_buffer.ptr = rtiles[4].buffer; + + functions.set_tiles(buffers); +} + +bool DenoisingTask::run_denoising() +{ + /* Allocate denoising buffer. */ + buffer.passes = 14; + buffer.width = rect.z - rect.x; + buffer.stride = align_up(buffer.width, 4); + buffer.h = rect.w - rect.y; + buffer.pass_stride = align_up(buffer.stride * buffer.h, divide_up(device->mem_sub_ptr_alignment(), sizeof(float))); + buffer.mem.alloc_to_device(buffer.pass_stride * buffer.passes, false); + + device_ptr null_ptr = (device_ptr) 0; + + /* Prefilter shadow feature. */ + { + device_sub_ptr unfiltered_a (buffer.mem, 0, buffer.pass_stride); + device_sub_ptr unfiltered_b (buffer.mem, 1*buffer.pass_stride, buffer.pass_stride); + device_sub_ptr sample_var (buffer.mem, 2*buffer.pass_stride, buffer.pass_stride); + device_sub_ptr sample_var_var (buffer.mem, 3*buffer.pass_stride, buffer.pass_stride); + device_sub_ptr buffer_var (buffer.mem, 5*buffer.pass_stride, buffer.pass_stride); + device_sub_ptr filtered_var (buffer.mem, 6*buffer.pass_stride, buffer.pass_stride); + device_sub_ptr nlm_temporary_1(buffer.mem, 7*buffer.pass_stride, buffer.pass_stride); + device_sub_ptr nlm_temporary_2(buffer.mem, 8*buffer.pass_stride, buffer.pass_stride); + device_sub_ptr nlm_temporary_3(buffer.mem, 9*buffer.pass_stride, buffer.pass_stride); + + nlm_state.temporary_1_ptr = *nlm_temporary_1; + nlm_state.temporary_2_ptr = *nlm_temporary_2; + nlm_state.temporary_3_ptr = *nlm_temporary_3; + + /* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */ + functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var); + + /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */ + nlm_state.set_parameters(6, 3, 4.0f, 1.0f); + functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var); + + /* Reuse memory, the previous data isn't needed anymore. */ + device_ptr filtered_a = *buffer_var, + filtered_b = *sample_var; + /* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */ + nlm_state.set_parameters(5, 3, 1.0f, 0.25f); + functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a); + functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b); + + device_ptr residual_var = *sample_var_var; + /* Estimate the residual variance between the two filtered halves. */ + functions.combine_halves(filtered_a, filtered_b, null_ptr, residual_var, 2, rect); + + device_ptr final_a = *unfiltered_a, + final_b = *unfiltered_b; + /* Use the residual variance for a second filter pass. */ + nlm_state.set_parameters(4, 2, 1.0f, 0.5f); + functions.non_local_means(filtered_a, filtered_b, residual_var, final_a); + functions.non_local_means(filtered_b, filtered_a, residual_var, final_b); + + /* Combine the two double-filtered halves to a final shadow feature. */ + device_sub_ptr shadow_pass(buffer.mem, 4*buffer.pass_stride, buffer.pass_stride); + functions.combine_halves(final_a, final_b, *shadow_pass, null_ptr, 0, rect); + } + + /* Prefilter general features. */ + { + device_sub_ptr unfiltered (buffer.mem, 8*buffer.pass_stride, buffer.pass_stride); + device_sub_ptr variance (buffer.mem, 9*buffer.pass_stride, buffer.pass_stride); + device_sub_ptr nlm_temporary_1(buffer.mem, 10*buffer.pass_stride, buffer.pass_stride); + device_sub_ptr nlm_temporary_2(buffer.mem, 11*buffer.pass_stride, buffer.pass_stride); + device_sub_ptr nlm_temporary_3(buffer.mem, 12*buffer.pass_stride, buffer.pass_stride); + + nlm_state.temporary_1_ptr = *nlm_temporary_1; + nlm_state.temporary_2_ptr = *nlm_temporary_2; + nlm_state.temporary_3_ptr = *nlm_temporary_3; + + int mean_from[] = { 0, 1, 2, 12, 6, 7, 8 }; + int variance_from[] = { 3, 4, 5, 13, 9, 10, 11}; + int pass_to[] = { 1, 2, 3, 0, 5, 6, 7}; + for(int pass = 0; pass < 7; pass++) { + device_sub_ptr feature_pass(buffer.mem, pass_to[pass]*buffer.pass_stride, buffer.pass_stride); + /* Get the unfiltered pass and its variance from the RenderBuffers. */ + functions.get_feature(mean_from[pass], variance_from[pass], *unfiltered, *variance); + /* Smooth the pass and store the result in the denoising buffers. */ + nlm_state.set_parameters(2, 2, 1.0f, 0.25f); + functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass); + } + } + + /* Copy color passes. */ + { + int mean_from[] = {20, 21, 22}; + int variance_from[] = {23, 24, 25}; + int mean_to[] = { 8, 9, 10}; + int variance_to[] = {11, 12, 13}; + int num_color_passes = 3; + + storage.temporary_color.alloc_to_device(3*buffer.pass_stride, false); + + for(int pass = 0; pass < num_color_passes; pass++) { + device_sub_ptr color_pass(storage.temporary_color, pass*buffer.pass_stride, buffer.pass_stride); + device_sub_ptr color_var_pass(buffer.mem, variance_to[pass]*buffer.pass_stride, buffer.pass_stride); + functions.get_feature(mean_from[pass], variance_from[pass], *color_pass, *color_var_pass); + } + + { + device_sub_ptr depth_pass (buffer.mem, 0, buffer.pass_stride); + device_sub_ptr color_var_pass(buffer.mem, variance_to[0]*buffer.pass_stride, 3*buffer.pass_stride); + device_sub_ptr output_pass (buffer.mem, mean_to[0]*buffer.pass_stride, 3*buffer.pass_stride); + functions.detect_outliers(storage.temporary_color.device_pointer, *color_var_pass, *depth_pass, *output_pass); + } + } + + storage.w = filter_area.z; + storage.h = filter_area.w; + storage.transform.alloc_to_device(storage.w*storage.h*TRANSFORM_SIZE, false); + storage.rank.alloc_to_device(storage.w*storage.h, false); + + functions.construct_transform(); + + device_only_memory<float> temporary_1(device, "Denoising NLM temporary 1"); + device_only_memory<float> temporary_2(device, "Denoising NLM temporary 2"); + temporary_1.alloc_to_device(buffer.pass_stride, false); + temporary_2.alloc_to_device(buffer.pass_stride, false); + reconstruction_state.temporary_1_ptr = temporary_1.device_pointer; + reconstruction_state.temporary_2_ptr = temporary_2.device_pointer; + + storage.XtWX.alloc_to_device(storage.w*storage.h*XTWX_SIZE, false); + storage.XtWY.alloc_to_device(storage.w*storage.h*XTWY_SIZE, false); + + reconstruction_state.filter_window = rect_from_shape(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h); + int tile_coordinate_offset = filter_area.y*render_buffer.stride + filter_area.x; + reconstruction_state.buffer_params = make_int4(render_buffer.offset + tile_coordinate_offset, + render_buffer.stride, + render_buffer.pass_stride, + render_buffer.denoising_clean_offset); + reconstruction_state.source_w = rect.z-rect.x; + reconstruction_state.source_h = rect.w-rect.y; + + { + device_sub_ptr color_ptr (buffer.mem, 8*buffer.pass_stride, 3*buffer.pass_stride); + device_sub_ptr color_var_ptr(buffer.mem, 11*buffer.pass_stride, 3*buffer.pass_stride); + functions.reconstruct(*color_ptr, *color_var_ptr, render_buffer.ptr); + } + + return true; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h new file mode 100644 index 00000000000..77a82d0ad04 --- /dev/null +++ b/intern/cycles/device/device_denoising.h @@ -0,0 +1,167 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __DEVICE_DENOISING_H__ +#define __DEVICE_DENOISING_H__ + +#include "device/device.h" + +#include "render/buffers.h" + +#include "kernel/filter/filter_defines.h" + +CCL_NAMESPACE_BEGIN + +class DenoisingTask { +public: + /* Parameters of the denoising algorithm. */ + int radius; + float nlm_k_2; + float pca_threshold; + + /* Pointer and parameters of the RenderBuffers. */ + struct RenderBuffers { + int denoising_data_offset; + int denoising_clean_offset; + int pass_stride; + int offset; + int stride; + device_ptr ptr; + int samples; + } render_buffer; + + TilesInfo *tiles; + device_vector<int> tiles_mem; + void tiles_from_rendertiles(RenderTile *rtiles); + + int4 rect; + int4 filter_area; + + struct DeviceFunctions { + function<bool(device_ptr image_ptr, /* Contains the values that are smoothed. */ + device_ptr guide_ptr, /* Contains the values that are used to calculate weights. */ + device_ptr variance_ptr, /* Contains the variance of the guide image. */ + device_ptr out_ptr /* The filtered output is written into this image. */ + )> non_local_means; + function<bool(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr output_ptr + )> reconstruct; + function<bool()> construct_transform; + + function<bool(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr mean_ptr, + device_ptr variance_ptr, + int r, + int4 rect + )> combine_halves; + function<bool(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr sample_variance_ptr, + device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr + )> divide_shadow; + function<bool(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr + )> get_feature; + function<bool(device_ptr image_ptr, + device_ptr variance_ptr, + device_ptr depth_ptr, + device_ptr output_ptr + )> detect_outliers; + function<bool(device_ptr*)> set_tiles; + } functions; + + /* Stores state of the current Reconstruction operation, + * which is accessed by the device in order to perform the operation. */ + struct ReconstructionState { + device_ptr temporary_1_ptr; /* There two images are used as temporary storage. */ + device_ptr temporary_2_ptr; + + int4 filter_window; + int4 buffer_params; + + int source_w; + int source_h; + } reconstruction_state; + + /* Stores state of the current NLM operation, + * which is accessed by the device in order to perform the operation. */ + struct NLMState { + device_ptr temporary_1_ptr; /* There three images are used as temporary storage. */ + device_ptr temporary_2_ptr; + device_ptr temporary_3_ptr; + + int r; /* Search radius of the filter. */ + int f; /* Patch size of the filter. */ + float a; /* Variance compensation factor in the MSE estimation. */ + float k_2; /* Squared value of the k parameter of the filter. */ + + void set_parameters(int r_, int f_, float a_, float k_2_) { r = r_; f = f_; a = a_, k_2 = k_2_; } + } nlm_state; + + struct Storage { + device_only_memory<float> transform; + device_only_memory<int> rank; + device_only_memory<float> XtWX; + device_only_memory<float3> XtWY; + device_only_memory<float> temporary_1; + device_only_memory<float> temporary_2; + device_only_memory<float> temporary_color; + int w; + int h; + + Storage(Device *device) + : transform(device, "denoising transform"), + rank(device, "denoising rank"), + XtWX(device, "denoising XtWX"), + XtWY(device, "denoising XtWY"), + temporary_1(device, "denoising NLM temporary 1"), + temporary_2(device, "denoising NLM temporary 2"), + temporary_color(device, "denoising temporary color") + {} + } storage; + + DenoisingTask(Device *device); + ~DenoisingTask(); + + void init_from_devicetask(const DeviceTask &task); + + bool run_denoising(); + + struct DenoiseBuffers { + int pass_stride; + int passes; + int stride; + int h; + int width; + device_only_memory<float> mem; + + DenoiseBuffers(Device *device) + : mem(device, "denoising pixel buffer") + {} + } buffer; + +protected: + Device *device; +}; + +CCL_NAMESPACE_END + +#endif /* __DEVICE_DENOISING_H__ */ diff --git a/intern/cycles/device/device_memory.cpp b/intern/cycles/device/device_memory.cpp new file mode 100644 index 00000000000..c6248fcf88b --- /dev/null +++ b/intern/cycles/device/device_memory.cpp @@ -0,0 +1,121 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/device.h" +#include "device/device_memory.h" + +CCL_NAMESPACE_BEGIN + +/* Device Memory */ + +device_memory::device_memory(Device *device, const char *name, MemoryType type) +: data_type(device_type_traits<uchar>::data_type), + data_elements(device_type_traits<uchar>::num_elements), + data_size(0), + device_size(0), + data_width(0), + data_height(0), + data_depth(0), + type(type), + name(name), + interpolation(INTERPOLATION_NONE), + extension(EXTENSION_REPEAT), + device(device), + device_pointer(0), + host_pointer(0), + shared_pointer(0) +{ +} + +device_memory::~device_memory() +{ +} + +void *device_memory::host_alloc(size_t size) +{ + if(!size) { + return 0; + } + + void *ptr = util_aligned_malloc(size, MIN_ALIGNMENT_CPU_DATA_TYPES); + + if(ptr) { + util_guarded_mem_alloc(size); + } + else { + throw std::bad_alloc(); + } + + return ptr; +} + +void device_memory::host_free() +{ + if(host_pointer) { + util_guarded_mem_free(memory_size()); + util_aligned_free((void*)host_pointer); + host_pointer = 0; + } +} + +void device_memory::device_alloc() +{ + assert(!device_pointer && type != MEM_TEXTURE); + device->mem_alloc(*this); +} + +void device_memory::device_free() +{ + if(device_pointer) { + device->mem_free(*this); + } +} + +void device_memory::device_copy_to() +{ + if(host_pointer) { + device->mem_copy_to(*this); + } +} + +void device_memory::device_copy_from(int y, int w, int h, int elem) +{ + assert(type != MEM_TEXTURE && type != MEM_READ_ONLY); + device->mem_copy_from(*this, y, w, h, elem); +} + +void device_memory::device_zero() +{ + if(data_size) { + device->mem_zero(*this); + } +} + +/* Device Sub Ptr */ + +device_sub_ptr::device_sub_ptr(device_memory& mem, int offset, int size) +: device(mem.device) +{ + ptr = device->mem_alloc_sub_ptr(mem, offset, size); +} + +device_sub_ptr::~device_sub_ptr() +{ + device->mem_free_sub_ptr(ptr); +} + +CCL_NAMESPACE_END + diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h index 5b5b4dc6802..d8fe41e78bb 100644 --- a/intern/cycles/device/device_memory.h +++ b/intern/cycles/device/device_memory.h @@ -19,46 +19,47 @@ /* Device Memory * - * This file defines data types that can be used in device memory arrays, and - * a device_vector<T> type to store such arrays. - * - * device_vector<T> contains an STL vector, metadata about the data type, - * dimensions, elements, and a device pointer. For the CPU device this is just - * a pointer to the STL vector data, as no copying needs to take place. For - * other devices this is a pointer to device memory, where we will copy memory - * to and from. */ + * Data types for allocating, copying and freeing device memory. */ -#include "util_debug.h" -#include "util_half.h" -#include "util_types.h" -#include "util_vector.h" +#include "util/util_half.h" +#include "util/util_texture.h" +#include "util/util_types.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN +class Device; + enum MemoryType { MEM_READ_ONLY, - MEM_WRITE_ONLY, - MEM_READ_WRITE + MEM_READ_WRITE, + MEM_DEVICE_ONLY, + MEM_TEXTURE, + MEM_PIXELS }; /* Supported Data Types */ enum DataType { + TYPE_UNKNOWN, TYPE_UCHAR, TYPE_UINT, TYPE_INT, TYPE_FLOAT, - TYPE_HALF + TYPE_HALF, + TYPE_UINT64, }; static inline size_t datatype_size(DataType datatype) { switch(datatype) { + case TYPE_UNKNOWN: return 1; case TYPE_UCHAR: return sizeof(uchar); case TYPE_FLOAT: return sizeof(float); case TYPE_UINT: return sizeof(uint); case TYPE_INT: return sizeof(int); case TYPE_HALF: return sizeof(half); + case TYPE_UINT64: return sizeof(uint64_t); default: return 0; } } @@ -66,8 +67,8 @@ static inline size_t datatype_size(DataType datatype) /* Traits for data types */ template<typename T> struct device_type_traits { - static const DataType data_type = TYPE_UCHAR; - static const int num_elements = 0; + static const DataType data_type = TYPE_UNKNOWN; + static const int num_elements = sizeof(T); }; template<> struct device_type_traits<uchar> { @@ -142,7 +143,7 @@ template<> struct device_type_traits<float2> { template<> struct device_type_traits<float3> { static const DataType data_type = TYPE_FLOAT; - static const int num_elements = 3; + static const int num_elements = 4; }; template<> struct device_type_traits<float4> { @@ -160,126 +161,318 @@ template<> struct device_type_traits<half4> { static const int num_elements = 4; }; -/* Device Memory */ +template<> struct device_type_traits<uint64_t> { + static const DataType data_type = TYPE_UINT64; + static const int num_elements = 1; +}; + +/* Device Memory + * + * Base class for all device memory. This should not be allocated directly, + * instead the appropriate subclass can be used. */ class device_memory { public: size_t memory_size() { return data_size*data_elements*datatype_size(data_type); } + size_t memory_elements_size(int elements) { + return elements*data_elements*datatype_size(data_type); + } - /* data information */ + /* Data information. */ DataType data_type; int data_elements; - device_ptr data_pointer; size_t data_size; size_t device_size; size_t data_width; size_t data_height; size_t data_depth; + MemoryType type; + const char *name; + InterpolationType interpolation; + ExtensionType extension; - /* device pointer */ + /* Pointers. */ + Device *device; device_ptr device_pointer; + void *host_pointer; + void *shared_pointer; + + virtual ~device_memory(); protected: - device_memory() {} - virtual ~device_memory() { assert(!device_pointer); } + friend class CUDADevice; - /* no copying */ + /* Only create through subclasses. */ + device_memory(Device *device, const char *name, MemoryType type); + + /* No copying allowed. */ device_memory(const device_memory&); device_memory& operator = (const device_memory&); + + /* Host allocation on the device. All host_pointer memory should be + * allocated with these functions, for devices that support using + * the same pointer for host and device. */ + void *host_alloc(size_t size); + void host_free(); + + /* Device memory allocation and copying. */ + void device_alloc(); + void device_free(); + void device_copy_to(); + void device_copy_from(int y, int w, int h, int elem); + void device_zero(); +}; + +/* Device Only Memory + * + * Working memory only needed by the device, with no corresponding allocation + * on the host. Only used internally in the device implementations. */ + +template<typename T> +class device_only_memory : public device_memory +{ +public: + device_only_memory(Device *device, const char *name) + : device_memory(device, name, MEM_DEVICE_ONLY) + { + data_type = device_type_traits<T>::data_type; + data_elements = max(device_type_traits<T>::num_elements, 1); + } + + virtual ~device_only_memory() + { + free(); + } + + void alloc_to_device(size_t num, bool shrink_to_fit = true) + { + size_t new_size = num; + bool reallocate; + + if(shrink_to_fit) { + reallocate = (data_size != new_size); + } + else { + reallocate = (data_size < new_size); + } + + if(reallocate) { + device_free(); + data_size = new_size; + device_alloc(); + } + } + + void free() + { + device_free(); + data_size = 0; + } + + void zero_to_device() + { + device_zero(); + } }; -/* Device Vector */ +/* Device Vector + * + * Data vector to exchange data between host and device. Memory will be + * allocated on the host first with alloc() and resize, and then filled + * in and copied to the device with copy_to_device(). Or alternatively + * allocated and set to zero on the device with zero_to_device(). + * + * When using memory type MEM_TEXTURE, a pointer to this memory will be + * automatically attached to kernel globals, using the provided name + * matching an entry in kernel_textures.h. */ template<typename T> class device_vector : public device_memory { public: - device_vector() + device_vector(Device *device, const char *name, MemoryType type) + : device_memory(device, name, type) { data_type = device_type_traits<T>::data_type; data_elements = device_type_traits<T>::num_elements; - data_pointer = 0; - data_size = 0; - device_size = 0; - data_width = 0; - data_height = 0; - data_depth = 0; assert(data_elements > 0); - - device_pointer = 0; } - virtual ~device_vector() {} + virtual ~device_vector() + { + free(); + } - /* vector functions */ - T *resize(size_t width, size_t height = 0, size_t depth = 0) + /* Host memory allocation. */ + T *alloc(size_t width, size_t height = 0, size_t depth = 0) { - data_size = width * ((height == 0)? 1: height) * ((depth == 0)? 1: depth); - if(data.resize(data_size) == NULL) { - clear(); - return NULL; + size_t new_size = size(width, height, depth); + + if(new_size != data_size) { + device_free(); + host_free(); + host_pointer = host_alloc(sizeof(T)*new_size); + assert(device_pointer == 0); } + + data_size = new_size; data_width = width; data_height = height; data_depth = depth; - if(data_size == 0) { - data_pointer = 0; - return NULL; - } - data_pointer = (device_ptr)&data[0]; - return &data[0]; - } - T *copy(T *ptr, size_t width, size_t height = 0, size_t depth = 0) - { - T *mem = resize(width, height, depth); - if(mem != NULL) { - memcpy(mem, ptr, memory_size()); - } - return mem; + return data(); } - void copy_at(T *ptr, size_t offset, size_t size) + /* Host memory resize. Only use this if the original data needs to be + * preserved, it is faster to call alloc() if it can be discarded. */ + T *resize(size_t width, size_t height = 0, size_t depth = 0) { - if(size > 0) { - size_t mem_size = size*data_elements*datatype_size(data_type); - memcpy(&data[0] + offset, ptr, mem_size); + size_t new_size = size(width, height, depth); + + if(new_size != data_size) { + void *new_ptr = host_alloc(sizeof(T)*new_size); + + if(new_size && data_size) { + size_t min_size = ((new_size < data_size)? new_size: data_size); + memcpy((T*)new_ptr, (T*)host_pointer, sizeof(T)*min_size); + } + + device_free(); + host_free(); + host_pointer = new_ptr; + assert(device_pointer == 0); } - } - void reference(T *ptr, size_t width, size_t height = 0, size_t depth = 0) - { - data.clear(); - data_size = width * ((height == 0)? 1: height) * ((depth == 0)? 1: depth); - data_pointer = (device_ptr)ptr; + data_size = new_size; data_width = width; data_height = height; data_depth = depth; + + return data(); } - void clear() + /* Take over data from an existing array. */ + void steal_data(array<T>& from) { - data.clear(); - data_pointer = 0; + device_free(); + host_free(); + + data_size = from.size(); data_width = 0; data_height = 0; data_depth = 0; + host_pointer = from.steal_pointer(); + assert(device_pointer == 0); + } + + /* Free device and host memory. */ + void free() + { + device_free(); + host_free(); + data_size = 0; + data_width = 0; + data_height = 0; + data_depth = 0; + host_pointer = 0; + assert(device_pointer == 0); } size_t size() { - return data.size(); + return data_size; + } + + T* data() + { + return (T*)host_pointer; + } + + T& operator[](size_t i) + { + assert(i < data_size); + return data()[i]; + } + + void copy_to_device() + { + device_copy_to(); + } + + void copy_from_device(int y, int w, int h) + { + device_copy_from(y, w, h, sizeof(T)); + } + + void zero_to_device() + { + device_zero(); } - T* get_data() +protected: + size_t size(size_t width, size_t height, size_t depth) { - return &data[0]; + return width * ((height == 0)? 1: height) * ((depth == 0)? 1: depth); } +}; + +/* Pixel Memory + * + * Device memory to efficiently draw as pixels to the screen in interactive + * rendering. Only copying pixels from the device is supported, not copying to. */ + +template<typename T> class device_pixels : public device_vector<T> +{ +public: + device_pixels(Device *device, const char *name) + : device_vector<T>(device, name, MEM_PIXELS) + { + } + + void alloc_to_device(size_t width, size_t height, size_t depth = 0) + { + device_vector<T>::alloc(width, height, depth); + + if(!device_memory::device_pointer) { + device_memory::device_alloc(); + } + } + + T *copy_from_device(int y, int w, int h) + { + device_memory::device_copy_from(y, w, h, sizeof(T)); + return device_vector<T>::data(); + } +}; + +/* Device Sub Memory + * + * Pointer into existing memory. It is not allocated separately, but created + * from an already allocated base memory. It is freed automatically when it + * goes out of scope, which should happen before base memory is freed. + * + * Note: some devices require offset and size of the sub_ptr to be properly + * aligned to device->mem_address_alingment(). */ + +class device_sub_ptr +{ +public: + device_sub_ptr(device_memory& mem, int offset, int size); + ~device_sub_ptr(); + + device_ptr operator*() const + { + return ptr; + } + +protected: + /* No copying. */ + device_sub_ptr& operator = (const device_sub_ptr&); -private: - array<T> data; + Device *device; + device_ptr ptr; }; CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp index 61d78ee65de..3a4c08b6eb2 100644 --- a/intern/cycles/device/device_multi.cpp +++ b/intern/cycles/device/device_multi.cpp @@ -17,17 +17,17 @@ #include <stdlib.h> #include <sstream> -#include "device.h" -#include "device_intern.h" -#include "device_network.h" +#include "device/device.h" +#include "device/device_intern.h" +#include "device/device_network.h" -#include "buffers.h" +#include "render/buffers.h" -#include "util_foreach.h" -#include "util_list.h" -#include "util_logging.h" -#include "util_map.h" -#include "util_time.h" +#include "util/util_foreach.h" +#include "util/util_list.h" +#include "util/util_logging.h" +#include "util/util_map.h" +#include "util/util_time.h" CCL_NAMESPACE_BEGIN @@ -43,16 +43,22 @@ public: }; list<SubDevice> devices; - device_ptr unique_ptr; + device_ptr unique_key; MultiDevice(DeviceInfo& info, Stats &stats, bool background_) - : Device(info, stats, background_), unique_ptr(1) + : Device(info, stats, background_), unique_key(1) { - Device *device; - foreach(DeviceInfo& subinfo, info.multi_devices) { - device = Device::create(subinfo, sub_stats_, background); - devices.push_back(SubDevice(device)); + Device *device = Device::create(subinfo, sub_stats_, background); + + /* Always add CPU devices at the back since GPU devices can change + * host memory pointers, which CPU uses as device pointer. */ + if(subinfo.type == DEVICE_CPU) { + devices.push_back(SubDevice(device)); + } + else { + devices.push_front(SubDevice(device)); + } } #ifdef WITH_NETWORK @@ -63,7 +69,7 @@ public: vector<string> servers = discovery.get_server_list(); foreach(string& server, servers) { - device = device_network_create(info, stats, server.c_str()); + Device *device = device_network_create(info, stats, server.c_str()); if(device) devices.push_back(SubDevice(device)); } @@ -106,71 +112,102 @@ public: return true; } - void mem_alloc(device_memory& mem, MemoryType type) + void mem_alloc(device_memory& mem) { + device_ptr key = unique_key++; + foreach(SubDevice& sub, devices) { + mem.device = sub.device; mem.device_pointer = 0; - sub.device->mem_alloc(mem, type); - sub.ptr_map[unique_ptr] = mem.device_pointer; + mem.device_size = 0; + + sub.device->mem_alloc(mem); + sub.ptr_map[key] = mem.device_pointer; } - mem.device_pointer = unique_ptr++; + mem.device = this; + mem.device_pointer = key; stats.mem_alloc(mem.device_size); } void mem_copy_to(device_memory& mem) { - device_ptr tmp = mem.device_pointer; + device_ptr existing_key = mem.device_pointer; + device_ptr key = (existing_key)? existing_key: unique_key++; + size_t existing_size = mem.device_size; foreach(SubDevice& sub, devices) { - mem.device_pointer = sub.ptr_map[tmp]; + mem.device = sub.device; + mem.device_pointer = (existing_key)? sub.ptr_map[existing_key]: 0; + mem.device_size = existing_size; + sub.device->mem_copy_to(mem); + sub.ptr_map[key] = mem.device_pointer; } - mem.device_pointer = tmp; + mem.device = this; + mem.device_pointer = key; + stats.mem_alloc(mem.device_size - existing_size); } void mem_copy_from(device_memory& mem, int y, int w, int h, int elem) { - device_ptr tmp = mem.device_pointer; + device_ptr key = mem.device_pointer; int i = 0, sub_h = h/devices.size(); foreach(SubDevice& sub, devices) { int sy = y + i*sub_h; int sh = (i == (int)devices.size() - 1)? h - sub_h*i: sub_h; - mem.device_pointer = sub.ptr_map[tmp]; + mem.device = sub.device; + mem.device_pointer = sub.ptr_map[key]; + sub.device->mem_copy_from(mem, sy, w, sh, elem); i++; } - mem.device_pointer = tmp; + mem.device = this; + mem.device_pointer = key; } void mem_zero(device_memory& mem) { - device_ptr tmp = mem.device_pointer; + device_ptr existing_key = mem.device_pointer; + device_ptr key = (existing_key)? existing_key: unique_key++; + size_t existing_size = mem.device_size; foreach(SubDevice& sub, devices) { - mem.device_pointer = sub.ptr_map[tmp]; + mem.device = sub.device; + mem.device_pointer = (existing_key)? sub.ptr_map[existing_key]: 0; + mem.device_size = existing_size; + sub.device->mem_zero(mem); + sub.ptr_map[key] = mem.device_pointer; } - mem.device_pointer = tmp; + mem.device = this; + mem.device_pointer = key; + stats.mem_alloc(mem.device_size - existing_size); } void mem_free(device_memory& mem) { - device_ptr tmp = mem.device_pointer; - stats.mem_free(mem.device_size); + device_ptr key = mem.device_pointer; + size_t existing_size = mem.device_size; foreach(SubDevice& sub, devices) { - mem.device_pointer = sub.ptr_map[tmp]; + mem.device = sub.device; + mem.device_pointer = sub.ptr_map[key]; + mem.device_size = existing_size; + sub.device->mem_free(mem); - sub.ptr_map.erase(sub.ptr_map.find(tmp)); + sub.ptr_map.erase(sub.ptr_map.find(key)); } + mem.device = this; mem.device_pointer = 0; + mem.device_size = 0; + stats.mem_free(existing_size); } void const_copy_to(const char *name, void *host, size_t size) @@ -179,85 +216,10 @@ public: sub.device->const_copy_to(name, host, size); } - void tex_alloc(const char *name, - device_memory& mem, - InterpolationType - interpolation, - ExtensionType extension) - { - VLOG(1) << "Texture allocate: " << name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; - - foreach(SubDevice& sub, devices) { - mem.device_pointer = 0; - sub.device->tex_alloc(name, mem, interpolation, extension); - sub.ptr_map[unique_ptr] = mem.device_pointer; - } - - mem.device_pointer = unique_ptr++; - stats.mem_alloc(mem.device_size); - } - - void tex_free(device_memory& mem) - { - device_ptr tmp = mem.device_pointer; - stats.mem_free(mem.device_size); - - foreach(SubDevice& sub, devices) { - mem.device_pointer = sub.ptr_map[tmp]; - sub.device->tex_free(mem); - sub.ptr_map.erase(sub.ptr_map.find(tmp)); - } - - mem.device_pointer = 0; - } - - void pixels_alloc(device_memory& mem) - { - foreach(SubDevice& sub, devices) { - mem.device_pointer = 0; - sub.device->pixels_alloc(mem); - sub.ptr_map[unique_ptr] = mem.device_pointer; - } - - mem.device_pointer = unique_ptr++; - } - - void pixels_free(device_memory& mem) - { - device_ptr tmp = mem.device_pointer; - - foreach(SubDevice& sub, devices) { - mem.device_pointer = sub.ptr_map[tmp]; - sub.device->pixels_free(mem); - sub.ptr_map.erase(sub.ptr_map.find(tmp)); - } - - mem.device_pointer = 0; - } - - void pixels_copy_from(device_memory& mem, int y, int w, int h) - { - device_ptr tmp = mem.device_pointer; - int i = 0, sub_h = h/devices.size(); - - foreach(SubDevice& sub, devices) { - int sy = y + i*sub_h; - int sh = (i == (int)devices.size() - 1)? h - sub_h*i: sub_h; - - mem.device_pointer = sub.ptr_map[tmp]; - sub.device->pixels_copy_from(mem, sy, w, sh); - i++; - } - - mem.device_pointer = tmp; - } - void draw_pixels(device_memory& rgba, int y, int w, int h, int dx, int dy, int width, int height, bool transparent, const DeviceDrawParams &draw_params) { - device_ptr tmp = rgba.device_pointer; + device_ptr key = rgba.device_pointer; int i = 0, sub_h = h/devices.size(); int sub_height = height/devices.size(); @@ -268,12 +230,12 @@ public: int sdy = dy + i*sub_height; /* adjust math for w/width */ - rgba.device_pointer = sub.ptr_map[tmp]; + rgba.device_pointer = sub.ptr_map[key]; sub.device->draw_pixels(rgba, sy, w, sh, dx, sdy, width, sheight, transparent, draw_params); i++; } - rgba.device_pointer = tmp; + rgba.device_pointer = key; } void map_tile(Device *sub_device, RenderTile& tile) @@ -281,7 +243,6 @@ public: foreach(SubDevice& sub, devices) { if(sub.device == sub_device) { if(tile.buffer) tile.buffer = sub.ptr_map[tile.buffer]; - if(tile.rng_state) tile.rng_state = sub.ptr_map[tile.rng_state]; } } } @@ -299,6 +260,81 @@ public: return -1; } + void map_neighbor_tiles(Device *sub_device, RenderTile *tiles) + { + for(int i = 0; i < 9; i++) { + if(!tiles[i].buffers) { + continue; + } + + /* If the tile was rendered on another device, copy its memory to + * to the current device now, for the duration of the denoising task. + * Note that this temporarily modifies the RenderBuffers and calls + * the device, so this function is not thread safe. */ + device_vector<float> &mem = tiles[i].buffers->buffer; + if(mem.device != sub_device) { + /* Only copy from device to host once. This is faster, but + * also required for the case where a CPU thread is denoising + * a tile rendered on the GPU. In that case we have to avoid + * overwriting the buffer being denoised by the CPU thread. */ + if(!tiles[i].buffers->map_neighbor_copied) { + tiles[i].buffers->map_neighbor_copied = true; + mem.copy_from_device(0, mem.data_size, 1); + } + + Device *original_device = mem.device; + device_ptr original_ptr = mem.device_pointer; + size_t original_size = mem.device_size; + + mem.device = sub_device; + mem.device_pointer = 0; + mem.device_size = 0; + + mem.copy_to_device(); + tiles[i].buffer = mem.device_pointer; + + mem.device = original_device; + mem.device_pointer = original_ptr; + mem.device_size = original_size; + } + } + } + + void unmap_neighbor_tiles(Device * sub_device, RenderTile * tiles) + { + for(int i = 0; i < 9; i++) { + if(!tiles[i].buffers) { + continue; + } + + device_vector<float> &mem = tiles[i].buffers->buffer; + if(mem.device != sub_device) { + Device *original_device = mem.device; + device_ptr original_ptr = mem.device_pointer; + size_t original_size = mem.device_size; + + mem.device = sub_device; + mem.device_pointer = tiles[i].buffer; + + /* Copy denoised tile to the host. */ + if(i == 4) { + mem.copy_from_device(0, mem.data_size, 1); + } + + sub_device->mem_free(mem); + + mem.device = original_device; + mem.device_pointer = original_ptr; + mem.device_size = original_size; + + /* Copy denoised tile to the original device. */ + if(i == 4) { + mem.copy_to_device(); + } + } + } + } + int get_split_task_count(DeviceTask& task) { int total_tasks = 0; @@ -330,7 +366,6 @@ public: if(task.rgba_half) subtask.rgba_half = sub.ptr_map[task.rgba_half]; if(task.shader_input) subtask.shader_input = sub.ptr_map[task.shader_input]; if(task.shader_output) subtask.shader_output = sub.ptr_map[task.shader_output]; - if(task.shader_output_luma) subtask.shader_output_luma = sub.ptr_map[task.shader_output_luma]; sub.device->task_add(subtask); } diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp index 53eef6cf199..5ad4405366e 100644 --- a/intern/cycles/device/device_network.cpp +++ b/intern/cycles/device/device_network.cpp @@ -14,12 +14,12 @@ * limitations under the License. */ -#include "device.h" -#include "device_intern.h" -#include "device_network.h" +#include "device/device.h" +#include "device/device_intern.h" +#include "device/device_network.h" -#include "util_foreach.h" -#include "util_logging.h" +#include "util/util_foreach.h" +#include "util/util_logging.h" #if defined(WITH_NETWORK) @@ -87,16 +87,20 @@ public: snd.write(); } - void mem_alloc(device_memory& mem, MemoryType type) + void mem_alloc(device_memory& mem) { + if(mem.name) { + VLOG(1) << "Buffer allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + } + thread_scoped_lock lock(rpc_lock); mem.device_pointer = ++mem_counter; RPCSend snd(socket, &error_func, "mem_alloc"); - snd.add(mem); - snd.add(type); snd.write(); } @@ -108,7 +112,7 @@ public: snd.add(mem); snd.write(); - snd.write_buffer((void*)mem.data_pointer, mem.memory_size()); + snd.write_buffer(mem.host_pointer, mem.memory_size()); } void mem_copy_from(device_memory& mem, int y, int w, int h, int elem) @@ -127,7 +131,7 @@ public: snd.write(); RPCReceive rcv(socket, &error_func); - rcv.read_buffer((void*)mem.data_pointer, data_size); + rcv.read_buffer(mem.host_pointer, data_size); } void mem_zero(device_memory& mem) @@ -168,45 +172,6 @@ public: snd.write_buffer(host, size); } - void tex_alloc(const char *name, - device_memory& mem, - InterpolationType interpolation, - ExtensionType extension) - { - VLOG(1) << "Texture allocate: " << name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; - - thread_scoped_lock lock(rpc_lock); - - mem.device_pointer = ++mem_counter; - - RPCSend snd(socket, &error_func, "tex_alloc"); - - string name_string(name); - - snd.add(name_string); - snd.add(mem); - snd.add(interpolation); - snd.add(extension); - snd.write(); - snd.write_buffer((void*)mem.data_pointer, mem.memory_size()); - } - - void tex_free(device_memory& mem) - { - if(mem.device_pointer) { - thread_scoped_lock lock(rpc_lock); - - RPCSend snd(socket, &error_func, "tex_free"); - - snd.add(mem); - snd.write(); - - mem.device_pointer = 0; - } - } - bool load_kernels(const DeviceRequestedFeatures& requested_features) { if(error_func.have_error()) @@ -315,7 +280,7 @@ public: snd.write(); } - int get_split_task_count(DeviceTask& task) + int get_split_task_count(DeviceTask&) { return 1; } @@ -337,8 +302,12 @@ void device_network_info(vector<DeviceInfo>& devices) info.description = "Network Device"; info.id = "NETWORK"; info.num = 0; - info.advanced_shading = true; /* todo: get this info from device */ - info.pack_images = false; + + /* todo: get this info from device */ + info.advanced_shading = true; + info.has_volume_decoupled = false; + info.bvh_layout_mask = BVH_LAYOUT_BVH2; + info.has_osl = false; devices.push_back(info); } @@ -460,61 +429,64 @@ protected: void process(RPCReceive& rcv, thread_scoped_lock &lock) { if(rcv.name == "mem_alloc") { - MemoryType type; - network_device_memory mem; - device_ptr client_pointer; - - rcv.read(mem); - rcv.read(type); - + string name; + network_device_memory mem(device); + rcv.read(mem, name); lock.unlock(); - client_pointer = mem.device_pointer; - - /* create a memory buffer for the device buffer */ + /* Allocate host side data buffer. */ size_t data_size = mem.memory_size(); - DataVector &data_v = data_vector_insert(client_pointer, data_size); + device_ptr client_pointer = mem.device_pointer; - if(data_size) - mem.data_pointer = (device_ptr)&(data_v[0]); - else - mem.data_pointer = 0; + DataVector &data_v = data_vector_insert(client_pointer, data_size); + mem.host_pointer = (data_size)? (void*)&(data_v[0]): 0; - /* perform the allocation on the actual device */ - device->mem_alloc(mem, type); + /* Perform the allocation on the actual device. */ + device->mem_alloc(mem); - /* store a mapping to/from client_pointer and real device pointer */ + /* Store a mapping to/from client_pointer and real device pointer. */ pointer_mapping_insert(client_pointer, mem.device_pointer); } else if(rcv.name == "mem_copy_to") { - network_device_memory mem; - - rcv.read(mem); + string name; + network_device_memory mem(device); + rcv.read(mem, name); lock.unlock(); - device_ptr client_pointer = mem.device_pointer; - - DataVector &data_v = data_vector_find(client_pointer); - size_t data_size = mem.memory_size(); + device_ptr client_pointer = mem.device_pointer; - /* get pointer to memory buffer for device buffer */ - mem.data_pointer = (device_ptr)&data_v[0]; + if(client_pointer) { + /* Lookup existing host side data buffer. */ + DataVector &data_v = data_vector_find(client_pointer); + mem.host_pointer = (void*)&data_v[0]; - /* copy data from network into memory buffer */ - rcv.read_buffer((uint8_t*)mem.data_pointer, data_size); + /* Translate the client pointer to a real device pointer. */ + mem.device_pointer = device_ptr_from_client_pointer(client_pointer); + } + else { + /* Allocate host side data buffer. */ + DataVector &data_v = data_vector_insert(client_pointer, data_size); + mem.host_pointer = (data_size)? (void*)&(data_v[0]): 0; + } - /* translate the client pointer to a real device pointer */ - mem.device_pointer = device_ptr_from_client_pointer(client_pointer); + /* Copy data from network into memory buffer. */ + rcv.read_buffer((uint8_t*)mem.host_pointer, data_size); - /* copy the data from the memory buffer to the device buffer */ + /* Copy the data from the memory buffer to the device buffer. */ device->mem_copy_to(mem); + + if(!client_pointer) { + /* Store a mapping to/from client_pointer and real device pointer. */ + pointer_mapping_insert(client_pointer, mem.device_pointer); + } } else if(rcv.name == "mem_copy_from") { - network_device_memory mem; + string name; + network_device_memory mem(device); int y, w, h, elem; - rcv.read(mem); + rcv.read(mem, name); rcv.read(y); rcv.read(w); rcv.read(h); @@ -525,7 +497,7 @@ protected: DataVector &data_v = data_vector_find(client_pointer); - mem.data_pointer = (device_ptr)&(data_v[0]); + mem.host_pointer = (device_ptr)&(data_v[0]); device->mem_copy_from(mem, y, w, h, elem); @@ -533,32 +505,48 @@ protected: RPCSend snd(socket, &error_func, "mem_copy_from"); snd.write(); - snd.write_buffer((uint8_t*)mem.data_pointer, data_size); + snd.write_buffer((uint8_t*)mem.host_pointer, data_size); lock.unlock(); } else if(rcv.name == "mem_zero") { - network_device_memory mem; - - rcv.read(mem); + string name; + network_device_memory mem(device); + rcv.read(mem, name); lock.unlock(); + size_t data_size = mem.memory_size(); device_ptr client_pointer = mem.device_pointer; - mem.device_pointer = device_ptr_from_client_pointer(client_pointer); - DataVector &data_v = data_vector_find(client_pointer); + if(client_pointer) { + /* Lookup existing host side data buffer. */ + DataVector &data_v = data_vector_find(client_pointer); + mem.host_pointer = (void*)&data_v[0]; - mem.data_pointer = (device_ptr)&(data_v[0]); + /* Translate the client pointer to a real device pointer. */ + mem.device_pointer = device_ptr_from_client_pointer(client_pointer); + } + else { + /* Allocate host side data buffer. */ + DataVector &data_v = data_vector_insert(client_pointer, data_size); + mem.host_pointer = (void*)? (device_ptr)&(data_v[0]): 0; + } + /* Zero memory. */ device->mem_zero(mem); + + if(!client_pointer) { + /* Store a mapping to/from client_pointer and real device pointer. */ + pointer_mapping_insert(client_pointer, mem.device_pointer); + } } else if(rcv.name == "mem_free") { - network_device_memory mem; - device_ptr client_pointer; + string name; + network_device_memory mem(device); - rcv.read(mem); + rcv.read(mem, name); lock.unlock(); - client_pointer = mem.device_pointer; + device_ptr client_pointer = mem.device_pointer; mem.device_pointer = device_ptr_from_client_pointer_erase(client_pointer); @@ -577,49 +565,6 @@ protected: device->const_copy_to(name_string.c_str(), &host_vector[0], size); } - else if(rcv.name == "tex_alloc") { - network_device_memory mem; - string name; - InterpolationType interpolation; - ExtensionType extension_type; - device_ptr client_pointer; - - rcv.read(name); - rcv.read(mem); - rcv.read(interpolation); - rcv.read(extension_type); - lock.unlock(); - - client_pointer = mem.device_pointer; - - size_t data_size = mem.memory_size(); - - DataVector &data_v = data_vector_insert(client_pointer, data_size); - - if(data_size) - mem.data_pointer = (device_ptr)&(data_v[0]); - else - mem.data_pointer = 0; - - rcv.read_buffer((uint8_t*)mem.data_pointer, data_size); - - device->tex_alloc(name.c_str(), mem, interpolation, extension_type); - - pointer_mapping_insert(client_pointer, mem.device_pointer); - } - else if(rcv.name == "tex_free") { - network_device_memory mem; - device_ptr client_pointer; - - rcv.read(mem); - lock.unlock(); - - client_pointer = mem.device_pointer; - - mem.device_pointer = device_ptr_from_client_pointer_erase(client_pointer); - - device->tex_free(mem); - } else if(rcv.name == "load_kernels") { DeviceRequestedFeatures requested_features; rcv.read(requested_features.experimental); @@ -655,10 +600,6 @@ protected: if(task.shader_output) task.shader_output = device_ptr_from_client_pointer(task.shader_output); - if(task.shader_output_luma) - task.shader_output_luma = device_ptr_from_client_pointer(task.shader_output_luma); - - task.acquire_tile = function_bind(&DeviceServer::task_acquire_tile, this, _1, _2); task.release_tile = function_bind(&DeviceServer::task_release_tile, this, _1); task.update_progress_sample = function_bind(&DeviceServer::task_update_progress_sample, this); @@ -708,7 +649,7 @@ protected: } } - bool task_acquire_tile(Device *device, RenderTile& tile) + bool task_acquire_tile(Device *, RenderTile& tile) { thread_scoped_lock acquire_lock(acquire_mutex); @@ -732,7 +673,6 @@ protected: tile = entry.tile; if(tile.buffer) tile.buffer = ptr_map[tile.buffer]; - if(tile.rng_state) tile.rng_state = ptr_map[tile.rng_state]; result = true; break; @@ -764,7 +704,6 @@ protected: thread_scoped_lock acquire_lock(acquire_mutex); if(tile.buffer) tile.buffer = ptr_imap[tile.buffer]; - if(tile.rng_state) tile.rng_state = ptr_imap[tile.rng_state]; { thread_scoped_lock lock(rpc_lock); diff --git a/intern/cycles/device/device_network.h b/intern/cycles/device/device_network.h index d28cfe3121f..b734ba2bda9 100644 --- a/intern/cycles/device/device_network.h +++ b/intern/cycles/device/device_network.h @@ -33,12 +33,13 @@ #include <sstream> #include <deque> -#include "buffers.h" +#include "render/buffers.h" -#include "util_foreach.h" -#include "util_list.h" -#include "util_map.h" -#include "util_string.h" +#include "util/util_foreach.h" +#include "util/util_list.h" +#include "util/util_map.h" +#include "util/util_param.h" +#include "util/util_string.h" CCL_NAMESPACE_BEGIN @@ -68,8 +69,15 @@ typedef boost::archive::binary_iarchive i_archive; class network_device_memory : public device_memory { public: - network_device_memory() {} - ~network_device_memory() { device_pointer = 0; }; + network_device_memory(Device *device) + : device_memory(device, "", MEM_READ_ONLY) + { + } + + ~network_device_memory() + { + device_pointer = 0; + }; vector<char> local_data; }; @@ -119,6 +127,9 @@ public: { archive & mem.data_type & mem.data_elements & mem.data_size; archive & mem.data_width & mem.data_height & mem.data_depth & mem.device_pointer; + archive & mem.type & string(mem.name); + archive & mem.interpolation & mem.extension; + archive & mem.device_pointer; } template<typename T> void add(const T& data) @@ -132,7 +143,7 @@ public: archive & type & task.x & task.y & task.w & task.h; archive & task.rgba_byte & task.rgba_half & task.buffer & task.sample & task.num_samples; archive & task.offset & task.stride; - archive & task.shader_input & task.shader_output & task.shader_output_luma & task.shader_eval_type; + archive & task.shader_input & task.shader_output & task.shader_eval_type; archive & task.shader_x & task.shader_w; archive & task.need_finish_queue; } @@ -142,7 +153,7 @@ public: archive & tile.x & tile.y & tile.w & tile.h; archive & tile.start_sample & tile.num_samples & tile.sample; archive & tile.resolution & tile.offset & tile.stride; - archive & tile.buffer & tile.rng_state; + archive & tile.buffer; } void write() @@ -258,12 +269,21 @@ public: delete archive_stream; } - void read(network_device_memory& mem) + void read(network_device_memory& mem, string& name) { *archive & mem.data_type & mem.data_elements & mem.data_size; *archive & mem.data_width & mem.data_height & mem.data_depth & mem.device_pointer; + *archive & mem.type & name; + *archive & mem.interpolation & mem.extension; + *archive & mem.device_pointer; + + mem.name = name.c_str(); + mem.host_pointer = 0; - mem.data_pointer = 0; + /* Can't transfer OpenGL texture over network. */ + if(mem.type == MEM_PIXELS) { + mem.type = MEM_READ_WRITE; + } } template<typename T> void read(T& data) @@ -291,7 +311,7 @@ public: *archive & type & task.x & task.y & task.w & task.h; *archive & task.rgba_byte & task.rgba_half & task.buffer & task.sample & task.num_samples; *archive & task.offset & task.stride; - *archive & task.shader_input & task.shader_output & task.shader_output_luma & task.shader_eval_type; + *archive & task.shader_input & task.shader_output & task.shader_eval_type; *archive & task.shader_x & task.shader_w; *archive & task.need_finish_queue; @@ -303,7 +323,7 @@ public: *archive & tile.x & tile.y & tile.w & tile.h; *archive & tile.start_sample & tile.num_samples & tile.sample; *archive & tile.resolution & tile.offset & tile.stride; - *archive & tile.buffer & tile.rng_state; + *archive & tile.buffer; tile.buffers = NULL; } diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp index ba94c592a5f..9d61bbdae5d 100644 --- a/intern/cycles/device/device_opencl.cpp +++ b/intern/cycles/device/device_opencl.cpp @@ -16,12 +16,13 @@ #ifdef WITH_OPENCL -#include "opencl/opencl.h" +#include "device/opencl/opencl.h" -#include "device_intern.h" +#include "device/device_intern.h" -#include "util_foreach.h" -#include "util_logging.h" +#include "util/util_foreach.h" +#include "util/util_logging.h" +#include "util/util_set.h" CCL_NAMESPACE_BEGIN @@ -73,13 +74,41 @@ bool device_opencl_init(void) return result; } + +static cl_int device_opencl_get_num_platforms_safe(cl_uint *num_platforms) +{ +#ifdef _WIN32 + __try { + return clGetPlatformIDs(0, NULL, num_platforms); + } + __except(EXCEPTION_EXECUTE_HANDLER) { + /* Ignore crashes inside the OpenCL driver and hope we can + * survive even with corrupted OpenCL installs. */ + fprintf(stderr, "Cycles OpenCL: driver crashed, continuing without OpenCL.\n"); + } + + *num_platforms = 0; + return CL_DEVICE_NOT_FOUND; +#else + return clGetPlatformIDs(0, NULL, num_platforms); +#endif +} + void device_opencl_info(vector<DeviceInfo>& devices) { + cl_uint num_platforms = 0; + device_opencl_get_num_platforms_safe(&num_platforms); + if(num_platforms == 0) { + return; + } + vector<OpenCLPlatformDevice> usable_devices; OpenCLInfo::get_usable_devices(&usable_devices); /* Devices are numbered consecutively across platforms. */ int num_devices = 0; + set<string> unique_ids; foreach(OpenCLPlatformDevice& platform_device, usable_devices) { + /* Compute unique ID for persistent user preferences. */ const string& platform_name = platform_device.platform_name; const cl_device_type device_type = platform_device.device_type; const string& device_name = platform_device.device_name; @@ -87,7 +116,15 @@ void device_opencl_info(vector<DeviceInfo>& devices) if(hardware_id == "") { hardware_id = string_printf("ID_%d", num_devices); } + string id = string("OPENCL_") + platform_name + "_" + device_name + "_" + hardware_id; + + /* Hardware ID might not be unique, add device number in that case. */ + if(unique_ids.find(id) != unique_ids.end()) { + id += string_printf("_ID_%d", num_devices); + } + unique_ids.insert(id); + /* Create DeviceInfo. */ DeviceInfo info; info.type = DEVICE_OPENCL; info.description = string_remove_trademark(string(device_name)); @@ -95,10 +132,11 @@ void device_opencl_info(vector<DeviceInfo>& devices) /* We don't know if it's used for display, but assume it is. */ info.display_device = true; info.advanced_shading = OpenCLInfo::kernel_use_advanced_shading(platform_name); - info.pack_images = true; info.use_split_kernel = OpenCLInfo::kernel_use_split(platform_name, device_type); - info.id = string("OPENCL_") + platform_name + "_" + device_name + "_" + hardware_id; + info.has_volume_decoupled = false; + info.bvh_layout_mask = BVH_LAYOUT_BVH2; + info.id = id; devices.push_back(info); num_devices++; } @@ -114,7 +152,7 @@ string device_opencl_capabilities(void) * it could also be nicely reported to the console. */ cl_uint num_platforms = 0; - opencl_assert(clGetPlatformIDs(0, NULL, &num_platforms)); + opencl_assert(device_opencl_get_num_platforms_safe(&num_platforms)); if(num_platforms == 0) { return "No OpenCL platforms found\n"; } @@ -130,10 +168,22 @@ string device_opencl_capabilities(void) opencl_assert(func(id, what, sizeof(data), &data, NULL)); \ result += string_printf("%s: %s\n", name, data); \ } while(false) +#define APPEND_STRING_EXTENSION_INFO(func, id, name, what) \ + do { \ + char data[1024] = "\0"; \ + size_t length = 0; \ + if(func(id, what, sizeof(data), &data, &length) == CL_SUCCESS) { \ + if(length != 0 && data[0] != '\0') { \ + result += string_printf("%s: %s\n", name, data); \ + } \ + } \ + } while(false) #define APPEND_PLATFORM_STRING_INFO(id, name, what) \ APPEND_STRING_INFO(clGetPlatformInfo, id, "\tPlatform " name, what) #define APPEND_DEVICE_STRING_INFO(id, name, what) \ APPEND_STRING_INFO(clGetDeviceInfo, id, "\t\t\tDevice " name, what) +#define APPEND_DEVICE_STRING_EXTENSION_INFO(id, name, what) \ + APPEND_STRING_EXTENSION_INFO(clGetDeviceInfo, id, "\t\t\tDevice " name, what) vector<cl_device_id> device_ids; for(cl_uint platform = 0; platform < num_platforms; ++platform) { @@ -167,6 +217,7 @@ string device_opencl_capabilities(void) result += string_printf("\t\tDevice: #%u\n", device); APPEND_DEVICE_STRING_INFO(device_id, "Name", CL_DEVICE_NAME); + APPEND_DEVICE_STRING_EXTENSION_INFO(device_id, "Board Name", CL_DEVICE_BOARD_NAME_AMD); APPEND_DEVICE_STRING_INFO(device_id, "Vendor", CL_DEVICE_VENDOR); APPEND_DEVICE_STRING_INFO(device_id, "OpenCL C Version", CL_DEVICE_OPENCL_C_VERSION); APPEND_DEVICE_STRING_INFO(device_id, "Profile", CL_DEVICE_PROFILE); diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp new file mode 100644 index 00000000000..74135a1f3c8 --- /dev/null +++ b/intern/cycles/device/device_split_kernel.cpp @@ -0,0 +1,326 @@ +/* + * Copyright 2011-2016 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/device_split_kernel.h" + +#include "kernel/kernel_types.h" +#include "kernel/split/kernel_split_data_types.h" + +#include "util/util_logging.h" +#include "util/util_time.h" + +CCL_NAMESPACE_BEGIN + +static const double alpha = 0.1; /* alpha for rolling average */ + +DeviceSplitKernel::DeviceSplitKernel(Device *device) +: device(device), + split_data(device, "split_data"), + ray_state(device, "ray_state", MEM_READ_WRITE), + queue_index(device, "queue_index"), + use_queues_flag(device, "use_queues_flag"), + work_pool_wgs(device, "work_pool_wgs") +{ + first_tile = true; + + avg_time_per_sample = 0.0; + + kernel_path_init = NULL; + kernel_scene_intersect = NULL; + kernel_lamp_emission = NULL; + kernel_do_volume = NULL; + kernel_queue_enqueue = NULL; + kernel_indirect_background = NULL; + kernel_shader_setup = NULL; + kernel_shader_sort = NULL; + kernel_shader_eval = NULL; + kernel_holdout_emission_blurring_pathtermination_ao = NULL; + kernel_subsurface_scatter = NULL; + kernel_direct_lighting = NULL; + kernel_shadow_blocked_ao = NULL; + kernel_shadow_blocked_dl = NULL; + kernel_enqueue_inactive = NULL; + kernel_next_iteration_setup = NULL; + kernel_indirect_subsurface = NULL; + kernel_buffer_update = NULL; +} + +DeviceSplitKernel::~DeviceSplitKernel() +{ + split_data.free(); + ray_state.free(); + use_queues_flag.free(); + queue_index.free(); + work_pool_wgs.free(); + + delete kernel_path_init; + delete kernel_scene_intersect; + delete kernel_lamp_emission; + delete kernel_do_volume; + delete kernel_queue_enqueue; + delete kernel_indirect_background; + delete kernel_shader_setup; + delete kernel_shader_sort; + delete kernel_shader_eval; + delete kernel_holdout_emission_blurring_pathtermination_ao; + delete kernel_subsurface_scatter; + delete kernel_direct_lighting; + delete kernel_shadow_blocked_ao; + delete kernel_shadow_blocked_dl; + delete kernel_enqueue_inactive; + delete kernel_next_iteration_setup; + delete kernel_indirect_subsurface; + delete kernel_buffer_update; +} + +bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_features) +{ +#define LOAD_KERNEL(name) \ + kernel_##name = get_split_kernel_function(#name, requested_features); \ + if(!kernel_##name) { \ + device->set_error(string("Split kernel error: failed to load kernel_") + #name); \ + return false; \ + } + + LOAD_KERNEL(path_init); + LOAD_KERNEL(scene_intersect); + LOAD_KERNEL(lamp_emission); + LOAD_KERNEL(do_volume); + LOAD_KERNEL(queue_enqueue); + LOAD_KERNEL(indirect_background); + LOAD_KERNEL(shader_setup); + LOAD_KERNEL(shader_sort); + LOAD_KERNEL(shader_eval); + LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao); + LOAD_KERNEL(subsurface_scatter); + LOAD_KERNEL(direct_lighting); + LOAD_KERNEL(shadow_blocked_ao); + LOAD_KERNEL(shadow_blocked_dl); + LOAD_KERNEL(enqueue_inactive); + LOAD_KERNEL(next_iteration_setup); + LOAD_KERNEL(indirect_subsurface); + LOAD_KERNEL(buffer_update); + +#undef LOAD_KERNEL + + return true; +} + +size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, uint64_t max_buffer_size) +{ + uint64_t size_per_element = state_buffer_size(kg, data, 1024) / 1024; + VLOG(1) << "Split state element size: " + << string_human_readable_number(size_per_element) << " bytes. (" + << string_human_readable_size(size_per_element) << ")."; + return max_buffer_size / size_per_element; +} + +bool DeviceSplitKernel::path_trace(DeviceTask *task, + RenderTile& tile, + device_memory& kgbuffer, + device_memory& kernel_data) +{ + if(device->have_error()) { + return false; + } + + /* Get local size */ + size_t local_size[2]; + { + int2 lsize = split_kernel_local_size(); + local_size[0] = lsize[0]; + local_size[1] = lsize[1]; + } + + /* Number of elements in the global state buffer */ + int num_global_elements = global_size[0] * global_size[1]; + + /* Allocate all required global memory once. */ + if(first_tile) { + first_tile = false; + + /* Set gloabl size */ + { + int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task); + + /* Make sure that set work size is a multiple of local + * work size dimensions. + */ + global_size[0] = round_up(gsize[0], local_size[0]); + global_size[1] = round_up(gsize[1], local_size[1]); + } + + num_global_elements = global_size[0] * global_size[1]; + assert(num_global_elements % WORK_POOL_SIZE == 0); + + /* Calculate max groups */ + + /* Denotes the maximum work groups possible w.r.t. current requested tile size. */ + unsigned int work_pool_size = (device->info.type == DEVICE_CPU) ? WORK_POOL_SIZE_CPU : WORK_POOL_SIZE_GPU; + unsigned int max_work_groups = num_global_elements / work_pool_size + 1; + + /* Allocate work_pool_wgs memory. */ + work_pool_wgs.alloc_to_device(max_work_groups); + queue_index.alloc_to_device(NUM_QUEUES); + use_queues_flag.alloc_to_device(1); + split_data.alloc_to_device(state_buffer_size(kgbuffer, kernel_data, num_global_elements)); + ray_state.alloc(num_global_elements); + } + +#define ENQUEUE_SPLIT_KERNEL(name, global_size, local_size) \ + if(device->have_error()) { \ + return false; \ + } \ + if(!kernel_##name->enqueue(KernelDimensions(global_size, local_size), kgbuffer, kernel_data)) { \ + return false; \ + } + + tile.sample = tile.start_sample; + + /* for exponential increase between tile updates */ + int time_multiplier = 1; + + while(tile.sample < tile.start_sample + tile.num_samples) { + /* to keep track of how long it takes to run a number of samples */ + double start_time = time_dt(); + + /* initial guess to start rolling average */ + const int initial_num_samples = 1; + /* approx number of samples per second */ + int samples_per_second = (avg_time_per_sample > 0.0) ? + int(double(time_multiplier) / avg_time_per_sample) + 1 : initial_num_samples; + + RenderTile subtile = tile; + subtile.start_sample = tile.sample; + subtile.num_samples = min(samples_per_second, tile.start_sample + tile.num_samples - tile.sample); + + if(device->have_error()) { + return false; + } + + /* reset state memory here as global size for data_init + * kernel might not be large enough to do in kernel + */ + work_pool_wgs.zero_to_device(); + split_data.zero_to_device(); + ray_state.zero_to_device(); + + if(!enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size), + subtile, + num_global_elements, + kgbuffer, + kernel_data, + split_data, + ray_state, + queue_index, + use_queues_flag, + work_pool_wgs)) + { + return false; + } + + ENQUEUE_SPLIT_KERNEL(path_init, global_size, local_size); + + bool activeRaysAvailable = true; + double cancel_time = DBL_MAX; + + while(activeRaysAvailable) { + /* Do path-iteration in host [Enqueue Path-iteration kernels. */ + for(int PathIter = 0; PathIter < 16; PathIter++) { + ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(do_volume, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(indirect_background, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(shader_setup, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(shader_sort, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(subsurface_scatter, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(enqueue_inactive, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(indirect_subsurface, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(buffer_update, global_size, local_size); + + if(task->get_cancel() && cancel_time == DBL_MAX) { + /* Wait up to twice as many seconds for current samples to finish + * to avoid artifacts in render result from ending too soon. + */ + cancel_time = time_dt() + 2.0 * time_multiplier; + } + + if(time_dt() > cancel_time) { + return true; + } + } + + /* Decide if we should exit path-iteration in host. */ + ray_state.copy_from_device(0, global_size[0] * global_size[1], 1); + + activeRaysAvailable = false; + + for(int rayStateIter = 0; rayStateIter < global_size[0] * global_size[1]; ++rayStateIter) { + if(!IS_STATE(ray_state.data(), rayStateIter, RAY_INACTIVE)) { + if(IS_STATE(ray_state.data(), rayStateIter, RAY_INVALID)) { + /* Something went wrong, abort to avoid looping endlessly. */ + device->set_error("Split kernel error: invalid ray state"); + return false; + } + + /* Not all rays are RAY_INACTIVE. */ + activeRaysAvailable = true; + break; + } + } + + if(time_dt() > cancel_time) { + return true; + } + } + + double time_per_sample = ((time_dt()-start_time) / subtile.num_samples); + + if(avg_time_per_sample == 0.0) { + /* start rolling average */ + avg_time_per_sample = time_per_sample; + } + else { + avg_time_per_sample = alpha*time_per_sample + (1.0-alpha)*avg_time_per_sample; + } + +#undef ENQUEUE_SPLIT_KERNEL + + tile.sample += subtile.num_samples; + task->update_progress(&tile, tile.w*tile.h*subtile.num_samples); + + time_multiplier = min(time_multiplier << 1, 10); + + if(task->get_cancel()) { + return true; + } + } + + return true; +} + +CCL_NAMESPACE_END + + diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h new file mode 100644 index 00000000000..2ec0261e847 --- /dev/null +++ b/intern/cycles/device/device_split_kernel.h @@ -0,0 +1,136 @@ +/* + * Copyright 2011-2016 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __DEVICE_SPLIT_KERNEL_H__ +#define __DEVICE_SPLIT_KERNEL_H__ + +#include "device/device.h" +#include "render/buffers.h" + +CCL_NAMESPACE_BEGIN + +/* When allocate global memory in chunks. We may not be able to + * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks; + * Since some bytes may be needed for aligning chunks of memory; + * This is the amount of memory that we dedicate for that purpose. + */ +#define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB + +/* Types used for split kernel */ + +class KernelDimensions { +public: + size_t global_size[2]; + size_t local_size[2]; + + KernelDimensions(size_t global_size_[2], size_t local_size_[2]) + { + memcpy(global_size, global_size_, sizeof(global_size)); + memcpy(local_size, local_size_, sizeof(local_size)); + } +}; + +class SplitKernelFunction { +public: + virtual ~SplitKernelFunction() {} + + /* enqueue the kernel, returns false if there is an error */ + virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data) = 0; +}; + +class DeviceSplitKernel { +private: + Device *device; + + SplitKernelFunction *kernel_path_init; + SplitKernelFunction *kernel_scene_intersect; + SplitKernelFunction *kernel_lamp_emission; + SplitKernelFunction *kernel_do_volume; + SplitKernelFunction *kernel_queue_enqueue; + SplitKernelFunction *kernel_indirect_background; + SplitKernelFunction *kernel_shader_setup; + SplitKernelFunction *kernel_shader_sort; + SplitKernelFunction *kernel_shader_eval; + SplitKernelFunction *kernel_holdout_emission_blurring_pathtermination_ao; + SplitKernelFunction *kernel_subsurface_scatter; + SplitKernelFunction *kernel_direct_lighting; + SplitKernelFunction *kernel_shadow_blocked_ao; + SplitKernelFunction *kernel_shadow_blocked_dl; + SplitKernelFunction *kernel_enqueue_inactive; + SplitKernelFunction *kernel_next_iteration_setup; + SplitKernelFunction *kernel_indirect_subsurface; + SplitKernelFunction *kernel_buffer_update; + + /* Global memory variables [porting]; These memory is used for + * co-operation between different kernels; Data written by one + * kernel will be available to another kernel via this global + * memory. + */ + device_only_memory<uchar> split_data; + device_vector<uchar> ray_state; + device_only_memory<int> queue_index; /* Array of size num_queues that tracks the size of each queue. */ + + /* Flag to make sceneintersect and lampemission kernel use queues. */ + device_only_memory<char> use_queues_flag; + + /* Approximate time it takes to complete one sample */ + double avg_time_per_sample; + + /* Work pool with respect to each work group. */ + device_only_memory<unsigned int> work_pool_wgs; + + /* Marked True in constructor and marked false at the end of path_trace(). */ + bool first_tile; + + /* Cached global size */ + size_t global_size[2]; + +public: + explicit DeviceSplitKernel(Device* device); + virtual ~DeviceSplitKernel(); + + bool load_kernels(const DeviceRequestedFeatures& requested_features); + bool path_trace(DeviceTask *task, + RenderTile& rtile, + device_memory& kgbuffer, + device_memory& kernel_data); + + virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads) = 0; + size_t max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, uint64_t max_buffer_size); + + virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim, + RenderTile& rtile, + int num_global_elements, + device_memory& kernel_globals, + device_memory& kernel_data_, + device_memory& split_data, + device_memory& ray_state, + device_memory& queue_index, + device_memory& use_queues_flag, + device_memory& work_pool_wgs) = 0; + + virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name, + const DeviceRequestedFeatures&) = 0; + virtual int2 split_kernel_local_size() = 0; + virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task) = 0; +}; + +CCL_NAMESPACE_END + +#endif /* __DEVICE_SPLIT_KERNEL_H__ */ + + + diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp index 48d18035c13..3c7d24fb5b7 100644 --- a/intern/cycles/device/device_task.cpp +++ b/intern/cycles/device/device_task.cpp @@ -17,12 +17,12 @@ #include <stdlib.h> #include <string.h> -#include "device_task.h" +#include "device/device_task.h" -#include "buffers.h" +#include "render/buffers.h" -#include "util_algorithm.h" -#include "util_time.h" +#include "util/util_algorithm.h" +#include "util/util_time.h" CCL_NAMESPACE_BEGIN @@ -31,7 +31,7 @@ CCL_NAMESPACE_BEGIN DeviceTask::DeviceTask(Type type_) : type(type_), x(0), y(0), w(0), h(0), rgba_byte(0), rgba_half(0), buffer(0), sample(0), num_samples(1), - shader_input(0), shader_output(0), shader_output_luma(0), + shader_input(0), shader_output(0), shader_eval_type(0), shader_filter(0), shader_x(0), shader_w(0) { last_update_time = time_dt(); @@ -56,7 +56,7 @@ int DeviceTask::get_subtask_count(int num, int max_size) if(type == SHADER) { num = min(shader_w, num); } - else if(type == PATH_TRACE) { + else if(type == RENDER) { } else { num = min(h, num); @@ -82,7 +82,7 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size) tasks.push_back(task); } } - else if(type == PATH_TRACE) { + else if(type == RENDER) { for(int i = 0; i < num; i++) tasks.push_back(*this); } @@ -103,7 +103,7 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size) void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples) { - if((type != PATH_TRACE) && + if((type != RENDER) && (type != SHADER)) return; diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h index 8bd54c3d2b0..b9658eb978f 100644 --- a/intern/cycles/device/device_task.h +++ b/intern/cycles/device/device_task.h @@ -17,11 +17,11 @@ #ifndef __DEVICE_TASK_H__ #define __DEVICE_TASK_H__ -#include "device_memory.h" +#include "device/device_memory.h" -#include "util_function.h" -#include "util_list.h" -#include "util_task.h" +#include "util/util_function.h" +#include "util/util_list.h" +#include "util/util_task.h" CCL_NAMESPACE_BEGIN @@ -34,7 +34,7 @@ class Tile; class DeviceTask : public Task { public: - typedef enum { PATH_TRACE, FILM_CONVERT, SHADER } Type; + typedef enum { RENDER, FILM_CONVERT, SHADER } Type; Type type; int x, y, w, h; @@ -46,12 +46,14 @@ public: int offset, stride; device_ptr shader_input; - device_ptr shader_output, shader_output_luma; + device_ptr shader_output; int shader_eval_type; int shader_filter; int shader_x, shader_w; - explicit DeviceTask(Type type = PATH_TRACE); + int passes_size; + + explicit DeviceTask(Type type = RENDER); int get_subtask_count(int num, int max_size = 0); void split(list<DeviceTask>& tasks, int num, int max_size = 0); @@ -63,6 +65,16 @@ public: function<void(RenderTile&)> update_tile_sample; function<void(RenderTile&)> release_tile; function<bool(void)> get_cancel; + function<void(RenderTile*, Device*)> map_neighbor_tiles; + function<void(RenderTile*, Device*)> unmap_neighbor_tiles; + + int denoising_radius; + float denoising_strength; + float denoising_feature_strength; + bool denoising_relative_pca; + int pass_stride; + int pass_denoising_data; + int pass_denoising_clean; bool need_finish_queue; bool integrator_branched; diff --git a/intern/cycles/device/opencl/memory_manager.cpp b/intern/cycles/device/opencl/memory_manager.cpp new file mode 100644 index 00000000000..75c9de65035 --- /dev/null +++ b/intern/cycles/device/opencl/memory_manager.cpp @@ -0,0 +1,257 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_OPENCL + +#include "util/util_foreach.h" + +#include "device/opencl/opencl.h" +#include "device/opencl/memory_manager.h" + +CCL_NAMESPACE_BEGIN + +void MemoryManager::DeviceBuffer::add_allocation(Allocation& allocation) +{ + allocations.push_back(&allocation); +} + +void MemoryManager::DeviceBuffer::update_device_memory(OpenCLDeviceBase *device) +{ + bool need_realloc = false; + + /* Calculate total size and remove any freed. */ + size_t total_size = 0; + + for(int i = allocations.size()-1; i >= 0; i--) { + Allocation* allocation = allocations[i]; + + /* Remove allocations that have been freed. */ + if(!allocation->mem || allocation->mem->memory_size() == 0) { + allocation->device_buffer = NULL; + allocation->size = 0; + + allocations.erase(allocations.begin()+i); + + need_realloc = true; + + continue; + } + + /* Get actual size for allocation. */ + size_t alloc_size = align_up(allocation->mem->memory_size(), 16); + + if(allocation->size != alloc_size) { + /* Allocation is either new or resized. */ + allocation->size = alloc_size; + allocation->needs_copy_to_device = true; + + need_realloc = true; + } + + total_size += alloc_size; + } + + if(need_realloc) { + cl_ulong max_buffer_size; + clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL); + + if(total_size > max_buffer_size) { + device->set_error("Scene too complex to fit in available memory."); + return; + } + + device_only_memory<uchar> *new_buffer = + new device_only_memory<uchar>(device, "memory manager buffer"); + + new_buffer->alloc_to_device(total_size); + + size_t offset = 0; + + foreach(Allocation* allocation, allocations) { + if(allocation->needs_copy_to_device) { + /* Copy from host to device. */ + opencl_device_assert(device, clEnqueueWriteBuffer(device->cqCommandQueue, + CL_MEM_PTR(new_buffer->device_pointer), + CL_FALSE, + offset, + allocation->mem->memory_size(), + allocation->mem->host_pointer, + 0, NULL, NULL + )); + + allocation->needs_copy_to_device = false; + } + else { + /* Fast copy from memory already on device. */ + opencl_device_assert(device, clEnqueueCopyBuffer(device->cqCommandQueue, + CL_MEM_PTR(buffer->device_pointer), + CL_MEM_PTR(new_buffer->device_pointer), + allocation->desc.offset, + offset, + allocation->mem->memory_size(), + 0, NULL, NULL + )); + } + + allocation->desc.offset = offset; + offset += allocation->size; + } + + delete buffer; + + buffer = new_buffer; + } + else { + assert(total_size == buffer->data_size); + + size_t offset = 0; + + foreach(Allocation* allocation, allocations) { + if(allocation->needs_copy_to_device) { + /* Copy from host to device. */ + opencl_device_assert(device, clEnqueueWriteBuffer(device->cqCommandQueue, + CL_MEM_PTR(buffer->device_pointer), + CL_FALSE, + offset, + allocation->mem->memory_size(), + allocation->mem->host_pointer, + 0, NULL, NULL + )); + + allocation->needs_copy_to_device = false; + } + + offset += allocation->size; + } + } + + /* Not really necessary, but seems to improve responsiveness for some reason. */ + clFinish(device->cqCommandQueue); +} + +void MemoryManager::DeviceBuffer::free(OpenCLDeviceBase *) +{ + buffer->free(); +} + +MemoryManager::DeviceBuffer* MemoryManager::smallest_device_buffer() +{ + DeviceBuffer* smallest = device_buffers; + + foreach(DeviceBuffer& device_buffer, device_buffers) { + if(device_buffer.size < smallest->size) { + smallest = &device_buffer; + } + } + + return smallest; +} + +MemoryManager::MemoryManager(OpenCLDeviceBase *device) +: device(device), need_update(false) +{ + foreach(DeviceBuffer& device_buffer, device_buffers) { + device_buffer.buffer = + new device_only_memory<uchar>(device, "memory manager buffer"); + } +} + +void MemoryManager::free() +{ + foreach(DeviceBuffer& device_buffer, device_buffers) { + device_buffer.free(device); + } +} + +void MemoryManager::alloc(const char *name, device_memory& mem) +{ + Allocation& allocation = allocations[name]; + + allocation.mem = &mem; + allocation.needs_copy_to_device = true; + + if(!allocation.device_buffer) { + DeviceBuffer* device_buffer = smallest_device_buffer(); + allocation.device_buffer = device_buffer; + + allocation.desc.device_buffer = device_buffer - device_buffers; + + device_buffer->add_allocation(allocation); + + device_buffer->size += mem.memory_size(); + } + + need_update = true; +} + +bool MemoryManager::free(device_memory& mem) +{ + foreach(AllocationsMap::value_type& value, allocations) { + Allocation& allocation = value.second; + if(allocation.mem == &mem) { + + allocation.device_buffer->size -= mem.memory_size(); + + allocation.mem = NULL; + allocation.needs_copy_to_device = false; + + need_update = true; + return true; + } + } + + return false; +} + +MemoryManager::BufferDescriptor MemoryManager::get_descriptor(string name) +{ + update_device_memory(); + + Allocation& allocation = allocations[name]; + return allocation.desc; +} + +void MemoryManager::update_device_memory() +{ + if(!need_update) { + return; + } + + need_update = false; + + foreach(DeviceBuffer& device_buffer, device_buffers) { + device_buffer.update_device_memory(device); + } +} + +void MemoryManager::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg) +{ + update_device_memory(); + + foreach(DeviceBuffer& device_buffer, device_buffers) { + if(device_buffer.buffer->device_pointer) { + device->kernel_set_args(kernel, (*narg)++, *device_buffer.buffer); + } + else { + device->kernel_set_args(kernel, (*narg)++, device->null_mem); + } + } +} + +CCL_NAMESPACE_END + +#endif /* WITH_OPENCL */ + diff --git a/intern/cycles/device/opencl/memory_manager.h b/intern/cycles/device/opencl/memory_manager.h new file mode 100644 index 00000000000..b3d861275f0 --- /dev/null +++ b/intern/cycles/device/opencl/memory_manager.h @@ -0,0 +1,107 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "device/device.h" + +#include "util/util_map.h" +#include "util/util_vector.h" +#include "util/util_string.h" + +#include "clew.h" + +CCL_NAMESPACE_BEGIN + +class OpenCLDeviceBase; + +class MemoryManager { +public: + static const int NUM_DEVICE_BUFFERS = 8; + + struct BufferDescriptor { + uint device_buffer; + cl_ulong offset; + }; + +private: + struct DeviceBuffer; + + struct Allocation { + device_memory *mem; + + DeviceBuffer *device_buffer; + size_t size; /* Size of actual allocation, may be larger than requested. */ + + BufferDescriptor desc; + + bool needs_copy_to_device; + + Allocation() : mem(NULL), device_buffer(NULL), size(0), needs_copy_to_device(false) + { + } + }; + + struct DeviceBuffer { + device_only_memory<uchar> *buffer; + vector<Allocation*> allocations; + size_t size; /* Size of all allocations. */ + + DeviceBuffer() + : buffer(NULL), size(0) + { + } + + ~DeviceBuffer() + { + delete buffer; + buffer = NULL; + } + + void add_allocation(Allocation& allocation); + + void update_device_memory(OpenCLDeviceBase *device); + + void free(OpenCLDeviceBase *device); + }; + + OpenCLDeviceBase *device; + + DeviceBuffer device_buffers[NUM_DEVICE_BUFFERS]; + + typedef unordered_map<string, Allocation> AllocationsMap; + AllocationsMap allocations; + + bool need_update; + + DeviceBuffer* smallest_device_buffer(); + +public: + MemoryManager(OpenCLDeviceBase *device); + + void free(); /* Free all memory. */ + + void alloc(const char *name, device_memory& mem); + bool free(device_memory& mem); + + BufferDescriptor get_descriptor(string name); + + void update_device_memory(); + void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg); +}; + +CCL_NAMESPACE_END + diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h index 4023ba89a10..85ef14ee29a 100644 --- a/intern/cycles/device/opencl/opencl.h +++ b/intern/cycles/device/opencl/opencl.h @@ -16,39 +16,42 @@ #ifdef WITH_OPENCL -#include "device.h" +#include "device/device.h" +#include "device/device_denoising.h" -#include "util_map.h" -#include "util_param.h" -#include "util_string.h" +#include "util/util_map.h" +#include "util/util_param.h" +#include "util/util_string.h" #include "clew.h" -CCL_NAMESPACE_BEGIN +#include "device/opencl/memory_manager.h" -#define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p)) +CCL_NAMESPACE_BEGIN -/* Macro declarations used with split kernel */ +/* Disable workarounds, seems to be working fine on latest drivers. */ +#define CYCLES_DISABLE_DRIVER_WORKAROUNDS -/* Macro to enable/disable work-stealing */ -#define __WORK_STEALING__ +/* Define CYCLES_DISABLE_DRIVER_WORKAROUNDS to disable workaounds for testing */ +#ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS +/* Work around AMD driver hangs by ensuring each command is finished before doing anything else. */ +# undef clEnqueueNDRangeKernel +# define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \ + CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \ + clFinish(a); -#define SPLIT_KERNEL_LOCAL_SIZE_X 64 -#define SPLIT_KERNEL_LOCAL_SIZE_Y 1 +# undef clEnqueueWriteBuffer +# define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \ + CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \ + clFinish(a); -/* This value may be tuned according to the scene we are rendering. - * - * Modifying PATH_ITER_INC_FACTOR value proportional to number of expected - * ray-bounces will improve performance. - */ -#define PATH_ITER_INC_FACTOR 8 +# undef clEnqueueReadBuffer +# define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \ + CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \ + clFinish(a); +#endif /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */ -/* When allocate global memory in chunks. We may not be able to - * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks; - * Since some bytes may be needed for aligning chunks of memory; - * This is the amount of memory that we dedicate for that purpose. - */ -#define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB +#define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p)) struct OpenCLPlatformDevice { OpenCLPlatformDevice(cl_platform_id platform_id, @@ -86,10 +89,65 @@ public: string *error = NULL); static bool device_version_check(cl_device_id device, string *error = NULL); - static string get_hardware_id(string platform_name, + static string get_hardware_id(const string& platform_name, cl_device_id device_id); static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices, bool force_all = false); + static bool use_single_program(); + + /* ** Some handy shortcuts to low level cl*GetInfo() functions. ** */ + + /* Platform information. */ + static bool get_num_platforms(cl_uint *num_platforms, cl_int *error = NULL); + static cl_uint get_num_platforms(); + + static bool get_platforms(vector<cl_platform_id> *platform_ids, + cl_int *error = NULL); + static vector<cl_platform_id> get_platforms(); + + static bool get_platform_name(cl_platform_id platform_id, + string *platform_name); + static string get_platform_name(cl_platform_id platform_id); + + static bool get_num_platform_devices(cl_platform_id platform_id, + cl_device_type device_type, + cl_uint *num_devices, + cl_int *error = NULL); + static cl_uint get_num_platform_devices(cl_platform_id platform_id, + cl_device_type device_type); + + static bool get_platform_devices(cl_platform_id platform_id, + cl_device_type device_type, + vector<cl_device_id> *device_ids, + cl_int* error = NULL); + static vector<cl_device_id> get_platform_devices(cl_platform_id platform_id, + cl_device_type device_type); + + /* Device information. */ + static bool get_device_name(cl_device_id device_id, + string *device_name, + cl_int* error = NULL); + + static string get_device_name(cl_device_id device_id); + + static bool get_device_type(cl_device_id device_id, + cl_device_type *device_type, + cl_int* error = NULL); + static cl_device_type get_device_type(cl_device_id device_id); + + static bool get_driver_version(cl_device_id device_id, + int *major, + int *minor, + cl_int* error = NULL); + + static int mem_sub_ptr_alignment(cl_device_id device_id); + + /* Get somewhat more readable device name. + * Main difference is AMD OpenCL here which only gives code name + * for the regular device name. This will give more sane device + * name using some extensions. + */ + static string get_readable_device_name(cl_device_id device_id); }; /* Thread safe cache for contexts and programs. @@ -168,12 +226,24 @@ public: static string get_kernel_md5(); }; +#define opencl_device_assert(device, stmt) \ + { \ + cl_int err = stmt; \ + \ + if(err != CL_SUCCESS) { \ + string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \ + if((device)->error_message() == "") \ + (device)->set_error(message); \ + fprintf(stderr, "%s\n", message.c_str()); \ + } \ + } (void)0 + #define opencl_assert(stmt) \ { \ cl_int err = stmt; \ \ if(err != CL_SUCCESS) { \ - string message = string_printf("OpenCL error: %s in %s", clewErrorString(err), #stmt); \ + string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \ if(error_msg == "") \ error_msg = message; \ fprintf(stderr, "%s\n", message.c_str()); \ @@ -194,17 +264,17 @@ public: public: OpenCLProgram() : loaded(false), device(NULL) {} OpenCLProgram(OpenCLDeviceBase *device, - string program_name, - string kernel_name, - string kernel_build_options, + const string& program_name, + const string& kernel_name, + const string& kernel_build_options, bool use_stdout = true); ~OpenCLProgram(); void add_kernel(ustring name); void load(); - bool is_loaded() { return loaded; } - string get_log() { return log; } + bool is_loaded() const { return loaded; } + const string& get_log() const { return log; } void report_error(); cl_kernel operator()(); @@ -218,8 +288,8 @@ public: bool load_binary(const string& clbin, const string *debug_src = NULL); bool save_binary(const string& clbin); - void add_log(string msg, bool is_debug); - void add_error(string msg); + void add_log(const string& msg, bool is_debug); + void add_error(const string& msg); bool loaded; cl_program program; @@ -237,7 +307,7 @@ public: map<ustring, cl_kernel> kernels; }; - OpenCLProgram base_program; + OpenCLProgram base_program, denoising_program; typedef map<string, device_vector<uchar>*> ConstMemMap; typedef map<string, device_ptr> MemMap; @@ -248,6 +318,7 @@ public: bool device_initialized; string platform_name; + string device_name; bool opencl_error(cl_int err); void opencl_error(const string& message); @@ -266,28 +337,33 @@ public: /* Has to be implemented by the real device classes. * The base device will then load all these programs. */ - virtual void load_kernels(const DeviceRequestedFeatures& requested_features, + virtual bool load_kernels(const DeviceRequestedFeatures& requested_features, vector<OpenCLProgram*> &programs) = 0; - void mem_alloc(device_memory& mem, MemoryType type); + void mem_alloc(device_memory& mem); void mem_copy_to(device_memory& mem); void mem_copy_from(device_memory& mem, int y, int w, int h, int elem); void mem_zero(device_memory& mem); void mem_free(device_memory& mem); + + int mem_sub_ptr_alignment(); + void const_copy_to(const char *name, void *host, size_t size); - void tex_alloc(const char *name, - device_memory& mem, - InterpolationType /*interpolation*/, - ExtensionType /*extension*/); + void tex_alloc(device_memory& mem); void tex_free(device_memory& mem); size_t global_size_round_up(int group_size, int global_size); - void enqueue_kernel(cl_kernel kernel, size_t w, size_t h); + void enqueue_kernel(cl_kernel kernel, size_t w, size_t h, + bool x_workgroups = false, + size_t max_workgroup_size = -1); void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name); + void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg); void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half); void shader(DeviceTask& task); + void denoise(RenderTile& tile, DenoisingTask& denoising, const DeviceTask& task); + class OpenCLDeviceTask : public DeviceTask { public: OpenCLDeviceTask(OpenCLDeviceBase *device, DeviceTask& task) @@ -321,21 +397,91 @@ public: virtual void thread_run(DeviceTask * /*task*/) = 0; + virtual bool is_split_kernel() = 0; + protected: string kernel_build_options(const string *debug_src = NULL); + void mem_zero_kernel(device_ptr ptr, size_t size); + + bool denoising_non_local_means(device_ptr image_ptr, + device_ptr guide_ptr, + device_ptr variance_ptr, + device_ptr out_ptr, + DenoisingTask *task); + bool denoising_construct_transform(DenoisingTask *task); + bool denoising_reconstruct(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr output_ptr, + DenoisingTask *task); + bool denoising_combine_halves(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr mean_ptr, + device_ptr variance_ptr, + int r, int4 rect, + DenoisingTask *task); + bool denoising_divide_shadow(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr sample_variance_ptr, + device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr, + DenoisingTask *task); + bool denoising_get_feature(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr, + DenoisingTask *task); + bool denoising_detect_outliers(device_ptr image_ptr, + device_ptr variance_ptr, + device_ptr depth_ptr, + device_ptr output_ptr, + DenoisingTask *task); + bool denoising_set_tiles(device_ptr *buffers, + DenoisingTask *task); + + device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int size); + void mem_free_sub_ptr(device_ptr ptr); + class ArgumentWrapper { public: - ArgumentWrapper() : size(0), pointer(NULL) {} - template <typename T> + ArgumentWrapper() : size(0), pointer(NULL) + { + } + + ArgumentWrapper(device_memory& argument) : size(sizeof(void*)), + pointer((void*)(&argument.device_pointer)) + { + } + + template<typename T> + ArgumentWrapper(device_vector<T>& argument) : size(sizeof(void*)), + pointer((void*)(&argument.device_pointer)) + { + } + + template<typename T> + ArgumentWrapper(device_only_memory<T>& argument) : size(sizeof(void*)), + pointer((void*)(&argument.device_pointer)) + { + } + template<typename T> ArgumentWrapper(T& argument) : size(sizeof(argument)), - pointer(&argument) { } + pointer(&argument) + { + } + ArgumentWrapper(int argument) : size(sizeof(int)), int_value(argument), - pointer(&int_value) { } + pointer(&int_value) + { + } + ArgumentWrapper(float argument) : size(sizeof(float)), float_value(argument), - pointer(&float_value) { } + pointer(&float_value) + { + } + size_t size; int int_value; float float_value; @@ -398,6 +544,21 @@ protected: virtual string build_options_for_base_program( const DeviceRequestedFeatures& /*requested_features*/); + +private: + MemoryManager memory_manager; + friend class MemoryManager; + + static_assert_align(TextureInfo, 16); + device_vector<TextureInfo> texture_info; + + typedef map<string, device_memory*> TexturesMap; + TexturesMap textures; + + bool textures_need_update; + +protected: + void flush_texture_buffers(); }; Device *opencl_create_mega_device(DeviceInfo& info, Stats& stats, bool background); diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp index a2b900312e7..bfa2702ad62 100644 --- a/intern/cycles/device/opencl/opencl_base.cpp +++ b/intern/cycles/device/opencl/opencl_base.cpp @@ -16,18 +16,29 @@ #ifdef WITH_OPENCL -#include "opencl.h" +#include "device/opencl/opencl.h" -#include "kernel_types.h" +#include "kernel/kernel_types.h" -#include "util_foreach.h" -#include "util_logging.h" -#include "util_md5.h" -#include "util_path.h" -#include "util_time.h" +#include "util/util_algorithm.h" +#include "util/util_debug.h" +#include "util/util_foreach.h" +#include "util/util_logging.h" +#include "util/util_md5.h" +#include "util/util_path.h" +#include "util/util_time.h" CCL_NAMESPACE_BEGIN +struct texture_slot_t { + texture_slot_t(const string& name, int slot) + : name(name), + slot(slot) { + } + string name; + int slot; +}; + bool OpenCLDeviceBase::opencl_error(cl_int err) { if(err != CL_SUCCESS) { @@ -62,7 +73,9 @@ void OpenCLDeviceBase::opencl_assert_err(cl_int err, const char* where) } OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool background_) -: Device(info, stats, background_) +: Device(info, stats, background_), + memory_manager(this), + texture_info(this, "__texture_info", MEM_TEXTURE) { cpPlatform = NULL; cdDevice = NULL; @@ -70,6 +83,7 @@ OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool backgrou cqCommandQueue = NULL; null_mem = 0; device_initialized = false; + textures_need_update = true; vector<OpenCLPlatformDevice> usable_devices; OpenCLInfo::get_usable_devices(&usable_devices); @@ -82,9 +96,10 @@ OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool backgrou cpPlatform = platform_device.platform_id; cdDevice = platform_device.device_id; platform_name = platform_device.platform_name; + device_name = platform_device.device_name; VLOG(2) << "Creating new Cycles device for OpenCL platform " << platform_name << ", device " - << platform_device.device_name << "."; + << device_name << "."; { /* try to use cached context */ @@ -113,12 +128,20 @@ OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool backgrou } cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr); - if(opencl_error(ciErr)) + if(opencl_error(ciErr)) { + opencl_error("OpenCL: Error creating command queue"); return; + } null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr); - if(opencl_error(ciErr)) + if(opencl_error(ciErr)) { + opencl_error("OpenCL: Error creating memory buffer for NULL"); return; + } + + /* Allocate this right away so that texture_info is placed at offset 0 in the device memory buffers */ + texture_info.resize(1); + memory_manager.alloc("texture_info", texture_info); fprintf(stderr, "Device init success\n"); device_initialized = true; @@ -128,12 +151,13 @@ OpenCLDeviceBase::~OpenCLDeviceBase() { task_pool.stop(); + memory_manager.free(); + if(null_mem) clReleaseMemObject(CL_MEM_PTR(null_mem)); ConstMemMap::iterator mt; for(mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) { - mem_free(*(mt->second)); delete mt->second; } @@ -147,10 +171,8 @@ OpenCLDeviceBase::~OpenCLDeviceBase() void CL_CALLBACK OpenCLDeviceBase::context_notify_callback(const char *err_info, const void * /*private_info*/, size_t /*cb*/, void *user_data) { - char name[256]; - clGetDeviceInfo((cl_device_id)user_data, CL_DEVICE_NAME, sizeof(name), &name, NULL); - - fprintf(stderr, "OpenCL error (%s): %s\n", name, err_info); + string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data); + fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info); } bool OpenCLDeviceBase::opencl_version_check() @@ -191,6 +213,8 @@ string OpenCLDeviceBase::device_md5_hash(string kernel_custom_build_options) bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_features) { + VLOG(2) << "Loading kernels for platform " << platform_name + << ", device " << device_name << "."; /* Verify if device was initialized. */ if(!device_initialized) { fprintf(stderr, "OpenCL: failed to initialize device.\n"); @@ -204,13 +228,33 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea base_program = OpenCLProgram(this, "base", "kernel.cl", build_options_for_base_program(requested_features)); base_program.add_kernel(ustring("convert_to_byte")); base_program.add_kernel(ustring("convert_to_half_float")); - base_program.add_kernel(ustring("shader")); + base_program.add_kernel(ustring("displace")); + base_program.add_kernel(ustring("background")); base_program.add_kernel(ustring("bake")); + base_program.add_kernel(ustring("zero_buffer")); + + denoising_program = OpenCLProgram(this, "denoising", "filter.cl", ""); + denoising_program.add_kernel(ustring("filter_divide_shadow")); + denoising_program.add_kernel(ustring("filter_get_feature")); + denoising_program.add_kernel(ustring("filter_detect_outliers")); + denoising_program.add_kernel(ustring("filter_combine_halves")); + denoising_program.add_kernel(ustring("filter_construct_transform")); + denoising_program.add_kernel(ustring("filter_nlm_calc_difference")); + denoising_program.add_kernel(ustring("filter_nlm_blur")); + denoising_program.add_kernel(ustring("filter_nlm_calc_weight")); + denoising_program.add_kernel(ustring("filter_nlm_update_output")); + denoising_program.add_kernel(ustring("filter_nlm_normalize")); + denoising_program.add_kernel(ustring("filter_nlm_construct_gramian")); + denoising_program.add_kernel(ustring("filter_finalize")); + denoising_program.add_kernel(ustring("filter_set_tiles")); vector<OpenCLProgram*> programs; programs.push_back(&base_program); + programs.push_back(&denoising_program); /* Call actual class to fill the vector with its programs. */ - load_kernels(requested_features, programs); + if(!load_kernels(requested_features, programs)) { + return false; + } /* Parallel compilation is supported by Cycles, but currently all OpenCL frameworks * serialize the calls internally, so it's not much use right now. @@ -242,17 +286,40 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea return true; } -void OpenCLDeviceBase::mem_alloc(device_memory& mem, MemoryType type) +void OpenCLDeviceBase::mem_alloc(device_memory& mem) { + if(mem.name) { + VLOG(1) << "Buffer allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + } + size_t size = mem.memory_size(); + /* check there is enough memory available for the allocation */ + cl_ulong max_alloc_size = 0; + clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL); + + if(DebugFlags().opencl.mem_limit) { + max_alloc_size = min(max_alloc_size, + cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used)); + } + + if(size > max_alloc_size) { + string error = "Scene too complex to fit in available memory."; + if(mem.name != NULL) { + error += string_printf(" (allocating buffer %s failed.)", mem.name); + } + set_error(error); + + return; + } + cl_mem_flags mem_flag; void *mem_ptr = NULL; - if(type == MEM_READ_ONLY) + if(mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE) mem_flag = CL_MEM_READ_ONLY; - else if(type == MEM_WRITE_ONLY) - mem_flag = CL_MEM_WRITE_ONLY; else mem_flag = CL_MEM_READ_WRITE; @@ -279,17 +346,27 @@ void OpenCLDeviceBase::mem_alloc(device_memory& mem, MemoryType type) void OpenCLDeviceBase::mem_copy_to(device_memory& mem) { - /* this is blocking */ - size_t size = mem.memory_size(); - if(size != 0) { - opencl_assert(clEnqueueWriteBuffer(cqCommandQueue, - CL_MEM_PTR(mem.device_pointer), - CL_TRUE, - 0, - size, - (void*)mem.data_pointer, - 0, - NULL, NULL)); + if(mem.type == MEM_TEXTURE) { + tex_free(mem); + tex_alloc(mem); + } + else { + if(!mem.device_pointer) { + mem_alloc(mem); + } + + /* this is blocking */ + size_t size = mem.memory_size(); + if(size != 0) { + opencl_assert(clEnqueueWriteBuffer(cqCommandQueue, + CL_MEM_PTR(mem.device_pointer), + CL_TRUE, + 0, + size, + mem.host_pointer, + 0, + NULL, NULL)); + } } } @@ -303,76 +380,178 @@ void OpenCLDeviceBase::mem_copy_from(device_memory& mem, int y, int w, int h, in CL_TRUE, offset, size, - (uchar*)mem.data_pointer + offset, + (uchar*)mem.host_pointer + offset, 0, NULL, NULL)); } +void OpenCLDeviceBase::mem_zero_kernel(device_ptr mem, size_t size) +{ + cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer")); + + size_t global_size[] = {1024, 1024}; + size_t num_threads = global_size[0] * global_size[1]; + + cl_mem d_buffer = CL_MEM_PTR(mem); + cl_ulong d_offset = 0; + cl_ulong d_size = 0; + + while(d_offset < size) { + d_size = std::min<cl_ulong>(num_threads*sizeof(float4), size - d_offset); + + kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset); + + ciErr = clEnqueueNDRangeKernel(cqCommandQueue, + ckZeroBuffer, + 2, + NULL, + global_size, + NULL, + 0, + NULL, + NULL); + opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); + + d_offset += d_size; + } +} + void OpenCLDeviceBase::mem_zero(device_memory& mem) { + if(!mem.device_pointer) { + mem_alloc(mem); + } + if(mem.device_pointer) { - memset((void*)mem.data_pointer, 0, mem.memory_size()); - mem_copy_to(mem); + if(base_program.is_loaded()) { + mem_zero_kernel(mem.device_pointer, mem.memory_size()); + } + + if(mem.host_pointer) { + memset(mem.host_pointer, 0, mem.memory_size()); + } + + if(!base_program.is_loaded()) { + void* zero = mem.host_pointer; + + if(!mem.host_pointer) { + zero = util_aligned_malloc(mem.memory_size(), 16); + memset(zero, 0, mem.memory_size()); + } + + opencl_assert(clEnqueueWriteBuffer(cqCommandQueue, + CL_MEM_PTR(mem.device_pointer), + CL_TRUE, + 0, + mem.memory_size(), + zero, + 0, + NULL, NULL)); + + if(!mem.host_pointer) { + util_aligned_free(zero); + } + } } } void OpenCLDeviceBase::mem_free(device_memory& mem) { - if(mem.device_pointer) { - if(mem.device_pointer != null_mem) { - opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer))); + if(mem.type == MEM_TEXTURE) { + tex_free(mem); + } + else { + if(mem.device_pointer) { + if(mem.device_pointer != null_mem) { + opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer))); + } + mem.device_pointer = 0; + + stats.mem_free(mem.device_size); + mem.device_size = 0; } - mem.device_pointer = 0; + } +} + +int OpenCLDeviceBase::mem_sub_ptr_alignment() +{ + return OpenCLInfo::mem_sub_ptr_alignment(cdDevice); +} + +device_ptr OpenCLDeviceBase::mem_alloc_sub_ptr(device_memory& mem, int offset, int size) +{ + cl_mem_flags mem_flag; + if(mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE) + mem_flag = CL_MEM_READ_ONLY; + else + mem_flag = CL_MEM_READ_WRITE; + + cl_buffer_region info; + info.origin = mem.memory_elements_size(offset); + info.size = mem.memory_elements_size(size); + + device_ptr sub_buf = (device_ptr) clCreateSubBuffer(CL_MEM_PTR(mem.device_pointer), + mem_flag, + CL_BUFFER_CREATE_TYPE_REGION, + &info, + &ciErr); + opencl_assert_err(ciErr, "clCreateSubBuffer"); + return sub_buf; +} - stats.mem_free(mem.device_size); - mem.device_size = 0; +void OpenCLDeviceBase::mem_free_sub_ptr(device_ptr device_pointer) +{ + if(device_pointer && device_pointer != null_mem) { + opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer))); } } void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size) { ConstMemMap::iterator i = const_mem_map.find(name); + device_vector<uchar> *data; if(i == const_mem_map.end()) { - device_vector<uchar> *data = new device_vector<uchar>(); - data->copy((uchar*)host, size); - - mem_alloc(*data, MEM_READ_ONLY); - i = const_mem_map.insert(ConstMemMap::value_type(name, data)).first; + data = new device_vector<uchar>(this, name, MEM_READ_ONLY); + data->alloc(size); + const_mem_map.insert(ConstMemMap::value_type(name, data)); } else { - device_vector<uchar> *data = i->second; - data->copy((uchar*)host, size); + data = i->second; } - mem_copy_to(*i->second); + memcpy(data->data(), host, size); + data->copy_to_device(); } -void OpenCLDeviceBase::tex_alloc(const char *name, - device_memory& mem, - InterpolationType /*interpolation*/, - ExtensionType /*extension*/) +void OpenCLDeviceBase::tex_alloc(device_memory& mem) { - VLOG(1) << "Texture allocate: " << name << ", " + VLOG(1) << "Texture allocate: " << mem.name << ", " << string_human_readable_number(mem.memory_size()) << " bytes. (" << string_human_readable_size(mem.memory_size()) << ")"; - mem_alloc(mem, MEM_READ_ONLY); - mem_copy_to(mem); - assert(mem_map.find(name) == mem_map.end()); - mem_map.insert(MemMap::value_type(name, mem.device_pointer)); + + memory_manager.alloc(mem.name, mem); + /* Set the pointer to non-null to keep code that inspects its value from thinking its unallocated. */ + mem.device_pointer = 1; + textures[mem.name] = &mem; + textures_need_update = true; } void OpenCLDeviceBase::tex_free(device_memory& mem) { if(mem.device_pointer) { - foreach(const MemMap::value_type& value, mem_map) { - if(value.second == mem.device_pointer) { - mem_map.erase(value.first); + mem.device_pointer = 0; + + if(memory_manager.free(mem)) { + textures_need_update = true; + } + + foreach(TexturesMap::value_type& value, textures) { + if(value.second == &mem) { + textures.erase(value.first); break; } } - - mem_free(mem); } } @@ -382,7 +561,7 @@ size_t OpenCLDeviceBase::global_size_round_up(int group_size, int global_size) return global_size + ((r == 0)? 0: group_size - r); } -void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h) +void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size) { size_t workgroup_size, max_work_items[3]; @@ -391,9 +570,20 @@ void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h) clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL); + if(max_workgroup_size > 0 && workgroup_size > max_workgroup_size) { + workgroup_size = max_workgroup_size; + } + /* Try to divide evenly over 2 dimensions. */ - size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1); - size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size}; + size_t local_size[2]; + if(x_workgroups) { + local_size[0] = workgroup_size; + local_size[1] = 1; + } + else { + size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1); + local_size[0] = local_size[1] = sqrt_workgroup_size; + } /* Some implementations have max size 1 on 2nd dimension. */ if(local_size[1] > max_work_items[1]) { @@ -434,6 +624,76 @@ void OpenCLDeviceBase::set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr)); } +void OpenCLDeviceBase::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg) +{ + flush_texture_buffers(); + + memory_manager.set_kernel_arg_buffers(kernel, narg); +} + +void OpenCLDeviceBase::flush_texture_buffers() +{ + if(!textures_need_update) { + return; + } + textures_need_update = false; + + /* Setup slots for textures. */ + int num_slots = 0; + + vector<texture_slot_t> texture_slots; + +#define KERNEL_TEX(type, name) \ + if(textures.find(#name) != textures.end()) { \ + texture_slots.push_back(texture_slot_t(#name, num_slots)); \ + } \ + num_slots++; +#include "kernel/kernel_textures.h" + + int num_data_slots = num_slots; + + foreach(TexturesMap::value_type& tex, textures) { + string name = tex.first; + + if(string_startswith(name, "__tex_image")) { + int pos = name.rfind("_"); + int id = atoi(name.data() + pos + 1); + texture_slots.push_back(texture_slot_t(name, + num_data_slots + id)); + num_slots = max(num_slots, num_data_slots + id + 1); + } + } + + /* Realloc texture descriptors buffer. */ + memory_manager.free(texture_info); + texture_info.resize(num_slots); + memory_manager.alloc("texture_info", texture_info); + + /* Fill in descriptors */ + foreach(texture_slot_t& slot, texture_slots) { + TextureInfo& info = texture_info[slot.slot]; + + MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name); + info.data = desc.offset; + info.cl_buffer = desc.device_buffer; + + if(string_startswith(slot.name, "__tex_image")) { + device_memory *mem = textures[slot.name]; + + info.width = mem->data_width; + info.height = mem->data_height; + info.depth = mem->data_depth; + + info.interpolation = mem->interpolation; + info.extension = mem->extension; + } + } + + /* Force write of descriptors. */ + memory_manager.free(texture_info); + memory_manager.alloc("texture_info", texture_info); +} + void OpenCLDeviceBase::film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half) { /* cast arguments to cl types */ @@ -458,10 +718,7 @@ void OpenCLDeviceBase::film_convert(DeviceTask& task, device_ptr buffer, device_ d_rgba, d_buffer); -#define KERNEL_TEX(type, ttype, name) \ -set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name); -#include "kernel_textures.h" -#undef KERNEL_TEX + set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index); start_arg_index += kernel_set_args(ckFilmConvertKernel, start_arg_index, @@ -476,13 +733,381 @@ set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name); enqueue_kernel(ckFilmConvertKernel, d_w, d_h); } +bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr, + device_ptr guide_ptr, + device_ptr variance_ptr, + device_ptr out_ptr, + DenoisingTask *task) +{ + + int stride = task->buffer.stride; + int w = task->buffer.width; + int h = task->buffer.h; + int r = task->nlm_state.r; + int f = task->nlm_state.f; + float a = task->nlm_state.a; + float k_2 = task->nlm_state.k_2; + + int shift_stride = stride*h; + int num_shifts = (2*r+1)*(2*r+1); + int mem_size = sizeof(float)*shift_stride*num_shifts; + + cl_mem weightAccum = CL_MEM_PTR(task->nlm_state.temporary_3_ptr); + + cl_mem difference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr); + opencl_assert_err(ciErr, "clCreateBuffer denoising_non_local_means"); + cl_mem blurDifference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr); + opencl_assert_err(ciErr, "clCreateBuffer denoising_non_local_means"); + + cl_mem image_mem = CL_MEM_PTR(image_ptr); + cl_mem guide_mem = CL_MEM_PTR(guide_ptr); + cl_mem variance_mem = CL_MEM_PTR(variance_ptr); + cl_mem out_mem = CL_MEM_PTR(out_ptr); + + mem_zero_kernel(task->nlm_state.temporary_3_ptr, sizeof(float)*w*h); + mem_zero_kernel(out_ptr, sizeof(float)*w*h); + + cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference")); + cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur")); + cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight")); + cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output")); + cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize")); + + kernel_set_args(ckNLMCalcDifference, 0, + guide_mem, + variance_mem, + difference, + w, h, stride, + shift_stride, + r, 0, a, k_2); + kernel_set_args(ckNLMBlur, 0, + difference, + blurDifference, + w, h, stride, + shift_stride, + r, f); + kernel_set_args(ckNLMCalcWeight, 0, + blurDifference, + difference, + w, h, stride, + shift_stride, + r, f); + kernel_set_args(ckNLMUpdateOutput, 0, + blurDifference, + image_mem, + out_mem, + weightAccum, + w, h, stride, + shift_stride, + r, f); + + enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true); + enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); + enqueue_kernel(ckNLMCalcWeight, w*h, num_shifts, true); + enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); + enqueue_kernel(ckNLMUpdateOutput, w*h, num_shifts, true); + + opencl_assert(clReleaseMemObject(difference)); + opencl_assert(clReleaseMemObject(blurDifference)); + + kernel_set_args(ckNLMNormalize, 0, + out_mem, weightAccum, w, h, stride); + enqueue_kernel(ckNLMNormalize, w, h); + + return true; +} + +bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task) +{ + cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); + cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); + cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); + + cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform")); + + kernel_set_args(ckFilterConstructTransform, 0, + buffer_mem, + transform_mem, + rank_mem, + task->filter_area, + task->rect, + task->buffer.pass_stride, + task->radius, + task->pca_threshold); + + enqueue_kernel(ckFilterConstructTransform, + task->storage.w, + task->storage.h, + 256); + + return true; +} + +bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr output_ptr, + DenoisingTask *task) +{ + mem_zero(task->storage.XtWX); + mem_zero(task->storage.XtWY); + + cl_mem color_mem = CL_MEM_PTR(color_ptr); + cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr); + cl_mem output_mem = CL_MEM_PTR(output_ptr); + + cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); + cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); + cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); + cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer); + cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer); + + cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference")); + cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur")); + cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight")); + cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian")); + cl_kernel ckFinalize = denoising_program(ustring("filter_finalize")); + + int w = task->reconstruction_state.source_w; + int h = task->reconstruction_state.source_h; + int stride = task->buffer.stride; + + int shift_stride = stride*h; + int num_shifts = (2*task->radius + 1)*(2*task->radius + 1); + int mem_size = sizeof(float)*shift_stride*num_shifts; + + cl_mem difference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr); + opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct"); + cl_mem blurDifference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr); + opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct"); + + kernel_set_args(ckNLMCalcDifference, 0, + color_mem, + color_variance_mem, + difference, + w, h, stride, + shift_stride, + task->radius, + task->buffer.pass_stride, + 1.0f, task->nlm_k_2); + kernel_set_args(ckNLMBlur, 0, + difference, + blurDifference, + w, h, stride, + shift_stride, + task->radius, 4); + kernel_set_args(ckNLMCalcWeight, 0, + blurDifference, + difference, + w, h, stride, + shift_stride, + task->radius, 4); + kernel_set_args(ckNLMConstructGramian, 0, + blurDifference, + buffer_mem, + transform_mem, + rank_mem, + XtWX_mem, + XtWY_mem, + task->reconstruction_state.filter_window, + w, h, stride, + shift_stride, + task->radius, 4, + task->buffer.pass_stride); + + enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true); + enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); + enqueue_kernel(ckNLMCalcWeight, w*h, num_shifts, true); + enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); + enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256); + + opencl_assert(clReleaseMemObject(difference)); + opencl_assert(clReleaseMemObject(blurDifference)); + + kernel_set_args(ckFinalize, 0, + output_mem, + rank_mem, + XtWX_mem, + XtWY_mem, + task->filter_area, + task->reconstruction_state.buffer_params, + task->render_buffer.samples); + enqueue_kernel(ckFinalize, w, h); + + return true; +} + +bool OpenCLDeviceBase::denoising_combine_halves(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr mean_ptr, + device_ptr variance_ptr, + int r, int4 rect, + DenoisingTask *task) +{ + cl_mem a_mem = CL_MEM_PTR(a_ptr); + cl_mem b_mem = CL_MEM_PTR(b_ptr); + cl_mem mean_mem = CL_MEM_PTR(mean_ptr); + cl_mem variance_mem = CL_MEM_PTR(variance_ptr); + + cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves")); + + kernel_set_args(ckFilterCombineHalves, 0, + mean_mem, + variance_mem, + a_mem, + b_mem, + rect, + r); + enqueue_kernel(ckFilterCombineHalves, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + return true; +} + +bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr sample_variance_ptr, + device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr, + DenoisingTask *task) +{ + cl_mem a_mem = CL_MEM_PTR(a_ptr); + cl_mem b_mem = CL_MEM_PTR(b_ptr); + cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr); + cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr); + cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr); + + cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer); + + cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow")); + + kernel_set_args(ckFilterDivideShadow, 0, + task->render_buffer.samples, + tiles_mem, + a_mem, + b_mem, + sample_variance_mem, + sv_variance_mem, + buffer_variance_mem, + task->rect, + task->render_buffer.pass_stride, + task->render_buffer.denoising_data_offset); + enqueue_kernel(ckFilterDivideShadow, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + return true; +} + +bool OpenCLDeviceBase::denoising_get_feature(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr, + DenoisingTask *task) +{ + cl_mem mean_mem = CL_MEM_PTR(mean_ptr); + cl_mem variance_mem = CL_MEM_PTR(variance_ptr); + + cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer); + + cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature")); + + kernel_set_args(ckFilterGetFeature, 0, + task->render_buffer.samples, + tiles_mem, + mean_offset, + variance_offset, + mean_mem, + variance_mem, + task->rect, + task->render_buffer.pass_stride, + task->render_buffer.denoising_data_offset); + enqueue_kernel(ckFilterGetFeature, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + return true; +} + +bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr, + device_ptr variance_ptr, + device_ptr depth_ptr, + device_ptr output_ptr, + DenoisingTask *task) +{ + cl_mem image_mem = CL_MEM_PTR(image_ptr); + cl_mem variance_mem = CL_MEM_PTR(variance_ptr); + cl_mem depth_mem = CL_MEM_PTR(depth_ptr); + cl_mem output_mem = CL_MEM_PTR(output_ptr); + + cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers")); + + kernel_set_args(ckFilterDetectOutliers, 0, + image_mem, + variance_mem, + depth_mem, + output_mem, + task->rect, + task->buffer.pass_stride); + enqueue_kernel(ckFilterDetectOutliers, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + return true; +} + +bool OpenCLDeviceBase::denoising_set_tiles(device_ptr *buffers, + DenoisingTask *task) +{ + task->tiles_mem.copy_to_device(); + + cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer); + + cl_kernel ckFilterSetTiles = denoising_program(ustring("filter_set_tiles")); + + kernel_set_args(ckFilterSetTiles, 0, tiles_mem); + for(int i = 0; i < 9; i++) { + cl_mem buffer_mem = CL_MEM_PTR(buffers[i]); + kernel_set_args(ckFilterSetTiles, i+1, buffer_mem); + } + + enqueue_kernel(ckFilterSetTiles, 1, 1); + + return true; +} + +void OpenCLDeviceBase::denoise(RenderTile &rtile, DenoisingTask& denoising, const DeviceTask &task) +{ + denoising.functions.set_tiles = function_bind(&OpenCLDeviceBase::denoising_set_tiles, this, _1, &denoising); + denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising); + denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, &denoising); + denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); + denoising.functions.combine_halves = function_bind(&OpenCLDeviceBase::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); + denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, &denoising); + denoising.functions.detect_outliers = function_bind(&OpenCLDeviceBase::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); + + denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); + denoising.render_buffer.samples = rtile.sample; + + RenderTile rtiles[9]; + rtiles[4] = rtile; + task.map_neighbor_tiles(rtiles, this); + denoising.tiles_from_rendertiles(rtiles); + + denoising.init_from_devicetask(task); + + denoising.run_denoising(); + + task.unmap_neighbor_tiles(rtiles, this); +} + void OpenCLDeviceBase::shader(DeviceTask& task) { /* cast arguments to cl types */ cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); cl_mem d_input = CL_MEM_PTR(task.shader_input); cl_mem d_output = CL_MEM_PTR(task.shader_output); - cl_mem d_output_luma = CL_MEM_PTR(task.shader_output_luma); cl_int d_shader_eval_type = task.shader_eval_type; cl_int d_shader_filter = task.shader_filter; cl_int d_shader_x = task.shader_x; @@ -491,10 +1116,15 @@ void OpenCLDeviceBase::shader(DeviceTask& task) cl_kernel kernel; - if(task.shader_eval_type >= SHADER_EVAL_BAKE) + if(task.shader_eval_type >= SHADER_EVAL_BAKE) { kernel = base_program(ustring("bake")); - else - kernel = base_program(ustring("shader")); + } + else if(task.shader_eval_type == SHADER_EVAL_DISPLACE) { + kernel = base_program(ustring("displace")); + } + else { + kernel = base_program(ustring("background")); + } cl_uint start_arg_index = kernel_set_args(kernel, @@ -503,16 +1133,7 @@ void OpenCLDeviceBase::shader(DeviceTask& task) d_input, d_output); - if(task.shader_eval_type < SHADER_EVAL_BAKE) { - start_arg_index += kernel_set_args(kernel, - start_arg_index, - d_output_luma); - } - -#define KERNEL_TEX(type, ttype, name) \ - set_kernel_arg_mem(kernel, &start_arg_index, #name); -#include "kernel_textures.h" -#undef KERNEL_TEX + set_kernel_arg_buffers(kernel, &start_arg_index); start_arg_index += kernel_set_args(kernel, start_arg_index, @@ -545,7 +1166,7 @@ void OpenCLDeviceBase::shader(DeviceTask& task) string OpenCLDeviceBase::kernel_build_options(const string *debug_src) { - string build_options = "-cl-fast-relaxed-math "; + string build_options = "-cl-no-signed-zeros -cl-mad-enable "; if(platform_name == "NVIDIA CUDA") { build_options += "-D__KERNEL_OPENCL_NVIDIA__ " @@ -725,7 +1346,7 @@ void OpenCLDeviceBase::store_cached_kernel( } string OpenCLDeviceBase::build_options_for_base_program( - const DeviceRequestedFeatures& /*requested_features*/) + const DeviceRequestedFeatures& requested_features) { /* TODO(sergey): By default we compile all features, meaning * mega kernel is not getting feature-based optimizations. @@ -733,6 +1354,14 @@ string OpenCLDeviceBase::build_options_for_base_program( * Ideally we need always compile kernel with as less features * enabled as possible to keep performance at it's max. */ + + /* For now disable baking when not in use as this has major + * impact on kernel build times. + */ + if(!requested_features.use_baking) { + return "-D__NO_BAKING__"; + } + return ""; } diff --git a/intern/cycles/device/opencl/opencl_mega.cpp b/intern/cycles/device/opencl/opencl_mega.cpp index 6ea7619e022..ef39cfb5f7d 100644 --- a/intern/cycles/device/opencl/opencl_mega.cpp +++ b/intern/cycles/device/opencl/opencl_mega.cpp @@ -16,15 +16,15 @@ #ifdef WITH_OPENCL -#include "opencl.h" +#include "device/opencl/opencl.h" -#include "buffers.h" +#include "render/buffers.h" -#include "kernel_types.h" +#include "kernel/kernel_types.h" -#include "util_md5.h" -#include "util_path.h" -#include "util_time.h" +#include "util/util_md5.h" +#include "util/util_path.h" +#include "util/util_time.h" CCL_NAMESPACE_BEGIN @@ -43,11 +43,12 @@ public: return true; } - virtual void load_kernels(const DeviceRequestedFeatures& /*requested_features*/, + virtual bool load_kernels(const DeviceRequestedFeatures& /*requested_features*/, vector<OpenCLProgram*> &programs) { path_trace_program.add_kernel(ustring("path_trace")); programs.push_back(&path_trace_program); + return true; } ~OpenCLDeviceMegaKernel() @@ -58,10 +59,11 @@ public: void path_trace(RenderTile& rtile, int sample) { + scoped_timer timer(&rtile.buffers->render_time); + /* Cast arguments to cl types. */ cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); cl_mem d_buffer = CL_MEM_PTR(rtile.buffer); - cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state); cl_int d_x = rtile.x; cl_int d_y = rtile.y; cl_int d_w = rtile.w; @@ -78,13 +80,9 @@ public: kernel_set_args(ckPathTraceKernel, 0, d_data, - d_buffer, - d_rng_state); + d_buffer); -#define KERNEL_TEX(type, ttype, name) \ - set_kernel_arg_mem(ckPathTraceKernel, &start_arg_index, #name); -#include "kernel_textures.h" -#undef KERNEL_TEX + set_kernel_arg_buffers(ckPathTraceKernel, &start_arg_index); start_arg_index += kernel_set_args(ckPathTraceKernel, start_arg_index, @@ -107,41 +105,55 @@ public: else if(task->type == DeviceTask::SHADER) { shader(*task); } - else if(task->type == DeviceTask::PATH_TRACE) { + else if(task->type == DeviceTask::RENDER) { RenderTile tile; + DenoisingTask denoising(this); + /* Keep rendering tiles until done. */ while(task->acquire_tile(this, tile)) { - int start_sample = tile.start_sample; - int end_sample = tile.start_sample + tile.num_samples; + if(tile.task == RenderTile::PATH_TRACE) { + int start_sample = tile.start_sample; + int end_sample = tile.start_sample + tile.num_samples; - for(int sample = start_sample; sample < end_sample; sample++) { - if(task->get_cancel()) { - if(task->need_finish_queue == false) - break; - } + for(int sample = start_sample; sample < end_sample; sample++) { + if(task->get_cancel()) { + if(task->need_finish_queue == false) + break; + } + + path_trace(tile, sample); - path_trace(tile, sample); + tile.sample = sample + 1; - tile.sample = sample + 1; + task->update_progress(&tile, tile.w*tile.h); + } + /* Complete kernel execution before release tile */ + /* This helps in multi-device render; + * The device that reaches the critical-section function + * release_tile waits (stalling other devices from entering + * release_tile) for all kernels to complete. If device1 (a + * slow-render device) reaches release_tile first then it would + * stall device2 (a fast-render device) from proceeding to render + * next tile. + */ + clFinish(cqCommandQueue); + } + else if(tile.task == RenderTile::DENOISE) { + tile.sample = tile.start_sample + tile.num_samples; + denoise(tile, denoising, *task); task->update_progress(&tile, tile.w*tile.h); } - /* Complete kernel execution before release tile */ - /* This helps in multi-device render; - * The device that reaches the critical-section function - * release_tile waits (stalling other devices from entering - * release_tile) for all kernels to complete. If device1 (a - * slow-render device) reaches release_tile first then it would - * stall device2 (a fast-render device) from proceeding to render - * next tile. - */ - clFinish(cqCommandQueue); - task->release_tile(tile); } } } + + bool is_split_kernel() + { + return false; + } }; Device *opencl_create_mega_device(DeviceInfo& info, Stats& stats, bool background) diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp index 3c3c2150128..51d3c7bb10f 100644 --- a/intern/cycles/device/opencl/opencl_split.cpp +++ b/intern/cycles/device/opencl/opencl_split.cpp @@ -16,1290 +16,441 @@ #ifdef WITH_OPENCL -#include "opencl.h" +#include "device/opencl/opencl.h" -#include "buffers.h" +#include "render/buffers.h" -#include "kernel_types.h" +#include "kernel/kernel_types.h" +#include "kernel/split/kernel_split_data_types.h" -#include "util_md5.h" -#include "util_path.h" -#include "util_time.h" +#include "device/device_split_kernel.h" + +#include "util/util_algorithm.h" +#include "util/util_debug.h" +#include "util/util_logging.h" +#include "util/util_md5.h" +#include "util/util_path.h" +#include "util/util_time.h" CCL_NAMESPACE_BEGIN -/* TODO(sergey): This is to keep tile split on OpenCL level working - * for now, since without this view-port render does not work as it - * should. - * - * Ideally it'll be done on the higher level, but we need to get ready - * for merge rather soon, so let's keep split logic private here in - * the file. +class OpenCLSplitKernel; + +namespace { + +/* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to + * fetch its size. */ -class SplitRenderTile : public RenderTile { -public: - SplitRenderTile() - : RenderTile(), - buffer_offset_x(0), - buffer_offset_y(0), - rng_state_offset_x(0), - rng_state_offset_y(0), - buffer_rng_state_stride(0) {} - - explicit SplitRenderTile(RenderTile& tile) - : RenderTile(), - buffer_offset_x(0), - buffer_offset_y(0), - rng_state_offset_x(0), - rng_state_offset_y(0), - buffer_rng_state_stride(0) - { - x = tile.x; - y = tile.y; - w = tile.w; - h = tile.h; - start_sample = tile.start_sample; - num_samples = tile.num_samples; - sample = tile.sample; - resolution = tile.resolution; - offset = tile.offset; - stride = tile.stride; - buffer = tile.buffer; - rng_state = tile.rng_state; - buffers = tile.buffers; +typedef struct KernelGlobalsDummy { + ccl_constant KernelData *data; + ccl_global char *buffers[8]; + +#define KERNEL_TEX(type, name) \ + TextureInfo name; +# include "kernel/kernel_textures.h" +#undef KERNEL_TEX + SplitData split_data; + SplitParams split_param_data; +} KernelGlobalsDummy; + +} // namespace + +static string get_build_options(OpenCLDeviceBase *device, const DeviceRequestedFeatures& requested_features) +{ + string build_options = "-D__SPLIT_KERNEL__ "; + build_options += requested_features.get_build_options(); + + /* Set compute device build option. */ + cl_device_type device_type; + OpenCLInfo::get_device_type(device->cdDevice, &device_type, &device->ciErr); + assert(device->ciErr == CL_SUCCESS); + if(device_type == CL_DEVICE_TYPE_GPU) { + build_options += " -D__COMPUTE_DEVICE_GPU__"; } - /* Split kernel is device global memory constrained; - * hence split kernel cant render big tile size's in - * one go. If the user sets a big tile size (big tile size - * is a term relative to the available device global memory), - * we split the tile further and then call path_trace on - * each of those split tiles. The following variables declared, - * assist in achieving that purpose - */ - int buffer_offset_x; - int buffer_offset_y; - int rng_state_offset_x; - int rng_state_offset_y; - int buffer_rng_state_stride; -}; + return build_options; +} /* OpenCLDeviceSplitKernel's declaration/definition. */ class OpenCLDeviceSplitKernel : public OpenCLDeviceBase { public: - /* Kernel declaration. */ + DeviceSplitKernel *split_kernel; OpenCLProgram program_data_init; - OpenCLProgram program_scene_intersect; - OpenCLProgram program_lamp_emission; - OpenCLProgram program_queue_enqueue; - OpenCLProgram program_background_buffer_update; - OpenCLProgram program_shader_eval; - OpenCLProgram program_holdout_emission_blurring_pathtermination_ao; - OpenCLProgram program_direct_lighting; - OpenCLProgram program_shadow_blocked; - OpenCLProgram program_next_iteration_setup; - OpenCLProgram program_sum_all_radiance; - - /* Global memory variables [porting]; These memory is used for - * co-operation between different kernels; Data written by one - * kernel will be available to another kernel via this global - * memory. - */ - cl_mem rng_coop; - cl_mem throughput_coop; - cl_mem L_transparent_coop; - cl_mem PathRadiance_coop; - cl_mem Ray_coop; - cl_mem PathState_coop; - cl_mem Intersection_coop; - cl_mem kgbuffer; /* KernelGlobals buffer. */ - - /* Global buffers for ShaderData. */ - cl_mem sd; /* ShaderData used in the main path-iteration loop. */ - cl_mem sd_DL_shadow; /* ShaderData used in Direct Lighting and - * shadow_blocked kernel. - */ - - /* Global memory required for shadow blocked and accum_radiance. */ - cl_mem BSDFEval_coop; - cl_mem ISLamp_coop; - cl_mem LightRay_coop; - cl_mem AOAlpha_coop; - cl_mem AOBSDF_coop; - cl_mem AOLightRay_coop; - cl_mem Intersection_coop_shadow; - -#ifdef WITH_CYCLES_DEBUG - /* DebugData memory */ - cl_mem debugdata_coop; -#endif - - /* Global state array that tracks ray state. */ - cl_mem ray_state; - - /* Per sample buffers. */ - cl_mem per_sample_output_buffers; - - /* Denotes which sample each ray is being processed for. */ - cl_mem work_array; - - /* Queue */ - cl_mem Queue_data; /* Array of size queuesize * num_queues * sizeof(int). */ - cl_mem Queue_index; /* Array of size num_queues * sizeof(int); - * Tracks the size of each queue. - */ - - /* Flag to make sceneintersect and lampemission kernel use queues. */ - cl_mem use_queues_flag; - - /* Amount of memory in output buffer associated with one pixel/thread. */ - size_t per_thread_output_buffer_size; - - /* Total allocatable available device memory. */ - size_t total_allocatable_memory; - - /* host version of ray_state; Used in checking host path-iteration - * termination. - */ - char *hostRayStateArray; - - /* Number of path-iterations to be done in one shot. */ - unsigned int PathIteration_times; - -#ifdef __WORK_STEALING__ - /* Work pool with respect to each work group. */ - cl_mem work_pool_wgs; - - /* Denotes the maximum work groups possible w.r.t. current tile size. */ - unsigned int max_work_groups; -#endif - - /* clos_max value for which the kernels have been loaded currently. */ - int current_max_closure; - - /* Marked True in constructor and marked false at the end of path_trace(). */ - bool first_tile; - - OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_) - : OpenCLDeviceBase(info, stats, background_) + OpenCLProgram program_state_buffer_size; + + OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_); + + ~OpenCLDeviceSplitKernel() { - background = background_; - - /* Initialize cl_mem variables. */ - kgbuffer = NULL; - sd = NULL; - sd_DL_shadow = NULL; - - rng_coop = NULL; - throughput_coop = NULL; - L_transparent_coop = NULL; - PathRadiance_coop = NULL; - Ray_coop = NULL; - PathState_coop = NULL; - Intersection_coop = NULL; - ray_state = NULL; - - AOAlpha_coop = NULL; - AOBSDF_coop = NULL; - AOLightRay_coop = NULL; - BSDFEval_coop = NULL; - ISLamp_coop = NULL; - LightRay_coop = NULL; - Intersection_coop_shadow = NULL; - -#ifdef WITH_CYCLES_DEBUG - debugdata_coop = NULL; -#endif - - work_array = NULL; - - /* Queue. */ - Queue_data = NULL; - Queue_index = NULL; - use_queues_flag = NULL; - - per_sample_output_buffers = NULL; - - per_thread_output_buffer_size = 0; - hostRayStateArray = NULL; - PathIteration_times = PATH_ITER_INC_FACTOR; -#ifdef __WORK_STEALING__ - work_pool_wgs = NULL; - max_work_groups = 0; -#endif - current_max_closure = -1; - first_tile = true; - - /* Get device's maximum memory that can be allocated. */ - ciErr = clGetDeviceInfo(cdDevice, - CL_DEVICE_MAX_MEM_ALLOC_SIZE, - sizeof(size_t), - &total_allocatable_memory, - NULL); - assert(ciErr == CL_SUCCESS); - if(platform_name == "AMD Accelerated Parallel Processing") { - /* This value is tweak-able; AMD platform does not seem to - * give maximum performance when all of CL_DEVICE_MAX_MEM_ALLOC_SIZE - * is considered for further computation. - */ - total_allocatable_memory /= 2; - } + task_pool.stop(); + + /* Release kernels */ + program_data_init.release(); + + delete split_kernel; } virtual bool show_samples() const { - return false; + return true; } - /* Split kernel utility functions. */ - size_t get_tex_size(const char *tex_name) + virtual bool load_kernels(const DeviceRequestedFeatures& requested_features, + vector<OpenCLDeviceBase::OpenCLProgram*> &programs) { - cl_mem ptr; - size_t ret_size = 0; - MemMap::iterator i = mem_map.find(tex_name); - if(i != mem_map.end()) { - ptr = CL_MEM_PTR(i->second); - ciErr = clGetMemObjectInfo(ptr, - CL_MEM_SIZE, - sizeof(ret_size), - &ret_size, - NULL); - assert(ciErr == CL_SUCCESS); + bool single_program = OpenCLInfo::use_single_program(); + program_data_init = OpenCLDeviceBase::OpenCLProgram(this, + single_program ? "split" : "split_data_init", + single_program ? "kernel_split.cl" : "kernel_data_init.cl", + get_build_options(this, requested_features)); + + program_data_init.add_kernel(ustring("path_trace_data_init")); + programs.push_back(&program_data_init); + + program_state_buffer_size = OpenCLDeviceBase::OpenCLProgram(this, + single_program ? "split" : "split_state_buffer_size", + single_program ? "kernel_split.cl" : "kernel_state_buffer_size.cl", + get_build_options(this, requested_features)); + program_state_buffer_size.add_kernel(ustring("path_trace_state_buffer_size")); + programs.push_back(&program_state_buffer_size); + + return split_kernel->load_kernels(requested_features); + } + + void thread_run(DeviceTask *task) + { + flush_texture_buffers(); + + if(task->type == DeviceTask::FILM_CONVERT) { + film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half); + } + else if(task->type == DeviceTask::SHADER) { + shader(*task); + } + else if(task->type == DeviceTask::RENDER) { + RenderTile tile; + DenoisingTask denoising(this); + + /* Allocate buffer for kernel globals */ + device_only_memory<KernelGlobalsDummy> kgbuffer(this, "kernel_globals"); + kgbuffer.alloc_to_device(1); + + /* Keep rendering tiles until done. */ + while(task->acquire_tile(this, tile)) { + if(tile.task == RenderTile::PATH_TRACE) { + assert(tile.task == RenderTile::PATH_TRACE); + scoped_timer timer(&tile.buffers->render_time); + + split_kernel->path_trace(task, + tile, + kgbuffer, + *const_mem_map["__data"]); + + /* Complete kernel execution before release tile. */ + /* This helps in multi-device render; + * The device that reaches the critical-section function + * release_tile waits (stalling other devices from entering + * release_tile) for all kernels to complete. If device1 (a + * slow-render device) reaches release_tile first then it would + * stall device2 (a fast-render device) from proceeding to render + * next tile. + */ + clFinish(cqCommandQueue); + } + else if(tile.task == RenderTile::DENOISE) { + tile.sample = tile.start_sample + tile.num_samples; + denoise(tile, denoising, *task); + task->update_progress(&tile, tile.w*tile.h); + } + + task->release_tile(tile); + } + + kgbuffer.free(); } - return ret_size; } - size_t get_shader_data_size(size_t max_closure) + bool is_split_kernel() { - /* ShaderData size with variable size ShaderClosure array */ - return sizeof(ShaderData) - (sizeof(ShaderClosure) * (MAX_CLOSURE - max_closure)); + return true; } - /* Returns size of KernelGlobals structure associated with OpenCL. */ - size_t get_KernelGlobals_size() +protected: + /* ** Those guys are for workign around some compiler-specific bugs ** */ + + string build_options_for_base_program( + const DeviceRequestedFeatures& requested_features) { - /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to - * fetch its size. - */ - typedef struct KernelGlobals { - ccl_constant KernelData *data; -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name; -#include "kernel_textures.h" -#undef KERNEL_TEX - void *sd_input; - void *isect_shadow; - } KernelGlobals; + return requested_features.get_build_options(); + } - return sizeof(KernelGlobals); + friend class OpenCLSplitKernel; + friend class OpenCLSplitKernelFunction; +}; + +struct CachedSplitMemory { + int id; + device_memory *split_data; + device_memory *ray_state; + device_memory *queue_index; + device_memory *use_queues_flag; + device_memory *work_pools; + device_ptr *buffer; +}; + +class OpenCLSplitKernelFunction : public SplitKernelFunction { +public: + OpenCLDeviceSplitKernel* device; + OpenCLDeviceBase::OpenCLProgram program; + CachedSplitMemory& cached_memory; + int cached_id; + + OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device, CachedSplitMemory& cached_memory) : + device(device), cached_memory(cached_memory), cached_id(cached_memory.id-1) + { } - virtual void load_kernels(const DeviceRequestedFeatures& requested_features, - vector<OpenCLProgram*> &programs) + ~OpenCLSplitKernelFunction() { - string build_options = "-D__SPLIT_KERNEL__ "; -#ifdef __WORK_STEALING__ - build_options += "-D__WORK_STEALING__ "; -#endif - build_options += requested_features.get_build_options(); - - /* Set compute device build option. */ - cl_device_type device_type; - ciErr = clGetDeviceInfo(cdDevice, - CL_DEVICE_TYPE, - sizeof(cl_device_type), - &device_type, - NULL); - assert(ciErr == CL_SUCCESS); - if(device_type == CL_DEVICE_TYPE_GPU) { - build_options += " -D__COMPUTE_DEVICE_GPU__"; + program.release(); + } + + virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data) + { + if(cached_id != cached_memory.id) { + cl_uint start_arg_index = + device->kernel_set_args(program(), + 0, + kg, + data, + *cached_memory.split_data, + *cached_memory.ray_state); + + device->set_kernel_arg_buffers(program(), &start_arg_index); + + start_arg_index += + device->kernel_set_args(program(), + start_arg_index, + *cached_memory.queue_index, + *cached_memory.use_queues_flag, + *cached_memory.work_pools, + *cached_memory.buffer); + + cached_id = cached_memory.id; + } + + device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue, + program(), + 2, + NULL, + dim.global_size, + dim.local_size, + 0, + NULL, + NULL); + + device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel"); + + if(device->ciErr != CL_SUCCESS) { + string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", + clewErrorString(device->ciErr)); + device->opencl_error(message); + return false; } -#define GLUE(a, b) a ## b -#define LOAD_KERNEL(name) \ - do { \ - GLUE(program_, name) = OpenCLProgram(this, "split_" #name, "kernel_" #name ".cl", build_options); \ - GLUE(program_, name).add_kernel(ustring("path_trace_" #name)); \ - programs.push_back(&GLUE(program_, name)); \ - } while(false) - - LOAD_KERNEL(data_init); - LOAD_KERNEL(scene_intersect); - LOAD_KERNEL(lamp_emission); - LOAD_KERNEL(queue_enqueue); - LOAD_KERNEL(background_buffer_update); - LOAD_KERNEL(shader_eval); - LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao); - LOAD_KERNEL(direct_lighting); - LOAD_KERNEL(shadow_blocked); - LOAD_KERNEL(next_iteration_setup); - LOAD_KERNEL(sum_all_radiance); - -#undef FIND_KERNEL -#undef GLUE - - current_max_closure = requested_features.max_closure; + return true; } +}; - ~OpenCLDeviceSplitKernel() +class OpenCLSplitKernel : public DeviceSplitKernel { + OpenCLDeviceSplitKernel *device; + CachedSplitMemory cached_memory; +public: + explicit OpenCLSplitKernel(OpenCLDeviceSplitKernel *device) : DeviceSplitKernel(device), device(device) { + } + + virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name, + const DeviceRequestedFeatures& requested_features) { - task_pool.stop(); + OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device, cached_memory); - /* Release kernels */ - program_data_init.release(); - program_scene_intersect.release(); - program_lamp_emission.release(); - program_queue_enqueue.release(); - program_background_buffer_update.release(); - program_shader_eval.release(); - program_holdout_emission_blurring_pathtermination_ao.release(); - program_direct_lighting.release(); - program_shadow_blocked.release(); - program_next_iteration_setup.release(); - program_sum_all_radiance.release(); - - /* Release global memory */ - release_mem_object_safe(rng_coop); - release_mem_object_safe(throughput_coop); - release_mem_object_safe(L_transparent_coop); - release_mem_object_safe(PathRadiance_coop); - release_mem_object_safe(Ray_coop); - release_mem_object_safe(PathState_coop); - release_mem_object_safe(Intersection_coop); - release_mem_object_safe(kgbuffer); - release_mem_object_safe(sd); - release_mem_object_safe(sd_DL_shadow); - release_mem_object_safe(ray_state); - release_mem_object_safe(AOAlpha_coop); - release_mem_object_safe(AOBSDF_coop); - release_mem_object_safe(AOLightRay_coop); - release_mem_object_safe(BSDFEval_coop); - release_mem_object_safe(ISLamp_coop); - release_mem_object_safe(LightRay_coop); - release_mem_object_safe(Intersection_coop_shadow); -#ifdef WITH_CYCLES_DEBUG - release_mem_object_safe(debugdata_coop); -#endif - release_mem_object_safe(use_queues_flag); - release_mem_object_safe(Queue_data); - release_mem_object_safe(Queue_index); - release_mem_object_safe(work_array); -#ifdef __WORK_STEALING__ - release_mem_object_safe(work_pool_wgs); -#endif - release_mem_object_safe(per_sample_output_buffers); - - if(hostRayStateArray != NULL) { - free(hostRayStateArray); + bool single_program = OpenCLInfo::use_single_program(); + kernel->program = + OpenCLDeviceBase::OpenCLProgram(device, + single_program ? "split" : "split_" + kernel_name, + single_program ? "kernel_split.cl" : "kernel_" + kernel_name + ".cl", + get_build_options(device, requested_features)); + + kernel->program.add_kernel(ustring("path_trace_" + kernel_name)); + kernel->program.load(); + + if(!kernel->program.is_loaded()) { + delete kernel; + return NULL; } + + return kernel; } - void path_trace(DeviceTask *task, - SplitRenderTile& rtile, - int2 max_render_feasible_tile_size) + virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads) { - /* cast arguments to cl types */ - cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); - cl_mem d_buffer = CL_MEM_PTR(rtile.buffer); - cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state); - cl_int d_x = rtile.x; - cl_int d_y = rtile.y; - cl_int d_w = rtile.w; - cl_int d_h = rtile.h; - cl_int d_offset = rtile.offset; - cl_int d_stride = rtile.stride; - - /* Make sure that set render feasible tile size is a multiple of local - * work size dimensions. - */ - assert(max_render_feasible_tile_size.x % SPLIT_KERNEL_LOCAL_SIZE_X == 0); - assert(max_render_feasible_tile_size.y % SPLIT_KERNEL_LOCAL_SIZE_Y == 0); + device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE); + size_buffer.alloc(1); + size_buffer.zero_to_device(); + + uint threads = num_threads; + device->kernel_set_args(device->program_state_buffer_size(), 0, kg, data, threads, size_buffer); + + size_t global_size = 64; + device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue, + device->program_state_buffer_size(), + 1, + NULL, + &global_size, + NULL, + 0, + NULL, + NULL); + + device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel"); + + size_buffer.copy_from_device(0, 1, 1); + size_t size = size_buffer[0]; + size_buffer.free(); + + if(device->ciErr != CL_SUCCESS) { + string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", + clewErrorString(device->ciErr)); + device->opencl_error(message); + return 0; + } + + return size; + } - size_t global_size[2]; - size_t local_size[2] = {SPLIT_KERNEL_LOCAL_SIZE_X, - SPLIT_KERNEL_LOCAL_SIZE_Y}; + virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim, + RenderTile& rtile, + int num_global_elements, + device_memory& kernel_globals, + device_memory& kernel_data, + device_memory& split_data, + device_memory& ray_state, + device_memory& queue_index, + device_memory& use_queues_flag, + device_memory& work_pool_wgs + ) + { + cl_int dQueue_size = dim.global_size[0] * dim.global_size[1]; /* Set the range of samples to be processed for every ray in * path-regeneration logic. */ cl_int start_sample = rtile.start_sample; cl_int end_sample = rtile.start_sample + rtile.num_samples; - cl_int num_samples = rtile.num_samples; - -#ifdef __WORK_STEALING__ - global_size[0] = (((d_w - 1) / local_size[0]) + 1) * local_size[0]; - global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1]; - unsigned int num_parallel_samples = 1; -#else - global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1]; - unsigned int num_threads = max_render_feasible_tile_size.x * - max_render_feasible_tile_size.y; - unsigned int num_tile_columns_possible = num_threads / global_size[1]; - /* Estimate number of parallel samples that can be - * processed in parallel. - */ - unsigned int num_parallel_samples = min(num_tile_columns_possible / d_w, - rtile.num_samples); - /* Wavefront size in AMD is 64. - * TODO(sergey): What about other platforms? - */ - if(num_parallel_samples >= 64) { - /* TODO(sergey): Could use generic round-up here. */ - num_parallel_samples = (num_parallel_samples / 64) * 64; - } - assert(num_parallel_samples != 0); - - global_size[0] = d_w * num_parallel_samples; -#endif /* __WORK_STEALING__ */ - - assert(global_size[0] * global_size[1] <= - max_render_feasible_tile_size.x * max_render_feasible_tile_size.y); - - /* Allocate all required global memory once. */ - if(first_tile) { - size_t num_global_elements = max_render_feasible_tile_size.x * - max_render_feasible_tile_size.y; - /* TODO(sergey): This will actually over-allocate if - * particular kernel does not support multiclosure. - */ - size_t shaderdata_size = get_shader_data_size(current_max_closure); - -#ifdef __WORK_STEALING__ - /* Calculate max groups */ - size_t max_global_size[2]; - size_t tile_x = max_render_feasible_tile_size.x; - size_t tile_y = max_render_feasible_tile_size.y; - max_global_size[0] = (((tile_x - 1) / local_size[0]) + 1) * local_size[0]; - max_global_size[1] = (((tile_y - 1) / local_size[1]) + 1) * local_size[1]; - max_work_groups = (max_global_size[0] * max_global_size[1]) / - (local_size[0] * local_size[1]); - /* Allocate work_pool_wgs memory. */ - work_pool_wgs = mem_alloc(max_work_groups * sizeof(unsigned int)); -#endif /* __WORK_STEALING__ */ - - /* Allocate queue_index memory only once. */ - Queue_index = mem_alloc(NUM_QUEUES * sizeof(int)); - use_queues_flag = mem_alloc(sizeof(char)); - kgbuffer = mem_alloc(get_KernelGlobals_size()); - - /* Create global buffers for ShaderData. */ - sd = mem_alloc(num_global_elements * shaderdata_size); - sd_DL_shadow = mem_alloc(num_global_elements * 2 * shaderdata_size); - - /* Creation of global memory buffers which are shared among - * the kernels. - */ - rng_coop = mem_alloc(num_global_elements * sizeof(RNG)); - throughput_coop = mem_alloc(num_global_elements * sizeof(float3)); - L_transparent_coop = mem_alloc(num_global_elements * sizeof(float)); - PathRadiance_coop = mem_alloc(num_global_elements * sizeof(PathRadiance)); - Ray_coop = mem_alloc(num_global_elements * sizeof(Ray)); - PathState_coop = mem_alloc(num_global_elements * sizeof(PathState)); - Intersection_coop = mem_alloc(num_global_elements * sizeof(Intersection)); - AOAlpha_coop = mem_alloc(num_global_elements * sizeof(float3)); - AOBSDF_coop = mem_alloc(num_global_elements * sizeof(float3)); - AOLightRay_coop = mem_alloc(num_global_elements * sizeof(Ray)); - BSDFEval_coop = mem_alloc(num_global_elements * sizeof(BsdfEval)); - ISLamp_coop = mem_alloc(num_global_elements * sizeof(int)); - LightRay_coop = mem_alloc(num_global_elements * sizeof(Ray)); - Intersection_coop_shadow = mem_alloc(2 * num_global_elements * sizeof(Intersection)); - -#ifdef WITH_CYCLES_DEBUG - debugdata_coop = mem_alloc(num_global_elements * sizeof(DebugData)); -#endif - - ray_state = mem_alloc(num_global_elements * sizeof(char)); - - hostRayStateArray = (char *)calloc(num_global_elements, sizeof(char)); - assert(hostRayStateArray != NULL && "Can't create hostRayStateArray memory"); - - Queue_data = mem_alloc(num_global_elements * (NUM_QUEUES * sizeof(int)+sizeof(int))); - work_array = mem_alloc(num_global_elements * sizeof(unsigned int)); - per_sample_output_buffers = mem_alloc(num_global_elements * - per_thread_output_buffer_size); - } - - cl_int dQueue_size = global_size[0] * global_size[1]; cl_uint start_arg_index = - kernel_set_args(program_data_init(), + device->kernel_set_args(device->program_data_init(), 0, - kgbuffer, - sd_DL_shadow, - d_data, - per_sample_output_buffers, - d_rng_state, - rng_coop, - throughput_coop, - L_transparent_coop, - PathRadiance_coop, - Ray_coop, - PathState_coop, - Intersection_coop_shadow, + kernel_globals, + kernel_data, + split_data, + num_global_elements, ray_state); -/* TODO(sergey): Avoid map lookup here. */ -#define KERNEL_TEX(type, ttype, name) \ - set_kernel_arg_mem(program_data_init(), &start_arg_index, #name); -#include "kernel_textures.h" -#undef KERNEL_TEX + device->set_kernel_arg_buffers(device->program_data_init(), &start_arg_index); start_arg_index += - kernel_set_args(program_data_init(), + device->kernel_set_args(device->program_data_init(), start_arg_index, start_sample, - d_x, - d_y, - d_w, - d_h, - d_offset, - d_stride, - rtile.rng_state_offset_x, - rtile.rng_state_offset_y, - rtile.buffer_rng_state_stride, - Queue_data, - Queue_index, + end_sample, + rtile.x, + rtile.y, + rtile.w, + rtile.h, + rtile.offset, + rtile.stride, + queue_index, dQueue_size, use_queues_flag, - work_array, -#ifdef __WORK_STEALING__ work_pool_wgs, - num_samples, -#endif -#ifdef WITH_CYCLES_DEBUG - debugdata_coop, -#endif - num_parallel_samples); - - kernel_set_args(program_scene_intersect(), - 0, - kgbuffer, - d_data, - rng_coop, - Ray_coop, - PathState_coop, - Intersection_coop, - ray_state, - d_w, - d_h, - Queue_data, - Queue_index, - dQueue_size, - use_queues_flag, -#ifdef WITH_CYCLES_DEBUG - debugdata_coop, -#endif - num_parallel_samples); - - kernel_set_args(program_lamp_emission(), - 0, - kgbuffer, - d_data, - throughput_coop, - PathRadiance_coop, - Ray_coop, - PathState_coop, - Intersection_coop, - ray_state, - d_w, - d_h, - Queue_data, - Queue_index, - dQueue_size, - use_queues_flag, - num_parallel_samples); - - kernel_set_args(program_queue_enqueue(), - 0, - Queue_data, - Queue_index, - ray_state, - dQueue_size); - - kernel_set_args(program_background_buffer_update(), - 0, - kgbuffer, - d_data, - per_sample_output_buffers, - d_rng_state, - rng_coop, - throughput_coop, - PathRadiance_coop, - Ray_coop, - PathState_coop, - L_transparent_coop, - ray_state, - d_w, - d_h, - d_x, - d_y, - d_stride, - rtile.rng_state_offset_x, - rtile.rng_state_offset_y, - rtile.buffer_rng_state_stride, - work_array, - Queue_data, - Queue_index, - dQueue_size, - end_sample, - start_sample, -#ifdef __WORK_STEALING__ - work_pool_wgs, - num_samples, -#endif -#ifdef WITH_CYCLES_DEBUG - debugdata_coop, -#endif - num_parallel_samples); - - kernel_set_args(program_shader_eval(), - 0, - kgbuffer, - d_data, - sd, - rng_coop, - Ray_coop, - PathState_coop, - Intersection_coop, - ray_state, - Queue_data, - Queue_index, - dQueue_size); - - kernel_set_args(program_holdout_emission_blurring_pathtermination_ao(), - 0, - kgbuffer, - d_data, - sd, - per_sample_output_buffers, - rng_coop, - throughput_coop, - L_transparent_coop, - PathRadiance_coop, - PathState_coop, - Intersection_coop, - AOAlpha_coop, - AOBSDF_coop, - AOLightRay_coop, - d_w, - d_h, - d_x, - d_y, - d_stride, - ray_state, - work_array, - Queue_data, - Queue_index, - dQueue_size, -#ifdef __WORK_STEALING__ - start_sample, -#endif - num_parallel_samples); - - kernel_set_args(program_direct_lighting(), - 0, - kgbuffer, - d_data, - sd, - rng_coop, - PathState_coop, - ISLamp_coop, - LightRay_coop, - BSDFEval_coop, - ray_state, - Queue_data, - Queue_index, - dQueue_size); - - kernel_set_args(program_shadow_blocked(), - 0, - kgbuffer, - d_data, - PathState_coop, - LightRay_coop, - AOLightRay_coop, - ray_state, - Queue_data, - Queue_index, - dQueue_size); - - kernel_set_args(program_next_iteration_setup(), - 0, - kgbuffer, - d_data, - sd, - rng_coop, - throughput_coop, - PathRadiance_coop, - Ray_coop, - PathState_coop, - LightRay_coop, - ISLamp_coop, - BSDFEval_coop, - AOLightRay_coop, - AOBSDF_coop, - AOAlpha_coop, - ray_state, - Queue_data, - Queue_index, - dQueue_size, - use_queues_flag); - - kernel_set_args(program_sum_all_radiance(), - 0, - d_data, - d_buffer, - per_sample_output_buffers, - num_parallel_samples, - d_w, - d_h, - d_stride, - rtile.buffer_offset_x, - rtile.buffer_offset_y, - rtile.buffer_rng_state_stride, - start_sample); - - /* Macro for Enqueuing split kernels. */ -#define GLUE(a, b) a ## b -#define ENQUEUE_SPLIT_KERNEL(kernelName, globalSize, localSize) \ - { \ - ciErr = clEnqueueNDRangeKernel(cqCommandQueue, \ - GLUE(program_, \ - kernelName)(), \ - 2, \ - NULL, \ - globalSize, \ - localSize, \ - 0, \ - NULL, \ - NULL); \ - opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); \ - if(ciErr != CL_SUCCESS) { \ - string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", \ - clewErrorString(ciErr)); \ - opencl_error(message); \ - return; \ - } \ - } (void) 0 + rtile.num_samples, + rtile.buffer); /* Enqueue ckPathTraceKernel_data_init kernel. */ - ENQUEUE_SPLIT_KERNEL(data_init, global_size, local_size); - bool activeRaysAvailable = true; - - /* Record number of time host intervention has been made */ - unsigned int numHostIntervention = 0; - unsigned int numNextPathIterTimes = PathIteration_times; - bool canceled = false; - while(activeRaysAvailable) { - /* Twice the global work size of other kernels for - * ckPathTraceKernel_shadow_blocked_direct_lighting. */ - size_t global_size_shadow_blocked[2]; - global_size_shadow_blocked[0] = global_size[0] * 2; - global_size_shadow_blocked[1] = global_size[1]; - - /* Do path-iteration in host [Enqueue Path-iteration kernels. */ - for(int PathIter = 0; PathIter < PathIteration_times; PathIter++) { - ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(background_buffer_update, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(shadow_blocked, global_size_shadow_blocked, local_size); - ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size); - - if(task->get_cancel()) { - canceled = true; - break; - } - } - - /* Read ray-state into Host memory to decide if we should exit - * path-iteration in host. - */ - ciErr = clEnqueueReadBuffer(cqCommandQueue, - ray_state, - CL_TRUE, - 0, - global_size[0] * global_size[1] * sizeof(char), - hostRayStateArray, - 0, - NULL, - NULL); - assert(ciErr == CL_SUCCESS); - - activeRaysAvailable = false; - - for(int rayStateIter = 0; - rayStateIter < global_size[0] * global_size[1]; - ++rayStateIter) - { - if(int8_t(hostRayStateArray[rayStateIter]) != RAY_INACTIVE) { - /* Not all rays are RAY_INACTIVE. */ - activeRaysAvailable = true; - break; - } - } - - if(activeRaysAvailable) { - numHostIntervention++; - PathIteration_times = PATH_ITER_INC_FACTOR; - /* Host intervention done before all rays become RAY_INACTIVE; - * Set do more initial iterations for the next tile. - */ - numNextPathIterTimes += PATH_ITER_INC_FACTOR; - } - - if(task->get_cancel()) { - canceled = true; - break; - } - } - - /* Execute SumALLRadiance kernel to accumulate radiance calculated in - * per_sample_output_buffers into RenderTile's output buffer. - */ - if(!canceled) { - size_t sum_all_radiance_local_size[2] = {16, 16}; - size_t sum_all_radiance_global_size[2]; - sum_all_radiance_global_size[0] = - (((d_w - 1) / sum_all_radiance_local_size[0]) + 1) * - sum_all_radiance_local_size[0]; - sum_all_radiance_global_size[1] = - (((d_h - 1) / sum_all_radiance_local_size[1]) + 1) * - sum_all_radiance_local_size[1]; - ENQUEUE_SPLIT_KERNEL(sum_all_radiance, - sum_all_radiance_global_size, - sum_all_radiance_local_size); + device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue, + device->program_data_init(), + 2, + NULL, + dim.global_size, + dim.local_size, + 0, + NULL, + NULL); + + device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel"); + + if(device->ciErr != CL_SUCCESS) { + string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", + clewErrorString(device->ciErr)); + device->opencl_error(message); + return false; } -#undef ENQUEUE_SPLIT_KERNEL -#undef GLUE + cached_memory.split_data = &split_data; + cached_memory.ray_state = &ray_state; + cached_memory.queue_index = &queue_index; + cached_memory.use_queues_flag = &use_queues_flag; + cached_memory.work_pools = &work_pool_wgs; + cached_memory.buffer = &rtile.buffer; + cached_memory.id++; - if(numHostIntervention == 0) { - /* This means that we are executing kernel more than required - * Must avoid this for the next sample/tile. - */ - PathIteration_times = ((numNextPathIterTimes - PATH_ITER_INC_FACTOR) <= 0) ? - PATH_ITER_INC_FACTOR : numNextPathIterTimes - PATH_ITER_INC_FACTOR; - } - else { - /* Number of path-iterations done for this tile is set as - * Initial path-iteration times for the next tile - */ - PathIteration_times = numNextPathIterTimes; - } - - first_tile = false; + return true; } - /* Calculates the amount of memory that has to be always - * allocated in order for the split kernel to function. - * This memory is tile/scene-property invariant (meaning, - * the value returned by this function does not depend - * on the user set tile size or scene properties. - */ - size_t get_invariable_mem_allocated() + virtual int2 split_kernel_local_size() { - size_t total_invariable_mem_allocated = 0; - size_t KernelGlobals_size = 0; - - KernelGlobals_size = get_KernelGlobals_size(); - - total_invariable_mem_allocated += KernelGlobals_size; /* KernelGlobals size */ - total_invariable_mem_allocated += NUM_QUEUES * sizeof(unsigned int); /* Queue index size */ - total_invariable_mem_allocated += sizeof(char); /* use_queues_flag size */ - - return total_invariable_mem_allocated; + return make_int2(64, 1); } - /* Calculate the memory that has-to-be/has-been allocated for - * the split kernel to function. - */ - size_t get_tile_specific_mem_allocated(const int2 tile_size) + virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/) { - size_t tile_specific_mem_allocated = 0; - - /* Get required tile info */ - unsigned int user_set_tile_w = tile_size.x; - unsigned int user_set_tile_h = tile_size.y; - -#ifdef __WORK_STEALING__ - /* Calculate memory to be allocated for work_pools in - * case of work_stealing. - */ - size_t max_global_size[2]; - size_t max_num_work_pools = 0; - max_global_size[0] = - (((user_set_tile_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) * - SPLIT_KERNEL_LOCAL_SIZE_X; - max_global_size[1] = - (((user_set_tile_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) * - SPLIT_KERNEL_LOCAL_SIZE_Y; - max_num_work_pools = - (max_global_size[0] * max_global_size[1]) / - (SPLIT_KERNEL_LOCAL_SIZE_X * SPLIT_KERNEL_LOCAL_SIZE_Y); - tile_specific_mem_allocated += max_num_work_pools * sizeof(unsigned int); -#endif - - tile_specific_mem_allocated += - user_set_tile_w * user_set_tile_h * per_thread_output_buffer_size; - tile_specific_mem_allocated += - user_set_tile_w * user_set_tile_h * sizeof(RNG); - - return tile_specific_mem_allocated; - } - - /* Calculates the texture memories and KernelData (d_data) memory - * that has been allocated. - */ - size_t get_scene_specific_mem_allocated(cl_mem d_data) - { - size_t scene_specific_mem_allocated = 0; - /* Calculate texture memories. */ -#define KERNEL_TEX(type, ttype, name) \ - scene_specific_mem_allocated += get_tex_size(#name); -#include "kernel_textures.h" -#undef KERNEL_TEX - size_t d_data_size; - ciErr = clGetMemObjectInfo(d_data, - CL_MEM_SIZE, - sizeof(d_data_size), - &d_data_size, - NULL); - assert(ciErr == CL_SUCCESS && "Can't get d_data mem object info"); - scene_specific_mem_allocated += d_data_size; - return scene_specific_mem_allocated; - } - - /* Calculate the memory required for one thread in split kernel. */ - size_t get_per_thread_memory() - { - size_t shaderdata_size = 0; - /* TODO(sergey): This will actually over-allocate if - * particular kernel does not support multiclosure. - */ - shaderdata_size = get_shader_data_size(current_max_closure); - size_t retval = sizeof(RNG) - + sizeof(float3) /* Throughput size */ - + sizeof(float) /* L transparent size */ - + sizeof(char) /* Ray state size */ - + sizeof(unsigned int) /* Work element size */ - + sizeof(int) /* ISLamp_size */ - + sizeof(PathRadiance) + sizeof(Ray) + sizeof(PathState) - + sizeof(Intersection) /* Overall isect */ - + sizeof(Intersection) /* Instersection_coop_AO */ - + sizeof(Intersection) /* Intersection coop DL */ - + shaderdata_size /* Overall ShaderData */ - + (shaderdata_size * 2) /* ShaderData : DL and shadow */ - + sizeof(Ray) + sizeof(BsdfEval) - + sizeof(float3) /* AOAlpha size */ - + sizeof(float3) /* AOBSDF size */ - + sizeof(Ray) - + (sizeof(int) * NUM_QUEUES) - + per_thread_output_buffer_size; - return retval; - } - - /* Considers the total memory available in the device and - * and returns the maximum global work size possible. - */ - size_t get_feasible_global_work_size(int2 tile_size, cl_mem d_data) - { - /* Calculate invariably allocated memory. */ - size_t invariable_mem_allocated = get_invariable_mem_allocated(); - /* Calculate tile specific allocated memory. */ - size_t tile_specific_mem_allocated = - get_tile_specific_mem_allocated(tile_size); - /* Calculate scene specific allocated memory. */ - size_t scene_specific_mem_allocated = - get_scene_specific_mem_allocated(d_data); - /* Calculate total memory available for the threads in global work size. */ - size_t available_memory = total_allocatable_memory - - invariable_mem_allocated - - tile_specific_mem_allocated - - scene_specific_mem_allocated - - DATA_ALLOCATION_MEM_FACTOR; - size_t per_thread_memory_required = get_per_thread_memory(); - return (available_memory / per_thread_memory_required); - } - - /* Checks if the device has enough memory to render the whole tile; - * If not, we should split single tile into multiple tiles of small size - * and process them all. - */ - bool need_to_split_tile(unsigned int d_w, - unsigned int d_h, - int2 max_render_feasible_tile_size) - { - size_t global_size_estimate[2]; - /* TODO(sergey): Such round-ups are in quite few places, need to replace - * them with an utility macro. - */ - global_size_estimate[0] = - (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) * - SPLIT_KERNEL_LOCAL_SIZE_X; - global_size_estimate[1] = - (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) * - SPLIT_KERNEL_LOCAL_SIZE_Y; - if((global_size_estimate[0] * global_size_estimate[1]) > - (max_render_feasible_tile_size.x * max_render_feasible_tile_size.y)) - { - return true; + cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice); + /* Use small global size on CPU devices as it seems to be much faster. */ + if(type == CL_DEVICE_TYPE_CPU) { + VLOG(1) << "Global size: (64, 64)."; + return make_int2(64, 64); } - else { - return false; - } - } - /* Considers the scene properties, global memory available in the device - * and returns a rectanglular tile dimension (approx the maximum) - * that should render on split kernel. - */ - int2 get_max_render_feasible_tile_size(size_t feasible_global_work_size) - { - int2 max_render_feasible_tile_size; - int square_root_val = (int)sqrt(feasible_global_work_size); - max_render_feasible_tile_size.x = square_root_val; - max_render_feasible_tile_size.y = square_root_val; - /* Ciel round-off max_render_feasible_tile_size. */ - int2 ceil_render_feasible_tile_size; - ceil_render_feasible_tile_size.x = - (((max_render_feasible_tile_size.x - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) * - SPLIT_KERNEL_LOCAL_SIZE_X; - ceil_render_feasible_tile_size.y = - (((max_render_feasible_tile_size.y - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) * - SPLIT_KERNEL_LOCAL_SIZE_Y; - if(ceil_render_feasible_tile_size.x * ceil_render_feasible_tile_size.y <= - feasible_global_work_size) - { - return ceil_render_feasible_tile_size; - } - /* Floor round-off max_render_feasible_tile_size. */ - int2 floor_render_feasible_tile_size; - floor_render_feasible_tile_size.x = - (max_render_feasible_tile_size.x / SPLIT_KERNEL_LOCAL_SIZE_X) * - SPLIT_KERNEL_LOCAL_SIZE_X; - floor_render_feasible_tile_size.y = - (max_render_feasible_tile_size.y / SPLIT_KERNEL_LOCAL_SIZE_Y) * - SPLIT_KERNEL_LOCAL_SIZE_Y; - return floor_render_feasible_tile_size; - } + cl_ulong max_buffer_size; + clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL); - /* Try splitting the current tile into multiple smaller - * almost-square-tiles. - */ - int2 get_split_tile_size(RenderTile rtile, - int2 max_render_feasible_tile_size) - { - int2 split_tile_size; - int num_global_threads = max_render_feasible_tile_size.x * - max_render_feasible_tile_size.y; - int d_w = rtile.w; - int d_h = rtile.h; - /* Ceil round off d_w and d_h */ - d_w = (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) * - SPLIT_KERNEL_LOCAL_SIZE_X; - d_h = (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) * - SPLIT_KERNEL_LOCAL_SIZE_Y; - while(d_w * d_h > num_global_threads) { - /* Halve the longer dimension. */ - if(d_w >= d_h) { - d_w = d_w / 2; - d_w = (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) * - SPLIT_KERNEL_LOCAL_SIZE_X; - } - else { - d_h = d_h / 2; - d_h = (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) * - SPLIT_KERNEL_LOCAL_SIZE_Y; - } + if(DebugFlags().opencl.mem_limit) { + max_buffer_size = min(max_buffer_size, + cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used)); } - split_tile_size.x = d_w; - split_tile_size.y = d_h; - return split_tile_size; - } - /* Splits existing tile into multiple tiles of tile size split_tile_size. */ - vector<SplitRenderTile> split_tiles(RenderTile rtile, int2 split_tile_size) - { - vector<SplitRenderTile> to_path_trace_rtile; - int d_w = rtile.w; - int d_h = rtile.h; - int num_tiles_x = (((d_w - 1) / split_tile_size.x) + 1); - int num_tiles_y = (((d_h - 1) / split_tile_size.y) + 1); - /* Buffer and rng_state offset calc. */ - size_t offset_index = rtile.offset + (rtile.x + rtile.y * rtile.stride); - size_t offset_x = offset_index % rtile.stride; - size_t offset_y = offset_index / rtile.stride; - /* Resize to_path_trace_rtile. */ - to_path_trace_rtile.resize(num_tiles_x * num_tiles_y); - for(int tile_iter_y = 0; tile_iter_y < num_tiles_y; tile_iter_y++) { - for(int tile_iter_x = 0; tile_iter_x < num_tiles_x; tile_iter_x++) { - int rtile_index = tile_iter_y * num_tiles_x + tile_iter_x; - to_path_trace_rtile[rtile_index].rng_state_offset_x = offset_x + tile_iter_x * split_tile_size.x; - to_path_trace_rtile[rtile_index].rng_state_offset_y = offset_y + tile_iter_y * split_tile_size.y; - to_path_trace_rtile[rtile_index].buffer_offset_x = offset_x + tile_iter_x * split_tile_size.x; - to_path_trace_rtile[rtile_index].buffer_offset_y = offset_y + tile_iter_y * split_tile_size.y; - to_path_trace_rtile[rtile_index].start_sample = rtile.start_sample; - to_path_trace_rtile[rtile_index].num_samples = rtile.num_samples; - to_path_trace_rtile[rtile_index].sample = rtile.sample; - to_path_trace_rtile[rtile_index].resolution = rtile.resolution; - to_path_trace_rtile[rtile_index].offset = rtile.offset; - to_path_trace_rtile[rtile_index].buffers = rtile.buffers; - to_path_trace_rtile[rtile_index].buffer = rtile.buffer; - to_path_trace_rtile[rtile_index].rng_state = rtile.rng_state; - to_path_trace_rtile[rtile_index].x = rtile.x + (tile_iter_x * split_tile_size.x); - to_path_trace_rtile[rtile_index].y = rtile.y + (tile_iter_y * split_tile_size.y); - to_path_trace_rtile[rtile_index].buffer_rng_state_stride = rtile.stride; - /* Fill width and height of the new render tile. */ - to_path_trace_rtile[rtile_index].w = (tile_iter_x == (num_tiles_x - 1)) ? - (d_w - (tile_iter_x * split_tile_size.x)) /* Border tile */ - : split_tile_size.x; - to_path_trace_rtile[rtile_index].h = (tile_iter_y == (num_tiles_y - 1)) ? - (d_h - (tile_iter_y * split_tile_size.y)) /* Border tile */ - : split_tile_size.y; - to_path_trace_rtile[rtile_index].stride = to_path_trace_rtile[rtile_index].w; - } - } - return to_path_trace_rtile; - } - - void thread_run(DeviceTask *task) - { - if(task->type == DeviceTask::FILM_CONVERT) { - film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half); - } - else if(task->type == DeviceTask::SHADER) { - shader(*task); - } - else if(task->type == DeviceTask::PATH_TRACE) { - RenderTile tile; - bool initialize_data_and_check_render_feasibility = false; - bool need_to_split_tiles_further = false; - int2 max_render_feasible_tile_size; - size_t feasible_global_work_size; - const int2 tile_size = task->requested_tile_size; - /* Keep rendering tiles until done. */ - while(task->acquire_tile(this, tile)) { - if(!initialize_data_and_check_render_feasibility) { - /* Initialize data. */ - /* Calculate per_thread_output_buffer_size. */ - size_t output_buffer_size = 0; - ciErr = clGetMemObjectInfo((cl_mem)tile.buffer, - CL_MEM_SIZE, - sizeof(output_buffer_size), - &output_buffer_size, - NULL); - assert(ciErr == CL_SUCCESS && "Can't get tile.buffer mem object info"); - /* This value is different when running on AMD and NV. */ - if(background) { - /* In offline render the number of buffer elements - * associated with tile.buffer is the current tile size. - */ - per_thread_output_buffer_size = - output_buffer_size / (tile.w * tile.h); - } - else { - /* interactive rendering, unlike offline render, the number of buffer elements - * associated with tile.buffer is the entire viewport size. - */ - per_thread_output_buffer_size = - output_buffer_size / (tile.buffers->params.width * - tile.buffers->params.height); - } - /* Check render feasibility. */ - feasible_global_work_size = get_feasible_global_work_size( - tile_size, - CL_MEM_PTR(const_mem_map["__data"]->device_pointer)); - max_render_feasible_tile_size = - get_max_render_feasible_tile_size( - feasible_global_work_size); - need_to_split_tiles_further = - need_to_split_tile(tile_size.x, - tile_size.y, - max_render_feasible_tile_size); - initialize_data_and_check_render_feasibility = true; - } - if(need_to_split_tiles_further) { - int2 split_tile_size = - get_split_tile_size(tile, - max_render_feasible_tile_size); - vector<SplitRenderTile> to_path_trace_render_tiles = - split_tiles(tile, split_tile_size); - /* Print message to console */ - if(background && (to_path_trace_render_tiles.size() > 1)) { - fprintf(stderr, "Message : Tiles need to be split " - "further inside path trace (due to insufficient " - "device-global-memory for split kernel to " - "function) \n" - "The current tile of dimensions %dx%d is split " - "into tiles of dimension %dx%d for render \n", - tile.w, tile.h, - split_tile_size.x, - split_tile_size.y); - } - /* Process all split tiles. */ - for(int tile_iter = 0; - tile_iter < to_path_trace_render_tiles.size(); - ++tile_iter) - { - path_trace(task, - to_path_trace_render_tiles[tile_iter], - max_render_feasible_tile_size); - } - } - else { - /* No splitting required; process the entire tile at once. */ - /* Render feasible tile size is user-set-tile-size itself. */ - max_render_feasible_tile_size.x = - (((tile_size.x - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) * - SPLIT_KERNEL_LOCAL_SIZE_X; - max_render_feasible_tile_size.y = - (((tile_size.y - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) * - SPLIT_KERNEL_LOCAL_SIZE_Y; - /* buffer_rng_state_stride is stride itself. */ - SplitRenderTile split_tile(tile); - split_tile.buffer_rng_state_stride = tile.stride; - path_trace(task, split_tile, max_render_feasible_tile_size); - } - tile.sample = tile.start_sample + tile.num_samples; - - /* Complete kernel execution before release tile. */ - /* This helps in multi-device render; - * The device that reaches the critical-section function - * release_tile waits (stalling other devices from entering - * release_tile) for all kernels to complete. If device1 (a - * slow-render device) reaches release_tile first then it would - * stall device2 (a fast-render device) from proceeding to render - * next tile. - */ - clFinish(cqCommandQueue); + VLOG(1) << "Maximum device allocation size: " + << string_human_readable_number(max_buffer_size) << " bytes. (" + << string_human_readable_size(max_buffer_size) << ")."; - task->release_tile(tile); - } - } - } + /* Limit to 2gb, as we shouldn't need more than that and some devices may support much more. */ + max_buffer_size = min(max_buffer_size / 2, (cl_ulong)2l*1024*1024*1024); -protected: - cl_mem mem_alloc(size_t bufsize, cl_mem_flags mem_flag = CL_MEM_READ_WRITE) - { - cl_mem ptr; - assert(bufsize != 0); - ptr = clCreateBuffer(cxContext, mem_flag, bufsize, NULL, &ciErr); - opencl_assert_err(ciErr, "clCreateBuffer"); - return ptr; + size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size); + int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64), (int)sqrt(num_elements)); + VLOG(1) << "Global size: " << global_size << "."; + return global_size; } +}; - /* ** Those guys are for workign around some compiler-specific bugs ** */ +OpenCLDeviceSplitKernel::OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_) +: OpenCLDeviceBase(info, stats, background_) +{ + split_kernel = new OpenCLSplitKernel(this); - string build_options_for_base_program( - const DeviceRequestedFeatures& requested_features) - { - return requested_features.get_build_options(); - } -}; + background = background_; +} Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, bool background) { diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp index c7760e075cb..78ed401bff5 100644 --- a/intern/cycles/device/opencl/opencl_util.cpp +++ b/intern/cycles/device/opencl/opencl_util.cpp @@ -16,11 +16,13 @@ #ifdef WITH_OPENCL -#include "opencl.h" +#include "device/opencl/opencl.h" -#include "util_logging.h" -#include "util_path.h" -#include "util_time.h" +#include "util/util_debug.h" +#include "util/util_logging.h" +#include "util/util_md5.h" +#include "util/util_path.h" +#include "util/util_time.h" using std::cerr; using std::endl; @@ -234,15 +236,15 @@ string OpenCLCache::get_kernel_md5() thread_scoped_lock lock(self.kernel_md5_lock); if(self.kernel_md5.empty()) { - self.kernel_md5 = path_files_md5_hash(path_get("kernel")); + self.kernel_md5 = path_files_md5_hash(path_get("source")); } return self.kernel_md5; } OpenCLDeviceBase::OpenCLProgram::OpenCLProgram(OpenCLDeviceBase *device, - string program_name, - string kernel_file, - string kernel_build_options, + const string& program_name, + const string& kernel_file, + const string& kernel_build_options, bool use_stdout) : device(device), program_name(program_name), @@ -273,20 +275,21 @@ void OpenCLDeviceBase::OpenCLProgram::release() } } -void OpenCLDeviceBase::OpenCLProgram::add_log(string msg, bool debug) +void OpenCLDeviceBase::OpenCLProgram::add_log(const string& msg, bool debug) { if(!use_stdout) { log += msg + "\n"; } else if(!debug) { printf("%s\n", msg.c_str()); + fflush(stdout); } else { VLOG(2) << msg; } } -void OpenCLDeviceBase::OpenCLProgram::add_error(string msg) +void OpenCLDeviceBase::OpenCLProgram::add_error(const string& msg) { if(use_stdout) { fprintf(stderr, "%s\n", msg.c_str()); @@ -338,12 +341,13 @@ bool OpenCLDeviceBase::OpenCLProgram::build_kernel(const string *debug_src) bool OpenCLDeviceBase::OpenCLProgram::compile_kernel(const string *debug_src) { - string source = "#include \"kernels/opencl/" + kernel_file + "\" // " + OpenCLCache::get_kernel_md5() + "\n"; + string source = "#include \"kernel/kernels/opencl/" + kernel_file + "\"\n"; /* We compile kernels consisting of many files. unfortunately OpenCL * kernel caches do not seem to recognize changes in included files. * so we force recompile on changes by adding the md5 hash of all files. */ - source = path_source_replace_includes(source, path_get("kernel")); + source = path_source_replace_includes(source, path_get("source")); + source += "\n// " + util_md5_string(source) + "\n"; if(debug_src) { path_write_text(*debug_src, source); @@ -354,10 +358,10 @@ bool OpenCLDeviceBase::OpenCLProgram::compile_kernel(const string *debug_src) cl_int ciErr; program = clCreateProgramWithSource(device->cxContext, - 1, - &source_str, - &source_len, - &ciErr); + 1, + &source_str, + &source_len, + &ciErr); if(ciErr != CL_SUCCESS) { add_error(string("OpenCL program creation failed: ") + clewErrorString(ciErr)); @@ -440,7 +444,11 @@ void OpenCLDeviceBase::OpenCLProgram::load() if(!program) { add_log(string("OpenCL program ") + program_name + " not found in cache.", true); - string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + OpenCLCache::get_kernel_md5(); + /* need to create source to get md5 */ + string source = "#include \"kernel/kernels/opencl/" + kernel_file + "\"\n"; + source = path_source_replace_includes(source, path_get("source")); + + string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + util_md5_string(source); basename = path_cache_get(path_join("kernels", basename)); string clbin = basename + ".clbin"; @@ -546,6 +554,11 @@ bool OpenCLInfo::use_debug() return DebugFlags().opencl.debug; } +bool OpenCLInfo::use_single_program() +{ + return DebugFlags().opencl.single_program; +} + bool OpenCLInfo::kernel_use_advanced_shading(const string& platform) { /* keep this in sync with kernel_types.h! */ @@ -589,14 +602,46 @@ bool OpenCLInfo::device_supported(const string& platform_name, const cl_device_id device_id) { cl_device_type device_type; - clGetDeviceInfo(device_id, - CL_DEVICE_TYPE, - sizeof(cl_device_type), - &device_type, - NULL); + if(!get_device_type(device_id, &device_type)) { + return false; + } + string device_name; + if(!get_device_name(device_id, &device_name)) { + return false; + } + + int driver_major = 0; + int driver_minor = 0; + if(!get_driver_version(device_id, &driver_major, &driver_minor)) { + return false; + } + VLOG(3) << "OpenCL driver version " << driver_major << "." << driver_minor; + + /* It is possible tyo have Iris GPU on AMD/Apple OpenCL framework + * (aka, it will not be on Intel framework). This isn't supported + * and needs an explicit blacklist. + */ + if(strstr(device_name.c_str(), "Iris")) { + return false; + } if(platform_name == "AMD Accelerated Parallel Processing" && device_type == CL_DEVICE_TYPE_GPU) { + if(driver_major < 2236) { + VLOG(1) << "AMD driver version " << driver_major << "." << driver_minor << " not supported."; + return false; + } + const char *blacklist[] = { + /* GCN 1 */ + "Tahiti", "Pitcairn", "Capeverde", "Oland", "Hainan", + NULL + }; + for(int i = 0; blacklist[i] != NULL; i++) { + if(device_name == blacklist[i]) { + VLOG(1) << "AMD device " << device_name << " not supported"; + return false; + } + } return true; } if(platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) { @@ -663,7 +708,7 @@ bool OpenCLInfo::device_version_check(cl_device_id device, return true; } -string OpenCLInfo::get_hardware_id(string platform_name, cl_device_id device_id) +string OpenCLInfo::get_hardware_id(const string& platform_name, cl_device_id device_id) { if(platform_name == "AMD Accelerated Parallel Processing" || platform_name == "Apple") { /* Use cl_amd_device_topology extension. */ @@ -707,39 +752,30 @@ void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices return; } + cl_int error; vector<cl_device_id> device_ids; - cl_uint num_devices = 0; vector<cl_platform_id> platform_ids; - cl_uint num_platforms = 0; - /* Get devices. */ - if(clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS || - num_platforms == 0) - { - FIRST_VLOG(2) << "No OpenCL platforms were found."; + /* Get platforms. */ + if(!get_platforms(&platform_ids, &error)) { + FIRST_VLOG(2) << "Error fetching platforms:" + << string(clewErrorString(error)); first_time = false; return; } - platform_ids.resize(num_platforms); - if(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL) != CL_SUCCESS) { - FIRST_VLOG(2) << "Failed to fetch platform IDs from the driver.."; + if(platform_ids.size() == 0) { + FIRST_VLOG(2) << "No OpenCL platforms were found."; first_time = false; return; } /* Devices are numbered consecutively across platforms. */ - for(int platform = 0; platform < num_platforms; platform++) { + for(int platform = 0; platform < platform_ids.size(); platform++) { cl_platform_id platform_id = platform_ids[platform]; - char pname[256]; - if(clGetPlatformInfo(platform_id, - CL_PLATFORM_NAME, - sizeof(pname), - &pname, - NULL) != CL_SUCCESS) - { + string platform_name; + if(!get_platform_name(platform_id, &platform_name)) { FIRST_VLOG(2) << "Failed to get platform name, ignoring."; continue; } - string platform_name = pname; FIRST_VLOG(2) << "Enumerating devices for platform " << platform_name << "."; if(!platform_version_check(platform_id)) { @@ -747,39 +783,28 @@ void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices << " due to too old compiler version."; continue; } - num_devices = 0; - cl_int ciErr; - if((ciErr = clGetDeviceIDs(platform_id, - device_type, - 0, - NULL, - &num_devices)) != CL_SUCCESS || num_devices == 0) + if(!get_platform_devices(platform_id, + device_type, + &device_ids, + &error)) { FIRST_VLOG(2) << "Ignoring platform " << platform_name - << ", failed to fetch number of devices: " << string(clewErrorString(ciErr)); + << ", failed to fetch of devices: " + << string(clewErrorString(error)); continue; } - device_ids.resize(num_devices); - if(clGetDeviceIDs(platform_id, - device_type, - num_devices, - &device_ids[0], - NULL) != CL_SUCCESS) - { + if(device_ids.size() == 0) { FIRST_VLOG(2) << "Ignoring platform " << platform_name - << ", failed to fetch devices list."; + << ", it has no devices."; continue; } - for(int num = 0; num < num_devices; num++) { - cl_device_id device_id = device_ids[num]; - char device_name[1024] = "\0"; - if(clGetDeviceInfo(device_id, - CL_DEVICE_NAME, - sizeof(device_name), - &device_name, - NULL) != CL_SUCCESS) - { - FIRST_VLOG(2) << "Failed to fetch device name, ignoring."; + for(int num = 0; num < device_ids.size(); num++) { + const cl_device_id device_id = device_ids[num]; + string device_name; + if(!get_device_name(device_id, &device_name, &error)) { + FIRST_VLOG(2) << "Failed to fetch device name: " + << string(clewErrorString(error)) + << ", ignoring."; continue; } if(!device_version_check(device_id)) { @@ -791,24 +816,28 @@ void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices device_supported(platform_name, device_id)) { cl_device_type device_type; - if(clGetDeviceInfo(device_id, - CL_DEVICE_TYPE, - sizeof(cl_device_type), - &device_type, - NULL) != CL_SUCCESS) - { + if(!get_device_type(device_id, &device_type, &error)) { FIRST_VLOG(2) << "Ignoring device " << device_name - << ", failed to fetch device type."; + << ", failed to fetch device type:" + << string(clewErrorString(error)); continue; } - FIRST_VLOG(2) << "Adding new device " << device_name << "."; + string readable_device_name = + get_readable_device_name(device_id); + if(readable_device_name != device_name) { + FIRST_VLOG(2) << "Using more readable device name: " + << readable_device_name; + } + FIRST_VLOG(2) << "Adding new device " + << readable_device_name << "."; string hardware_id = get_hardware_id(platform_name, device_id); - usable_devices->push_back(OpenCLPlatformDevice(platform_id, - platform_name, - device_id, - device_type, - device_name, - hardware_id)); + usable_devices->push_back(OpenCLPlatformDevice( + platform_id, + platform_name, + device_id, + device_type, + readable_device_name, + hardware_id)); } else { FIRST_VLOG(2) << "Ignoring device " << device_name @@ -819,6 +848,308 @@ void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices first_time = false; } +bool OpenCLInfo::get_platforms(vector<cl_platform_id> *platform_ids, + cl_int *error) +{ + /* Reset from possible previous state. */ + platform_ids->resize(0); + cl_uint num_platforms; + if(!get_num_platforms(&num_platforms, error)) { + return false; + } + /* Get actual platforms. */ + cl_int err; + platform_ids->resize(num_platforms); + if((err = clGetPlatformIDs(num_platforms, + &platform_ids->at(0), + NULL)) != CL_SUCCESS) { + if(error != NULL) { + *error = err; + } + return false; + } + if(error != NULL) { + *error = CL_SUCCESS; + } + return true; +} + +vector<cl_platform_id> OpenCLInfo::get_platforms() +{ + vector<cl_platform_id> platform_ids; + get_platforms(&platform_ids); + return platform_ids; +} + +bool OpenCLInfo::get_num_platforms(cl_uint *num_platforms, cl_int *error) +{ + cl_int err; + if((err = clGetPlatformIDs(0, NULL, num_platforms)) != CL_SUCCESS) { + if(error != NULL) { + *error = err; + } + *num_platforms = 0; + return false; + } + if(error != NULL) { + *error = CL_SUCCESS; + } + return true; +} + +cl_uint OpenCLInfo::get_num_platforms() +{ + cl_uint num_platforms; + if(!get_num_platforms(&num_platforms)) { + return 0; + } + return num_platforms; +} + +bool OpenCLInfo::get_platform_name(cl_platform_id platform_id, + string *platform_name) +{ + char buffer[256]; + if(clGetPlatformInfo(platform_id, + CL_PLATFORM_NAME, + sizeof(buffer), + &buffer, + NULL) != CL_SUCCESS) + { + *platform_name = ""; + return false; + } + *platform_name = buffer; + return true; +} + +string OpenCLInfo::get_platform_name(cl_platform_id platform_id) +{ + string platform_name; + if(!get_platform_name(platform_id, &platform_name)) { + return ""; + } + return platform_name; +} + +bool OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id, + cl_device_type device_type, + cl_uint *num_devices, + cl_int *error) +{ + cl_int err; + if((err = clGetDeviceIDs(platform_id, + device_type, + 0, + NULL, + num_devices)) != CL_SUCCESS) + { + if(error != NULL) { + *error = err; + } + *num_devices = 0; + return false; + } + if(error != NULL) { + *error = CL_SUCCESS; + } + return true; +} + +cl_uint OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id, + cl_device_type device_type) +{ + cl_uint num_devices; + if(!get_num_platform_devices(platform_id, + device_type, + &num_devices)) + { + return 0; + } + return num_devices; +} + +bool OpenCLInfo::get_platform_devices(cl_platform_id platform_id, + cl_device_type device_type, + vector<cl_device_id> *device_ids, + cl_int* error) +{ + /* Reset from possible previous state. */ + device_ids->resize(0); + /* Get number of devices to pre-allocate memory. */ + cl_uint num_devices; + if(!get_num_platform_devices(platform_id, + device_type, + &num_devices, + error)) + { + return false; + } + /* Get actual device list. */ + device_ids->resize(num_devices); + cl_int err; + if((err = clGetDeviceIDs(platform_id, + device_type, + num_devices, + &device_ids->at(0), + NULL)) != CL_SUCCESS) + { + if(error != NULL) { + *error = err; + } + return false; + } + if(error != NULL) { + *error = CL_SUCCESS; + } + return true; +} + +vector<cl_device_id> OpenCLInfo::get_platform_devices(cl_platform_id platform_id, + cl_device_type device_type) +{ + vector<cl_device_id> devices; + get_platform_devices(platform_id, device_type, &devices); + return devices; +} + +bool OpenCLInfo::get_device_name(cl_device_id device_id, + string *device_name, + cl_int* error) +{ + char buffer[1024]; + cl_int err; + if((err = clGetDeviceInfo(device_id, + CL_DEVICE_NAME, + sizeof(buffer), + &buffer, + NULL)) != CL_SUCCESS) + { + if(error != NULL) { + *error = err; + } + *device_name = ""; + return false; + } + if(error != NULL) { + *error = CL_SUCCESS; + } + *device_name = buffer; + return true; +} + +string OpenCLInfo::get_device_name(cl_device_id device_id) +{ + string device_name; + if(!get_device_name(device_id, &device_name)) { + return ""; + } + return device_name; +} + +bool OpenCLInfo::get_device_type(cl_device_id device_id, + cl_device_type *device_type, + cl_int* error) +{ + cl_int err; + if((err = clGetDeviceInfo(device_id, + CL_DEVICE_TYPE, + sizeof(cl_device_type), + device_type, + NULL)) != CL_SUCCESS) + { + if(error != NULL) { + *error = err; + } + *device_type = 0; + return false; + } + if(error != NULL) { + *error = CL_SUCCESS; + } + return true; +} + +cl_device_type OpenCLInfo::get_device_type(cl_device_id device_id) +{ + cl_device_type device_type; + if(!get_device_type(device_id, &device_type)) { + return 0; + } + return device_type; +} + +string OpenCLInfo::get_readable_device_name(cl_device_id device_id) +{ + string name = ""; + char board_name[1024]; + size_t length = 0; + if(clGetDeviceInfo(device_id, + CL_DEVICE_BOARD_NAME_AMD, + sizeof(board_name), + &board_name, + &length) == CL_SUCCESS) + { + if(length != 0 && board_name[0] != '\0') { + name = board_name; + } + } + + /* Fallback to standard device name API. */ + if(name.empty()) { + name = get_device_name(device_id); + } + + /* Distinguish from our native CPU device. */ + if(get_device_type(device_id) & CL_DEVICE_TYPE_CPU) { + name += " (OpenCL)"; + } + + return name; +} + +bool OpenCLInfo::get_driver_version(cl_device_id device_id, + int *major, + int *minor, + cl_int* error) +{ + char buffer[1024]; + cl_int err; + if((err = clGetDeviceInfo(device_id, + CL_DRIVER_VERSION, + sizeof(buffer), + &buffer, + NULL)) != CL_SUCCESS) + { + if(error != NULL) { + *error = err; + } + return false; + } + if(error != NULL) { + *error = CL_SUCCESS; + } + if(sscanf(buffer, "%d.%d", major, minor) < 2) { + VLOG(1) << string_printf("OpenCL: failed to parse driver version string (%s).", buffer); + return false; + } + return true; +} + +int OpenCLInfo::mem_sub_ptr_alignment(cl_device_id device_id) +{ + int base_align_bits; + if(clGetDeviceInfo(device_id, + CL_DEVICE_MEM_BASE_ADDR_ALIGN, + sizeof(int), + &base_align_bits, + NULL) == CL_SUCCESS) + { + return base_align_bits/8; + } + return 1; +} + CCL_NAMESPACE_END #endif diff --git a/intern/cycles/graph/CMakeLists.txt b/intern/cycles/graph/CMakeLists.txt index 4ea18728f1c..168ca0210e7 100644 --- a/intern/cycles/graph/CMakeLists.txt +++ b/intern/cycles/graph/CMakeLists.txt @@ -1,7 +1,6 @@ set(INC - . - ../util + .. ) set(SRC @@ -20,5 +19,5 @@ set(SRC_HEADERS include_directories(${INC}) include_directories(SYSTEM ${INC_SYS}) -add_library(cycles_graph ${SRC} ${SRC_HEADERS}) +cycles_add_library(cycles_graph ${SRC} ${SRC_HEADERS}) diff --git a/intern/cycles/graph/node.cpp b/intern/cycles/graph/node.cpp index 3c228a716d5..b16c6af5d0e 100644 --- a/intern/cycles/graph/node.cpp +++ b/intern/cycles/graph/node.cpp @@ -14,12 +14,13 @@ * limitations under the License. */ -#include "node.h" -#include "node_type.h" +#include "graph/node.h" +#include "graph/node_type.h" -#include "util_foreach.h" -#include "util_param.h" -#include "util_transform.h" +#include "util/util_foreach.h" +#include "util/util_md5.h" +#include "util/util_param.h" +#include "util/util_transform.h" CCL_NAMESPACE_BEGIN @@ -364,29 +365,48 @@ static bool is_array_equal(const Node *node, const Node *other, const SocketType return *a == *b; } +template<typename T> +static bool is_value_equal(const Node *node, const Node *other, const SocketType& socket) +{ + const T *a = (const T*)(((char*)node) + socket.struct_offset); + const T *b = (const T*)(((char*)other) + socket.struct_offset); + return *a == *b; +} + bool Node::equals_value(const Node& other, const SocketType& socket) const { - if(socket.is_array()) { - switch(socket.type) { - case SocketType::BOOLEAN_ARRAY: return is_array_equal<bool>(this, &other, socket); - case SocketType::FLOAT_ARRAY: return is_array_equal<float>(this, &other, socket); - case SocketType::INT_ARRAY: return is_array_equal<int>(this, &other, socket); - case SocketType::COLOR_ARRAY: return is_array_equal<float3>(this, &other, socket); - case SocketType::VECTOR_ARRAY: return is_array_equal<float3>(this, &other, socket); - case SocketType::POINT_ARRAY: return is_array_equal<float3>(this, &other, socket); - case SocketType::NORMAL_ARRAY: return is_array_equal<float3>(this, &other, socket); - case SocketType::POINT2_ARRAY: return is_array_equal<float2>(this, &other, socket); - case SocketType::STRING_ARRAY: return is_array_equal<ustring>(this, &other, socket); - case SocketType::TRANSFORM_ARRAY: return is_array_equal<Transform>(this, &other, socket); - case SocketType::NODE_ARRAY: return is_array_equal<void*>(this, &other, socket); - default: assert(0); return true; - } - } - else { - const void *a = ((char*)this) + socket.struct_offset; - const void *b = ((char*)&other) + socket.struct_offset; - return (memcmp(a, b, socket.size()) == 0); + switch(socket.type) { + case SocketType::BOOLEAN: return is_value_equal<bool>(this, &other, socket); + case SocketType::FLOAT: return is_value_equal<float>(this, &other, socket); + case SocketType::INT: return is_value_equal<int>(this, &other, socket); + case SocketType::UINT: return is_value_equal<uint>(this, &other, socket); + case SocketType::COLOR: return is_value_equal<float3>(this, &other, socket); + case SocketType::VECTOR: return is_value_equal<float3>(this, &other, socket); + case SocketType::POINT: return is_value_equal<float3>(this, &other, socket); + case SocketType::NORMAL: return is_value_equal<float3>(this, &other, socket); + case SocketType::POINT2: return is_value_equal<float2>(this, &other, socket); + case SocketType::CLOSURE: return true; + case SocketType::STRING: return is_value_equal<ustring>(this, &other, socket); + case SocketType::ENUM: return is_value_equal<int>(this, &other, socket); + case SocketType::TRANSFORM: return is_value_equal<Transform>(this, &other, socket); + case SocketType::NODE: return is_value_equal<void*>(this, &other, socket); + + case SocketType::BOOLEAN_ARRAY: return is_array_equal<bool>(this, &other, socket); + case SocketType::FLOAT_ARRAY: return is_array_equal<float>(this, &other, socket); + case SocketType::INT_ARRAY: return is_array_equal<int>(this, &other, socket); + case SocketType::COLOR_ARRAY: return is_array_equal<float3>(this, &other, socket); + case SocketType::VECTOR_ARRAY: return is_array_equal<float3>(this, &other, socket); + case SocketType::POINT_ARRAY: return is_array_equal<float3>(this, &other, socket); + case SocketType::NORMAL_ARRAY: return is_array_equal<float3>(this, &other, socket); + case SocketType::POINT2_ARRAY: return is_array_equal<float2>(this, &other, socket); + case SocketType::STRING_ARRAY: return is_array_equal<ustring>(this, &other, socket); + case SocketType::TRANSFORM_ARRAY: return is_array_equal<Transform>(this, &other, socket); + case SocketType::NODE_ARRAY: return is_array_equal<void*>(this, &other, socket); + + case SocketType::UNDEFINED: return true; } + + return true; } /* equals */ @@ -403,5 +423,77 @@ bool Node::equals(const Node& other) const return true; } +/* Hash */ + +template<typename T> +static void value_hash(const Node *node, const SocketType& socket, MD5Hash& md5) +{ + md5.append(((uint8_t*)node) + socket.struct_offset, socket.size()); +} + +static void float3_hash(const Node *node, const SocketType& socket, MD5Hash& md5) +{ + /* Don't compare 4th element used for padding. */ + md5.append(((uint8_t*)node) + socket.struct_offset, sizeof(float) * 3); +} + +template<typename T> +static void array_hash(const Node *node, const SocketType& socket, MD5Hash& md5) +{ + const array<T>& a = *(const array<T>*)(((char*)node) + socket.struct_offset); + for (size_t i = 0; i < a.size(); i++) { + md5.append((uint8_t*)&a[i], sizeof(T)); + } +} + +static void float3_array_hash(const Node *node, const SocketType& socket, MD5Hash& md5) +{ + /* Don't compare 4th element used for padding. */ + const array<float3>& a = *(const array<float3>*)(((char*)node) + socket.struct_offset); + for (size_t i = 0; i < a.size(); i++) { + md5.append((uint8_t*)&a[i], sizeof(float) * 3); + } +} + +void Node::hash(MD5Hash& md5) +{ + md5.append(type->name.string()); + + foreach(const SocketType& socket, type->inputs) { + md5.append(socket.name.string()); + + switch(socket.type) { + case SocketType::BOOLEAN: value_hash<bool>(this, socket, md5); break; + case SocketType::FLOAT: value_hash<float>(this, socket, md5); break; + case SocketType::INT: value_hash<int>(this, socket, md5); break; + case SocketType::UINT: value_hash<uint>(this, socket, md5); break; + case SocketType::COLOR: float3_hash(this, socket, md5); break; + case SocketType::VECTOR: float3_hash(this, socket, md5); break; + case SocketType::POINT: float3_hash(this, socket, md5); break; + case SocketType::NORMAL: float3_hash(this, socket, md5); break; + case SocketType::POINT2: value_hash<float2>(this, socket, md5); break; + case SocketType::CLOSURE: break; + case SocketType::STRING: value_hash<ustring>(this, socket, md5); break; + case SocketType::ENUM: value_hash<int>(this, socket, md5); break; + case SocketType::TRANSFORM: value_hash<Transform>(this, socket, md5); break; + case SocketType::NODE: value_hash<void*>(this, socket, md5); break; + + case SocketType::BOOLEAN_ARRAY: array_hash<bool>(this, socket, md5); break; + case SocketType::FLOAT_ARRAY: array_hash<float>(this, socket, md5); break; + case SocketType::INT_ARRAY: array_hash<int>(this, socket, md5); break; + case SocketType::COLOR_ARRAY: float3_array_hash(this, socket, md5); break; + case SocketType::VECTOR_ARRAY: float3_array_hash(this, socket, md5); break; + case SocketType::POINT_ARRAY: float3_array_hash(this, socket, md5); break; + case SocketType::NORMAL_ARRAY: float3_array_hash(this, socket, md5); break; + case SocketType::POINT2_ARRAY: array_hash<float2>(this, socket, md5); break; + case SocketType::STRING_ARRAY: array_hash<ustring>(this, socket, md5); break; + case SocketType::TRANSFORM_ARRAY: array_hash<Transform>(this, socket, md5); break; + case SocketType::NODE_ARRAY: array_hash<void*>(this, socket, md5); break; + + case SocketType::UNDEFINED: break; + } + } +} + CCL_NAMESPACE_END diff --git a/intern/cycles/graph/node.h b/intern/cycles/graph/node.h index 64410f4539b..d198c38be32 100644 --- a/intern/cycles/graph/node.h +++ b/intern/cycles/graph/node.h @@ -16,14 +16,15 @@ #pragma once -#include "node_type.h" +#include "graph/node_type.h" -#include "util_map.h" -#include "util_param.h" -#include "util_vector.h" +#include "util/util_map.h" +#include "util/util_param.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN +class MD5Hash; struct Node; struct NodeType; struct Transform; @@ -88,6 +89,9 @@ struct Node /* equals */ bool equals(const Node& other) const; + /* compute hash of node and its socket values */ + void hash(MD5Hash& md5); + ustring name; const NodeType *type; }; diff --git a/intern/cycles/graph/node_enum.h b/intern/cycles/graph/node_enum.h index 2bae531c036..4e40c294f4f 100644 --- a/intern/cycles/graph/node_enum.h +++ b/intern/cycles/graph/node_enum.h @@ -16,8 +16,8 @@ #pragma once -#include "util_map.h" -#include "util_param.h" +#include "util/util_map.h" +#include "util/util_param.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/graph/node_type.cpp b/intern/cycles/graph/node_type.cpp index 5b98de778ad..37aae211e93 100644 --- a/intern/cycles/graph/node_type.cpp +++ b/intern/cycles/graph/node_type.cpp @@ -14,9 +14,9 @@ * limitations under the License. */ -#include "node_type.h" -#include "util_foreach.h" -#include "util_transform.h" +#include "graph/node_type.h" +#include "util/util_foreach.h" +#include "util/util_transform.h" CCL_NAMESPACE_BEGIN @@ -77,7 +77,7 @@ size_t SocketType::max_size() void *SocketType::zero_default_value() { - static Transform zero_transform = {{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}}; + static Transform zero_transform = {{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}}; return &zero_transform; } diff --git a/intern/cycles/graph/node_type.h b/intern/cycles/graph/node_type.h index 1fb135f6d22..7d46e31ce24 100644 --- a/intern/cycles/graph/node_type.h +++ b/intern/cycles/graph/node_type.h @@ -16,12 +16,12 @@ #pragma once -#include "node_enum.h" +#include "graph/node_enum.h" -#include "util_map.h" -#include "util_param.h" -#include "util_string.h" -#include "util_vector.h" +#include "util/util_map.h" +#include "util/util_param.h" +#include "util/util_string.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/graph/node_xml.cpp b/intern/cycles/graph/node_xml.cpp index 590e09645ed..f4599e22d40 100644 --- a/intern/cycles/graph/node_xml.cpp +++ b/intern/cycles/graph/node_xml.cpp @@ -14,11 +14,11 @@ * limitations under the License. */ -#include "node_xml.h" +#include "graph/node_xml.h" -#include "util_foreach.h" -#include "util_string.h" -#include "util_transform.h" +#include "util/util_foreach.h" +#include "util/util_string.h" +#include "util/util_transform.h" CCL_NAMESPACE_BEGIN @@ -33,7 +33,7 @@ static const char *xml_write_boolean(bool value) } template<int VECTOR_SIZE, typename T> -static void xml_read_float_array(T& value, pugi::xml_attribute attr) +static void xml_read_float_array(T& value, xml_attribute attr) { vector<string> tokens; string_split(tokens, attr.value()); @@ -51,9 +51,9 @@ static void xml_read_float_array(T& value, pugi::xml_attribute attr) } } -void xml_read_node(XMLReader& reader, Node *node, pugi::xml_node xml_node) +void xml_read_node(XMLReader& reader, Node *node, xml_node xml_node) { - pugi::xml_attribute name_attr = xml_node.attribute("name"); + xml_attribute name_attr = xml_node.attribute("name"); if(name_attr) { node->name = ustring(name_attr.value()); } @@ -66,7 +66,7 @@ void xml_read_node(XMLReader& reader, Node *node, pugi::xml_node xml_node) continue; } - pugi::xml_attribute attr = xml_node.attribute(socket.name.c_str()); + xml_attribute attr = xml_node.attribute(socket.name.c_str()); if(!attr) { continue; @@ -196,7 +196,7 @@ void xml_read_node(XMLReader& reader, Node *node, pugi::xml_node xml_node) case SocketType::TRANSFORM: { array<Transform> value; - xml_read_float_array<16>(value, attr); + xml_read_float_array<12>(value, attr); if(value.size() == 1) { node->set(socket, value[0]); } @@ -205,7 +205,7 @@ void xml_read_node(XMLReader& reader, Node *node, pugi::xml_node xml_node) case SocketType::TRANSFORM_ARRAY: { array<Transform> value; - xml_read_float_array<16>(value, attr); + xml_read_float_array<12>(value, attr); node->set(socket, value); break; } @@ -254,9 +254,9 @@ void xml_read_node(XMLReader& reader, Node *node, pugi::xml_node xml_node) reader.node_map[node->name] = node; } -pugi::xml_node xml_write_node(Node *node, pugi::xml_node xml_root) +xml_node xml_write_node(Node *node, xml_node xml_root) { - pugi::xml_node xml_node = xml_root.append_child(node->type->name.c_str()); + xml_node xml_node = xml_root.append_child(node->type->name.c_str()); xml_node.append_attribute("name") = node->name.c_str(); @@ -271,7 +271,7 @@ pugi::xml_node xml_write_node(Node *node, pugi::xml_node xml_root) continue; } - pugi::xml_attribute attr = xml_node.append_attribute(socket.name.c_str()); + xml_attribute attr = xml_node.append_attribute(socket.name.c_str()); switch(socket.type) { @@ -400,12 +400,10 @@ pugi::xml_node xml_write_node(Node *node, pugi::xml_node xml_root) { Transform tfm = node->get_transform(socket); std::stringstream ss; - for(int i = 0; i < 4; i++) { - ss << string_printf("%g %g %g %g", (double)tfm[i][0], (double)tfm[i][1], (double)tfm[i][2], (double)tfm[i][3]); - if(i != 3) { - ss << " "; - } + for(int i = 0; i < 3; i++) { + ss << string_printf("%g %g %g %g ", (double)tfm[i][0], (double)tfm[i][1], (double)tfm[i][2], (double)tfm[i][3]); } + ss << string_printf("%g %g %g %g", 0.0, 0.0, 0.0, 1.0); attr = ss.str().c_str(); break; } @@ -416,11 +414,12 @@ pugi::xml_node xml_write_node(Node *node, pugi::xml_node xml_root) for(size_t j = 0; j < value.size(); j++) { const Transform& tfm = value[j]; - for(int i = 0; i < 4; i++) { - ss << string_printf("%g %g %g %g", (double)tfm[i][0], (double)tfm[i][1], (double)tfm[i][2], (double)tfm[i][3]); - if(j != value.size() - 1 || i != 3) { - ss << " "; - } + for(int i = 0; i < 3; i++) { + ss << string_printf("%g %g %g %g ", (double)tfm[i][0], (double)tfm[i][1], (double)tfm[i][2], (double)tfm[i][3]); + } + ss << string_printf("%g %g %g %g", 0.0, 0.0, 0.0, 1.0); + if(j != value.size() - 1) { + ss << " "; } } attr = ss.str().c_str(); diff --git a/intern/cycles/graph/node_xml.h b/intern/cycles/graph/node_xml.h index 7494c5e6e55..b648c9666c1 100644 --- a/intern/cycles/graph/node_xml.h +++ b/intern/cycles/graph/node_xml.h @@ -16,11 +16,11 @@ #pragma once -#include "node.h" +#include "graph/node.h" -#include "util_map.h" -#include "util_string.h" -#include "util_xml.h" +#include "util/util_map.h" +#include "util/util_string.h" +#include "util/util_xml.h" CCL_NAMESPACE_BEGIN @@ -28,8 +28,8 @@ struct XMLReader { map<ustring, Node*> node_map; }; -void xml_read_node(XMLReader& reader, Node *node, pugi::xml_node xml_node); -pugi::xml_node xml_write_node(Node *node, pugi::xml_node xml_root); +void xml_read_node(XMLReader& reader, Node *node, xml_node xml_node); +xml_node xml_write_node(Node *node, xml_node xml_root); CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 29e0f44841e..9b7f4e00084 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -1,45 +1,78 @@ remove_extra_strict_flags() set(INC - . - ../util - osl - svm + .. ) set(INC_SYS ) -set(SRC +set(SRC_CPU_KERNELS kernels/cpu/kernel.cpp + kernels/cpu/kernel_sse2.cpp + kernels/cpu/kernel_sse3.cpp + kernels/cpu/kernel_sse41.cpp + kernels/cpu/kernel_avx.cpp + kernels/cpu/kernel_avx2.cpp + kernels/cpu/kernel_split.cpp + kernels/cpu/kernel_split_sse2.cpp + kernels/cpu/kernel_split_sse3.cpp + kernels/cpu/kernel_split_sse41.cpp + kernels/cpu/kernel_split_avx.cpp + kernels/cpu/kernel_split_avx2.cpp + kernels/cpu/filter.cpp + kernels/cpu/filter_sse2.cpp + kernels/cpu/filter_sse3.cpp + kernels/cpu/filter_sse41.cpp + kernels/cpu/filter_avx.cpp + kernels/cpu/filter_avx2.cpp +) + +set(SRC_CUDA_KERNELS + kernels/cuda/kernel.cu + kernels/cuda/kernel_split.cu + kernels/cuda/filter.cu +) + +set(SRC_OPENCL_KERNELS kernels/opencl/kernel.cl + kernels/opencl/kernel_state_buffer_size.cl + kernels/opencl/kernel_split.cl kernels/opencl/kernel_data_init.cl + kernels/opencl/kernel_path_init.cl kernels/opencl/kernel_queue_enqueue.cl kernels/opencl/kernel_scene_intersect.cl kernels/opencl/kernel_lamp_emission.cl - kernels/opencl/kernel_background_buffer_update.cl + kernels/opencl/kernel_do_volume.cl + kernels/opencl/kernel_indirect_background.cl + kernels/opencl/kernel_shader_setup.cl + kernels/opencl/kernel_shader_sort.cl kernels/opencl/kernel_shader_eval.cl kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl + kernels/opencl/kernel_subsurface_scatter.cl kernels/opencl/kernel_direct_lighting.cl - kernels/opencl/kernel_shadow_blocked.cl + kernels/opencl/kernel_shadow_blocked_ao.cl + kernels/opencl/kernel_shadow_blocked_dl.cl + kernels/opencl/kernel_enqueue_inactive.cl kernels/opencl/kernel_next_iteration_setup.cl - kernels/opencl/kernel_sum_all_radiance.cl - kernels/cuda/kernel.cu + kernels/opencl/kernel_indirect_subsurface.cl + kernels/opencl/kernel_buffer_update.cl + kernels/opencl/filter.cl ) set(SRC_BVH_HEADERS bvh/bvh.h bvh/bvh_nodes.h bvh/bvh_shadow_all.h - bvh/bvh_subsurface.h + bvh/bvh_local.h bvh/bvh_traversal.h bvh/bvh_types.h bvh/bvh_volume.h bvh/bvh_volume_all.h bvh/qbvh_nodes.h bvh/qbvh_shadow_all.h - bvh/qbvh_subsurface.h + bvh/qbvh_local.h bvh/qbvh_traversal.h bvh/qbvh_volume.h bvh/qbvh_volume_all.h @@ -52,12 +85,10 @@ set(SRC_HEADERS kernel_compat_cpu.h kernel_compat_cuda.h kernel_compat_opencl.h - kernel_debug.h kernel_differential.h kernel_emission.h kernel_film.h kernel_globals.h - kernel_image_opencl.h kernel_jitter.h kernel_light.h kernel_math.h @@ -68,6 +99,7 @@ set(SRC_HEADERS kernel_path_common.h kernel_path_state.h kernel_path_surface.h + kernel_path_subsurface.h kernel_path_volume.h kernel_projection.h kernel_queues.h @@ -86,6 +118,18 @@ set(SRC_KERNELS_CPU_HEADERS kernels/cpu/kernel_cpu.h kernels/cpu/kernel_cpu_impl.h kernels/cpu/kernel_cpu_image.h + kernels/cpu/filter_cpu.h + kernels/cpu/filter_cpu_impl.h +) + +set(SRC_KERNELS_CUDA_HEADERS + kernels/cuda/kernel_config.h + kernels/cuda/kernel_cuda_image.h +) + +set(SRC_KERNELS_OPENCL_HEADERS + kernels/opencl/kernel_split_function.h + kernels/opencl/kernel_opencl_image.h ) set(SRC_CLOSURE_HEADERS @@ -109,11 +153,14 @@ set(SRC_CLOSURE_HEADERS closure/bssrdf.h closure/emissive.h closure/volume.h + closure/bsdf_principled_diffuse.h + closure/bsdf_principled_sheen.h ) set(SRC_SVM_HEADERS svm/svm.h svm/svm_attribute.h + svm/svm_bevel.h svm/svm_blackbody.h svm/svm_bump.h svm/svm_camera.h @@ -162,6 +209,7 @@ set(SRC_GEOM_HEADERS geom/geom.h geom/geom_attribute.h geom/geom_curve.h + geom/geom_curve_intersect.h geom/geom_motion_curve.h geom/geom_motion_triangle.h geom/geom_motion_triangle_intersect.h @@ -175,42 +223,102 @@ set(SRC_GEOM_HEADERS geom/geom_volume.h ) +set(SRC_FILTER_HEADERS + filter/filter.h + filter/filter_defines.h + filter/filter_features.h + filter/filter_features_sse.h + filter/filter_kernel.h + filter/filter_nlm_cpu.h + filter/filter_nlm_gpu.h + filter/filter_prefilter.h + filter/filter_reconstruction.h + filter/filter_transform.h + filter/filter_transform_gpu.h + filter/filter_transform_sse.h +) + set(SRC_UTIL_HEADERS ../util/util_atomic.h ../util/util_color.h + ../util/util_defines.h ../util/util_half.h ../util/util_hash.h ../util/util_math.h ../util/util_math_fast.h + ../util/util_math_intersect.h + ../util/util_math_float2.h + ../util/util_math_float3.h + ../util/util_math_float4.h + ../util/util_math_int2.h + ../util/util_math_int3.h + ../util/util_math_int4.h + ../util/util_math_matrix.h + ../util/util_projection.h + ../util/util_rect.h ../util/util_static_assert.h ../util/util_transform.h ../util/util_texture.h ../util/util_types.h + ../util/util_types_float2.h + ../util/util_types_float2_impl.h + ../util/util_types_float3.h + ../util/util_types_float3_impl.h + ../util/util_types_float4.h + ../util/util_types_float4_impl.h + ../util/util_types_int2.h + ../util/util_types_int2_impl.h + ../util/util_types_int3.h + ../util/util_types_int3_impl.h + ../util/util_types_int4.h + ../util/util_types_int4_impl.h + ../util/util_types_uchar2.h + ../util/util_types_uchar2_impl.h + ../util/util_types_uchar3.h + ../util/util_types_uchar3_impl.h + ../util/util_types_uchar4.h + ../util/util_types_uchar4_impl.h + ../util/util_types_uint2.h + ../util/util_types_uint2_impl.h + ../util/util_types_uint3.h + ../util/util_types_uint3_impl.h + ../util/util_types_uint4.h + ../util/util_types_uint4_impl.h + ../util/util_types_vector3.h + ../util/util_types_vector3_impl.h ) set(SRC_SPLIT_HEADERS - split/kernel_background_buffer_update.h + split/kernel_branched.h + split/kernel_buffer_update.h split/kernel_data_init.h split/kernel_direct_lighting.h + split/kernel_do_volume.h + split/kernel_enqueue_inactive.h split/kernel_holdout_emission_blurring_pathtermination_ao.h + split/kernel_indirect_background.h + split/kernel_indirect_subsurface.h split/kernel_lamp_emission.h split/kernel_next_iteration_setup.h + split/kernel_path_init.h + split/kernel_queue_enqueue.h split/kernel_scene_intersect.h + split/kernel_shader_setup.h + split/kernel_shader_sort.h split/kernel_shader_eval.h - split/kernel_shadow_blocked.h + split/kernel_shadow_blocked_ao.h + split/kernel_shadow_blocked_dl.h split/kernel_split_common.h - split/kernel_sum_all_radiance.h + split/kernel_split_data.h + split/kernel_split_data_types.h + split/kernel_subsurface_scatter.h ) # CUDA module if(WITH_CYCLES_CUDA_BINARIES) - # 32 bit or 64 bit - if(CUDA_64_BIT_DEVICE_CODE) - set(CUDA_BITS 64) - else() - set(CUDA_BITS 32) - endif() + # 64 bit only + set(CUDA_BITS 64) # CUDA version execute_process(COMMAND ${CUDA_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT) @@ -219,7 +327,7 @@ if(WITH_CYCLES_CUDA_BINARIES) set(CUDA_VERSION "${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}") # warn for other versions - if(CUDA_VERSION MATCHES "80") + if(CUDA_VERSION MATCHES "80" OR CUDA_VERSION MATCHES "90") else() message(WARNING "CUDA version ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} detected, " @@ -227,74 +335,104 @@ if(WITH_CYCLES_CUDA_BINARIES) endif() # build for each arch - set(cuda_sources kernels/cuda/kernel.cu + set(cuda_sources kernels/cuda/kernel.cu kernels/cuda/kernel_split.cu ${SRC_HEADERS} + ${SRC_KERNELS_CUDA_HEADERS} ${SRC_BVH_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS} ) + set(cuda_filter_sources kernels/cuda/filter.cu + ${SRC_HEADERS} + ${SRC_KERNELS_CUDA_HEADERS} + ${SRC_FILTER_HEADERS} + ${SRC_UTIL_HEADERS} + ) set(cuda_cubins) - macro(CYCLES_CUDA_KERNEL_ADD arch experimental) + macro(CYCLES_CUDA_KERNEL_ADD arch name flags sources experimental) + set(cuda_cubin ${name}_${arch}.cubin) + set(cuda_kernel_src "/kernels/cuda/${name}.cu") + + set(cuda_flags + -D CCL_NAMESPACE_BEGIN= + -D CCL_NAMESPACE_END= + -D NVCC + -m ${CUDA_BITS} + -I ${CMAKE_CURRENT_SOURCE_DIR}/.. + -I ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda + --use_fast_math + -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin}) + if(${experimental}) - set(cuda_extra_flags "-D__KERNEL_EXPERIMENTAL__") - set(cuda_cubin kernel_experimental_${arch}.cubin) - else() - set(cuda_extra_flags "") - set(cuda_cubin kernel_${arch}.cubin) + set(cuda_flags ${cuda_flags} -D __KERNEL_EXPERIMENTAL__) + set(name ${name}_experimental) endif() if(WITH_CYCLES_DEBUG) - set(cuda_debug_flags "-D__KERNEL_DEBUG__") - else() - set(cuda_debug_flags "") + set(cuda_flags ${cuda_flags} -D __KERNEL_DEBUG__) endif() - set(cuda_nvcc_command ${CUDA_NVCC_EXECUTABLE}) - set(cuda_nvcc_version ${CUDA_VERSION}) - - set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}") - set(cuda_math_flags "--use_fast_math") - - add_custom_command( - OUTPUT ${cuda_cubin} - COMMAND ${cuda_nvcc_command} - -arch=${arch} - ${CUDA_NVCC_FLAGS} - -m${CUDA_BITS} - --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda/kernel.cu - -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} - --ptxas-options="-v" - ${cuda_arch_flags} - ${cuda_version_flags} - ${cuda_math_flags} - ${cuda_extra_flags} - ${cuda_debug_flags} - -I${CMAKE_CURRENT_SOURCE_DIR}/../util - -I${CMAKE_CURRENT_SOURCE_DIR}/svm - -DCCL_NAMESPACE_BEGIN= - -DCCL_NAMESPACE_END= - -DNVCC - DEPENDS ${cuda_sources}) - + if(WITH_CYCLES_CUBIN_COMPILER) + string(SUBSTRING ${arch} 3 -1 CUDA_ARCH) + + # Needed to find libnvrtc-builtins.so. Can't do it from inside + # cycles_cubin_cc since the env variable is read before main() + if(APPLE) + set(CUBIN_CC_ENV ${CMAKE_COMMAND} + -E env DYLD_LIBRARY_PATH="${CUDA_TOOLKIT_ROOT_DIR}/lib") + elseif(UNIX) + set(CUBIN_CC_ENV ${CMAKE_COMMAND} + -E env LD_LIBRARY_PATH="${CUDA_TOOLKIT_ROOT_DIR}/lib64") + endif() + + add_custom_command( + OUTPUT ${cuda_cubin} + COMMAND ${CUBIN_CC_ENV} + "$<TARGET_FILE:cycles_cubin_cc>" + -target ${CUDA_ARCH} + -i ${CMAKE_CURRENT_SOURCE_DIR}${cuda_kernel_src} + ${cuda_flags} + -v + -cuda-toolkit-dir "${CUDA_TOOLKIT_ROOT_DIR}" + DEPENDS ${sources} cycles_cubin_cc) + else() + add_custom_command( + OUTPUT ${cuda_cubin} + COMMAND ${CUDA_NVCC_EXECUTABLE} + -arch=${arch} + ${CUDA_NVCC_FLAGS} + --cubin + ${CMAKE_CURRENT_SOURCE_DIR}${cuda_kernel_src} + --ptxas-options="-v" + ${cuda_flags} + DEPENDS ${sources}) + endif() delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib) list(APPEND cuda_cubins ${cuda_cubin}) - unset(cuda_extra_flags) unset(cuda_debug_flags) - - unset(cuda_nvcc_command) - unset(cuda_nvcc_version) endmacro() foreach(arch ${CYCLES_CUDA_BINARIES_ARCH}) - # Compile regular kernel - CYCLES_CUDA_KERNEL_ADD(${arch} FALSE) + if(${arch} MATCHES "sm_2.") + message(STATUS "CUDA binaries for ${arch} are no longer supported, skipped.") + else() + # Compile regular kernel + CYCLES_CUDA_KERNEL_ADD(${arch} filter "" "${cuda_filter_sources}" FALSE) + CYCLES_CUDA_KERNEL_ADD(${arch} kernel "" "${cuda_sources}" FALSE) + endif() + + if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES) + # Compile split kernel + CYCLES_CUDA_KERNEL_ADD(${arch} kernel_split "-D __SPLIT__" ${cuda_sources} FALSE) + endif() endforeach() add_custom_target(cycles_kernel_cuda ALL DEPENDS ${cuda_cubins}) + cycles_set_solution_folder(cycles_kernel_cuda) endif() # OSL module @@ -309,38 +447,45 @@ endif() include_directories(${INC}) include_directories(SYSTEM ${INC_SYS}) -if(CXX_HAS_SSE) - list(APPEND SRC - kernels/cpu/kernel_sse2.cpp - kernels/cpu/kernel_sse3.cpp - kernels/cpu/kernel_sse41.cpp - ) +set_source_files_properties(kernels/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}") +set_source_files_properties(kernels/cpu/kernel_split.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}") +set_source_files_properties(kernels/cpu/filter.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}") +if(CXX_HAS_SSE) set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/filter_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/filter_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/filter_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") endif() if(CXX_HAS_AVX) - list(APPEND SRC - kernels/cpu/kernel_avx.cpp - ) set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/filter_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") endif() if(CXX_HAS_AVX2) - list(APPEND SRC - kernels/cpu/kernel_avx2.cpp - ) set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/filter_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") endif() -add_library(cycles_kernel - ${SRC} +cycles_add_library(cycles_kernel + ${SRC_CPU_KERNELS} + ${SRC_CUDA_KERNELS} + ${SRC_OPENCL_KERNELS} ${SRC_HEADERS} ${SRC_KERNELS_CPU_HEADERS} + ${SRC_KERNELS_CUDA_HEADERS} + ${SRC_KERNELS_OPENCL_HEADERS} ${SRC_BVH_HEADERS} ${SRC_CLOSURE_HEADERS} + ${SRC_FILTER_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} ${SRC_SPLIT_HEADERS} @@ -360,24 +505,16 @@ endif() #add_custom_target(cycles_kernel_preprocess ALL DEPENDS ${KERNEL_PREPROCESSED}) #delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${KERNEL_PREPROCESSED}" ${CYCLES_INSTALL_PATH}/kernel) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_data_init.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_queue_enqueue.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_scene_intersect.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_lamp_emission.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_background_buffer_update.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_eval.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_sum_all_radiance.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/bvh) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/closure) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/svm) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/geom) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SPLIT_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/split) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_OPENCL_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CUDA_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_OPENCL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_FILTER_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/filter) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/util) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SPLIT_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/split) diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h index 321983c1abc..d3e0b25a200 100644 --- a/intern/cycles/kernel/bvh/bvh.h +++ b/intern/cycles/kernel/bvh/bvh.h @@ -27,76 +27,76 @@ CCL_NAMESPACE_BEGIN -#include "bvh_types.h" +#include "kernel/bvh/bvh_types.h" /* Common QBVH functions. */ #ifdef __QBVH__ -# include "qbvh_nodes.h" +# include "kernel/bvh/qbvh_nodes.h" #endif /* Regular BVH traversal */ -#include "bvh_nodes.h" +#include "kernel/bvh/bvh_nodes.h" #define BVH_FUNCTION_NAME bvh_intersect #define BVH_FUNCTION_FEATURES 0 -#include "bvh_traversal.h" +#include "kernel/bvh/bvh_traversal.h" #if defined(__INSTANCING__) # define BVH_FUNCTION_NAME bvh_intersect_instancing # define BVH_FUNCTION_FEATURES BVH_INSTANCING -# include "bvh_traversal.h" +# include "kernel/bvh/bvh_traversal.h" #endif #if defined(__HAIR__) # define BVH_FUNCTION_NAME bvh_intersect_hair # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH -# include "bvh_traversal.h" +# include "kernel/bvh/bvh_traversal.h" #endif #if defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_motion # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION -# include "bvh_traversal.h" +# include "kernel/bvh/bvh_traversal.h" #endif #if defined(__HAIR__) && defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_hair_motion # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH|BVH_MOTION -# include "bvh_traversal.h" +# include "kernel/bvh/bvh_traversal.h" #endif /* Subsurface scattering BVH traversal */ -#if defined(__SUBSURFACE__) -# define BVH_FUNCTION_NAME bvh_intersect_subsurface +#if defined(__BVH_LOCAL__) +# define BVH_FUNCTION_NAME bvh_intersect_local # define BVH_FUNCTION_FEATURES BVH_HAIR -# include "bvh_subsurface.h" +# include "kernel/bvh/bvh_local.h" # if defined(__OBJECT_MOTION__) -# define BVH_FUNCTION_NAME bvh_intersect_subsurface_motion +# define BVH_FUNCTION_NAME bvh_intersect_local_motion # define BVH_FUNCTION_FEATURES BVH_MOTION|BVH_HAIR -# include "bvh_subsurface.h" +# include "kernel/bvh/bvh_local.h" # endif -#endif /* __SUBSURFACE__ */ +#endif /* __BVH_LOCAL__ */ /* Volume BVH traversal */ #if defined(__VOLUME__) # define BVH_FUNCTION_NAME bvh_intersect_volume # define BVH_FUNCTION_FEATURES BVH_HAIR -# include "bvh_volume.h" +# include "kernel/bvh/bvh_volume.h" # if defined(__INSTANCING__) # define BVH_FUNCTION_NAME bvh_intersect_volume_instancing # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR -# include "bvh_volume.h" +# include "kernel/bvh/bvh_volume.h" # endif # if defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_volume_motion # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR -# include "bvh_volume.h" +# include "kernel/bvh/bvh_volume.h" # endif #endif /* __VOLUME__ */ @@ -105,30 +105,30 @@ CCL_NAMESPACE_BEGIN #if defined(__SHADOW_RECORD_ALL__) # define BVH_FUNCTION_NAME bvh_intersect_shadow_all # define BVH_FUNCTION_FEATURES 0 -# include "bvh_shadow_all.h" +# include "kernel/bvh/bvh_shadow_all.h" # if defined(__INSTANCING__) # define BVH_FUNCTION_NAME bvh_intersect_shadow_all_instancing # define BVH_FUNCTION_FEATURES BVH_INSTANCING -# include "bvh_shadow_all.h" +# include "kernel/bvh/bvh_shadow_all.h" # endif # if defined(__HAIR__) # define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR -# include "bvh_shadow_all.h" +# include "kernel/bvh/bvh_shadow_all.h" # endif # if defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_shadow_all_motion # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION -# include "bvh_shadow_all.h" +# include "kernel/bvh/bvh_shadow_all.h" # endif # if defined(__HAIR__) && defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair_motion # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_MOTION -# include "bvh_shadow_all.h" +# include "kernel/bvh/bvh_shadow_all.h" # endif #endif /* __SHADOW_RECORD_ALL__ */ @@ -137,18 +137,18 @@ CCL_NAMESPACE_BEGIN #if defined(__VOLUME_RECORD_ALL__) # define BVH_FUNCTION_NAME bvh_intersect_volume_all # define BVH_FUNCTION_FEATURES BVH_HAIR -# include "bvh_volume_all.h" +# include "kernel/bvh/bvh_volume_all.h" # if defined(__INSTANCING__) # define BVH_FUNCTION_NAME bvh_intersect_volume_all_instancing # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR -# include "bvh_volume_all.h" +# include "kernel/bvh/bvh_volume_all.h" # endif # if defined(__OBJECT_MOTION__) # define BVH_FUNCTION_NAME bvh_intersect_volume_all_motion # define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR -# include "bvh_volume_all.h" +# include "kernel/bvh/bvh_volume_all.h" # endif #endif /* __VOLUME_RECORD_ALL__ */ @@ -201,58 +201,92 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg, #endif /* __KERNEL_CPU__ */ } -#ifdef __SUBSURFACE__ -ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg, - const Ray *ray, - SubsurfaceIntersection *ss_isect, - int subsurface_object, - uint *lcg_state, - int max_hits) +#ifdef __BVH_LOCAL__ +/* Note: ray is passed by value to work around a possible CUDA compiler bug. */ +ccl_device_intersect void scene_intersect_local(KernelGlobals *kg, + const Ray ray, + LocalIntersection *local_isect, + int local_object, + uint *lcg_state, + int max_hits) { #ifdef __OBJECT_MOTION__ if(kernel_data.bvh.have_motion) { - return bvh_intersect_subsurface_motion(kg, - ray, - ss_isect, - subsurface_object, - lcg_state, - max_hits); + return bvh_intersect_local_motion(kg, + &ray, + local_isect, + local_object, + lcg_state, + max_hits); } #endif /* __OBJECT_MOTION__ */ - return bvh_intersect_subsurface(kg, - ray, - ss_isect, - subsurface_object, - lcg_state, - max_hits); + return bvh_intersect_local(kg, + &ray, + local_isect, + local_object, + lcg_state, + max_hits); } #endif #ifdef __SHADOW_RECORD_ALL__ -ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, const Ray *ray, Intersection *isect, uint max_hits, uint *num_hits) +ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, + const Ray *ray, + Intersection *isect, + uint visibility, + uint max_hits, + uint *num_hits) { # ifdef __OBJECT_MOTION__ if(kernel_data.bvh.have_motion) { # ifdef __HAIR__ - if(kernel_data.bvh.have_curves) - return bvh_intersect_shadow_all_hair_motion(kg, ray, isect, max_hits, num_hits); + if(kernel_data.bvh.have_curves) { + return bvh_intersect_shadow_all_hair_motion(kg, + ray, + isect, + visibility, + max_hits, + num_hits); + } # endif /* __HAIR__ */ - return bvh_intersect_shadow_all_motion(kg, ray, isect, max_hits, num_hits); + return bvh_intersect_shadow_all_motion(kg, + ray, + isect, + visibility, + max_hits, + num_hits); } # endif /* __OBJECT_MOTION__ */ # ifdef __HAIR__ - if(kernel_data.bvh.have_curves) - return bvh_intersect_shadow_all_hair(kg, ray, isect, max_hits, num_hits); + if(kernel_data.bvh.have_curves) { + return bvh_intersect_shadow_all_hair(kg, + ray, + isect, + visibility, + max_hits, + num_hits); + } # endif /* __HAIR__ */ # ifdef __INSTANCING__ - if(kernel_data.bvh.have_instancing) - return bvh_intersect_shadow_all_instancing(kg, ray, isect, max_hits, num_hits); + if(kernel_data.bvh.have_instancing) { + return bvh_intersect_shadow_all_instancing(kg, + ray, + isect, + visibility, + max_hits, + num_hits); + } # endif /* __INSTANCING__ */ - return bvh_intersect_shadow_all(kg, ray, isect, max_hits, num_hits); + return bvh_intersect_shadow_all(kg, + ray, + isect, + visibility, + max_hits, + num_hits); } #endif /* __SHADOW_RECORD_ALL__ */ diff --git a/intern/cycles/kernel/bvh/bvh_subsurface.h b/intern/cycles/kernel/bvh/bvh_local.h index 889bbca21e2..9292cc76a5c 100644 --- a/intern/cycles/kernel/bvh/bvh_subsurface.h +++ b/intern/cycles/kernel/bvh/bvh_local.h @@ -18,7 +18,7 @@ */ #ifdef __QBVH__ -# include "qbvh_subsurface.h" +# include "kernel/bvh/qbvh_local.h" #endif #if BVH_FEATURE(BVH_HAIR) @@ -27,9 +27,10 @@ # define NODE_INTERSECT bvh_aligned_node_intersect #endif -/* This is a template BVH traversal function for subsurface scattering, where - * various features can be enabled/disabled. This way we can compile optimized - * versions for each case without new features slowing things down. +/* This is a template BVH traversal function for finding local intersections + * around the shading point, for subsurface scattering and bevel. We disable + * various features for performance, and for instanced objects avoid traversing + * other parts of the scene. * * BVH_MOTION: motion blur rendering * @@ -42,8 +43,8 @@ ccl_device_inline #endif void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, const Ray *ray, - SubsurfaceIntersection *ss_isect, - int subsurface_object, + LocalIntersection *local_isect, + int local_object, uint *lcg_state, int max_hits) { @@ -60,7 +61,7 @@ void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, /* traversal variables in registers */ int stack_ptr = 0; - int node_addr = kernel_tex_fetch(__object_node, subsurface_object); + int node_addr = kernel_tex_fetch(__object_node, local_object); /* ray parameters in registers */ float3 P = ray->P; @@ -69,24 +70,24 @@ void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, int object = OBJECT_NONE; float isect_t = ray->t; - ss_isect->num_hits = 0; + local_isect->num_hits = 0; - const int object_flag = kernel_tex_fetch(__object_flag, subsurface_object); + const int object_flag = kernel_tex_fetch(__object_flag, local_object); if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { #if BVH_FEATURE(BVH_MOTION) Transform ob_itfm; - bvh_instance_motion_push(kg, - subsurface_object, - ray, - &P, - &dir, - &idir, - &isect_t, - &ob_itfm); + isect_t = bvh_instance_motion_push(kg, + local_object, + ray, + &P, + &dir, + &idir, + isect_t, + &ob_itfm); #else - bvh_instance_push(kg, subsurface_object, ray, &P, &dir, &idir, &isect_t); + isect_t = bvh_instance_push(kg, local_object, ray, &P, &dir, &idir, isect_t); #endif - object = subsurface_object; + object = local_object; } #if defined(__KERNEL_SSE2__) @@ -109,9 +110,6 @@ void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); - /* traversal loop */ do { do { @@ -196,15 +194,16 @@ void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, /* intersect ray against primitive */ for(; prim_addr < prim_addr2; prim_addr++) { kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - triangle_intersect_subsurface(kg, - &isect_precalc, - ss_isect, - P, - object, - prim_addr, - isect_t, - lcg_state, - max_hits); + triangle_intersect_local(kg, + local_isect, + P, + dir, + object, + local_object, + prim_addr, + isect_t, + lcg_state, + max_hits); } break; } @@ -213,16 +212,17 @@ void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, /* intersect ray against primitive */ for(; prim_addr < prim_addr2; prim_addr++) { kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - motion_triangle_intersect_subsurface(kg, - ss_isect, - P, - dir, - ray->time, - object, - prim_addr, - isect_t, - lcg_state, - max_hits); + motion_triangle_intersect_local(kg, + local_isect, + P, + dir, + ray->time, + object, + local_object, + prim_addr, + isect_t, + lcg_state, + max_hits); } break; } @@ -238,31 +238,30 @@ void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, ccl_device_inline void BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, - SubsurfaceIntersection *ss_isect, - int subsurface_object, + LocalIntersection *local_isect, + int local_object, uint *lcg_state, int max_hits) { + switch(kernel_data.bvh.bvh_layout) { #ifdef __QBVH__ - if(kernel_data.bvh.use_qbvh) { - return BVH_FUNCTION_FULL_NAME(QBVH)(kg, - ray, - ss_isect, - subsurface_object, - lcg_state, - max_hits); - } - else + case BVH_LAYOUT_BVH4: + return BVH_FUNCTION_FULL_NAME(QBVH)(kg, + ray, + local_isect, + local_object, + lcg_state, + max_hits); #endif - { - kernel_assert(kernel_data.bvh.use_qbvh == false); - return BVH_FUNCTION_FULL_NAME(BVH)(kg, - ray, - ss_isect, - subsurface_object, - lcg_state, - max_hits); + case BVH_LAYOUT_BVH2: + return BVH_FUNCTION_FULL_NAME(BVH)(kg, + ray, + local_isect, + local_object, + lcg_state, + max_hits); } + kernel_assert(!"Should not happen"); } #undef BVH_FUNCTION_NAME diff --git a/intern/cycles/kernel/bvh/bvh_nodes.h b/intern/cycles/kernel/bvh/bvh_nodes.h index 726bef1794c..060b3934a41 100644 --- a/intern/cycles/kernel/bvh/bvh_nodes.h +++ b/intern/cycles/kernel/bvh/bvh_nodes.h @@ -17,26 +17,25 @@ // TODO(sergey): Look into avoid use of full Transform and use 3x3 matrix and // 3-vector which might be faster. ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg, - int node_addr, - int child) + int node_addr, + int child) { Transform space; const int child_addr = node_addr + child * 3; space.x = kernel_tex_fetch(__bvh_nodes, child_addr+1); space.y = kernel_tex_fetch(__bvh_nodes, child_addr+2); space.z = kernel_tex_fetch(__bvh_nodes, child_addr+3); - space.w = make_float4(0.0f, 0.0f, 0.0f, 1.0f); return space; } #if !defined(__KERNEL_SSE2__) ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg, - const float3 P, - const float3 idir, - const float t, - const int node_addr, - const uint visibility, - float dist[2]) + const float3 P, + const float3 idir, + const float t, + const int node_addr, + const uint visibility, + float dist[2]) { /* fetch node data */ @@ -52,8 +51,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg, float c0hiy = (node1.z - P.y) * idir.y; float c0loz = (node2.x - P.z) * idir.z; float c0hiz = (node2.z - P.z) * idir.z; - float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); - float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); + float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz)); + float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz)); float c1lox = (node0.y - P.x) * idir.x; float c1hix = (node0.w - P.x) * idir.x; @@ -61,8 +60,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg, float c1hiy = (node1.w - P.y) * idir.y; float c1loz = (node2.y - P.z) * idir.z; float c1hiz = (node2.w - P.z) * idir.z; - float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); - float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); + float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz)); + float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz)); dist[0] = c0min; dist[1] = c1min; @@ -78,14 +77,14 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg, } ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg, - const float3 P, - const float3 idir, - const float t, - const float difl, - const float extmax, - const int node_addr, - const uint visibility, - float dist[2]) + const float3 P, + const float3 idir, + const float t, + const float difl, + const float extmax, + const int node_addr, + const uint visibility, + float dist[2]) { /* fetch node data */ @@ -101,8 +100,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg, float c0hiy = (node1.z - P.y) * idir.y; float c0loz = (node2.x - P.z) * idir.z; float c0hiz = (node2.z - P.z) * idir.z; - float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f); - float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t); + float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz)); + float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz)); float c1lox = (node0.y - P.x) * idir.x; float c1hix = (node0.w - P.x) * idir.x; @@ -110,8 +109,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg, float c1hiy = (node1.w - P.y) * idir.y; float c1loz = (node2.y - P.z) * idir.z; float c1hiz = (node2.w - P.z) * idir.z; - float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f); - float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t); + float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz)); + float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz)); if(difl != 0.0f) { float hdiff = 1.0f + difl; @@ -203,13 +202,13 @@ ccl_device_forceinline bool bvh_unaligned_node_intersect_child_robust( } ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg, - const float3 P, - const float3 dir, - const float3 idir, - const float t, - const int node_addr, - const uint visibility, - float dist[2]) + const float3 P, + const float3 dir, + const float3 idir, + const float t, + const int node_addr, + const uint visibility, + float dist[2]) { int mask = 0; float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); @@ -233,15 +232,15 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg, } ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg, - const float3 P, - const float3 dir, - const float3 idir, - const float t, - const float difl, - const float extmax, - const int node_addr, - const uint visibility, - float dist[2]) + const float3 P, + const float3 dir, + const float3 idir, + const float t, + const float difl, + const float extmax, + const int node_addr, + const uint visibility, + float dist[2]) { int mask = 0; float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); @@ -265,13 +264,13 @@ ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg } ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg, - const float3 P, - const float3 dir, - const float3 idir, - const float t, - const int node_addr, - const uint visibility, - float dist[2]) + const float3 P, + const float3 dir, + const float3 idir, + const float t, + const int node_addr, + const uint visibility, + float dist[2]) { float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { @@ -296,15 +295,15 @@ ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg, } ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg, - const float3 P, - const float3 dir, - const float3 idir, - const float t, - const float difl, - const float extmax, - const int node_addr, - const uint visibility, - float dist[2]) + const float3 P, + const float3 dir, + const float3 idir, + const float t, + const float difl, + const float extmax, + const int node_addr, + const uint visibility, + float dist[2]) { float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { @@ -442,19 +441,19 @@ ccl_device_forceinline int bvh_aligned_node_intersect_robust( } ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg, - const float3 P, - const float3 dir, - const ssef& isect_near, - const ssef& isect_far, - const int node_addr, - const uint visibility, - float dist[2]) + const float3 P, + const float3 dir, + const ssef& isect_near, + const ssef& isect_far, + const int node_addr, + const uint visibility, + float dist[2]) { Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0); Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1); float3 aligned_dir0 = transform_direction(&space0, dir), - aligned_dir1 = transform_direction(&space1, dir);; + aligned_dir1 = transform_direction(&space1, dir); float3 aligned_P0 = transform_point(&space0, P), aligned_P1 = transform_point(&space1, P); float3 nrdir0 = -bvh_inverse_direction(aligned_dir0), @@ -483,8 +482,8 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg, ssef tfar_y = max(lower_y, upper_y); ssef tfar_z = max(lower_z, upper_z); - const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); - const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); + const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); sseb vmask = tnear <= tfar; dist[0] = tnear.f[0]; dist[1] = tnear.f[1]; @@ -503,20 +502,20 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg, } ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg, - const float3 P, - const float3 dir, - const ssef& isect_near, - const ssef& isect_far, - const float difl, - const int node_addr, - const uint visibility, - float dist[2]) + const float3 P, + const float3 dir, + const ssef& isect_near, + const ssef& isect_far, + const float difl, + const int node_addr, + const uint visibility, + float dist[2]) { Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0); Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1); float3 aligned_dir0 = transform_direction(&space0, dir), - aligned_dir1 = transform_direction(&space1, dir);; + aligned_dir1 = transform_direction(&space1, dir); float3 aligned_P0 = transform_point(&space0, P), aligned_P1 = transform_point(&space1, P); float3 nrdir0 = -bvh_inverse_direction(aligned_dir0), @@ -545,8 +544,8 @@ ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg ssef tfar_y = max(lower_y, upper_y); ssef tfar_z = max(lower_z, upper_z); - const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); - const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); + const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); sseb vmask; if(difl != 0.0f) { const float round_down = 1.0f - difl; @@ -574,17 +573,17 @@ ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg } ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg, - const float3& P, - const float3& dir, - const ssef& isect_near, - const ssef& isect_far, - const ssef& tsplat, - const ssef Psplat[3], - const ssef idirsplat[3], - const shuffle_swap_t shufflexyz[3], - const int node_addr, - const uint visibility, - float dist[2]) + const float3& P, + const float3& dir, + const ssef& isect_near, + const ssef& isect_far, + const ssef& tsplat, + const ssef Psplat[3], + const ssef idirsplat[3], + const shuffle_swap_t shufflexyz[3], + const int node_addr, + const uint visibility, + float dist[2]) { float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { @@ -612,19 +611,19 @@ ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg, } ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg, - const float3& P, - const float3& dir, - const ssef& isect_near, - const ssef& isect_far, - const ssef& tsplat, - const ssef Psplat[3], - const ssef idirsplat[3], - const shuffle_swap_t shufflexyz[3], - const float difl, - const float extmax, - const int node_addr, - const uint visibility, - float dist[2]) + const float3& P, + const float3& dir, + const ssef& isect_near, + const ssef& isect_far, + const ssef& tsplat, + const ssef Psplat[3], + const ssef idirsplat[3], + const shuffle_swap_t shufflexyz[3], + const float difl, + const float extmax, + const int node_addr, + const uint visibility, + float dist[2]) { float4 node = kernel_tex_fetch(__bvh_nodes, node_addr); if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) { diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h index df33a86bb18..cfc567ff9ca 100644 --- a/intern/cycles/kernel/bvh/bvh_shadow_all.h +++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h @@ -18,7 +18,7 @@ */ #ifdef __QBVH__ -# include "qbvh_shadow_all.h" +# include "kernel/bvh/qbvh_shadow_all.h" #endif #if BVH_FEATURE(BVH_HAIR) @@ -45,6 +45,7 @@ ccl_device_inline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, const Ray *ray, Intersection *isect_array, + const uint visibility, const uint max_hits, uint *num_hits) { @@ -100,9 +101,6 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif /* __KERNEL_SSE2__ */ - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); - /* traversal loop */ do { do { @@ -121,7 +119,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, idir, isect_t, node_addr, - PATH_RAY_SHADOW, + visibility, dist); #else // __KERNEL_SSE2__ traverse_mask = NODE_INTERSECT(kg, @@ -136,7 +134,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, idirsplat, shufflexyz, node_addr, - PATH_RAY_SHADOW, + visibility, dist); #endif // __KERNEL_SSE2__ @@ -188,7 +186,6 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, /* primitive intersection */ while(prim_addr < prim_addr2) { kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type); - bool hit; /* todo: specialized intersect functions which don't fill in @@ -198,10 +195,10 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, switch(p_type) { case PRIMITIVE_TRIANGLE: { hit = triangle_intersect(kg, - &isect_precalc, isect_array, P, - PATH_RAY_SHADOW, + dir, + visibility, object, prim_addr); break; @@ -213,7 +210,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, P, dir, ray->time, - PATH_RAY_SHADOW, + visibility, object, prim_addr); break; @@ -224,30 +221,30 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, case PRIMITIVE_MOTION_CURVE: { const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr); if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { - hit = bvh_cardinal_curve_intersect(kg, - isect_array, - P, - dir, - PATH_RAY_SHADOW, - object, - prim_addr, - ray->time, - curve_type, - NULL, - 0, 0); + hit = cardinal_curve_intersect(kg, + isect_array, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + NULL, + 0, 0); } else { - hit = bvh_curve_intersect(kg, - isect_array, - P, - dir, - PATH_RAY_SHADOW, - object, - prim_addr, - ray->time, - curve_type, - NULL, - 0, 0); + hit = curve_intersect(kg, + isect_array, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + NULL, + 0, 0); } break; } @@ -279,7 +276,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, shader = __float_as_int(str.z); } #endif - int flag = kernel_tex_fetch(__shader_flag, (shader & SHADER_MASK)*SHADER_SIZE); + int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags; /* if no transparent shadows, all light is blocked */ if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) { @@ -309,12 +306,11 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, object = kernel_tex_fetch(__prim_object, -prim_addr-1); # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm); + isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm); # else - bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t); + isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); # endif - triangle_intersect_precalc(dir, &isect_precalc); num_hits_in_instance = 0; isect_array->t = isect_t; @@ -354,22 +350,17 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac); # endif - triangle_intersect_precalc(dir, &isect_precalc); - /* scale isect->t to adjust for instancing */ for(int i = 0; i < num_hits_in_instance; i++) { (isect_array-i-1)->t *= t_fac; } } else { - float ignore_t = FLT_MAX; - # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm); + bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm); # else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t); + bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); # endif - triangle_intersect_precalc(dir, &isect_precalc); } isect_t = tmax; @@ -400,27 +391,30 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersection *isect_array, + const uint visibility, const uint max_hits, uint *num_hits) { + switch(kernel_data.bvh.bvh_layout) { #ifdef __QBVH__ - if(kernel_data.bvh.use_qbvh) { - return BVH_FUNCTION_FULL_NAME(QBVH)(kg, - ray, - isect_array, - max_hits, - num_hits); - } - else + case BVH_LAYOUT_BVH4: + return BVH_FUNCTION_FULL_NAME(QBVH)(kg, + ray, + isect_array, + visibility, + max_hits, + num_hits); #endif - { - kernel_assert(kernel_data.bvh.use_qbvh == false); - return BVH_FUNCTION_FULL_NAME(BVH)(kg, - ray, - isect_array, - max_hits, - num_hits); + case BVH_LAYOUT_BVH2: + return BVH_FUNCTION_FULL_NAME(BVH)(kg, + ray, + isect_array, + visibility, + max_hits, + num_hits); } + kernel_assert(!"Should not happen"); + return false; } #undef BVH_FUNCTION_NAME diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h index 80c8f31473a..551625eae78 100644 --- a/intern/cycles/kernel/bvh/bvh_traversal.h +++ b/intern/cycles/kernel/bvh/bvh_traversal.h @@ -18,7 +18,7 @@ */ #ifdef __QBVH__ -# include "qbvh_traversal.h" +# include "kernel/bvh/qbvh_traversal.h" #endif #if BVH_FEATURE(BVH_HAIR) @@ -104,9 +104,6 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); - /* traversal loop */ do { do { @@ -238,23 +235,23 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, BVH_DEBUG_NEXT_INTERSECTION(); kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); if(triangle_intersect(kg, - &isect_precalc, isect, P, + dir, visibility, object, prim_addr)) { /* shadow ray early termination */ #if defined(__KERNEL_SSE2__) - if(visibility == PATH_RAY_SHADOW_OPAQUE) + if(visibility & PATH_RAY_SHADOW_OPAQUE) return true; tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); # if BVH_FEATURE(BVH_HAIR) tfar = ssef(isect->t); # endif #else - if(visibility == PATH_RAY_SHADOW_OPAQUE) + if(visibility & PATH_RAY_SHADOW_OPAQUE) return true; #endif } @@ -277,14 +274,14 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, { /* shadow ray early termination */ # if defined(__KERNEL_SSE2__) - if(visibility == PATH_RAY_SHADOW_OPAQUE) + if(visibility & PATH_RAY_SHADOW_OPAQUE) return true; tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); # if BVH_FEATURE(BVH_HAIR) tfar = ssef(isect->t); # endif # else - if(visibility == PATH_RAY_SHADOW_OPAQUE) + if(visibility & PATH_RAY_SHADOW_OPAQUE) return true; # endif } @@ -301,44 +298,44 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL)); bool hit; if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { - hit = bvh_cardinal_curve_intersect(kg, - isect, - P, - dir, - visibility, - object, - prim_addr, - ray->time, - curve_type, - lcg_state, - difl, - extmax); + hit = cardinal_curve_intersect(kg, + isect, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + lcg_state, + difl, + extmax); } else { - hit = bvh_curve_intersect(kg, - isect, - P, - dir, - visibility, - object, - prim_addr, - ray->time, - curve_type, - lcg_state, - difl, - extmax); + hit = curve_intersect(kg, + isect, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + lcg_state, + difl, + extmax); } if(hit) { /* shadow ray early termination */ # if defined(__KERNEL_SSE2__) - if(visibility == PATH_RAY_SHADOW_OPAQUE) + if(visibility & PATH_RAY_SHADOW_OPAQUE) return true; tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t); # if BVH_FEATURE(BVH_HAIR) tfar = ssef(isect->t); # endif # else - if(visibility == PATH_RAY_SHADOW_OPAQUE) + if(visibility & PATH_RAY_SHADOW_OPAQUE) return true; # endif } @@ -354,11 +351,10 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, object = kernel_tex_fetch(__prim_object, -prim_addr-1); # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm); + isect->t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); # else - bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t); + isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t); # endif - triangle_intersect_precalc(dir, &isect_precalc); # if defined(__KERNEL_SSE2__) Psplat[0] = ssef(P.x); @@ -391,11 +387,10 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, /* instance pop */ # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm); + isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); # else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t); + isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); # endif - triangle_intersect_precalc(dir, &isect_precalc); # if defined(__KERNEL_SSE2__) Psplat[0] = ssef(P.x); @@ -431,34 +426,34 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, #endif ) { + switch(kernel_data.bvh.bvh_layout) { #ifdef __QBVH__ - if(kernel_data.bvh.use_qbvh) { - return BVH_FUNCTION_FULL_NAME(QBVH)(kg, - ray, - isect, - visibility -#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) - , lcg_state, - difl, - extmax -#endif - ); - } - else -#endif - { - kernel_assert(kernel_data.bvh.use_qbvh == false); - return BVH_FUNCTION_FULL_NAME(BVH)(kg, - ray, - isect, - visibility + case BVH_LAYOUT_BVH4: + return BVH_FUNCTION_FULL_NAME(QBVH)(kg, + ray, + isect, + visibility +# if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) + , lcg_state, + difl, + extmax +# endif + ); +#endif /* __QBVH__ */ + case BVH_LAYOUT_BVH2: + return BVH_FUNCTION_FULL_NAME(BVH)(kg, + ray, + isect, + visibility #if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH) - , lcg_state, - difl, - extmax + , lcg_state, + difl, + extmax #endif - ); + ); } + kernel_assert(!"Should not happen"); + return false; } #undef BVH_FUNCTION_NAME diff --git a/intern/cycles/kernel/bvh/bvh_volume.h b/intern/cycles/kernel/bvh/bvh_volume.h index 57e5b8d736d..ce5fc7be33d 100644 --- a/intern/cycles/kernel/bvh/bvh_volume.h +++ b/intern/cycles/kernel/bvh/bvh_volume.h @@ -18,7 +18,7 @@ */ #ifdef __QBVH__ -# include "qbvh_volume.h" +# include "kernel/bvh/qbvh_volume.h" #endif #if BVH_FEATURE(BVH_HAIR) @@ -97,9 +97,6 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); - /* traversal loop */ do { do { @@ -194,9 +191,9 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, continue; } triangle_intersect(kg, - &isect_precalc, isect, P, + dir, visibility, object, prim_addr); @@ -238,13 +235,11 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, int object_flag = kernel_tex_fetch(__object_flag, object); if(object_flag & SD_OBJECT_HAS_VOLUME) { # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm); + isect->t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); # else - bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t); + isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t); # endif - triangle_intersect_precalc(dir, &isect_precalc); - # if defined(__KERNEL_SSE2__) Psplat[0] = ssef(P.x); Psplat[1] = ssef(P.y); @@ -281,13 +276,11 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, /* instance pop */ # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm); + isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); # else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t); + isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); # endif - triangle_intersect_precalc(dir, &isect_precalc); - # if defined(__KERNEL_SSE2__) Psplat[0] = ssef(P.x); Psplat[1] = ssef(P.y); @@ -316,22 +309,22 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg, Intersection *isect, const uint visibility) { + switch(kernel_data.bvh.bvh_layout) { #ifdef __QBVH__ - if(kernel_data.bvh.use_qbvh) { - return BVH_FUNCTION_FULL_NAME(QBVH)(kg, - ray, - isect, - visibility); - } - else + case BVH_LAYOUT_BVH4: + return BVH_FUNCTION_FULL_NAME(QBVH)(kg, + ray, + isect, + visibility); #endif - { - kernel_assert(kernel_data.bvh.use_qbvh == false); - return BVH_FUNCTION_FULL_NAME(BVH)(kg, - ray, - isect, - visibility); + case BVH_LAYOUT_BVH2: + return BVH_FUNCTION_FULL_NAME(BVH)(kg, + ray, + isect, + visibility); } + kernel_assert(!"Should not happen"); + return false; } #undef BVH_FUNCTION_NAME diff --git a/intern/cycles/kernel/bvh/bvh_volume_all.h b/intern/cycles/kernel/bvh/bvh_volume_all.h index 5a1accebaa0..f2379efc656 100644 --- a/intern/cycles/kernel/bvh/bvh_volume_all.h +++ b/intern/cycles/kernel/bvh/bvh_volume_all.h @@ -18,7 +18,7 @@ */ #ifdef __QBVH__ -# include "qbvh_volume_all.h" +# include "kernel/bvh/qbvh_volume_all.h" #endif #if BVH_FEATURE(BVH_HAIR) @@ -101,9 +101,6 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz); #endif /* __KERNEL_SSE2__ */ - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); - /* traversal loop */ do { do { @@ -199,9 +196,9 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, continue; } hit = triangle_intersect(kg, - &isect_precalc, isect_array, P, + dir, visibility, object, prim_addr); @@ -288,14 +285,12 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, object = kernel_tex_fetch(__prim_object, -prim_addr-1); int object_flag = kernel_tex_fetch(__object_flag, object); if(object_flag & SD_OBJECT_HAS_VOLUME) { - # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm); + isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm); # else - bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t); + isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); # endif - triangle_intersect_precalc(dir, &isect_precalc); num_hits_in_instance = 0; isect_array->t = isect_t; @@ -341,20 +336,17 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg, # else bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac); # endif - triangle_intersect_precalc(dir, &isect_precalc); /* Scale isect->t to adjust for instancing. */ for(int i = 0; i < num_hits_in_instance; i++) { (isect_array-i-1)->t *= t_fac; } } else { - float ignore_t = FLT_MAX; # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm); + bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm); # else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t); + bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); # endif - triangle_intersect_precalc(dir, &isect_precalc); } isect_t = tmax; @@ -389,24 +381,24 @@ ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg, const uint max_hits, const uint visibility) { + switch(kernel_data.bvh.bvh_layout) { #ifdef __QBVH__ - if(kernel_data.bvh.use_qbvh) { - return BVH_FUNCTION_FULL_NAME(QBVH)(kg, - ray, - isect_array, - max_hits, - visibility); - } - else + case BVH_LAYOUT_BVH4: + return BVH_FUNCTION_FULL_NAME(QBVH)(kg, + ray, + isect_array, + max_hits, + visibility); #endif - { - kernel_assert(kernel_data.bvh.use_qbvh == false); - return BVH_FUNCTION_FULL_NAME(BVH)(kg, - ray, - isect_array, - max_hits, - visibility); + case BVH_LAYOUT_BVH2: + return BVH_FUNCTION_FULL_NAME(BVH)(kg, + ray, + isect_array, + max_hits, + visibility); } + kernel_assert(!"Should not happen"); + return 0; } #undef BVH_FUNCTION_NAME diff --git a/intern/cycles/kernel/bvh/qbvh_subsurface.h b/intern/cycles/kernel/bvh/qbvh_local.h index 84dc4003133..2386fa1a1e8 100644 --- a/intern/cycles/kernel/bvh/qbvh_subsurface.h +++ b/intern/cycles/kernel/bvh/qbvh_local.h @@ -14,9 +14,10 @@ * limitations under the License. */ -/* This is a template BVH traversal function for subsurface scattering, where - * various features can be enabled/disabled. This way we can compile optimized - * versions for each case without new features slowing things down. +/* This is a template BVH traversal function for finding local intersections + * around the shading point, for subsurface scattering and bevel. We disable + * various features for performance, and for instanced objects avoid traversing + * other parts of the scene. * * BVH_MOTION: motion blur rendering * @@ -30,8 +31,8 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, const Ray *ray, - SubsurfaceIntersection *ss_isect, - int subsurface_object, + LocalIntersection *local_isect, + int local_object, uint *lcg_state, int max_hits) { @@ -49,7 +50,7 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, /* Traversal variables in registers. */ int stack_ptr = 0; - int node_addr = kernel_tex_fetch(__object_node, subsurface_object); + int node_addr = kernel_tex_fetch(__object_node, local_object); /* Ray parameters in registers. */ float3 P = ray->P; @@ -58,24 +59,24 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, int object = OBJECT_NONE; float isect_t = ray->t; - ss_isect->num_hits = 0; + local_isect->num_hits = 0; - const int object_flag = kernel_tex_fetch(__object_flag, subsurface_object); + const int object_flag = kernel_tex_fetch(__object_flag, local_object); if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { #if BVH_FEATURE(BVH_MOTION) Transform ob_itfm; - bvh_instance_motion_push(kg, - subsurface_object, - ray, - &P, - &dir, - &idir, - &isect_t, - &ob_itfm); + isect_t = bvh_instance_motion_push(kg, + local_object, + ray, + &P, + &dir, + &idir, + isect_t, + &ob_itfm); #else - bvh_instance_push(kg, subsurface_object, ray, &P, &dir, &idir, &isect_t); + isect_t = bvh_instance_push(kg, local_object, ray, &P, &dir, &idir, isect_t); #endif - object = subsurface_object; + object = local_object; } #ifndef __KERNEL_SSE41__ @@ -105,9 +106,6 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); - /* Traversal loop. */ do { do { @@ -252,15 +250,16 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, /* Intersect ray against primitive, */ for(; prim_addr < prim_addr2; prim_addr++) { kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - triangle_intersect_subsurface(kg, - &isect_precalc, - ss_isect, - P, - object, - prim_addr, - isect_t, - lcg_state, - max_hits); + triangle_intersect_local(kg, + local_isect, + P, + dir, + object, + local_object, + prim_addr, + isect_t, + lcg_state, + max_hits); } break; } @@ -269,16 +268,17 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, /* Intersect ray against primitive. */ for(; prim_addr < prim_addr2; prim_addr++) { kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); - motion_triangle_intersect_subsurface(kg, - ss_isect, - P, - dir, - ray->time, - object, - prim_addr, - isect_t, - lcg_state, - max_hits); + motion_triangle_intersect_local(kg, + local_isect, + P, + dir, + ray->time, + object, + local_object, + prim_addr, + isect_t, + lcg_state, + max_hits); } break; } diff --git a/intern/cycles/kernel/bvh/qbvh_nodes.h b/intern/cycles/kernel/bvh/qbvh_nodes.h index 6d22f0b0d6a..3036efd4198 100644 --- a/intern/cycles/kernel/bvh/qbvh_nodes.h +++ b/intern/cycles/kernel/bvh/qbvh_nodes.h @@ -126,8 +126,8 @@ ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg const sseb vmask = cast(tnear) > cast(tfar); int mask = (int)movemask(vmask)^0xf; #else - const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); - const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); + const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); const sseb vmask = tnear <= tfar; int mask = (int)movemask(vmask); #endif @@ -174,8 +174,8 @@ ccl_device_inline int qbvh_aligned_node_intersect_robust( const float round_down = 1.0f - difl; const float round_up = 1.0f + difl; - const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near); - const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far); + const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z); + const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z); const sseb vmask = round_down*tnear <= round_up*tfar; *dist = tnear; return (int)movemask(vmask); diff --git a/intern/cycles/kernel/bvh/qbvh_shadow_all.h b/intern/cycles/kernel/bvh/qbvh_shadow_all.h index 607295f9ed5..46fd178aed6 100644 --- a/intern/cycles/kernel/bvh/qbvh_shadow_all.h +++ b/intern/cycles/kernel/bvh/qbvh_shadow_all.h @@ -33,6 +33,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, const Ray *ray, Intersection *isect_array, + const uint visibility, const uint max_hits, uint *num_hits) { @@ -96,19 +97,17 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); - /* Traversal loop. */ do { do { /* Traverse internal nodes. */ while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + (void)inodes; if(false #ifdef __VISIBILITY_FLAG__ - || ((__float_as_uint(inodes.x) & PATH_RAY_SHADOW) == 0) + || ((__float_as_uint(inodes.x) & visibility) == 0) #endif #if BVH_FEATURE(BVH_MOTION) || UNLIKELY(ray->time < inodes.y) @@ -245,7 +244,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, if(node_addr < 0) { float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1)); #ifdef __VISIBILITY_FLAG__ - if((__float_as_uint(leaf.z) & PATH_RAY_SHADOW) == 0) { + if((__float_as_uint(leaf.z) & visibility) == 0) { /* Pop. */ node_addr = traversal_stack[stack_ptr].addr; --stack_ptr; @@ -269,7 +268,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, /* Primitive intersection. */ while(prim_addr < prim_addr2) { kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type); - bool hit; /* todo: specialized intersect functions which don't fill in @@ -279,10 +277,10 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, switch(p_type) { case PRIMITIVE_TRIANGLE: { hit = triangle_intersect(kg, - &isect_precalc, isect_array, P, - PATH_RAY_SHADOW, + dir, + visibility, object, prim_addr); break; @@ -294,7 +292,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, P, dir, ray->time, - PATH_RAY_SHADOW, + visibility, object, prim_addr); break; @@ -305,30 +303,30 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, case PRIMITIVE_MOTION_CURVE: { const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr); if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { - hit = bvh_cardinal_curve_intersect(kg, - isect_array, - P, - dir, - PATH_RAY_SHADOW, - object, - prim_addr, - ray->time, - curve_type, - NULL, - 0, 0); + hit = cardinal_curve_intersect(kg, + isect_array, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + NULL, + 0, 0); } else { - hit = bvh_curve_intersect(kg, - isect_array, - P, - dir, - PATH_RAY_SHADOW, - object, - prim_addr, - ray->time, - curve_type, - NULL, - 0, 0); + hit = curve_intersect(kg, + isect_array, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + NULL, + 0, 0); } break; } @@ -360,7 +358,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, shader = __float_as_int(str.z); } #endif - int flag = kernel_tex_fetch(__shader_flag, (shader & SHADER_MASK)*SHADER_SIZE); + int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags; /* if no transparent shadows, all light is blocked */ if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) { @@ -390,9 +388,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, object = kernel_tex_fetch(__prim_object, -prim_addr-1); # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm); + isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm); # else - bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t); + isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); # endif num_hits_in_instance = 0; @@ -414,8 +412,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - triangle_intersect_precalc(dir, &isect_precalc); - ++stack_ptr; kernel_assert(stack_ptr < BVH_QSTACK_SIZE); traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; @@ -445,11 +441,10 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, } } else { - float ignore_t = FLT_MAX; # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm); + bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm); # else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t); + bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); # endif } @@ -472,8 +467,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - triangle_intersect_precalc(dir, &isect_precalc); - object = OBJECT_NONE; node_addr = traversal_stack[stack_ptr].addr; --stack_ptr; diff --git a/intern/cycles/kernel/bvh/qbvh_traversal.h b/intern/cycles/kernel/bvh/qbvh_traversal.h index 10ae7bee852..335a4afd47a 100644 --- a/intern/cycles/kernel/bvh/qbvh_traversal.h +++ b/intern/cycles/kernel/bvh/qbvh_traversal.h @@ -106,15 +106,13 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); - /* Traversal loop. */ do { do { /* Traverse internal nodes. */ while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) { float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0); + (void)inodes; if(UNLIKELY(node_dist > isect->t) #if BVH_FEATURE(BVH_MOTION) @@ -122,8 +120,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, || UNLIKELY(ray->time > inodes.z) #endif #ifdef __VISIBILITY_FLAG__ - || (__float_as_uint(inodes.x) & visibility) == 0) + || (__float_as_uint(inodes.x) & visibility) == 0 #endif + ) { /* Pop. */ node_addr = traversal_stack[stack_ptr].addr; @@ -333,15 +332,15 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, BVH_DEBUG_NEXT_INTERSECTION(); kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type); if(triangle_intersect(kg, - &isect_precalc, isect, P, + dir, visibility, object, prim_addr)) { tfar = ssef(isect->t); /* Shadow ray early termination. */ - if(visibility == PATH_RAY_SHADOW_OPAQUE) { + if(visibility & PATH_RAY_SHADOW_OPAQUE) { return true; } } @@ -363,7 +362,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, prim_addr)) { tfar = ssef(isect->t); /* Shadow ray early termination. */ - if(visibility == PATH_RAY_SHADOW_OPAQUE) { + if(visibility & PATH_RAY_SHADOW_OPAQUE) { return true; } } @@ -380,37 +379,37 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL)); bool hit; if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) { - hit = bvh_cardinal_curve_intersect(kg, - isect, - P, - dir, - visibility, - object, - prim_addr, - ray->time, - curve_type, - lcg_state, - difl, - extmax); + hit = cardinal_curve_intersect(kg, + isect, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + lcg_state, + difl, + extmax); } else { - hit = bvh_curve_intersect(kg, - isect, - P, - dir, - visibility, - object, - prim_addr, - ray->time, - curve_type, - lcg_state, - difl, - extmax); + hit = curve_intersect(kg, + isect, + P, + dir, + visibility, + object, + prim_addr, + ray->time, + curve_type, + lcg_state, + difl, + extmax); } if(hit) { tfar = ssef(isect->t); /* Shadow ray early termination. */ - if(visibility == PATH_RAY_SHADOW_OPAQUE) { + if(visibility & PATH_RAY_SHADOW_OPAQUE) { return true; } } @@ -447,8 +446,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - triangle_intersect_precalc(dir, &isect_precalc); - ++stack_ptr; kernel_assert(stack_ptr < BVH_QSTACK_SIZE); traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; @@ -468,9 +465,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, /* Instance pop. */ # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm); + isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); # else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t); + isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); # endif qbvh_near_far_idx_calc(idir, @@ -489,8 +486,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - triangle_intersect_precalc(dir, &isect_precalc); - object = OBJECT_NONE; node_addr = traversal_stack[stack_ptr].addr; node_dist = traversal_stack[stack_ptr].dist; diff --git a/intern/cycles/kernel/bvh/qbvh_volume.h b/intern/cycles/kernel/bvh/qbvh_volume.h index dc6627e2dbb..192ce009524 100644 --- a/intern/cycles/kernel/bvh/qbvh_volume.h +++ b/intern/cycles/kernel/bvh/qbvh_volume.h @@ -91,9 +91,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); - /* Traversal loop. */ do { do { @@ -266,7 +263,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, continue; } /* Intersect ray against primitive. */ - triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, prim_addr); + triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr); } break; } @@ -295,9 +292,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, int object_flag = kernel_tex_fetch(__object_flag, object); if(object_flag & SD_OBJECT_HAS_VOLUME) { # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm); + isect->t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); # else - bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t); + isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t); # endif qbvh_near_far_idx_calc(idir, @@ -316,8 +313,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - triangle_intersect_precalc(dir, &isect_precalc); - ++stack_ptr; kernel_assert(stack_ptr < BVH_QSTACK_SIZE); traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL; @@ -341,9 +336,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, /* Instance pop. */ # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm); + isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm); # else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t); + isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t); # endif qbvh_near_far_idx_calc(idir, @@ -362,8 +357,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - triangle_intersect_precalc(dir, &isect_precalc); - object = OBJECT_NONE; node_addr = traversal_stack[stack_ptr].addr; --stack_ptr; diff --git a/intern/cycles/kernel/bvh/qbvh_volume_all.h b/intern/cycles/kernel/bvh/qbvh_volume_all.h index ff1fa92af6e..ac5f58a9a51 100644 --- a/intern/cycles/kernel/bvh/qbvh_volume_all.h +++ b/intern/cycles/kernel/bvh/qbvh_volume_all.h @@ -95,9 +95,6 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, &near_x, &near_y, &near_z, &far_x, &far_y, &far_z); - IsectPrecalc isect_precalc; - triangle_intersect_precalc(dir, &isect_precalc); - /* Traversal loop. */ do { do { @@ -271,7 +268,7 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, continue; } /* Intersect ray against primitive. */ - hit = triangle_intersect(kg, &isect_precalc, isect_array, P, visibility, object, prim_addr); + hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr); if(hit) { /* Move on to next entry in intersections array. */ isect_array++; @@ -346,9 +343,9 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, int object_flag = kernel_tex_fetch(__object_flag, object); if(object_flag & SD_OBJECT_HAS_VOLUME) { # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm); + isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm); # else - bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t); + isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t); # endif qbvh_near_far_idx_calc(idir, @@ -367,7 +364,6 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - triangle_intersect_precalc(dir, &isect_precalc); num_hits_in_instance = 0; isect_array->t = isect_t; @@ -406,11 +402,10 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, } } else { - float ignore_t = FLT_MAX; # if BVH_FEATURE(BVH_MOTION) - bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm); + bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm); # else - bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t); + bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX); # endif } @@ -433,8 +428,6 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg, org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z)); # endif - triangle_intersect_precalc(dir, &isect_precalc); - object = OBJECT_NONE; node_addr = traversal_stack[stack_ptr].addr; --stack_ptr; diff --git a/intern/cycles/kernel/closure/alloc.h b/intern/cycles/kernel/closure/alloc.h index b7abc1ec507..b903aeb8073 100644 --- a/intern/cycles/kernel/closure/alloc.h +++ b/intern/cycles/kernel/closure/alloc.h @@ -20,17 +20,16 @@ ccl_device ShaderClosure *closure_alloc(ShaderData *sd, int size, ClosureType ty { kernel_assert(size <= sizeof(ShaderClosure)); - int num_closure = ccl_fetch(sd, num_closure); - int num_closure_extra = ccl_fetch(sd, num_closure_extra); - if(num_closure + num_closure_extra >= MAX_CLOSURE) + if(sd->num_closure_left == 0) return NULL; - ShaderClosure *sc = &ccl_fetch(sd, closure)[num_closure]; + ShaderClosure *sc = &sd->closure[sd->num_closure]; sc->type = type; sc->weight = weight; - ccl_fetch(sd, num_closure)++; + sd->num_closure++; + sd->num_closure_left--; return sc; } @@ -44,25 +43,23 @@ ccl_device ccl_addr_space void *closure_alloc_extra(ShaderData *sd, int size) * This lets us keep the same fast array iteration over closures, as we * found linked list iteration and iteration with skipping to be slower. */ int num_extra = ((size + sizeof(ShaderClosure) - 1) / sizeof(ShaderClosure)); - int num_closure = ccl_fetch(sd, num_closure); - int num_closure_extra = ccl_fetch(sd, num_closure_extra) + num_extra; - if(num_closure + num_closure_extra > MAX_CLOSURE) { - /* Remove previous closure. */ - ccl_fetch(sd, num_closure)--; - ccl_fetch(sd, num_closure_extra)++; + if(num_extra > sd->num_closure_left) { + /* Remove previous closure if it was allocated. */ + sd->num_closure--; + sd->num_closure_left++; return NULL; } - ccl_fetch(sd, num_closure_extra) = num_closure_extra; - return (ccl_addr_space void*)(ccl_fetch(sd, closure) + MAX_CLOSURE - num_closure_extra); + sd->num_closure_left -= num_extra; + return (ccl_addr_space void*)(sd->closure + sd->num_closure + sd->num_closure_left); } ccl_device_inline ShaderClosure *bsdf_alloc(ShaderData *sd, int size, float3 weight) { ShaderClosure *sc = closure_alloc(sd, size, CLOSURE_NONE_ID, weight); - if(!sc) + if(sc == NULL) return NULL; float sample_weight = fabsf(average(weight)); diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h index 7e4d5fe2e37..d8ff69ca241 100644 --- a/intern/cycles/kernel/closure/bsdf.h +++ b/intern/cycles/kernel/closure/bsdf.h @@ -14,126 +14,178 @@ * limitations under the License. */ -#include "../closure/bsdf_ashikhmin_velvet.h" -#include "../closure/bsdf_diffuse.h" -#include "../closure/bsdf_oren_nayar.h" -#include "../closure/bsdf_phong_ramp.h" -#include "../closure/bsdf_diffuse_ramp.h" -#include "../closure/bsdf_microfacet.h" -#include "../closure/bsdf_microfacet_multi.h" -#include "../closure/bsdf_reflection.h" -#include "../closure/bsdf_refraction.h" -#include "../closure/bsdf_transparent.h" -#include "../closure/bsdf_ashikhmin_shirley.h" -#include "../closure/bsdf_toon.h" -#include "../closure/bsdf_hair.h" -#ifdef __SUBSURFACE__ -# include "../closure/bssrdf.h" -#endif -#ifdef __VOLUME__ -# include "../closure/volume.h" -#endif +#include "kernel/closure/bsdf_ashikhmin_velvet.h" +#include "kernel/closure/bsdf_diffuse.h" +#include "kernel/closure/bsdf_oren_nayar.h" +#include "kernel/closure/bsdf_phong_ramp.h" +#include "kernel/closure/bsdf_diffuse_ramp.h" +#include "kernel/closure/bsdf_microfacet.h" +#include "kernel/closure/bsdf_microfacet_multi.h" +#include "kernel/closure/bsdf_reflection.h" +#include "kernel/closure/bsdf_refraction.h" +#include "kernel/closure/bsdf_transparent.h" +#include "kernel/closure/bsdf_ashikhmin_shirley.h" +#include "kernel/closure/bsdf_toon.h" +#include "kernel/closure/bsdf_hair.h" +#include "kernel/closure/bsdf_principled_diffuse.h" +#include "kernel/closure/bsdf_principled_sheen.h" +#include "kernel/closure/bssrdf.h" +#include "kernel/closure/volume.h" CCL_NAMESPACE_BEGIN +/* Returns the square of the roughness of the closure if it has roughness, + * 0 for singular closures and 1 otherwise. */ +ccl_device_inline float bsdf_get_specular_roughness_squared(const ShaderClosure *sc) +{ + if(CLOSURE_IS_BSDF_SINGULAR(sc->type)) { + return 0.0f; + } + + if(CLOSURE_IS_BSDF_MICROFACET(sc->type)) { + MicrofacetBsdf *bsdf = (MicrofacetBsdf*)sc; + return bsdf->alpha_x*bsdf->alpha_y; + } + + return 1.0f; +} + +ccl_device_inline float bsdf_get_roughness_squared(const ShaderClosure *sc) +{ + /* This version includes diffuse, mainly for baking Principled BSDF + * where specular and metallic zero otherwise does not bake the + * specified roughness parameter. */ + if(sc->type == CLOSURE_BSDF_OREN_NAYAR_ID) { + OrenNayarBsdf *bsdf = (OrenNayarBsdf*)sc; + return sqr(sqr(bsdf->roughness)); + } + + if(sc->type == CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID) { + PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)sc; + return sqr(sqr(bsdf->roughness)); + } + + if(CLOSURE_IS_BSDF_DIFFUSE(sc->type)) { + return 0.0f; + } + + return bsdf_get_specular_roughness_squared(sc); +} + ccl_device_forceinline int bsdf_sample(KernelGlobals *kg, - ShaderData *sd, - const ShaderClosure *sc, - float randu, - float randv, - float3 *eval, - float3 *omega_in, - differential3 *domega_in, - float *pdf) + ShaderData *sd, + const ShaderClosure *sc, + float randu, + float randv, + float3 *eval, + float3 *omega_in, + differential3 *domega_in, + float *pdf) { int label; switch(sc->type) { case CLOSURE_BSDF_DIFFUSE_ID: case CLOSURE_BSDF_BSSRDF_ID: - label = bsdf_diffuse_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_diffuse_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; #ifdef __SVM__ case CLOSURE_BSDF_OREN_NAYAR_ID: - label = bsdf_oren_nayar_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_oren_nayar_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; #ifdef __OSL__ case CLOSURE_BSDF_PHONG_RAMP_ID: - label = bsdf_phong_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_phong_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_DIFFUSE_RAMP_ID: - label = bsdf_diffuse_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_diffuse_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; #endif case CLOSURE_BSDF_TRANSLUCENT_ID: - label = bsdf_translucent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_translucent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_REFLECTION_ID: - label = bsdf_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_REFRACTION_ID: - label = bsdf_refraction_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_refraction_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_TRANSPARENT_ID: - label = bsdf_transparent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_transparent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_MICROFACET_GGX_ID: + case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID: + case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID: case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: + case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: - label = bsdf_microfacet_ggx_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_microfacet_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID: - label = bsdf_microfacet_multi_ggx_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, - eval, omega_in, &domega_in->dx, &domega_in->dy, pdf, &ccl_fetch(sd, lcg_state)); + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID: + label = bsdf_microfacet_multi_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + eval, omega_in, &domega_in->dx, &domega_in->dy, pdf, &sd->lcg_state); break; case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID: - label = bsdf_microfacet_multi_ggx_glass_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, - eval, omega_in, &domega_in->dx, &domega_in->dy, pdf, &ccl_fetch(sd, lcg_state)); + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID: + label = bsdf_microfacet_multi_ggx_glass_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + eval, omega_in, &domega_in->dx, &domega_in->dy, pdf, &sd->lcg_state); break; case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: - label = bsdf_microfacet_beckmann_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_microfacet_beckmann_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID: case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID: - label = bsdf_ashikhmin_shirley_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_ashikhmin_shirley_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: - label = bsdf_ashikhmin_velvet_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_ashikhmin_velvet_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_DIFFUSE_TOON_ID: - label = bsdf_diffuse_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_diffuse_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_GLOSSY_TOON_ID: - label = bsdf_glossy_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_glossy_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_HAIR_REFLECTION_ID: - label = bsdf_hair_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_hair_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: - label = bsdf_hair_transmission_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_hair_transmission_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); + break; +#ifdef __PRINCIPLED__ + case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID: + case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID: + label = bsdf_principled_diffuse_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); + break; + case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID: + label = bsdf_principled_sheen_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; +#endif /* __PRINCIPLED__ */ #endif #ifdef __VOLUME__ case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: - label = volume_henyey_greenstein_sample(sc, ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); + label = volume_henyey_greenstein_sample(sc, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; #endif default: @@ -141,6 +193,17 @@ ccl_device_forceinline int bsdf_sample(KernelGlobals *kg, break; } + /* Test if BSDF sample should be treated as transparent for background. */ + if(label & LABEL_TRANSMIT) { + float threshold_squared = kernel_data.background.transparent_roughness_squared_threshold; + + if(threshold_squared >= 0.0f) { + if(bsdf_get_specular_roughness_squared(sc) <= threshold_squared) { + label |= LABEL_TRANSMIT_TRANSPARENT; + } + } + } + return label; } @@ -157,75 +220,89 @@ float3 bsdf_eval(KernelGlobals *kg, { float3 eval; - if(dot(ccl_fetch(sd, Ng), omega_in) >= 0.0f) { + if(dot(sd->Ng, omega_in) >= 0.0f) { switch(sc->type) { case CLOSURE_BSDF_DIFFUSE_ID: case CLOSURE_BSDF_BSSRDF_ID: - eval = bsdf_diffuse_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_diffuse_eval_reflect(sc, sd->I, omega_in, pdf); break; #ifdef __SVM__ case CLOSURE_BSDF_OREN_NAYAR_ID: - eval = bsdf_oren_nayar_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_oren_nayar_eval_reflect(sc, sd->I, omega_in, pdf); break; #ifdef __OSL__ case CLOSURE_BSDF_PHONG_RAMP_ID: - eval = bsdf_phong_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_phong_ramp_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_DIFFUSE_RAMP_ID: - eval = bsdf_diffuse_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_diffuse_ramp_eval_reflect(sc, sd->I, omega_in, pdf); break; #endif case CLOSURE_BSDF_TRANSLUCENT_ID: - eval = bsdf_translucent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_translucent_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_REFLECTION_ID: - eval = bsdf_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_reflection_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_REFRACTION_ID: - eval = bsdf_refraction_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_refraction_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_TRANSPARENT_ID: - eval = bsdf_transparent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_transparent_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_GGX_ID: + case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID: + case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID: case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: + case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: - eval = bsdf_microfacet_ggx_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_microfacet_ggx_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID: - eval = bsdf_microfacet_multi_ggx_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state)); + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID: + eval = bsdf_microfacet_multi_ggx_eval_reflect(sc, sd->I, omega_in, pdf, &sd->lcg_state); break; case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID: - eval = bsdf_microfacet_multi_ggx_glass_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state)); + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID: + eval = bsdf_microfacet_multi_ggx_glass_eval_reflect(sc, sd->I, omega_in, pdf, &sd->lcg_state); break; case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: - eval = bsdf_microfacet_beckmann_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_microfacet_beckmann_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID: case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID: - eval = bsdf_ashikhmin_shirley_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_ashikhmin_shirley_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: - eval = bsdf_ashikhmin_velvet_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_ashikhmin_velvet_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_DIFFUSE_TOON_ID: - eval = bsdf_diffuse_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_diffuse_toon_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_GLOSSY_TOON_ID: - eval = bsdf_glossy_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_glossy_toon_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_HAIR_REFLECTION_ID: - eval = bsdf_hair_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_hair_reflection_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: - eval = bsdf_hair_transmission_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_hair_transmission_eval_reflect(sc, sd->I, omega_in, pdf); + break; +#ifdef __PRINCIPLED__ + case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID: + case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID: + eval = bsdf_principled_diffuse_eval_reflect(sc, sd->I, omega_in, pdf); break; + case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID: + eval = bsdf_principled_sheen_eval_reflect(sc, sd->I, omega_in, pdf); + break; +#endif /* __PRINCIPLED__ */ #endif #ifdef __VOLUME__ case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: - eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf); break; #endif default: @@ -237,63 +314,77 @@ float3 bsdf_eval(KernelGlobals *kg, switch(sc->type) { case CLOSURE_BSDF_DIFFUSE_ID: case CLOSURE_BSDF_BSSRDF_ID: - eval = bsdf_diffuse_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_diffuse_eval_transmit(sc, sd->I, omega_in, pdf); break; #ifdef __SVM__ case CLOSURE_BSDF_OREN_NAYAR_ID: - eval = bsdf_oren_nayar_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_oren_nayar_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_TRANSLUCENT_ID: - eval = bsdf_translucent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_translucent_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_REFLECTION_ID: - eval = bsdf_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_reflection_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_REFRACTION_ID: - eval = bsdf_refraction_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_refraction_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_TRANSPARENT_ID: - eval = bsdf_transparent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_transparent_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_GGX_ID: + case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID: + case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID: case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: + case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: - eval = bsdf_microfacet_ggx_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_microfacet_ggx_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID: - eval = bsdf_microfacet_multi_ggx_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state)); + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID: + eval = bsdf_microfacet_multi_ggx_eval_transmit(sc, sd->I, omega_in, pdf, &sd->lcg_state); break; case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID: - eval = bsdf_microfacet_multi_ggx_glass_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state)); + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID: + eval = bsdf_microfacet_multi_ggx_glass_eval_transmit(sc, sd->I, omega_in, pdf, &sd->lcg_state); break; case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: - eval = bsdf_microfacet_beckmann_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_microfacet_beckmann_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID: case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID: - eval = bsdf_ashikhmin_shirley_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_ashikhmin_shirley_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: - eval = bsdf_ashikhmin_velvet_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_ashikhmin_velvet_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_DIFFUSE_TOON_ID: - eval = bsdf_diffuse_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_diffuse_toon_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_GLOSSY_TOON_ID: - eval = bsdf_glossy_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_glossy_toon_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_HAIR_REFLECTION_ID: - eval = bsdf_hair_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_hair_reflection_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: - eval = bsdf_hair_transmission_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_hair_transmission_eval_transmit(sc, sd->I, omega_in, pdf); + break; +#ifdef __PRINCIPLED__ + case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID: + case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID: + eval = bsdf_principled_diffuse_eval_transmit(sc, sd->I, omega_in, pdf); break; + case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID: + eval = bsdf_principled_sheen_eval_transmit(sc, sd->I, omega_in, pdf); + break; +#endif /* __PRINCIPLED__ */ #endif #ifdef __VOLUME__ case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: - eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf); break; #endif default: @@ -311,11 +402,16 @@ ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness) #ifdef __SVM__ switch(sc->type) { case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID: + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID: case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID: + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID: bsdf_microfacet_multi_ggx_blur(sc, roughness); break; case CLOSURE_BSDF_MICROFACET_GGX_ID: + case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID: + case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID: case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: + case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: bsdf_microfacet_ggx_blur(sc, roughness); break; @@ -349,10 +445,15 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b) case CLOSURE_BSDF_REFLECTION_ID: case CLOSURE_BSDF_REFRACTION_ID: case CLOSURE_BSDF_MICROFACET_GGX_ID: + case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID: + case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID: case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: + case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID: + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID: case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID: + case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: @@ -367,6 +468,11 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b) case CLOSURE_BSDF_HAIR_REFLECTION_ID: case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: return bsdf_hair_merge(a, b); +#ifdef __PRINCIPLED__ + case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID: + case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID: + return bsdf_principled_diffuse_merge(a, b); +#endif #ifdef __VOLUME__ case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: return volume_henyey_greenstein_merge(a, b); diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h index 7e0f5a7ec75..a5ba2cb2972 100644 --- a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h +++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h @@ -40,7 +40,6 @@ typedef ccl_addr_space struct VelvetBsdf { float sigma; float invsigma2; - float3 N; } VelvetBsdf; ccl_device int bsdf_ashikhmin_velvet_setup(VelvetBsdf *bsdf) diff --git a/intern/cycles/kernel/closure/bsdf_diffuse.h b/intern/cycles/kernel/closure/bsdf_diffuse.h index dcd187f9305..ec6f1f20996 100644 --- a/intern/cycles/kernel/closure/bsdf_diffuse.h +++ b/intern/cycles/kernel/closure/bsdf_diffuse.h @@ -37,7 +37,6 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct DiffuseBsdf { SHADER_CLOSURE_BASE; - float3 N; } DiffuseBsdf; /* DIFFUSE */ diff --git a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h index 2d982a95fe4..24f40af46a3 100644 --- a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h +++ b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h @@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct DiffuseRampBsdf { SHADER_CLOSURE_BASE; - float3 N; float3 *colors; } DiffuseRampBsdf; diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h index 4a1316fd2a9..2dd59354058 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet.h @@ -36,7 +36,8 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct MicrofacetExtra { - float3 color; + float3 color, cspec0; + float clearcoat; } MicrofacetExtra; typedef ccl_addr_space struct MicrofacetBsdf { @@ -45,7 +46,6 @@ typedef ccl_addr_space struct MicrofacetBsdf { float alpha_x, alpha_y, ior; MicrofacetExtra *extra; float3 T; - float3 N; } MicrofacetBsdf; /* Beckmann and GGX microfacet importance sampling. */ @@ -233,6 +233,36 @@ ccl_device_forceinline float3 microfacet_sample_stretched( return normalize(make_float3(-slope_x, -slope_y, 1.0f)); } +/* Calculate the reflection color + * + * If fresnel is used, the color is an interpolation of the F0 color and white + * with respect to the fresnel + * + * Else it is simply white + */ +ccl_device_forceinline float3 reflection_color(const MicrofacetBsdf *bsdf, float3 L, float3 H) { + float3 F = make_float3(1.0f, 1.0f, 1.0f); + bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID + || bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID + || bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID); + + if(use_fresnel) { + float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior); + + F = interpolate_fresnel_color(L, H, bsdf->ior, F0, bsdf->extra->cspec0); + } + + return F; +} + +ccl_device_forceinline float D_GTR1(float NdotH, float alpha) +{ + if(alpha >= 1.0f) return M_1_PI_F; + float alpha2 = alpha*alpha; + float t = 1.0f + (alpha2 - 1.0f) * NdotH*NdotH; + return (alpha2 - 1.0f) / (M_PI_F * logf(alpha2) * t); +} + /* GGX microfacet with Smith shadow-masking from: * * Microfacet Models for Refraction through Rough Surfaces @@ -248,14 +278,52 @@ ccl_device_forceinline float3 microfacet_sample_stretched( ccl_device int bsdf_microfacet_ggx_setup(MicrofacetBsdf *bsdf) { + bsdf->extra = NULL; + bsdf->alpha_x = saturate(bsdf->alpha_x); bsdf->alpha_y = bsdf->alpha_x; - + bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_ID; return SD_BSDF|SD_BSDF_HAS_EVAL; } +ccl_device int bsdf_microfacet_ggx_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd) +{ + bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x); + bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y); + bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z); + + float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior); + float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0)); + bsdf->sample_weight *= F; + + bsdf->alpha_x = saturate(bsdf->alpha_x); + bsdf->alpha_y = bsdf->alpha_x; + + bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID; + + return SD_BSDF|SD_BSDF_HAS_EVAL; +} + +ccl_device int bsdf_microfacet_ggx_clearcoat_setup(MicrofacetBsdf *bsdf, const ShaderData *sd) +{ + bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x); + bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y); + bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z); + + float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior); + float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0)); + bsdf->sample_weight *= 0.25f * bsdf->extra->clearcoat * F; + + bsdf->alpha_x = saturate(bsdf->alpha_x); + bsdf->alpha_y = bsdf->alpha_x; + + bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID; + + return SD_BSDF|SD_BSDF_HAS_EVAL; +} + ccl_device bool bsdf_microfacet_merge(const ShaderClosure *a, const ShaderClosure *b) { const MicrofacetBsdf *bsdf_a = (const MicrofacetBsdf*)a; @@ -266,23 +334,47 @@ ccl_device bool bsdf_microfacet_merge(const ShaderClosure *a, const ShaderClosur (bsdf_a->alpha_y == bsdf_b->alpha_y) && (isequal_float3(bsdf_a->T, bsdf_b->T)) && (bsdf_a->ior == bsdf_b->ior) && - ((!bsdf_a->extra && !bsdf_b->extra) || + ((bsdf_a->extra == NULL && bsdf_b->extra == NULL) || ((bsdf_a->extra && bsdf_b->extra) && - (isequal_float3(bsdf_a->extra->color, bsdf_b->extra->color)))); + (isequal_float3(bsdf_a->extra->color, bsdf_b->extra->color)) && + (isequal_float3(bsdf_a->extra->cspec0, bsdf_b->extra->cspec0)) && + (bsdf_a->extra->clearcoat == bsdf_b->extra->clearcoat))); } ccl_device int bsdf_microfacet_ggx_aniso_setup(MicrofacetBsdf *bsdf) { + bsdf->extra = NULL; + bsdf->alpha_x = saturate(bsdf->alpha_x); bsdf->alpha_y = saturate(bsdf->alpha_y); - + bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID; return SD_BSDF|SD_BSDF_HAS_EVAL; } +ccl_device int bsdf_microfacet_ggx_aniso_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd) +{ + bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x); + bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y); + bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z); + + float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior); + float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0)); + bsdf->sample_weight *= F; + + bsdf->alpha_x = saturate(bsdf->alpha_x); + bsdf->alpha_y = saturate(bsdf->alpha_y); + + bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID; + + return SD_BSDF|SD_BSDF_HAS_EVAL; +} + ccl_device int bsdf_microfacet_ggx_refraction_setup(MicrofacetBsdf *bsdf) { + bsdf->extra = NULL; + bsdf->alpha_x = saturate(bsdf->alpha_x); bsdf->alpha_y = bsdf->alpha_x; @@ -327,7 +419,18 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons float cosThetaM2 = cosThetaM * cosThetaM; float cosThetaM4 = cosThetaM2 * cosThetaM2; float tanThetaM2 = (1 - cosThetaM2) / cosThetaM2; - D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2)); + + if(bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID) { + /* use GTR1 for clearcoat */ + D = D_GTR1(cosThetaM, bsdf->alpha_x); + + /* the alpha value for clearcoat is a fixed 0.25 => alpha2 = 0.25 * 0.25 */ + alpha2 = 0.0625f; + } + else { + /* use GTR2 otherwise */ + D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2)); + } /* eq. 34: now calculate G1(i,m) and G1(o,m) */ G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO))); @@ -374,7 +477,13 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons /* eq. 20 */ float common = D * 0.25f / cosNO; - float out = G * common; + + float3 F = reflection_color(bsdf, omega_in, m); + if(bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID) { + F *= 0.25f * bsdf->extra->clearcoat; + } + + float3 out = F * G * common; /* eq. 2 in distribution of visible normals sampling * pm = Dw = G1o * dot(m, I) * D / dot(N, I); */ @@ -384,7 +493,7 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons * pdf = pm * 0.25 / dot(m, I); */ *pdf = G1o * common; - return make_float3(out, out, out); + return out; } return make_float3(0.0f, 0.0f, 0.0f); @@ -489,6 +598,16 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure /* some high number for MIS */ *pdf = 1e6f; *eval = make_float3(1e6f, 1e6f, 1e6f); + + bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID + || bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID + || bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID); + + /* if fresnel is used, calculate the color with reflection_color(...) */ + if(use_fresnel) { + *eval *= reflection_color(bsdf, *omega_in, m); + } + label = LABEL_REFLECT | LABEL_SINGULAR; } else { @@ -502,13 +621,27 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure float cosThetaM2 = cosThetaM * cosThetaM; float cosThetaM4 = cosThetaM2 * cosThetaM2; float tanThetaM2 = 1/(cosThetaM2) - 1; - D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2)); /* eval BRDF*cosNI */ float cosNI = dot(N, *omega_in); + if(bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID) { + /* use GTR1 for clearcoat */ + D = D_GTR1(cosThetaM, bsdf->alpha_x); + + /* the alpha value for clearcoat is a fixed 0.25 => alpha2 = 0.25 * 0.25 */ + alpha2 = 0.0625f; + + /* recalculate G1o */ + G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO))); + } + else { + /* use GTR2 otherwise */ + D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2)); + } + /* eq. 34: now calculate G1(i,m) */ - G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI))); + G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI))); } else { /* anisotropic distribution */ @@ -538,10 +671,15 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure /* see eval function for derivation */ float common = (G1o * D) * 0.25f / cosNO; - float out = G1i * common; *pdf = common; - *eval = make_float3(out, out, out); + float3 F = reflection_color(bsdf, *omega_in, m); + + *eval = G1i * common * F; + } + + if(bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID) { + *eval *= 0.25f * bsdf->extra->clearcoat; } #ifdef __RAY_DIFFERENTIALS__ diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h index cea59adfebe..2f2c35d5d1f 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h @@ -40,20 +40,20 @@ ccl_device_forceinline float D_ggx_aniso(const float3 wm, const float2 alpha) } /* Sample slope distribution (based on page 14 of the supplemental implementation). */ -ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 randU) +ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float randx, const float randy) { - if(cosI > 0.9999f || cosI < 1e-6f) { - const float r = sqrtf(randU.x / (1.0f - randU.x)); - const float phi = M_2PI_F * randU.y; + if(cosI > 0.9999f || fabsf(cosI) < 1e-6f) { + const float r = sqrtf(randx / max(1.0f - randx, 1e-7f)); + const float phi = M_2PI_F * randy; return make_float2(r*cosf(phi), r*sinf(phi)); } - const float sinI = sqrtf(1.0f - cosI*cosI); + const float sinI = safe_sqrtf(1.0f - cosI*cosI); const float tanI = sinI/cosI; const float projA = 0.5f * (cosI + 1.0f); if(projA < 0.0001f) return make_float2(0.0f, 0.0f); - const float A = 2.0f*randU.x*projA / cosI - 1.0f; + const float A = 2.0f*randx*projA / cosI - 1.0f; float tmp = A*A-1.0f; if(fabsf(tmp) < 1e-7f) return make_float2(0.0f, 0.0f); @@ -64,26 +64,26 @@ ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 ran const float slopeX = (A < 0.0f || slopeX2 > 1.0f/tanI)? (tanI*tmp - D) : slopeX2; float U2; - if(randU.y >= 0.5f) - U2 = 2.0f*(randU.y - 0.5f); + if(randy >= 0.5f) + U2 = 2.0f*(randy - 0.5f); else - U2 = 2.0f*(0.5f - randU.y); + U2 = 2.0f*(0.5f - randy); const float z = (U2*(U2*(U2*0.27385f-0.73369f)+0.46341f)) / (U2*(U2*(U2*0.093073f+0.309420f)-1.0f)+0.597999f); const float slopeY = z * sqrtf(1.0f + slopeX*slopeX); - if(randU.y >= 0.5f) + if(randy >= 0.5f) return make_float2(slopeX, slopeY); else return make_float2(slopeX, -slopeY); } /* Visible normal sampling for the GGX distribution (based on page 7 of the supplemental implementation). */ -ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float2 randU) +ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float randx, const float randy) { const float3 wi_11 = normalize(make_float3(alpha.x*wi.x, alpha.y*wi.y, wi.z)); - const float2 slope_11 = mf_sampleP22_11(wi_11.z, randU); + const float2 slope_11 = mf_sampleP22_11(wi_11.z, randx, randy); - const float2 cossin_phi = normalize(make_float2(wi_11.x, wi_11.y)); + const float3 cossin_phi = safe_normalize(make_float3(wi_11.x, wi_11.y, 0.0f)); const float slope_x = alpha.x*(cossin_phi.x * slope_11.x - cossin_phi.y * slope_11.y); const float slope_y = alpha.y*(cossin_phi.y * slope_11.x + cossin_phi.x * slope_11.y); @@ -91,18 +91,15 @@ ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha return normalize(make_float3(-slope_x, -slope_y, 1.0f)); } -/* === Phase functions: Glossy, Diffuse and Glass === */ +/* === Phase functions: Glossy and Glass === */ -/* Phase function for reflective materials, either without a fresnel term (for compatibility) or with the conductive fresnel term. */ -ccl_device_forceinline float3 mf_sample_phase_glossy(const float3 wi, float3 *n, float3 *k, float3 *weight, const float3 wm) +/* Phase function for reflective materials. */ +ccl_device_forceinline float3 mf_sample_phase_glossy(const float3 wi, float3 *weight, const float3 wm) { - if(n && k) - *weight *= fresnel_conductor(dot(wi, wm), *n, *k); - return -wi + 2.0f * wm * dot(wi, wm); } -ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w, const float lambda, const float3 wo, const float2 alpha, float3 *n, float3 *k) +ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w, const float lambda, const float3 wo, const float2 alpha) { if(w.z > 0.9999f) return make_float3(0.0f, 0.0f, 0.0f); @@ -123,30 +120,9 @@ ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w, const float l else phase *= D_ggx_aniso(wh, alpha); - if(n && k) { - /* Apply conductive fresnel term. */ - return phase * fresnel_conductor(dotW_WH, *n, *k); - } - return make_float3(phase, phase, phase); } -/* Phase function for rough lambertian diffuse surfaces. */ -ccl_device_forceinline float3 mf_sample_phase_diffuse(const float3 wm, const float randu, const float randv) -{ - float3 tm, bm; - make_orthonormals(wm, &tm, &bm); - - float2 disk = concentric_sample_disk(randu, randv); - return disk.x*tm + disk.y*bm + safe_sqrtf(1.0f - disk.x*disk.x - disk.y*disk.y)*wm; -} - -ccl_device_forceinline float3 mf_eval_phase_diffuse(const float3 w, const float3 wm) -{ - const float v = max(0.0f, dot(w, wm)) * M_1_PI_F; - return make_float3(v, v, v); -} - /* Phase function for dielectric transmissive materials, including both reflection and refraction according to the dielectric fresnel term. */ ccl_device_forceinline float3 mf_sample_phase_glass(const float3 wi, const float eta, const float3 wm, const float randV, bool *outside) { @@ -269,40 +245,69 @@ ccl_device_forceinline float mf_ggx_albedo(float r) return saturate(albedo); } +ccl_device_inline float mf_ggx_transmission_albedo(float a, float ior) +{ + if(ior < 1.0f) { + ior = 1.0f/ior; + } + a = saturate(a); + ior = clamp(ior, 1.0f, 3.0f); + float I_1 = 0.0476898f*expf(-0.978352f*(ior-0.65657f)*(ior-0.65657f)) - 0.033756f*ior + 0.993261f; + float R_1 = (((0.116991f*a - 0.270369f)*a + 0.0501366f)*a - 0.00411511f)*a + 1.00008f; + float I_2 = (((-2.08704f*ior + 26.3298f)*ior - 127.906f)*ior + 292.958f)*ior - 287.946f + 199.803f/(ior*ior) - 101.668f/(ior*ior*ior); + float R_2 = ((((5.3725f*a -24.9307f)*a + 22.7437f)*a - 3.40751f)*a + 0.0986325f)*a + 0.00493504f; + + return saturate(1.0f + I_2*R_2*0.0019127f - (1.0f - I_1)*(1.0f - R_1)*9.3205f); +} + ccl_device_forceinline float mf_ggx_pdf(const float3 wi, const float3 wo, const float alpha) { float D = D_ggx(normalize(wi+wo), alpha); float lambda = mf_lambda(wi, make_float2(alpha, alpha)); + float singlescatter = 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f); + + float multiscatter = wo.z * M_1_PI_F; + float albedo = mf_ggx_albedo(alpha); - return 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f) + (1.0f - albedo) * wo.z; + return albedo*singlescatter + (1.0f - albedo)*multiscatter; } ccl_device_forceinline float mf_ggx_aniso_pdf(const float3 wi, const float3 wo, const float2 alpha) { - return 0.25f * D_ggx_aniso(normalize(wi+wo), alpha) / ((1.0f + mf_lambda(wi, alpha)) * wi.z) + (1.0f - mf_ggx_albedo(sqrtf(alpha.x*alpha.y))) * wo.z; -} + float D = D_ggx_aniso(normalize(wi+wo), alpha); + float lambda = mf_lambda(wi, alpha); + float singlescatter = 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f); -ccl_device_forceinline float mf_diffuse_pdf(const float3 wo) -{ - return M_1_PI_F * wo.z; + float multiscatter = wo.z * M_1_PI_F; + + float albedo = mf_ggx_albedo(sqrtf(alpha.x*alpha.y)); + return albedo*singlescatter + (1.0f - albedo)*multiscatter; } ccl_device_forceinline float mf_glass_pdf(const float3 wi, const float3 wo, const float alpha, const float eta) { - float3 wh; - float fresnel; - if(wi.z*wo.z > 0.0f) { - wh = normalize(wi + wo); - fresnel = fresnel_dielectric_cos(dot(wi, wh), eta); - } - else { - wh = normalize(wi + wo*eta); - fresnel = 1.0f - fresnel_dielectric_cos(dot(wi, wh), eta); - } + bool reflective = (wi.z*wo.z > 0.0f); + + float wh_len; + float3 wh = normalize_len(wi + (reflective? wo : (wo*eta)), &wh_len); if(wh.z < 0.0f) wh = -wh; float3 r_wi = (wi.z < 0.0f)? -wi: wi; - return fresnel * max(0.0f, dot(r_wi, wh)) * D_ggx(wh, alpha) / ((1.0f + mf_lambda(r_wi, make_float2(alpha, alpha))) * r_wi.z) + fabsf(wo.z); + float lambda = mf_lambda(r_wi, make_float2(alpha, alpha)); + float D = D_ggx(wh, alpha); + float fresnel = fresnel_dielectric_cos(dot(r_wi, wh), eta); + + float multiscatter = fabsf(wo.z * M_1_PI_F); + if(reflective) { + float singlescatter = 0.25f * D / max((1.0f + lambda) * r_wi.z, 1e-7f); + float albedo = mf_ggx_albedo(alpha); + return fresnel * (albedo*singlescatter + (1.0f - albedo)*multiscatter); + } + else { + float singlescatter = fabsf(dot(r_wi, wh)*dot(wo, wh) * D * eta*eta / max((1.0f + lambda) * r_wi.z * wh_len*wh_len, 1e-7f)); + float albedo = mf_ggx_transmission_albedo(alpha, eta); + return (1.0f - fresnel) * (albedo*singlescatter + (1.0f - albedo)*multiscatter); + } } /* === Actual random walk implementations, one version of mf_eval and mf_sample per phase function. === */ @@ -313,18 +318,11 @@ ccl_device_forceinline float mf_glass_pdf(const float3 wi, const float3 wo, cons #define MF_PHASE_FUNCTION glass #define MF_MULTI_GLASS -#include "bsdf_microfacet_multi_impl.h" - -/* The diffuse phase function is not implemented as a node yet. */ -#if 0 -#define MF_PHASE_FUNCTION diffuse -#define MF_MULTI_DIFFUSE -#include "bsdf_microfacet_multi_impl.h" -#endif +#include "kernel/closure/bsdf_microfacet_multi_impl.h" #define MF_PHASE_FUNCTION glossy #define MF_MULTI_GLOSSY -#include "bsdf_microfacet_multi_impl.h" +#include "kernel/closure/bsdf_microfacet_multi_impl.h" ccl_device void bsdf_microfacet_multi_ggx_blur(ShaderClosure *sc, float roughness) { @@ -345,8 +343,9 @@ ccl_device int bsdf_microfacet_multi_ggx_common_setup(MicrofacetBsdf *bsdf) bsdf->extra->color.x = saturate(bsdf->extra->color.x); bsdf->extra->color.y = saturate(bsdf->extra->color.y); bsdf->extra->color.z = saturate(bsdf->extra->color.z); - - bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID; + bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x); + bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y); + bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z); return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG; } @@ -356,6 +355,22 @@ ccl_device int bsdf_microfacet_multi_ggx_aniso_setup(MicrofacetBsdf *bsdf) if(is_zero(bsdf->T)) bsdf->T = make_float3(1.0f, 0.0f, 0.0f); + bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID; + + return bsdf_microfacet_multi_ggx_common_setup(bsdf); +} + +ccl_device int bsdf_microfacet_multi_ggx_aniso_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd) +{ + if(is_zero(bsdf->T)) + bsdf->T = make_float3(1.0f, 0.0f, 0.0f); + + bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID; + + float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior); + float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0)); + bsdf->sample_weight *= F; + return bsdf_microfacet_multi_ggx_common_setup(bsdf); } @@ -363,6 +378,30 @@ ccl_device int bsdf_microfacet_multi_ggx_setup(MicrofacetBsdf *bsdf) { bsdf->alpha_y = bsdf->alpha_x; + bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID; + + return bsdf_microfacet_multi_ggx_common_setup(bsdf); +} + +ccl_device int bsdf_microfacet_multi_ggx_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd) +{ + bsdf->alpha_y = bsdf->alpha_x; + + bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID; + + float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior); + float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0)); + bsdf->sample_weight *= F; + + return bsdf_microfacet_multi_ggx_common_setup(bsdf); +} + +ccl_device int bsdf_microfacet_multi_ggx_refraction_setup(MicrofacetBsdf *bsdf) +{ + bsdf->alpha_y = bsdf->alpha_x; + + bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID; + return bsdf_microfacet_multi_ggx_common_setup(bsdf); } @@ -378,6 +417,8 @@ ccl_device float3 bsdf_microfacet_multi_ggx_eval_reflect(const ShaderClosure *sc return make_float3(0.0f, 0.0f, 0.0f); } + bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID); + bool is_aniso = (bsdf->alpha_x != bsdf->alpha_y); float3 X, Y, Z; Z = bsdf->N; @@ -393,7 +434,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_eval_reflect(const ShaderClosure *sc *pdf = mf_ggx_aniso_pdf(localI, localO, make_float2(bsdf->alpha_x, bsdf->alpha_y)); else *pdf = mf_ggx_pdf(localI, localO, bsdf->alpha_x); - return mf_eval_glossy(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, NULL, NULL); + return mf_eval_glossy(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0); } ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf, ccl_addr_space uint *lcg_state) @@ -407,9 +448,15 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderC *omega_in = 2*dot(Z, I)*Z - I; *pdf = 1e6f; *eval = make_float3(1e6f, 1e6f, 1e6f); +#ifdef __RAY_DIFFERENTIALS__ + *domega_in_dx = (2 * dot(Z, dIdx)) * Z - dIdx; + *domega_in_dy = (2 * dot(Z, dIdy)) * Z - dIdy; +#endif return LABEL_REFLECT|LABEL_SINGULAR; } + bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID); + bool is_aniso = (bsdf->alpha_x != bsdf->alpha_y); if(is_aniso) make_orthonormals_tangent(Z, bsdf->T, &X, &Y); @@ -419,7 +466,7 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderC float3 localI = make_float3(dot(I, X), dot(I, Y), dot(I, Z)); float3 localO; - *eval = mf_sample_glossy(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, NULL, NULL); + *eval = mf_sample_glossy(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0); if(is_aniso) *pdf = mf_ggx_aniso_pdf(localI, localO, make_float2(bsdf->alpha_x, bsdf->alpha_y)); else @@ -427,6 +474,7 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderC *eval *= *pdf; *omega_in = X*localO.x + Y*localO.y + Z*localO.z; + #ifdef __RAY_DIFFERENTIALS__ *domega_in_dx = (2 * dot(Z, dIdx)) * Z - dIdx; *domega_in_dy = (2 * dot(Z, dIdy)) * Z - dIdy; @@ -450,6 +498,27 @@ ccl_device int bsdf_microfacet_multi_ggx_glass_setup(MicrofacetBsdf *bsdf) return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG; } +ccl_device int bsdf_microfacet_multi_ggx_glass_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd) +{ + bsdf->alpha_x = clamp(bsdf->alpha_x, 1e-4f, 1.0f); + bsdf->alpha_y = bsdf->alpha_x; + bsdf->ior = max(0.0f, bsdf->ior); + bsdf->extra->color.x = saturate(bsdf->extra->color.x); + bsdf->extra->color.y = saturate(bsdf->extra->color.y); + bsdf->extra->color.z = saturate(bsdf->extra->color.z); + bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x); + bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y); + bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z); + + bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID; + + float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior); + float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0)); + bsdf->sample_weight *= F; + + return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG; +} + ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf, ccl_addr_space uint *lcg_state) { const MicrofacetBsdf *bsdf = (const MicrofacetBsdf*)sc; @@ -465,7 +534,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_transmit(const ShaderClos float3 localO = make_float3(dot(omega_in, X), dot(omega_in, Y), dot(omega_in, Z)); *pdf = mf_glass_pdf(localI, localO, bsdf->alpha_x, bsdf->ior); - return mf_eval_glass(localI, localO, false, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior); + return mf_eval_glass(localI, localO, false, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, false, bsdf->extra->color); } ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf, ccl_addr_space uint *lcg_state) { @@ -475,6 +544,8 @@ ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(const ShaderClosu return make_float3(0.0f, 0.0f, 0.0f); } + bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID); + float3 X, Y, Z; Z = bsdf->N; make_orthonormals(Z, &X, &Y); @@ -483,7 +554,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(const ShaderClosu float3 localO = make_float3(dot(omega_in, X), dot(omega_in, Y), dot(omega_in, Z)); *pdf = mf_glass_pdf(localI, localO, bsdf->alpha_x, bsdf->ior); - return mf_eval_glass(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior); + return mf_eval_glass(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0); } ccl_device int bsdf_microfacet_multi_ggx_glass_sample(KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf, ccl_addr_space uint *lcg_state) @@ -525,12 +596,14 @@ ccl_device int bsdf_microfacet_multi_ggx_glass_sample(KernelGlobals *kg, const S } } + bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID); + make_orthonormals(Z, &X, &Y); float3 localI = make_float3(dot(I, X), dot(I, Y), dot(I, Z)); float3 localO; - *eval = mf_sample_glass(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior); + *eval = mf_sample_glass(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0); *pdf = mf_glass_pdf(localI, localO, bsdf->alpha_x, bsdf->ior); *eval *= *pdf; diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h index 8054fa8e849..e73915dbda7 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h @@ -26,19 +26,16 @@ * the balance heuristic isn't necessarily optimal anymore. */ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)( - float3 wi, - float3 wo, - const bool wo_outside, - const float3 color, - const float alpha_x, - const float alpha_y, - ccl_addr_space uint *lcg_state -#ifdef MF_MULTI_GLASS - , const float eta -#elif defined(MF_MULTI_GLOSSY) - , float3 *n, float3 *k -#endif -) + float3 wi, + float3 wo, + const bool wo_outside, + const float3 color, + const float alpha_x, + const float alpha_y, + ccl_addr_space uint *lcg_state, + const float eta, + bool use_fresnel, + const float3 cspec0) { /* Evaluating for a shallower incoming direction produces less noise, and the properties of the BSDF guarantee reciprocity. */ bool swapped = false; @@ -71,50 +68,57 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)( /* Analytically compute single scattering for lower noise. */ float3 eval; + float3 throughput = make_float3(1.0f, 1.0f, 1.0f); + const float3 wh = normalize(wi+wo); #ifdef MF_MULTI_GLASS eval = mf_eval_phase_glass(-wi, lambda_r, wo, wo_outside, alpha, eta); if(wo_outside) eval *= -lambda_r / (shadowing_lambda - lambda_r); else eval *= -lambda_r * beta(-lambda_r, shadowing_lambda+1.0f); -#elif defined(MF_MULTI_DIFFUSE) - /* Diffuse has no special closed form for the single scattering bounce */ - eval = make_float3(0.0f, 0.0f, 0.0f); #else /* MF_MULTI_GLOSSY */ - const float3 wh = normalize(wi+wo); const float G2 = 1.0f / (1.0f - (lambda_r + 1.0f) + shadowing_lambda); float val = G2 * 0.25f / wi.z; if(alpha.x == alpha.y) val *= D_ggx(wh, alpha.x); else val *= D_ggx_aniso(wh, alpha); - if(n && k) { - eval = fresnel_conductor(dot(wh, wi), *n, *k) * val; - } - else { - eval = make_float3(val, val, val); - } + eval = make_float3(val, val, val); #endif + float F0 = fresnel_dielectric_cos(1.0f, eta); + if(use_fresnel) { + throughput = interpolate_fresnel_color(wi, wh, eta, F0, cspec0); + + eval *= throughput; + } + float3 wr = -wi; float hr = 1.0f; float C1_r = 1.0f; float G1_r = 0.0f; bool outside = true; - float3 throughput = make_float3(1.0f, 1.0f, 1.0f); for(int order = 0; order < 10; order++) { - /* Sample microfacet height and normal */ - if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, lcg_step_float_addrspace(lcg_state))) + /* Sample microfacet height. */ + float height_rand = lcg_step_float_addrspace(lcg_state); + if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, height_rand)) break; - float3 wm = mf_sample_vndf(-wr, alpha, make_float2(lcg_step_float_addrspace(lcg_state), - lcg_step_float_addrspace(lcg_state))); - -#ifdef MF_MULTI_DIFFUSE - if(order == 0) { - /* Compute single-scattering for diffuse. */ - const float G2_G1 = -lambda_r / (shadowing_lambda - lambda_r); - eval += throughput * G2_G1 * mf_eval_phase_diffuse(wo, wm); + /* Sample microfacet normal. */ + float vndf_rand_y = lcg_step_float_addrspace(lcg_state); + float vndf_rand_x = lcg_step_float_addrspace(lcg_state); + float3 wm = mf_sample_vndf(-wr, alpha, vndf_rand_x, vndf_rand_y); + +#ifdef MF_MULTI_GLASS + if(order == 0 && use_fresnel) { + /* Evaluate amount of scattering towards wo on this microfacet. */ + float3 phase; + if(outside) + phase = mf_eval_phase_glass(wr, lambda_r, wo, wo_outside, alpha, eta); + else + phase = mf_eval_phase_glass(wr, lambda_r, -wo, !wo_outside, alpha, 1.0f / eta); + + eval = throughput * phase * mf_G1(wo_outside ? wo : -wo, mf_C1((outside == wo_outside) ? hr : -hr), shadowing_lambda); } #endif if(order > 0) { @@ -125,10 +129,8 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)( phase = mf_eval_phase_glass(wr, lambda_r, wo, wo_outside, alpha, eta); else phase = mf_eval_phase_glass(wr, lambda_r, -wo, !wo_outside, alpha, 1.0f/eta); -#elif defined(MF_MULTI_DIFFUSE) - phase = mf_eval_phase_diffuse(wo, wm); #else /* MF_MULTI_GLOSSY */ - phase = mf_eval_phase_glossy(wr, lambda_r, wo, alpha, n, k) * throughput; + phase = mf_eval_phase_glossy(wr, lambda_r, wo, alpha) * throughput; #endif eval += throughput * phase * mf_G1(wo_outside? wo: -wo, mf_C1((outside == wo_outside)? hr: -hr), shadowing_lambda); } @@ -136,23 +138,32 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)( /* Bounce from the microfacet. */ #ifdef MF_MULTI_GLASS bool next_outside; - wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, lcg_step_float_addrspace(lcg_state), &next_outside); + float3 wi_prev = -wr; + float phase_rand = lcg_step_float_addrspace(lcg_state); + wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, phase_rand, &next_outside); if(!next_outside) { outside = !outside; wr = -wr; hr = -hr; } -#elif defined(MF_MULTI_DIFFUSE) - wr = mf_sample_phase_diffuse(wm, - lcg_step_float_addrspace(lcg_state), - lcg_step_float_addrspace(lcg_state)); + + if(use_fresnel && !next_outside) { + throughput *= color; + } + else if(use_fresnel && order > 0) { + throughput *= interpolate_fresnel_color(wi_prev, wm, eta, F0, cspec0); + } #else /* MF_MULTI_GLOSSY */ - wr = mf_sample_phase_glossy(-wr, n, k, &throughput, wm); + if(use_fresnel && order > 0) { + throughput *= interpolate_fresnel_color(-wr, wm, eta, F0, cspec0); + } + wr = mf_sample_phase_glossy(-wr, &throughput, wm); #endif lambda_r = mf_lambda(wr, alpha); - throughput *= color; + if(!use_fresnel) + throughput *= color; C1_r = mf_C1(hr); G1_r = mf_G1(wr, C1_r, lambda_r); @@ -168,13 +179,16 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)( * escaped the surface in wo. The function returns the throughput between wi and wo. * Without reflection losses due to coloring or fresnel absorption in conductors, the sampling is optimal. */ -ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3 *wo, const float3 color, const float alpha_x, const float alpha_y, ccl_addr_space uint *lcg_state -#ifdef MF_MULTI_GLASS - , const float eta -#elif defined(MF_MULTI_GLOSSY) - , float3 *n, float3 *k -#endif -) +ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)( + float3 wi, + float3 *wo, + const float3 color, + const float alpha_x, + const float alpha_y, + ccl_addr_space uint *lcg_state, + const float eta, + bool use_fresnel, + const float3 cspec0) { const float2 alpha = make_float2(alpha_x, alpha_y); @@ -186,37 +200,64 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3 float G1_r = 0.0f; bool outside = true; + float F0 = fresnel_dielectric_cos(1.0f, eta); + if(use_fresnel) { + throughput = interpolate_fresnel_color(wi, normalize(wi + wr), eta, F0, cspec0); + } + int order; for(order = 0; order < 10; order++) { /* Sample microfacet height. */ - if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, lcg_step_float_addrspace(lcg_state))) { + float height_rand = lcg_step_float_addrspace(lcg_state); + if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, height_rand)) { /* The random walk has left the surface. */ *wo = outside? wr: -wr; return throughput; } /* Sample microfacet normal. */ - float3 wm = mf_sample_vndf(-wr, alpha, make_float2(lcg_step_float_addrspace(lcg_state), - lcg_step_float_addrspace(lcg_state))); + float vndf_rand_y = lcg_step_float_addrspace(lcg_state); + float vndf_rand_x = lcg_step_float_addrspace(lcg_state); + float3 wm = mf_sample_vndf(-wr, alpha, vndf_rand_x, vndf_rand_y); /* First-bounce color is already accounted for in mix weight. */ - if(order > 0) + if(!use_fresnel && order > 0) throughput *= color; /* Bounce from the microfacet. */ #ifdef MF_MULTI_GLASS bool next_outside; - wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, lcg_step_float_addrspace(lcg_state), &next_outside); + float3 wi_prev = -wr; + float phase_rand = lcg_step_float_addrspace(lcg_state); + wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, phase_rand, &next_outside); if(!next_outside) { hr = -hr; wr = -wr; outside = !outside; } -#elif defined(MF_MULTI_DIFFUSE) - wr = mf_sample_phase_diffuse(wm, - lcg_step_float_addrspace(lcg_state), - lcg_step_float_addrspace(lcg_state)); + + if(use_fresnel) { + if(!next_outside) { + throughput *= color; + } + else { + float3 t_color = interpolate_fresnel_color(wi_prev, wm, eta, F0, cspec0); + + if(order == 0) + throughput = t_color; + else + throughput *= t_color; + } + } #else /* MF_MULTI_GLOSSY */ - wr = mf_sample_phase_glossy(-wr, n, k, &throughput, wm); + if(use_fresnel) { + float3 t_color = interpolate_fresnel_color(-wr, wm, eta, F0, cspec0); + + if(order == 0) + throughput = t_color; + else + throughput *= t_color; + } + wr = mf_sample_phase_glossy(-wr, &throughput, wm); #endif /* Update random walk parameters. */ @@ -228,6 +269,5 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3 } #undef MF_MULTI_GLASS -#undef MF_MULTI_DIFFUSE #undef MF_MULTI_GLOSSY #undef MF_PHASE_FUNCTION diff --git a/intern/cycles/kernel/closure/bsdf_oren_nayar.h b/intern/cycles/kernel/closure/bsdf_oren_nayar.h index cb342a026ef..6b770fc0c16 100644 --- a/intern/cycles/kernel/closure/bsdf_oren_nayar.h +++ b/intern/cycles/kernel/closure/bsdf_oren_nayar.h @@ -22,7 +22,6 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct OrenNayarBsdf { SHADER_CLOSURE_BASE; - float3 N; float roughness; float a; float b; diff --git a/intern/cycles/kernel/closure/bsdf_phong_ramp.h b/intern/cycles/kernel/closure/bsdf_phong_ramp.h index e152a8780db..420f94755ee 100644 --- a/intern/cycles/kernel/closure/bsdf_phong_ramp.h +++ b/intern/cycles/kernel/closure/bsdf_phong_ramp.h @@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct PhongRampBsdf { SHADER_CLOSURE_BASE; - float3 N; float exponent; float3 *colors; } PhongRampBsdf; diff --git a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h new file mode 100644 index 00000000000..f8ca64293b0 --- /dev/null +++ b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h @@ -0,0 +1,127 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __BSDF_PRINCIPLED_DIFFUSE_H__ +#define __BSDF_PRINCIPLED_DIFFUSE_H__ + +/* DISNEY PRINCIPLED DIFFUSE BRDF + * + * Shading model by Brent Burley (Disney): "Physically Based Shading at Disney" (2012) + */ + +CCL_NAMESPACE_BEGIN + +typedef ccl_addr_space struct PrincipledDiffuseBsdf { + SHADER_CLOSURE_BASE; + + float roughness; +} PrincipledDiffuseBsdf; + +ccl_device float3 calculate_principled_diffuse_brdf(const PrincipledDiffuseBsdf *bsdf, + float3 N, float3 V, float3 L, float3 H, float *pdf) +{ + float NdotL = max(dot(N, L), 0.0f); + float NdotV = max(dot(N, V), 0.0f); + + if(NdotL < 0 || NdotV < 0) { + *pdf = 0.0f; + return make_float3(0.0f, 0.0f, 0.0f); + } + + float LdotH = dot(L, H); + + float FL = schlick_fresnel(NdotL), FV = schlick_fresnel(NdotV); + const float Fd90 = 0.5f + 2.0f * LdotH*LdotH * bsdf->roughness; + float Fd = (1.0f * (1.0f - FL) + Fd90 * FL) * (1.0f * (1.0f - FV) + Fd90 * FV); + + float value = M_1_PI_F * NdotL * Fd; + + return make_float3(value, value, value); +} + +ccl_device int bsdf_principled_diffuse_setup(PrincipledDiffuseBsdf *bsdf) +{ + bsdf->type = CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID; + return SD_BSDF|SD_BSDF_HAS_EVAL; +} + +ccl_device bool bsdf_principled_diffuse_merge(const ShaderClosure *a, const ShaderClosure *b) +{ + const PrincipledDiffuseBsdf *bsdf_a = (const PrincipledDiffuseBsdf*)a; + const PrincipledDiffuseBsdf *bsdf_b = (const PrincipledDiffuseBsdf*)b; + + return (isequal_float3(bsdf_a->N, bsdf_b->N) && bsdf_a->roughness == bsdf_b->roughness); +} + +ccl_device float3 bsdf_principled_diffuse_eval_reflect(const ShaderClosure *sc, const float3 I, + const float3 omega_in, float *pdf) +{ + const PrincipledDiffuseBsdf *bsdf = (const PrincipledDiffuseBsdf *)sc; + + float3 N = bsdf->N; + float3 V = I; // outgoing + float3 L = omega_in; // incoming + float3 H = normalize(L + V); + + if(dot(N, omega_in) > 0.0f) { + *pdf = fmaxf(dot(N, omega_in), 0.0f) * M_1_PI_F; + return calculate_principled_diffuse_brdf(bsdf, N, V, L, H, pdf); + } + else { + *pdf = 0.0f; + return make_float3(0.0f, 0.0f, 0.0f); + } +} + +ccl_device float3 bsdf_principled_diffuse_eval_transmit(const ShaderClosure *sc, const float3 I, + const float3 omega_in, float *pdf) +{ + return make_float3(0.0f, 0.0f, 0.0f); +} + +ccl_device int bsdf_principled_diffuse_sample(const ShaderClosure *sc, + float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, + float3 *eval, float3 *omega_in, float3 *domega_in_dx, + float3 *domega_in_dy, float *pdf) +{ + const PrincipledDiffuseBsdf *bsdf = (const PrincipledDiffuseBsdf *)sc; + + float3 N = bsdf->N; + + sample_cos_hemisphere(N, randu, randv, omega_in, pdf); + + if(dot(Ng, *omega_in) > 0) { + float3 H = normalize(I + *omega_in); + + *eval = calculate_principled_diffuse_brdf(bsdf, N, I, *omega_in, H, pdf); + +#ifdef __RAY_DIFFERENTIALS__ + // TODO: find a better approximation for the diffuse bounce + *domega_in_dx = -((2 * dot(N, dIdx)) * N - dIdx); + *domega_in_dy = -((2 * dot(N, dIdy)) * N - dIdy); +#endif + } + else { + *pdf = 0.0f; + } + return LABEL_REFLECT|LABEL_DIFFUSE; +} + +CCL_NAMESPACE_END + +#endif /* __BSDF_PRINCIPLED_DIFFUSE_H__ */ + + diff --git a/intern/cycles/kernel/closure/bsdf_principled_sheen.h b/intern/cycles/kernel/closure/bsdf_principled_sheen.h new file mode 100644 index 00000000000..f4476bfecd0 --- /dev/null +++ b/intern/cycles/kernel/closure/bsdf_principled_sheen.h @@ -0,0 +1,113 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __BSDF_PRINCIPLED_SHEEN_H__ +#define __BSDF_PRINCIPLED_SHEEN_H__ + +/* DISNEY PRINCIPLED SHEEN BRDF + * + * Shading model by Brent Burley (Disney): "Physically Based Shading at Disney" (2012) + */ + +CCL_NAMESPACE_BEGIN + +typedef ccl_addr_space struct PrincipledSheenBsdf { + SHADER_CLOSURE_BASE; +} PrincipledSheenBsdf; + +ccl_device float3 calculate_principled_sheen_brdf(const PrincipledSheenBsdf *bsdf, + float3 N, float3 V, float3 L, float3 H, float *pdf) +{ + float NdotL = dot(N, L); + float NdotV = dot(N, V); + + if(NdotL < 0 || NdotV < 0) { + *pdf = 0.0f; + return make_float3(0.0f, 0.0f, 0.0f); + } + + float LdotH = dot(L, H); + + float value = schlick_fresnel(LdotH) * NdotL; + + return make_float3(value, value, value); +} + +ccl_device int bsdf_principled_sheen_setup(PrincipledSheenBsdf *bsdf) +{ + bsdf->type = CLOSURE_BSDF_PRINCIPLED_SHEEN_ID; + return SD_BSDF|SD_BSDF_HAS_EVAL; +} + +ccl_device float3 bsdf_principled_sheen_eval_reflect(const ShaderClosure *sc, const float3 I, + const float3 omega_in, float *pdf) +{ + const PrincipledSheenBsdf *bsdf = (const PrincipledSheenBsdf *)sc; + + float3 N = bsdf->N; + float3 V = I; // outgoing + float3 L = omega_in; // incoming + float3 H = normalize(L + V); + + if(dot(N, omega_in) > 0.0f) { + *pdf = fmaxf(dot(N, omega_in), 0.0f) * M_1_PI_F; + return calculate_principled_sheen_brdf(bsdf, N, V, L, H, pdf); + } + else { + *pdf = 0.0f; + return make_float3(0.0f, 0.0f, 0.0f); + } +} + +ccl_device float3 bsdf_principled_sheen_eval_transmit(const ShaderClosure *sc, const float3 I, + const float3 omega_in, float *pdf) +{ + return make_float3(0.0f, 0.0f, 0.0f); +} + +ccl_device int bsdf_principled_sheen_sample(const ShaderClosure *sc, + float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, + float3 *eval, float3 *omega_in, float3 *domega_in_dx, + float3 *domega_in_dy, float *pdf) +{ + const PrincipledSheenBsdf *bsdf = (const PrincipledSheenBsdf *)sc; + + float3 N = bsdf->N; + + sample_cos_hemisphere(N, randu, randv, omega_in, pdf); + + if(dot(Ng, *omega_in) > 0) { + float3 H = normalize(I + *omega_in); + + *eval = calculate_principled_sheen_brdf(bsdf, N, I, *omega_in, H, pdf); + +#ifdef __RAY_DIFFERENTIALS__ + // TODO: find a better approximation for the diffuse bounce + *domega_in_dx = -((2 * dot(N, dIdx)) * N - dIdx); + *domega_in_dy = -((2 * dot(N, dIdy)) * N - dIdy); +#endif + } + else { + *pdf = 0.0f; + } + return LABEL_REFLECT|LABEL_DIFFUSE; +} + +CCL_NAMESPACE_END + +#endif /* __BSDF_PRINCIPLED_SHEEN_H__ */ + + diff --git a/intern/cycles/kernel/closure/bsdf_toon.h b/intern/cycles/kernel/closure/bsdf_toon.h index 28e775bcbc8..d8b6d8ddead 100644 --- a/intern/cycles/kernel/closure/bsdf_toon.h +++ b/intern/cycles/kernel/closure/bsdf_toon.h @@ -38,7 +38,6 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct ToonBsdf { SHADER_CLOSURE_BASE; - float3 N; float size; float smooth; } ToonBsdf; diff --git a/intern/cycles/kernel/closure/bsdf_transparent.h b/intern/cycles/kernel/closure/bsdf_transparent.h index 3c2fd8004df..79ee9dc4537 100644 --- a/intern/cycles/kernel/closure/bsdf_transparent.h +++ b/intern/cycles/kernel/closure/bsdf_transparent.h @@ -35,10 +35,39 @@ CCL_NAMESPACE_BEGIN -ccl_device int bsdf_transparent_setup(ShaderClosure *sc) +ccl_device void bsdf_transparent_setup(ShaderData *sd, const float3 weight, int path_flag) { - sc->type = CLOSURE_BSDF_TRANSPARENT_ID; - return SD_BSDF|SD_TRANSPARENT; + if(sd->flag & SD_TRANSPARENT) { + sd->closure_transparent_extinction += weight; + + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; + + if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) { + sc->weight += weight; + sc->sample_weight += fabsf(average(weight)); + break; + } + } + } + else { + sd->flag |= SD_BSDF|SD_TRANSPARENT; + sd->closure_transparent_extinction = weight; + + if(path_flag & PATH_RAY_TERMINATE) { + /* In this case the number of closures is set to zero to disable + * all others, but we still want to get transparency so increase + * the number just for this. */ + sd->num_closure_left = 1; + } + + ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight); + + if(bsdf) { + bsdf->N = sd->N; + bsdf->type = CLOSURE_BSDF_TRANSPARENT_ID; + } + } } ccl_device float3 bsdf_transparent_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf) diff --git a/intern/cycles/kernel/closure/bsdf_util.h b/intern/cycles/kernel/closure/bsdf_util.h index b0c5280b6cb..3dc15d5791c 100644 --- a/intern/cycles/kernel/closure/bsdf_util.h +++ b/intern/cycles/kernel/closure/bsdf_util.h @@ -124,6 +124,13 @@ ccl_device float3 fresnel_conductor(float cosi, const float3 eta, const float3 k return(Rparl2 + Rperp2) * 0.5f; } +ccl_device float schlick_fresnel(float u) +{ + float m = clamp(1.0f - u, 0.0f, 1.0f); + float m2 = m * m; + return m2 * m2 * m; // pow(m, 5) +} + ccl_device float smooth_step(float edge0, float edge1, float x) { float result; @@ -136,6 +143,19 @@ ccl_device float smooth_step(float edge0, float edge1, float x) return result; } +/* Calculate the fresnel color which is a blend between white and the F0 color (cspec0) */ +ccl_device_forceinline float3 interpolate_fresnel_color(float3 L, float3 H, float ior, float F0, float3 cspec0) { + /* Calculate the fresnel interpolation factor + * The value from fresnel_dielectric_cos(...) has to be normalized because + * the cspec0 keeps the F0 color + */ + float F0_norm = 1.0f / (1.0f - F0); + float FH = (fresnel_dielectric_cos(dot(L, H), ior) - F0) * F0_norm; + + /* Blend between white and a specular color with respect to the fresnel */ + return cspec0 * (1.0f - FH) + make_float3(1.0f, 1.0f, 1.0f) * FH; +} + CCL_NAMESPACE_END #endif /* __BSDF_UTIL_H__ */ diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h index af0bbd861a9..8578767b07e 100644 --- a/intern/cycles/kernel/closure/bssrdf.h +++ b/intern/cycles/kernel/closure/bssrdf.h @@ -22,12 +22,12 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct Bssrdf { SHADER_CLOSURE_BASE; - float radius; + float3 radius; + float3 albedo; float sharpness; - float d; float texture_blur; - float albedo; - float3 N; + float roughness; + float channels; } Bssrdf; /* Planar Truncated Gaussian @@ -39,12 +39,11 @@ typedef ccl_addr_space struct Bssrdf { /* paper suggests 1/12.46 which is much too small, suspect it's *12.46 */ #define GAUSS_TRUNCATE 12.46f -ccl_device float bssrdf_gaussian_eval(const ShaderClosure *sc, float r) +ccl_device float bssrdf_gaussian_eval(const float radius, float r) { /* integrate (2*pi*r * exp(-r*r/(2*v)))/(2*pi*v)) from 0 to Rm * = 1 - exp(-Rm*Rm/(2*v)) */ - const Bssrdf *bssrdf = (const Bssrdf*)sc; - const float v = bssrdf->radius*bssrdf->radius*(0.25f*0.25f); + const float v = radius*radius*(0.25f*0.25f); const float Rm = sqrtf(v*GAUSS_TRUNCATE); if(r >= Rm) @@ -53,20 +52,19 @@ ccl_device float bssrdf_gaussian_eval(const ShaderClosure *sc, float r) return expf(-r*r/(2.0f*v))/(2.0f*M_PI_F*v); } -ccl_device float bssrdf_gaussian_pdf(const ShaderClosure *sc, float r) +ccl_device float bssrdf_gaussian_pdf(const float radius, float r) { /* 1.0 - expf(-Rm*Rm/(2*v)) simplified */ const float area_truncated = 1.0f - expf(-0.5f*GAUSS_TRUNCATE); - return bssrdf_gaussian_eval(sc, r) * (1.0f/(area_truncated)); + return bssrdf_gaussian_eval(radius, r) * (1.0f/(area_truncated)); } -ccl_device void bssrdf_gaussian_sample(const ShaderClosure *sc, float xi, float *r, float *h) +ccl_device void bssrdf_gaussian_sample(const float radius, float xi, float *r, float *h) { /* xi = integrate (2*pi*r * exp(-r*r/(2*v)))/(2*pi*v)) = -exp(-r^2/(2*v)) * r = sqrt(-2*v*logf(xi)) */ - const Bssrdf *bssrdf = (const Bssrdf*)sc; - const float v = bssrdf->radius*bssrdf->radius*(0.25f*0.25f); + const float v = radius*radius*(0.25f*0.25f); const float Rm = sqrtf(v*GAUSS_TRUNCATE); /* 1.0 - expf(-Rm*Rm/(2*v)) simplified */ @@ -87,13 +85,10 @@ ccl_device void bssrdf_gaussian_sample(const ShaderClosure *sc, float xi, float * far as I can tell has no closed form solution. So we get an iterative solution * instead with newton-raphson. */ -ccl_device float bssrdf_cubic_eval(const ShaderClosure *sc, float r) +ccl_device float bssrdf_cubic_eval(const float radius, const float sharpness, float r) { - const Bssrdf *bssrdf = (const Bssrdf*)sc; - const float sharpness = bssrdf->sharpness; - if(sharpness == 0.0f) { - const float Rm = bssrdf->radius; + const float Rm = radius; if(r >= Rm) return 0.0f; @@ -107,7 +102,7 @@ ccl_device float bssrdf_cubic_eval(const ShaderClosure *sc, float r) } else { - float Rm = bssrdf->radius*(1.0f + sharpness); + float Rm = radius*(1.0f + sharpness); if(r >= Rm) return 0.0f; @@ -124,7 +119,7 @@ ccl_device float bssrdf_cubic_eval(const ShaderClosure *sc, float r) else { Rmy = powf(Rm, y); ry = powf(r, y); - ryinv = (r > 0.0f)? powf(r, 2.0f*y - 2.0f): 0.0f; + ryinv = (r > 0.0f)? powf(r, y - 1.0f): 0.0f; } const float Rmy5 = (Rmy*Rmy) * (Rmy*Rmy) * Rmy; @@ -135,9 +130,9 @@ ccl_device float bssrdf_cubic_eval(const ShaderClosure *sc, float r) } } -ccl_device float bssrdf_cubic_pdf(const ShaderClosure *sc, float r) +ccl_device float bssrdf_cubic_pdf(const float radius, const float sharpness, float r) { - return bssrdf_cubic_eval(sc, r); + return bssrdf_cubic_eval(radius, sharpness, r); } /* solve 10x^2 - 20x^3 + 15x^4 - 4x^5 - xi == 0 */ @@ -168,11 +163,9 @@ ccl_device_forceinline float bssrdf_cubic_quintic_root_find(float xi) return x; } -ccl_device void bssrdf_cubic_sample(const ShaderClosure *sc, float xi, float *r, float *h) +ccl_device void bssrdf_cubic_sample(const float radius, const float sharpness, float xi, float *r, float *h) { - const Bssrdf *bssrdf = (const Bssrdf*)sc; - const float sharpness = bssrdf->sharpness; - float Rm = bssrdf->radius; + float Rm = radius; float r_ = bssrdf_cubic_quintic_root_find(xi); if(sharpness != 0.0f) { @@ -207,7 +200,7 @@ ccl_device_inline float bssrdf_burley_fitting(float A) /* Scale mean free path length so it gives similar looking result * to Cubic and Gaussian models. */ -ccl_device_inline float bssrdf_burley_compatible_mfp(float r) +ccl_device_inline float3 bssrdf_burley_compatible_mfp(float3 r) { return 0.25f * M_1_PI_F * r; } @@ -215,19 +208,18 @@ ccl_device_inline float bssrdf_burley_compatible_mfp(float r) ccl_device void bssrdf_burley_setup(Bssrdf *bssrdf) { /* Mean free path length. */ - const float l = bssrdf_burley_compatible_mfp(bssrdf->radius); + const float3 l = bssrdf_burley_compatible_mfp(bssrdf->radius); /* Surface albedo. */ - const float A = bssrdf->albedo; - const float s = bssrdf_burley_fitting(A); - const float d = l / s; + const float3 A = bssrdf->albedo; + const float3 s = make_float3(bssrdf_burley_fitting(A.x), + bssrdf_burley_fitting(A.y), + bssrdf_burley_fitting(A.z)); - bssrdf->d = d; + bssrdf->radius = l / s; } -ccl_device float bssrdf_burley_eval(const ShaderClosure *sc, float r) +ccl_device float bssrdf_burley_eval(const float d, float r) { - const Bssrdf *bssrdf = (const Bssrdf*)sc; - const float d = bssrdf->d; const float Rm = BURLEY_TRUNCATE * d; if(r >= Rm) @@ -246,9 +238,9 @@ ccl_device float bssrdf_burley_eval(const ShaderClosure *sc, float r) return (exp_r_d + exp_r_3_d) / (4.0f*d); } -ccl_device float bssrdf_burley_pdf(const ShaderClosure *sc, float r) +ccl_device float bssrdf_burley_pdf(const float d, float r) { - return bssrdf_burley_eval(sc, r) * (1.0f/BURLEY_TRUNCATE_CDF); + return bssrdf_burley_eval(d, r) * (1.0f/BURLEY_TRUNCATE_CDF); } /* Find the radius for desired CDF value. @@ -291,13 +283,11 @@ ccl_device_forceinline float bssrdf_burley_root_find(float xi) return r; } -ccl_device void bssrdf_burley_sample(const ShaderClosure *sc, +ccl_device void bssrdf_burley_sample(const float d, float xi, float *r, float *h) { - const Bssrdf *bssrdf = (const Bssrdf*)sc; - const float d = bssrdf->d; const float Rm = BURLEY_TRUNCATE * d; const float r_ = bssrdf_burley_root_find(xi * BURLEY_TRUNCATE_CDF) * d; @@ -311,29 +301,26 @@ ccl_device void bssrdf_burley_sample(const ShaderClosure *sc, * * Samples distributed over disk with no falloff, for reference. */ -ccl_device float bssrdf_none_eval(const ShaderClosure *sc, float r) +ccl_device float bssrdf_none_eval(const float radius, float r) { - const Bssrdf *bssrdf = (const Bssrdf*)sc; - const float Rm = bssrdf->radius; + const float Rm = radius; return (r < Rm)? 1.0f: 0.0f; } -ccl_device float bssrdf_none_pdf(const ShaderClosure *sc, float r) +ccl_device float bssrdf_none_pdf(const float radius, float r) { /* integrate (2*pi*r)/(pi*Rm*Rm) from 0 to Rm = 1 */ - const Bssrdf *bssrdf = (const Bssrdf*)sc; - const float Rm = bssrdf->radius; + const float Rm = radius; const float area = (M_PI_F*Rm*Rm); - return bssrdf_none_eval(sc, r) / area; + return bssrdf_none_eval(radius, r) / area; } -ccl_device void bssrdf_none_sample(const ShaderClosure *sc, float xi, float *r, float *h) +ccl_device void bssrdf_none_sample(const float radius, float xi, float *r, float *h) { /* xi = integrate (2*pi*r)/(pi*Rm*Rm) = r^2/Rm^2 * r = sqrt(xi)*Rm */ - const Bssrdf *bssrdf = (const Bssrdf*)sc; - const float Rm = bssrdf->radius; + const float Rm = radius; const float r_ = sqrtf(xi)*Rm; *r = r_; @@ -348,55 +335,166 @@ ccl_device_inline Bssrdf *bssrdf_alloc(ShaderData *sd, float3 weight) { Bssrdf *bssrdf = (Bssrdf*)closure_alloc(sd, sizeof(Bssrdf), CLOSURE_NONE_ID, weight); - if(!bssrdf) + if(bssrdf == NULL) { return NULL; + } float sample_weight = fabsf(average(weight)); bssrdf->sample_weight = sample_weight; return (sample_weight >= CLOSURE_WEIGHT_CUTOFF) ? bssrdf : NULL; } -ccl_device int bssrdf_setup(Bssrdf *bssrdf, ClosureType type) +ccl_device int bssrdf_setup(ShaderData *sd, Bssrdf *bssrdf, ClosureType type) { - if(bssrdf->radius < BSSRDF_MIN_RADIUS) { - /* revert to diffuse BSDF if radius too small */ - DiffuseBsdf *bsdf = (DiffuseBsdf*)bssrdf; - bsdf->N = bssrdf->N; - int flag = bsdf_diffuse_setup(bsdf); - bsdf->type = CLOSURE_BSDF_BSSRDF_ID; - return flag; + int flag = 0; + int bssrdf_channels = 3; + float3 diffuse_weight = make_float3(0.0f, 0.0f, 0.0f); + + /* Verify if the radii are large enough to sample without precision issues. */ + if(bssrdf->radius.x < BSSRDF_MIN_RADIUS) { + diffuse_weight.x = bssrdf->weight.x; + bssrdf->weight.x = 0.0f; + bssrdf->radius.x = 0.0f; + bssrdf_channels--; } - else { + if(bssrdf->radius.y < BSSRDF_MIN_RADIUS) { + diffuse_weight.y = bssrdf->weight.y; + bssrdf->weight.y = 0.0f; + bssrdf->radius.y = 0.0f; + bssrdf_channels--; + } + if(bssrdf->radius.z < BSSRDF_MIN_RADIUS) { + diffuse_weight.z = bssrdf->weight.z; + bssrdf->weight.z = 0.0f; + bssrdf->radius.z = 0.0f; + bssrdf_channels--; + } + + if(bssrdf_channels < 3) { + /* Add diffuse BSDF if any radius too small. */ +#ifdef __PRINCIPLED__ + if(type == CLOSURE_BSSRDF_PRINCIPLED_ID || + type == CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID) + { + float roughness = bssrdf->roughness; + float3 N = bssrdf->N; + + PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bsdf_alloc(sd, sizeof(PrincipledDiffuseBsdf), diffuse_weight); + + if(bsdf) { + bsdf->type = CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID; + bsdf->N = N; + bsdf->roughness = roughness; + flag |= bsdf_principled_diffuse_setup(bsdf); + } + } + else +#endif /* __PRINCIPLED__ */ + { + DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd, sizeof(DiffuseBsdf), diffuse_weight); + + if(bsdf) { + bsdf->type = CLOSURE_BSDF_BSSRDF_ID; + bsdf->N = bssrdf->N; + flag |= bsdf_diffuse_setup(bsdf); + } + } + } + + /* Setup BSSRDF if radius is large enough. */ + if(bssrdf_channels > 0) { + bssrdf->type = type; + bssrdf->channels = bssrdf_channels; + bssrdf->sample_weight = fabsf(average(bssrdf->weight)) * bssrdf->channels; bssrdf->texture_blur = saturate(bssrdf->texture_blur); bssrdf->sharpness = saturate(bssrdf->sharpness); - bssrdf->type = type; - if(type == CLOSURE_BSSRDF_BURLEY_ID) { + if(type == CLOSURE_BSSRDF_BURLEY_ID || + type == CLOSURE_BSSRDF_PRINCIPLED_ID || + type == CLOSURE_BSSRDF_RANDOM_WALK_ID || + type == CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID) + { bssrdf_burley_setup(bssrdf); } - return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSSRDF; + flag |= SD_BSSRDF; + } + else { + bssrdf->type = type; + bssrdf->sample_weight = 0.0f; } + + return flag; } ccl_device void bssrdf_sample(const ShaderClosure *sc, float xi, float *r, float *h) { - if(sc->type == CLOSURE_BSSRDF_CUBIC_ID) - bssrdf_cubic_sample(sc, xi, r, h); - else if(sc->type == CLOSURE_BSSRDF_GAUSSIAN_ID) - bssrdf_gaussian_sample(sc, xi, r, h); - else /*if(sc->type == CLOSURE_BSSRDF_BURLEY_ID)*/ - bssrdf_burley_sample(sc, xi, r, h); + const Bssrdf *bssrdf = (const Bssrdf*)sc; + float radius; + + /* Sample color channel and reuse random number. Only a subset of channels + * may be used if their radius was too small to handle as BSSRDF. */ + xi *= bssrdf->channels; + + if(xi < 1.0f) { + radius = (bssrdf->radius.x > 0.0f)? bssrdf->radius.x: + (bssrdf->radius.y > 0.0f)? bssrdf->radius.y: + bssrdf->radius.z; + } + else if(xi < 2.0f) { + xi -= 1.0f; + radius = (bssrdf->radius.x > 0.0f)? bssrdf->radius.y: + bssrdf->radius.z; + } + else { + xi -= 2.0f; + radius = bssrdf->radius.z; + } + + /* Sample BSSRDF. */ + if(bssrdf->type == CLOSURE_BSSRDF_CUBIC_ID) { + bssrdf_cubic_sample(radius, bssrdf->sharpness, xi, r, h); + } + else if(bssrdf->type == CLOSURE_BSSRDF_GAUSSIAN_ID){ + bssrdf_gaussian_sample(radius, xi, r, h); + } + else { /*if(bssrdf->type == CLOSURE_BSSRDF_BURLEY_ID || bssrdf->type == CLOSURE_BSSRDF_PRINCIPLED_ID)*/ + bssrdf_burley_sample(radius, xi, r, h); + } +} + +ccl_device float bssrdf_channel_pdf(const Bssrdf *bssrdf, float radius, float r) +{ + if(radius == 0.0f) { + return 0.0f; + } + else if(bssrdf->type == CLOSURE_BSSRDF_CUBIC_ID) { + return bssrdf_cubic_pdf(radius, bssrdf->sharpness, r); + } + else if(bssrdf->type == CLOSURE_BSSRDF_GAUSSIAN_ID) { + return bssrdf_gaussian_pdf(radius, r); + } + else { /*if(bssrdf->type == CLOSURE_BSSRDF_BURLEY_ID || bssrdf->type == CLOSURE_BSSRDF_PRINCIPLED_ID)*/ + return bssrdf_burley_pdf(radius, r); + } +} + +ccl_device_forceinline float3 bssrdf_eval(const ShaderClosure *sc, float r) +{ + const Bssrdf *bssrdf = (const Bssrdf*)sc; + + return make_float3( + bssrdf_channel_pdf(bssrdf, bssrdf->radius.x, r), + bssrdf_channel_pdf(bssrdf, bssrdf->radius.y, r), + bssrdf_channel_pdf(bssrdf, bssrdf->radius.z, r)); } ccl_device_forceinline float bssrdf_pdf(const ShaderClosure *sc, float r) { - if(sc->type == CLOSURE_BSSRDF_CUBIC_ID) - return bssrdf_cubic_pdf(sc, r); - else if(sc->type == CLOSURE_BSSRDF_GAUSSIAN_ID) - return bssrdf_gaussian_pdf(sc, r); - else /*if(sc->type == CLOSURE_BSSRDF_BURLEY_ID)*/ - return bssrdf_burley_pdf(sc, r); + const Bssrdf *bssrdf = (const Bssrdf*)sc; + float3 pdf = bssrdf_eval(sc, r); + + return (pdf.x + pdf.y + pdf.z) / bssrdf->channels; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/closure/emissive.h b/intern/cycles/kernel/closure/emissive.h index c534df373bd..e709ca9a372 100644 --- a/intern/cycles/kernel/closure/emissive.h +++ b/intern/cycles/kernel/closure/emissive.h @@ -32,8 +32,32 @@ CCL_NAMESPACE_BEGIN +/* BACKGROUND CLOSURE */ + +ccl_device void background_setup(ShaderData *sd, const float3 weight) +{ + if(sd->flag & SD_EMISSION) { + sd->closure_emission_background += weight; + } + else { + sd->flag |= SD_EMISSION; + sd->closure_emission_background = weight; + } +} + /* EMISSION CLOSURE */ +ccl_device void emission_setup(ShaderData *sd, const float3 weight) +{ + if(sd->flag & SD_EMISSION) { + sd->closure_emission_background += weight; + } + else { + sd->flag |= SD_EMISSION; + sd->closure_emission_background = weight; + } +} + /* return the probability distribution function in the direction I, * given the parameters and the light's surface normal. This MUST match * the PDF computed by sample(). */ diff --git a/intern/cycles/kernel/closure/volume.h b/intern/cycles/kernel/closure/volume.h index 01e67c7c2fd..da791e9aa73 100644 --- a/intern/cycles/kernel/closure/volume.h +++ b/intern/cycles/kernel/closure/volume.h @@ -19,14 +19,27 @@ CCL_NAMESPACE_BEGIN +/* VOLUME EXTINCTION */ + +ccl_device void volume_extinction_setup(ShaderData *sd, float3 weight) +{ + if(sd->flag & SD_EXTINCTION) { + sd->closure_transparent_extinction += weight; + } + else { + sd->flag |= SD_EXTINCTION; + sd->closure_transparent_extinction = weight; + } +} + +/* HENYEY-GREENSTEIN CLOSURE */ + typedef ccl_addr_space struct HenyeyGreensteinVolume { SHADER_CLOSURE_BASE; float g; } HenyeyGreensteinVolume; -/* HENYEY-GREENSTEIN CLOSURE */ - /* Given cosine between rays, return probability density that a photon bounces * to that direction. The g parameter controls how different it is from the * uniform sphere. g=0 uniform diffuse-like, g=1 close to sharp single ray. */ @@ -70,35 +83,45 @@ ccl_device float3 volume_henyey_greenstein_eval_phase(const ShaderClosure *sc, c return make_float3(*pdf, *pdf, *pdf); } -ccl_device int volume_henyey_greenstein_sample(const ShaderClosure *sc, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, - float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf) +ccl_device float3 henyey_greenstrein_sample(float3 D, float g, float randu, float randv, float *pdf) { - const HenyeyGreensteinVolume *volume = (const HenyeyGreensteinVolume*)sc; - float g = volume->g; - float cos_phi, sin_phi, cos_theta; - /* match pdf for small g */ - if(fabsf(g) < 1e-3f) { + float cos_theta; + bool isotropic = fabsf(g) < 1e-3f; + + if(isotropic) { cos_theta = (1.0f - 2.0f * randu); - *pdf = M_1_PI_F * 0.25f; + if(pdf) { + *pdf = M_1_PI_F * 0.25f; + } } else { float k = (1.0f - g * g) / (1.0f - g + 2.0f * g * randu); cos_theta = (1.0f + g * g - k * k) / (2.0f * g); - *pdf = single_peaked_henyey_greenstein(cos_theta, g); + if(pdf) { + *pdf = single_peaked_henyey_greenstein(cos_theta, g); + } } float sin_theta = safe_sqrtf(1.0f - cos_theta * cos_theta); - float phi = M_2PI_F * randv; - cos_phi = cosf(phi); - sin_phi = sinf(phi); + float3 dir = make_float3(sin_theta * cosf(phi), sin_theta * sinf(phi), cos_theta); - /* note that I points towards the viewer and so is used negated */ float3 T, B; - make_orthonormals(-I, &T, &B); - *omega_in = sin_theta * cos_phi * T + sin_theta * sin_phi * B + cos_theta * (-I); + make_orthonormals(D, &T, &B); + dir = dir.x * T + dir.y * B + dir.z * D; + + return dir; +} + +ccl_device int volume_henyey_greenstein_sample(const ShaderClosure *sc, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, + float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf) +{ + const HenyeyGreensteinVolume *volume = (const HenyeyGreensteinVolume*)sc; + float g = volume->g; + /* note that I points towards the viewer and so is used negated */ + *omega_in = henyey_greenstrein_sample(-I, g, randu, randv, pdf); *eval = make_float3(*pdf, *pdf, *pdf); /* perfect importance sampling */ #ifdef __RAY_DIFFERENTIALS__ @@ -110,15 +133,6 @@ ccl_device int volume_henyey_greenstein_sample(const ShaderClosure *sc, float3 I return LABEL_VOLUME_SCATTER; } -/* ABSORPTION VOLUME CLOSURE */ - -ccl_device int volume_absorption_setup(ShaderClosure *sc) -{ - sc->type = CLOSURE_VOLUME_ABSORPTION_ID; - - return SD_ABSORPTION; -} - /* VOLUME CLOSURE */ ccl_device float3 volume_phase_eval(const ShaderData *sd, const ShaderClosure *sc, float3 omega_in, float *pdf) diff --git a/intern/cycles/kernel/filter/filter.h b/intern/cycles/kernel/filter/filter.h new file mode 100644 index 00000000000..f6e474d6702 --- /dev/null +++ b/intern/cycles/kernel/filter/filter.h @@ -0,0 +1,52 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __FILTER_H__ +#define __FILTER_H__ + +/* CPU Filter Kernel Interface */ + +#include "util/util_types.h" + +#include "kernel/filter/filter_defines.h" + +CCL_NAMESPACE_BEGIN + +#define KERNEL_NAME_JOIN(x, y, z) x ## _ ## y ## _ ## z +#define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name) +#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name) + +#define KERNEL_ARCH cpu +#include "kernel/kernels/cpu/filter_cpu.h" + +#define KERNEL_ARCH cpu_sse2 +#include "kernel/kernels/cpu/filter_cpu.h" + +#define KERNEL_ARCH cpu_sse3 +#include "kernel/kernels/cpu/filter_cpu.h" + +#define KERNEL_ARCH cpu_sse41 +#include "kernel/kernels/cpu/filter_cpu.h" + +#define KERNEL_ARCH cpu_avx +#include "kernel/kernels/cpu/filter_cpu.h" + +#define KERNEL_ARCH cpu_avx2 +#include "kernel/kernels/cpu/filter_cpu.h" + +CCL_NAMESPACE_END + +#endif /* __FILTER_H__ */ diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h new file mode 100644 index 00000000000..ce96f733aff --- /dev/null +++ b/intern/cycles/kernel/filter/filter_defines.h @@ -0,0 +1,38 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __FILTER_DEFINES_H__ +#define __FILTER_DEFINES_H__ + +#define DENOISE_FEATURES 10 +#define TRANSFORM_SIZE (DENOISE_FEATURES*DENOISE_FEATURES) +#define XTWX_SIZE (((DENOISE_FEATURES+1)*(DENOISE_FEATURES+2))/2) +#define XTWY_SIZE (DENOISE_FEATURES+1) + +typedef struct TilesInfo { + int offsets[9]; + int strides[9]; + int x[4]; + int y[4]; + /* TODO(lukas): CUDA doesn't have uint64_t... */ +#ifdef __KERNEL_OPENCL__ + ccl_global float *buffers[9]; +#else + long long int buffers[9]; +#endif +} TilesInfo; + +#endif /* __FILTER_DEFINES_H__*/ diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h new file mode 100644 index 00000000000..6226ed2c2ef --- /dev/null +++ b/intern/cycles/kernel/filter/filter_features.h @@ -0,0 +1,124 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + CCL_NAMESPACE_BEGIN + +#define ccl_get_feature(buffer, pass) (buffer)[(pass)*pass_stride] + +/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y). + * pixel_buffer always points to the current pixel in the first pass. */ +#define FOR_PIXEL_WINDOW pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \ + for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ + for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) { + +#define END_FOR_PIXEL_WINDOW } \ + pixel_buffer += buffer_w - (high.x - low.x); \ + } + +ccl_device_inline void filter_get_features(int2 pixel, + const ccl_global float *ccl_restrict buffer, + float *features, + const float *ccl_restrict mean, + int pass_stride) +{ + features[0] = pixel.x; + features[1] = pixel.y; + features[2] = fabsf(ccl_get_feature(buffer, 0)); + features[3] = ccl_get_feature(buffer, 1); + features[4] = ccl_get_feature(buffer, 2); + features[5] = ccl_get_feature(buffer, 3); + features[6] = ccl_get_feature(buffer, 4); + features[7] = ccl_get_feature(buffer, 5); + features[8] = ccl_get_feature(buffer, 6); + features[9] = ccl_get_feature(buffer, 7); + if(mean) { + for(int i = 0; i < DENOISE_FEATURES; i++) + features[i] -= mean[i]; + } +} + +ccl_device_inline void filter_get_feature_scales(int2 pixel, + const ccl_global float *ccl_restrict buffer, + float *scales, + const float *ccl_restrict mean, + int pass_stride) +{ + scales[0] = fabsf(pixel.x - mean[0]); + scales[1] = fabsf(pixel.y - mean[1]); + scales[2] = fabsf(fabsf(ccl_get_feature(buffer, 0)) - mean[2]); + scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3], + ccl_get_feature(buffer, 2) - mean[4], + ccl_get_feature(buffer, 3) - mean[5])); + scales[4] = fabsf(ccl_get_feature(buffer, 4) - mean[6]); + scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7], + ccl_get_feature(buffer, 6) - mean[8], + ccl_get_feature(buffer, 7) - mean[9])); +} + +ccl_device_inline void filter_calculate_scale(float *scale) +{ + scale[0] = 1.0f/max(scale[0], 0.01f); + scale[1] = 1.0f/max(scale[1], 0.01f); + scale[2] = 1.0f/max(scale[2], 0.01f); + scale[6] = 1.0f/max(scale[4], 0.01f); + scale[7] = scale[8] = scale[9] = 1.0f/max(sqrtf(scale[5]), 0.01f); + scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f); +} + +ccl_device_inline float3 filter_get_color(const ccl_global float *ccl_restrict buffer, + int pass_stride) +{ + return make_float3(ccl_get_feature(buffer, 8), ccl_get_feature(buffer, 9), ccl_get_feature(buffer, 10)); +} + +ccl_device_inline void design_row_add(float *design_row, + int rank, + const ccl_global float *ccl_restrict transform, + int stride, + int row, + float feature) +{ + for(int i = 0; i < rank; i++) { + design_row[1+i] += transform[(row*DENOISE_FEATURES + i)*stride]*feature; + } +} + +/* Fill the design row. */ +ccl_device_inline void filter_get_design_row_transform(int2 p_pixel, + const ccl_global float *ccl_restrict p_buffer, + int2 q_pixel, + const ccl_global float *ccl_restrict q_buffer, + int pass_stride, + int rank, + float *design_row, + const ccl_global float *ccl_restrict transform, + int stride) +{ + design_row[0] = 1.0f; + math_vector_zero(design_row+1, rank); + design_row_add(design_row, rank, transform, stride, 0, q_pixel.x - p_pixel.x); + design_row_add(design_row, rank, transform, stride, 1, q_pixel.y - p_pixel.y); + design_row_add(design_row, rank, transform, stride, 2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0))); + design_row_add(design_row, rank, transform, stride, 3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1)); + design_row_add(design_row, rank, transform, stride, 4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2)); + design_row_add(design_row, rank, transform, stride, 5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3)); + design_row_add(design_row, rank, transform, stride, 6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4)); + design_row_add(design_row, rank, transform, stride, 7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5)); + design_row_add(design_row, rank, transform, stride, 8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6)); + design_row_add(design_row, rank, transform, stride, 9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7)); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h new file mode 100644 index 00000000000..3ddd8712266 --- /dev/null +++ b/intern/cycles/kernel/filter/filter_features_sse.h @@ -0,0 +1,93 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +#define ccl_get_feature_sse(pass) load_float4(buffer + (pass)*pass_stride) + +/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time. + * pixel_buffer always points to the first of the 4 current pixel in the first pass. + * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. */ + +#define FOR_PIXEL_WINDOW_SSE pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \ + for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ + float4 y4 = make_float4(pixel.y); \ + for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \ + float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \ + int4 active_pixels = x4 < make_float4(high.x); + +#define END_FOR_PIXEL_WINDOW_SSE } \ + pixel_buffer += buffer_w - (pixel.x - low.x); \ + } + +ccl_device_inline void filter_get_features_sse(float4 x, float4 y, + int4 active_pixels, + const float *ccl_restrict buffer, + float4 *features, + const float4 *ccl_restrict mean, + int pass_stride) +{ + features[0] = x; + features[1] = y; + features[2] = fabs(ccl_get_feature_sse(0)); + features[3] = ccl_get_feature_sse(1); + features[4] = ccl_get_feature_sse(2); + features[5] = ccl_get_feature_sse(3); + features[6] = ccl_get_feature_sse(4); + features[7] = ccl_get_feature_sse(5); + features[8] = ccl_get_feature_sse(6); + features[9] = ccl_get_feature_sse(7); + if(mean) { + for(int i = 0; i < DENOISE_FEATURES; i++) + features[i] = features[i] - mean[i]; + } + for(int i = 0; i < DENOISE_FEATURES; i++) + features[i] = mask(active_pixels, features[i]); +} + +ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y, + int4 active_pixels, + const float *ccl_restrict buffer, + float4 *scales, + const float4 *ccl_restrict mean, + int pass_stride) +{ + scales[0] = fabs(x - mean[0]); + scales[1] = fabs(y - mean[1]); + scales[2] = fabs(fabs(ccl_get_feature_sse(0)) - mean[2]); + scales[3] = sqr(ccl_get_feature_sse(1) - mean[3]) + + sqr(ccl_get_feature_sse(2) - mean[4]) + + sqr(ccl_get_feature_sse(3) - mean[5]); + scales[4] = fabs(ccl_get_feature_sse(4) - mean[6]); + scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) + + sqr(ccl_get_feature_sse(6) - mean[8]) + + sqr(ccl_get_feature_sse(7) - mean[9]); + for(int i = 0; i < 6; i++) + scales[i] = mask(active_pixels, scales[i]); +} + +ccl_device_inline void filter_calculate_scale_sse(float4 *scale) +{ + scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f))); + scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f))); + scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f))); + scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f))); + scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f))); + scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f))); +} + + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_kernel.h b/intern/cycles/kernel/filter/filter_kernel.h new file mode 100644 index 00000000000..2ef03dc0a02 --- /dev/null +++ b/intern/cycles/kernel/filter/filter_kernel.h @@ -0,0 +1,50 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "util/util_color.h" +#include "util/util_math.h" +#include "util/util_math_fast.h" +#include "util/util_texture.h" + +#include "util/util_atomic.h" +#include "util/util_math_matrix.h" + +#include "kernel/filter/filter_defines.h" + +#include "kernel/filter/filter_features.h" +#ifdef __KERNEL_SSE3__ +# include "kernel/filter/filter_features_sse.h" +#endif + +#include "kernel/filter/filter_prefilter.h" + +#ifdef __KERNEL_GPU__ +# include "kernel/filter/filter_transform_gpu.h" +#else +# ifdef __KERNEL_SSE3__ +# include "kernel/filter/filter_transform_sse.h" +# else +# include "kernel/filter/filter_transform.h" +# endif +#endif + +#include "kernel/filter/filter_reconstruction.h" + +#ifdef __KERNEL_CPU__ +# include "kernel/filter/filter_nlm_cpu.h" +#else +# include "kernel/filter/filter_nlm_gpu.h" +#endif diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h new file mode 100644 index 00000000000..e2da0fd872b --- /dev/null +++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h @@ -0,0 +1,180 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, + const float *ccl_restrict weight_image, + const float *ccl_restrict variance_image, + float *difference_image, + int4 rect, + int stride, + int channel_offset, + float a, + float k_2) +{ + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x; x < rect.z; x++) { + float diff = 0.0f; + int numChannels = channel_offset? 3 : 1; + for(int c = 0; c < numChannels; c++) { + float cdiff = weight_image[c*channel_offset + y*stride + x] - weight_image[c*channel_offset + (y+dy)*stride + (x+dx)]; + float pvar = variance_image[c*channel_offset + y*stride + x]; + float qvar = variance_image[c*channel_offset + (y+dy)*stride + (x+dx)]; + diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar)); + } + if(numChannels > 1) { + diff *= 1.0f/numChannels; + } + difference_image[y*stride + x] = diff; + } + } +} + +ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict difference_image, + float *out_image, + int4 rect, + int stride, + int f) +{ + int aligned_lowx = rect.x / 4; + int aligned_highx = (rect.z + 3) / 4; + for(int y = rect.y; y < rect.w; y++) { + const int low = max(rect.y, y-f); + const int high = min(rect.w, y+f+1); + for(int x = rect.x; x < rect.z; x++) { + out_image[y*stride + x] = 0.0f; + } + for(int y1 = low; y1 < high; y1++) { + float4* out_image4 = (float4*)(out_image + y*stride); + float4* difference_image4 = (float4*)(difference_image + y1*stride); + for(int x = aligned_lowx; x < aligned_highx; x++) { + out_image4[x] += difference_image4[x]; + } + } + for(int x = rect.x; x < rect.z; x++) { + out_image[y*stride + x] *= 1.0f/(high - low); + } + } +} + +ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict difference_image, + float *out_image, + int4 rect, + int stride, + int f) +{ + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x; x < rect.z; x++) { + out_image[y*stride + x] = 0.0f; + } + } + for(int dx = -f; dx <= f; dx++) { + int pos_dx = max(0, dx); + int neg_dx = min(0, dx); + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) { + out_image[y*stride + x] += difference_image[y*stride + x+dx]; + } + } + } + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x; x < rect.z; x++) { + const int low = max(rect.x, x-f); + const int high = min(rect.z, x+f+1); + out_image[y*stride + x] = fast_expf(-max(out_image[y*stride + x] * (1.0f/(high - low)), 0.0f)); + } + } +} + +ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, + const float *ccl_restrict difference_image, + const float *ccl_restrict image, + float *out_image, + float *accum_image, + int4 rect, + int stride, + int f) +{ + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x; x < rect.z; x++) { + const int low = max(rect.x, x-f); + const int high = min(rect.z, x+f+1); + float sum = 0.0f; + for(int x1 = low; x1 < high; x1++) { + sum += difference_image[y*stride + x1]; + } + float weight = sum * (1.0f/(high - low)); + accum_image[y*stride + x] += weight; + out_image[y*stride + x] += weight*image[(y+dy)*stride + (x+dx)]; + } + } +} + +ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, + const float *ccl_restrict difference_image, + const float *ccl_restrict buffer, + float *transform, + int *rank, + float *XtWX, + float3 *XtWY, + int4 rect, + int4 filter_window, + int stride, int f, + int pass_stride) +{ + int4 clip_area = rect_clip(rect, filter_window); + /* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */ + for(int y = clip_area.y; y < clip_area.w; y++) { + for(int x = clip_area.x; x < clip_area.z; x++) { + const int low = max(rect.x, x-f); + const int high = min(rect.z, x+f+1); + float sum = 0.0f; + for(int x1 = low; x1 < high; x1++) { + sum += difference_image[y*stride + x1]; + } + float weight = sum * (1.0f/(high - low)); + + int storage_ofs = coord_to_local_index(filter_window, x, y); + float *l_transform = transform + storage_ofs*TRANSFORM_SIZE; + float *l_XtWX = XtWX + storage_ofs*XTWX_SIZE; + float3 *l_XtWY = XtWY + storage_ofs*XTWY_SIZE; + int *l_rank = rank + storage_ofs; + + kernel_filter_construct_gramian(x, y, 1, + dx, dy, + stride, + pass_stride, + buffer, + l_transform, l_rank, + weight, l_XtWX, l_XtWY, 0); + } + } +} + +ccl_device_inline void kernel_filter_nlm_normalize(float *out_image, + const float *ccl_restrict accum_image, + int4 rect, + int w) +{ + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x; x < rect.z; x++) { + out_image[y*w+x] /= accum_image[y*w+x]; + } + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h new file mode 100644 index 00000000000..4ca49ea6733 --- /dev/null +++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h @@ -0,0 +1,202 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +/* Determines pixel coordinates and offset for the current thread. + * Returns whether the thread should do any work. + * + * All coordinates are relative to the denoising buffer! + * + * Window is the rect that should be processed. + * co is filled with (x, y, dx, dy). + */ +ccl_device_inline bool get_nlm_coords_window(int w, int h, int r, int stride, + int4 *rect, int4 *co, int *ofs, + int4 window) +{ + /* Determine the pixel offset that this thread should apply. */ + int s = 2*r+1; + int si = ccl_global_id(1); + int sx = si % s; + int sy = si / s; + if(sy >= s) { + return false; + } + co->z = sx-r; + co->w = sy-r; + + /* Pixels still need to lie inside the denoising buffer after applying the offset, + * so determine the area for which this is the case. */ + *rect = make_int4(max(0, -co->z), max(0, -co->w), + w - max(0, co->z), h - max(0, co->w)); + + /* Find the intersection of the area that we want to process (window) and the area + * that can be processed (rect) to get the final area for this offset. */ + int4 clip_area = rect_clip(window, *rect); + + /* If the radius is larger than one of the sides of the window, + * there will be shifts for which there is no usable pixel at all. */ + if(!rect_is_valid(clip_area)) { + return false; + } + + /* Map the linear thread index to pixels inside the clip area. */ + int x, y; + if(!local_index_to_coord(clip_area, ccl_global_id(0), &x, &y)) { + return false; + } + co->x = x; + co->y = y; + + *ofs = (sy*s + sx) * stride; + + return true; +} + +ccl_device_inline bool get_nlm_coords(int w, int h, int r, int stride, + int4 *rect, int4 *co, int *ofs) +{ + return get_nlm_coords_window(w, h, r, stride, rect, co, ofs, make_int4(0, 0, w, h)); +} + +ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y, + int dx, int dy, + const ccl_global float *ccl_restrict weight_image, + const ccl_global float *ccl_restrict variance_image, + ccl_global float *difference_image, + int4 rect, int stride, + int channel_offset, + float a, float k_2) +{ + float diff = 0.0f; + int numChannels = channel_offset? 3 : 1; + for(int c = 0; c < numChannels; c++) { + float cdiff = weight_image[c*channel_offset + y*stride + x] - weight_image[c*channel_offset + (y+dy)*stride + (x+dx)]; + float pvar = variance_image[c*channel_offset + y*stride + x]; + float qvar = variance_image[c*channel_offset + (y+dy)*stride + (x+dx)]; + diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar)); + } + if(numChannels > 1) { + diff *= 1.0f/numChannels; + } + difference_image[y*stride + x] = diff; +} + +ccl_device_inline void kernel_filter_nlm_blur(int x, int y, + const ccl_global float *ccl_restrict difference_image, + ccl_global float *out_image, + int4 rect, int stride, int f) +{ + float sum = 0.0f; + const int low = max(rect.y, y-f); + const int high = min(rect.w, y+f+1); + for(int y1 = low; y1 < high; y1++) { + sum += difference_image[y1*stride + x]; + } + sum *= 1.0f/(high-low); + out_image[y*stride + x] = sum; +} + +ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y, + const ccl_global float *ccl_restrict difference_image, + ccl_global float *out_image, + int4 rect, int stride, int f) +{ + float sum = 0.0f; + const int low = max(rect.x, x-f); + const int high = min(rect.z, x+f+1); + for(int x1 = low; x1 < high; x1++) { + sum += difference_image[y*stride + x1]; + } + sum *= 1.0f/(high-low); + out_image[y*stride + x] = fast_expf(-max(sum, 0.0f)); +} + +ccl_device_inline void kernel_filter_nlm_update_output(int x, int y, + int dx, int dy, + const ccl_global float *ccl_restrict difference_image, + const ccl_global float *ccl_restrict image, + ccl_global float *out_image, + ccl_global float *accum_image, + int4 rect, int stride, int f) +{ + float sum = 0.0f; + const int low = max(rect.x, x-f); + const int high = min(rect.z, x+f+1); + for(int x1 = low; x1 < high; x1++) { + sum += difference_image[y*stride + x1]; + } + sum *= 1.0f/(high-low); + if(out_image) { + atomic_add_and_fetch_float(accum_image + y*stride + x, sum); + atomic_add_and_fetch_float(out_image + y*stride + x, sum*image[(y+dy)*stride + (x+dx)]); + } + else { + accum_image[y*stride + x] = sum; + } +} + +ccl_device_inline void kernel_filter_nlm_construct_gramian(int x, int y, + int dx, int dy, + const ccl_global float *ccl_restrict difference_image, + const ccl_global float *ccl_restrict buffer, + const ccl_global float *ccl_restrict transform, + ccl_global int *rank, + ccl_global float *XtWX, + ccl_global float3 *XtWY, + int4 rect, + int4 filter_window, + int stride, int f, + int pass_stride, + int localIdx) +{ + const int low = max(rect.x, x-f); + const int high = min(rect.z, x+f+1); + float sum = 0.0f; + for(int x1 = low; x1 < high; x1++) { + sum += difference_image[y*stride + x1]; + } + float weight = sum * (1.0f/(high - low)); + + /* Reconstruction data is only stored for pixels inside the filter window, + * so compute the pixels's index in there. */ + int storage_ofs = coord_to_local_index(filter_window, x, y); + transform += storage_ofs; + rank += storage_ofs; + XtWX += storage_ofs; + XtWY += storage_ofs; + + kernel_filter_construct_gramian(x, y, + rect_size(filter_window), + dx, dy, + stride, + pass_stride, + buffer, + transform, rank, + weight, XtWX, XtWY, + localIdx); +} + +ccl_device_inline void kernel_filter_nlm_normalize(int x, int y, + ccl_global float *out_image, + const ccl_global float *ccl_restrict accum_image, + int stride) +{ + out_image[y*stride + x] /= accum_image[y*stride + x]; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h new file mode 100644 index 00000000000..4af209341f6 --- /dev/null +++ b/intern/cycles/kernel/filter/filter_prefilter.h @@ -0,0 +1,212 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +/* First step of the shadow prefiltering, performs the shadow division and stores all data + * in a nice and easy rectangular array that can be passed to the NLM filter. + * + * Calculates: + * unfiltered: Contains the two half images of the shadow feature pass + * sampleVariance: The sample-based variance calculated in the kernel. Note: This calculation is biased in general, and especially here since the variance of the ratio can only be approximated. + * sampleVarianceV: Variance of the sample variance estimation, quite noisy (since it's essentially the buffer variance of the two variance halves) + * bufferVariance: The buffer-based variance of the shadow feature. Unbiased, but quite noisy. + */ +ccl_device void kernel_filter_divide_shadow(int sample, + ccl_global TilesInfo *tiles, + int x, int y, + ccl_global float *unfilteredA, + ccl_global float *unfilteredB, + ccl_global float *sampleVariance, + ccl_global float *sampleVarianceV, + ccl_global float *bufferVariance, + int4 rect, + int buffer_pass_stride, + int buffer_denoising_offset) +{ + int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2); + int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2); + int tile = ytile*3+xtile; + + int offset = tiles->offsets[tile]; + int stride = tiles->strides[tile]; + const ccl_global float *ccl_restrict center_buffer = (ccl_global float*) tiles->buffers[tile]; + center_buffer += (y*stride + x + offset)*buffer_pass_stride; + center_buffer += buffer_denoising_offset + 14; + + int buffer_w = align_up(rect.z - rect.x, 4); + int idx = (y-rect.y)*buffer_w + (x - rect.x); + unfilteredA[idx] = center_buffer[1] / max(center_buffer[0], 1e-7f); + unfilteredB[idx] = center_buffer[4] / max(center_buffer[3], 1e-7f); + + float varA = center_buffer[2]; + float varB = center_buffer[5]; + int odd_sample = (sample+1)/2; + int even_sample = sample/2; + + /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance + * update does not work efficiently with atomics in the kernel. */ + varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample); + varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample); + + varA /= max(odd_sample - 1, 1); + varB /= max(even_sample - 1, 1); + + sampleVariance[idx] = 0.5f*(varA + varB) / sample; + sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample*sample); + bufferVariance[idx] = 0.5f * (unfilteredA[idx] - unfilteredB[idx]) * (unfilteredA[idx] - unfilteredB[idx]); +} + +/* Load a regular feature from the render buffers into the denoise buffer. + * Parameters: + * - sample: The sample amount in the buffer, used to normalize the buffer. + * - m_offset, v_offset: Render Buffer Pass offsets of mean and variance of the feature. + * - x, y: Current pixel + * - mean, variance: Target denoise buffers. + * - rect: The prefilter area (lower pixels inclusive, upper pixels exclusive). + */ +ccl_device void kernel_filter_get_feature(int sample, + ccl_global TilesInfo *tiles, + int m_offset, int v_offset, + int x, int y, + ccl_global float *mean, + ccl_global float *variance, + int4 rect, int buffer_pass_stride, + int buffer_denoising_offset) +{ + int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2); + int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2); + int tile = ytile*3+xtile; + ccl_global float *center_buffer = ((ccl_global float*) tiles->buffers[tile]) + (tiles->offsets[tile] + y*tiles->strides[tile] + x)*buffer_pass_stride + buffer_denoising_offset; + + int buffer_w = align_up(rect.z - rect.x, 4); + int idx = (y-rect.y)*buffer_w + (x - rect.x); + + mean[idx] = center_buffer[m_offset] / sample; + if(sample > 1) { + /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance + * update does not work efficiently with atomics in the kernel. */ + variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1))); + } + else { + /* Can't compute variance with single sample, just set it very high. */ + variance[idx] = 1e10f; + } +} + +ccl_device void kernel_filter_detect_outliers(int x, int y, + ccl_global float *image, + ccl_global float *variance, + ccl_global float *depth, + ccl_global float *out, + int4 rect, + int pass_stride) +{ + int buffer_w = align_up(rect.z - rect.x, 4); + + int n = 0; + float values[25]; + for(int y1 = max(y-2, rect.y); y1 < min(y+3, rect.w); y1++) { + for(int x1 = max(x-2, rect.x); x1 < min(x+3, rect.z); x1++) { + int idx = (y1-rect.y)*buffer_w + (x1-rect.x); + float3 color = make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]); + color = max(color, make_float3(0.0f, 0.0f, 0.0f)); + float L = average(color); + + /* Find the position of L. */ + int i; + for(i = 0; i < n; i++) { + if(values[i] > L) break; + } + /* Make space for L by shifting all following values to the right. */ + for(int j = n; j > i; j--) { + values[j] = values[j-1]; + } + /* Insert L. */ + values[i] = L; + n++; + } + } + + int idx = (y-rect.y)*buffer_w + (x-rect.x); + float3 color = make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]); + color = max(color, make_float3(0.0f, 0.0f, 0.0f)); + float L = average(color); + + float ref = 2.0f*values[(int)(n*0.75f)]; + if(L > ref) { + /* The pixel appears to be an outlier. + * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is that the pixel + * should actually be at the reference value: + * If the reference is within the 3-sigma interval, the pixel is assumed to be a statistical outlier. + * Otherwise, it is very unlikely that the pixel should be darker, which indicates a legitimate highlight. + */ + float stddev = sqrtf(average(make_float3(variance[idx], variance[idx+pass_stride], variance[idx+2*pass_stride]))); + if(L - 3*stddev < ref) { + /* The pixel is an outlier, so negate the depth value to mark it as one. + * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */ + depth[idx] = -depth[idx]; + float fac = ref/L; + color *= fac; + variance[idx ] *= fac*fac; + variance[idx + pass_stride] *= fac*fac; + variance[idx+2*pass_stride] *= fac*fac; + } + } + out[idx ] = color.x; + out[idx + pass_stride] = color.y; + out[idx+2*pass_stride] = color.z; +} + +/* Combine A/B buffers. + * Calculates the combined mean and the buffer variance. */ +ccl_device void kernel_filter_combine_halves(int x, int y, + ccl_global float *mean, + ccl_global float *variance, + ccl_global float *a, + ccl_global float *b, + int4 rect, int r) +{ + int buffer_w = align_up(rect.z - rect.x, 4); + int idx = (y-rect.y)*buffer_w + (x - rect.x); + + if(mean) mean[idx] = 0.5f * (a[idx]+b[idx]); + if(variance) { + if(r == 0) variance[idx] = 0.25f * (a[idx]-b[idx])*(a[idx]-b[idx]); + else { + variance[idx] = 0.0f; + float values[25]; + int numValues = 0; + for(int py = max(y-r, rect.y); py < min(y+r+1, rect.w); py++) { + for(int px = max(x-r, rect.x); px < min(x+r+1, rect.z); px++) { + int pidx = (py-rect.y)*buffer_w + (px-rect.x); + values[numValues++] = 0.25f * (a[pidx]-b[pidx])*(a[pidx]-b[pidx]); + } + } + /* Insertion-sort the variances (fast enough for 25 elements). */ + for(int i = 1; i < numValues; i++) { + float v = values[i]; + int j; + for(j = i-1; j >= 0 && values[j] > v; j--) + values[j+1] = values[j]; + values[j+1] = v; + } + variance[idx] = values[(7*numValues)/8]; + } + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h new file mode 100644 index 00000000000..b7bf322f9ce --- /dev/null +++ b/intern/cycles/kernel/filter/filter_reconstruction.h @@ -0,0 +1,117 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device_inline void kernel_filter_construct_gramian(int x, int y, + int storage_stride, + int dx, int dy, + int buffer_stride, + int pass_stride, + const ccl_global float *ccl_restrict buffer, + const ccl_global float *ccl_restrict transform, + ccl_global int *rank, + float weight, + ccl_global float *XtWX, + ccl_global float3 *XtWY, + int localIdx) +{ + if(weight < 1e-3f) { + return; + } + + int p_offset = y * buffer_stride + x; + int q_offset = (y+dy) * buffer_stride + (x+dx); + +#ifdef __KERNEL_GPU__ + const int stride = storage_stride; +#else + const int stride = 1; + (void) storage_stride; +#endif + +#ifdef __KERNEL_CUDA__ + ccl_local float shared_design_row[(DENOISE_FEATURES+1)*CCL_MAX_LOCAL_SIZE]; + ccl_local_param float *design_row = shared_design_row + localIdx*(DENOISE_FEATURES+1); +#else + float design_row[DENOISE_FEATURES+1]; +#endif + + float3 q_color = filter_get_color(buffer + q_offset, pass_stride); + + /* If the pixel was flagged as an outlier during prefiltering, skip it. */ + if(ccl_get_feature(buffer + q_offset, 0) < 0.0f) { + return; + } + + filter_get_design_row_transform(make_int2(x, y), buffer + p_offset, + make_int2(x+dx, y+dy), buffer + q_offset, + pass_stride, *rank, design_row, transform, stride); + + math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride); + math_vec3_add_strided(XtWY, (*rank)+1, design_row, weight * q_color, stride); +} + +ccl_device_inline void kernel_filter_finalize(int x, int y, + ccl_global float *buffer, + ccl_global int *rank, + int storage_stride, + ccl_global float *XtWX, + ccl_global float3 *XtWY, + int4 buffer_params, + int sample) +{ +#ifdef __KERNEL_GPU__ + const int stride = storage_stride; +#else + const int stride = 1; + (void) storage_stride; +#endif + + if(XtWX[0] < 1e-3f) { + /* There is not enough information to determine a denoised result. + * As a fallback, keep the original value of the pixel. */ + return; + } + + /* The weighted average of pixel colors (essentially, the NLM-filtered image). + * In case the solution of the linear model fails due to numerical issues, + * fall back to this value. */ + float3 mean_color = XtWY[0]/XtWX[0]; + + math_trimatrix_vec3_solve(XtWX, XtWY, (*rank)+1, stride); + + float3 final_color = XtWY[0]; + if(!isfinite3_safe(final_color)) { + final_color = mean_color; + } + + /* Clamp pixel value to positive values. */ + final_color = max(final_color, make_float3(0.0f, 0.0f, 0.0f)); + + ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z; + final_color *= sample; + if(buffer_params.w) { + final_color.x += combined_buffer[buffer_params.w+0]; + final_color.y += combined_buffer[buffer_params.w+1]; + final_color.z += combined_buffer[buffer_params.w+2]; + } + combined_buffer[0] = final_color.x; + combined_buffer[1] = final_color.y; + combined_buffer[2] = final_color.z; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_transform.h b/intern/cycles/kernel/filter/filter_transform.h new file mode 100644 index 00000000000..a5f87c05ec0 --- /dev/null +++ b/intern/cycles/kernel/filter/filter_transform.h @@ -0,0 +1,108 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer, + int x, int y, int4 rect, + int pass_stride, + float *transform, int *rank, + int radius, float pca_threshold) +{ + int buffer_w = align_up(rect.z - rect.x, 4); + + float features[DENOISE_FEATURES]; + + /* Temporary storage, used in different steps of the algorithm. */ + float tempmatrix[DENOISE_FEATURES*DENOISE_FEATURES]; + float tempvector[2*DENOISE_FEATURES]; + const float *ccl_restrict pixel_buffer; + int2 pixel; + + /* === Calculate denoising window. === */ + int2 low = make_int2(max(rect.x, x - radius), + max(rect.y, y - radius)); + int2 high = make_int2(min(rect.z, x + radius + 1), + min(rect.w, y + radius + 1)); + int num_pixels = (high.y - low.y) * (high.x - low.x); + + /* === Shift feature passes to have mean 0. === */ + float feature_means[DENOISE_FEATURES]; + math_vector_zero(feature_means, DENOISE_FEATURES); + FOR_PIXEL_WINDOW { + filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride); + math_vector_add(feature_means, features, DENOISE_FEATURES); + } END_FOR_PIXEL_WINDOW + + math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES); + + /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */ + float *feature_scale = tempvector; + math_vector_zero(feature_scale, DENOISE_FEATURES); + + FOR_PIXEL_WINDOW { + filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride); + math_vector_max(feature_scale, features, DENOISE_FEATURES); + } END_FOR_PIXEL_WINDOW + + filter_calculate_scale(feature_scale); + + /* === Generate the feature transformation. === + * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space + * which generally has fewer dimensions. This mainly helps to prevent overfitting. */ + float* feature_matrix = tempmatrix; + math_matrix_zero(feature_matrix, DENOISE_FEATURES); + FOR_PIXEL_WINDOW { + filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride); + math_vector_mul(features, feature_scale, DENOISE_FEATURES); + math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f); + } END_FOR_PIXEL_WINDOW + + math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1); + *rank = 0; + /* Prevent overfitting when a small window is used. */ + int max_rank = min(DENOISE_FEATURES, num_pixels/3); + if(pca_threshold < 0.0f) { + float threshold_energy = 0.0f; + for(int i = 0; i < DENOISE_FEATURES; i++) { + threshold_energy += feature_matrix[i*DENOISE_FEATURES+i]; + } + threshold_energy *= 1.0f - (-pca_threshold); + + float reduced_energy = 0.0f; + for(int i = 0; i < max_rank; i++, (*rank)++) { + if(i >= 2 && reduced_energy >= threshold_energy) + break; + float s = feature_matrix[i*DENOISE_FEATURES+i]; + reduced_energy += s; + } + } + else { + for(int i = 0; i < max_rank; i++, (*rank)++) { + float s = feature_matrix[i*DENOISE_FEATURES+i]; + if(i >= 2 && sqrtf(s) < pca_threshold) + break; + } + } + + /* Bake the feature scaling into the transformation matrix. */ + for(int i = 0; i < (*rank); i++) { + math_vector_mul(transform + i*DENOISE_FEATURES, feature_scale, DENOISE_FEATURES); + } + math_matrix_transpose(transform, DENOISE_FEATURES, 1); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h new file mode 100644 index 00000000000..83a1222bbdb --- /dev/null +++ b/intern/cycles/kernel/filter/filter_transform_gpu.h @@ -0,0 +1,119 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_restrict buffer, + int x, int y, int4 rect, + int pass_stride, + ccl_global float *transform, + ccl_global int *rank, + int radius, float pca_threshold, + int transform_stride, int localIdx) +{ + int buffer_w = align_up(rect.z - rect.x, 4); + +#ifdef __KERNEL_CUDA__ + ccl_local float shared_features[DENOISE_FEATURES*CCL_MAX_LOCAL_SIZE]; + ccl_local_param float *features = shared_features + localIdx*DENOISE_FEATURES; +#else + float features[DENOISE_FEATURES]; +#endif + + /* === Calculate denoising window. === */ + int2 low = make_int2(max(rect.x, x - radius), + max(rect.y, y - radius)); + int2 high = make_int2(min(rect.z, x + radius + 1), + min(rect.w, y + radius + 1)); + int num_pixels = (high.y - low.y) * (high.x - low.x); + const ccl_global float *ccl_restrict pixel_buffer; + int2 pixel; + + + + + /* === Shift feature passes to have mean 0. === */ + float feature_means[DENOISE_FEATURES]; + math_vector_zero(feature_means, DENOISE_FEATURES); + FOR_PIXEL_WINDOW { + filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride); + math_vector_add(feature_means, features, DENOISE_FEATURES); + } END_FOR_PIXEL_WINDOW + + math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES); + + /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */ + float feature_scale[DENOISE_FEATURES]; + math_vector_zero(feature_scale, DENOISE_FEATURES); + + FOR_PIXEL_WINDOW { + filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride); + math_vector_max(feature_scale, features, DENOISE_FEATURES); + } END_FOR_PIXEL_WINDOW + + filter_calculate_scale(feature_scale); + + + + /* === Generate the feature transformation. === + * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space + * which generally has fewer dimensions. This mainly helps to prevent overfitting. */ + float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES]; + math_matrix_zero(feature_matrix, DENOISE_FEATURES); + FOR_PIXEL_WINDOW { + filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride); + math_vector_mul(features, feature_scale, DENOISE_FEATURES); + math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f); + } END_FOR_PIXEL_WINDOW + + math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, transform_stride); + *rank = 0; + /* Prevent overfitting when a small window is used. */ + int max_rank = min(DENOISE_FEATURES, num_pixels/3); + if(pca_threshold < 0.0f) { + float threshold_energy = 0.0f; + for(int i = 0; i < DENOISE_FEATURES; i++) { + threshold_energy += feature_matrix[i*DENOISE_FEATURES+i]; + } + threshold_energy *= 1.0f - (-pca_threshold); + + float reduced_energy = 0.0f; + for(int i = 0; i < max_rank; i++, (*rank)++) { + if(i >= 2 && reduced_energy >= threshold_energy) + break; + float s = feature_matrix[i*DENOISE_FEATURES+i]; + reduced_energy += s; + } + } + else { + for(int i = 0; i < max_rank; i++, (*rank)++) { + float s = feature_matrix[i*DENOISE_FEATURES+i]; + if(i >= 2 && sqrtf(s) < pca_threshold) + break; + } + } + + math_matrix_transpose(transform, DENOISE_FEATURES, transform_stride); + + /* Bake the feature scaling into the transformation matrix. */ + for(int i = 0; i < DENOISE_FEATURES; i++) { + for(int j = 0; j < (*rank); j++) { + transform[(i*DENOISE_FEATURES + j)*transform_stride] *= feature_scale[i]; + } + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h new file mode 100644 index 00000000000..9e65f61664b --- /dev/null +++ b/intern/cycles/kernel/filter/filter_transform_sse.h @@ -0,0 +1,105 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer, + int x, int y, int4 rect, + int pass_stride, + float *transform, int *rank, + int radius, float pca_threshold) +{ + int buffer_w = align_up(rect.z - rect.x, 4); + + float4 features[DENOISE_FEATURES]; + const float *ccl_restrict pixel_buffer; + int2 pixel; + + int2 low = make_int2(max(rect.x, x - radius), + max(rect.y, y - radius)); + int2 high = make_int2(min(rect.z, x + radius + 1), + min(rect.w, y + radius + 1)); + int num_pixels = (high.y - low.y) * (high.x - low.x); + + float4 feature_means[DENOISE_FEATURES]; + math_vector_zero_sse(feature_means, DENOISE_FEATURES); + FOR_PIXEL_WINDOW_SSE { + filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride); + math_vector_add_sse(feature_means, DENOISE_FEATURES, features); + } END_FOR_PIXEL_WINDOW_SSE + + float4 pixel_scale = make_float4(1.0f / num_pixels); + for(int i = 0; i < DENOISE_FEATURES; i++) { + feature_means[i] = reduce_add(feature_means[i]) * pixel_scale; + } + + float4 feature_scale[DENOISE_FEATURES]; + math_vector_zero_sse(feature_scale, DENOISE_FEATURES); + FOR_PIXEL_WINDOW_SSE { + filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride); + math_vector_max_sse(feature_scale, features, DENOISE_FEATURES); + } END_FOR_PIXEL_WINDOW_SSE + + filter_calculate_scale_sse(feature_scale); + + float4 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES]; + math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES); + FOR_PIXEL_WINDOW_SSE { + filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride); + math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale); + math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, make_float4(1.0f)); + } END_FOR_PIXEL_WINDOW_SSE + + float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES]; + math_matrix_hsum(feature_matrix, DENOISE_FEATURES, feature_matrix_sse); + + math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1); + + *rank = 0; + /* Prevent overfitting when a small window is used. */ + int max_rank = min(DENOISE_FEATURES, num_pixels/3); + if(pca_threshold < 0.0f) { + float threshold_energy = 0.0f; + for(int i = 0; i < DENOISE_FEATURES; i++) { + threshold_energy += feature_matrix[i*DENOISE_FEATURES+i]; + } + threshold_energy *= 1.0f - (-pca_threshold); + + float reduced_energy = 0.0f; + for(int i = 0; i < max_rank; i++, (*rank)++) { + if(i >= 2 && reduced_energy >= threshold_energy) + break; + float s = feature_matrix[i*DENOISE_FEATURES+i]; + reduced_energy += s; + } + } + else { + for(int i = 0; i < max_rank; i++, (*rank)++) { + float s = feature_matrix[i*DENOISE_FEATURES+i]; + if(i >= 2 && sqrtf(s) < pca_threshold) + break; + } + } + + math_matrix_transpose(transform, DENOISE_FEATURES, 1); + + /* Bake the feature scaling into the transformation matrix. */ + for(int i = 0; i < DENOISE_FEATURES; i++) { + math_vector_scale(transform + i*DENOISE_FEATURES, feature_scale[i][0], *rank); + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/geom/geom.h b/intern/cycles/kernel/geom/geom.h index 6838e26c242..f34b77ebc07 100644 --- a/intern/cycles/kernel/geom/geom.h +++ b/intern/cycles/kernel/geom/geom.h @@ -14,19 +14,20 @@ * limitations under the License. */ -#include "geom_attribute.h" -#include "geom_object.h" +#include "kernel/geom/geom_attribute.h" +#include "kernel/geom/geom_object.h" #ifdef __PATCH_EVAL__ -# include "geom_patch.h" +# include "kernel/geom/geom_patch.h" #endif -#include "geom_triangle.h" -#include "geom_subd_triangle.h" -#include "geom_triangle_intersect.h" -#include "geom_motion_triangle.h" -#include "geom_motion_triangle_intersect.h" -#include "geom_motion_triangle_shader.h" -#include "geom_motion_curve.h" -#include "geom_curve.h" -#include "geom_volume.h" -#include "geom_primitive.h" +#include "kernel/geom/geom_triangle.h" +#include "kernel/geom/geom_subd_triangle.h" +#include "kernel/geom/geom_triangle_intersect.h" +#include "kernel/geom/geom_motion_triangle.h" +#include "kernel/geom/geom_motion_triangle_intersect.h" +#include "kernel/geom/geom_motion_triangle_shader.h" +#include "kernel/geom/geom_motion_curve.h" +#include "kernel/geom/geom_curve.h" +#include "kernel/geom/geom_curve_intersect.h" +#include "kernel/geom/geom_volume.h" +#include "kernel/geom/geom_primitive.h" diff --git a/intern/cycles/kernel/geom/geom_attribute.h b/intern/cycles/kernel/geom/geom_attribute.h index 08ccee56335..42c053704d5 100644 --- a/intern/cycles/kernel/geom/geom_attribute.h +++ b/intern/cycles/kernel/geom/geom_attribute.h @@ -30,7 +30,7 @@ ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData * ccl_device_inline uint attribute_primitive_type(KernelGlobals *kg, const ShaderData *sd) { #ifdef __HAIR__ - if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { + if(sd->type & PRIMITIVE_ALL_CURVE) { return ATTR_PRIM_CURVE; } else @@ -51,14 +51,19 @@ ccl_device_inline AttributeDescriptor attribute_not_found() /* Find attribute based on ID */ +ccl_device_inline uint object_attribute_map_offset(KernelGlobals *kg, int object) +{ + return kernel_tex_fetch(__objects, object).attribute_map_offset; +} + ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id) { - if(ccl_fetch(sd, object) == PRIM_NONE) { + if(sd->object == OBJECT_NONE) { return attribute_not_found(); } /* for SVM, find attribute by unique id */ - uint attr_offset = ccl_fetch(sd, object)*kernel_data.bvh.attributes_map_stride; + uint attr_offset = object_attribute_map_offset(kg, sd->object); attr_offset += attribute_primitive_type(kg, sd); uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset); @@ -73,7 +78,7 @@ ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals *kg, const Sh AttributeDescriptor desc; desc.element = (AttributeElement)attr_map.y; - if(ccl_fetch(sd, prim) == PRIM_NONE && + if(sd->prim == PRIM_NONE && desc.element != ATTR_ELEMENT_MESH && desc.element != ATTR_ELEMENT_VOXEL && desc.element != ATTR_ELEMENT_OBJECT) @@ -98,7 +103,6 @@ ccl_device Transform primitive_attribute_matrix(KernelGlobals *kg, const ShaderD tfm.x = kernel_tex_fetch(__attributes_float3, desc.offset + 0); tfm.y = kernel_tex_fetch(__attributes_float3, desc.offset + 1); tfm.z = kernel_tex_fetch(__attributes_float3, desc.offset + 2); - tfm.w = kernel_tex_fetch(__attributes_float3, desc.offset + 3); return tfm; } diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h index 712b67a1b55..e35267f02bf 100644 --- a/intern/cycles/kernel/geom/geom_curve.h +++ b/intern/cycles/kernel/geom/geom_curve.h @@ -16,9 +16,10 @@ CCL_NAMESPACE_BEGIN /* Curve Primitive * - * Curve primitive for rendering hair and fur. These can be render as flat ribbons - * or curves with actual thickness. The curve can also be rendered as line segments - * rather than curves for better performance */ + * Curve primitive for rendering hair and fur. These can be render as flat + * ribbons or curves with actual thickness. The curve can also be rendered as + * line segments rather than curves for better performance. + */ #ifdef __HAIR__ @@ -32,22 +33,22 @@ ccl_device float curve_attribute_float(KernelGlobals *kg, const ShaderData *sd, if(dy) *dy = 0.0f; #endif - return kernel_tex_fetch(__attributes_float, desc.offset + ccl_fetch(sd, prim)); + return kernel_tex_fetch(__attributes_float, desc.offset + sd->prim); } else if(desc.element == ATTR_ELEMENT_CURVE_KEY || desc.element == ATTR_ELEMENT_CURVE_KEY_MOTION) { - float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim)); - int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type)); + float4 curvedata = kernel_tex_fetch(__curves, sd->prim); + int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type); int k1 = k0 + 1; float f0 = kernel_tex_fetch(__attributes_float, desc.offset + k0); float f1 = kernel_tex_fetch(__attributes_float, desc.offset + k1); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*(f1 - f0); + if(dx) *dx = sd->du.dx*(f1 - f0); if(dy) *dy = 0.0f; #endif - return (1.0f - ccl_fetch(sd, u))*f0 + ccl_fetch(sd, u)*f1; + return (1.0f - sd->u)*f0 + sd->u*f1; } else { #ifdef __RAY_DIFFERENTIALS__ @@ -71,22 +72,22 @@ ccl_device float3 curve_attribute_float3(KernelGlobals *kg, const ShaderData *sd if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f); #endif - return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + ccl_fetch(sd, prim))); + return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + sd->prim)); } else if(desc.element == ATTR_ELEMENT_CURVE_KEY || desc.element == ATTR_ELEMENT_CURVE_KEY_MOTION) { - float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim)); - int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type)); + float4 curvedata = kernel_tex_fetch(__curves, sd->prim); + int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type); int k1 = k0 + 1; float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + k0)); float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + k1)); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*(f1 - f0); + if(dx) *dx = sd->du.dx*(f1 - f0); if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f); #endif - return (1.0f - ccl_fetch(sd, u))*f0 + ccl_fetch(sd, u)*f1; + return (1.0f - sd->u)*f0 + sd->u*f1; } else { #ifdef __RAY_DIFFERENTIALS__ @@ -104,22 +105,22 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd) { float r = 0.0f; - if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { - float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim)); - int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type)); + if(sd->type & PRIMITIVE_ALL_CURVE) { + float4 curvedata = kernel_tex_fetch(__curves, sd->prim); + int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type); int k1 = k0 + 1; float4 P_curve[2]; - if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) { + if(sd->type & PRIMITIVE_CURVE) { P_curve[0]= kernel_tex_fetch(__curve_keys, k0); P_curve[1]= kernel_tex_fetch(__curve_keys, k1); } else { - motion_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), k0, k1, P_curve); + motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve); } - r = (P_curve[1].w - P_curve[0].w) * ccl_fetch(sd, u) + P_curve[0].w; + r = (P_curve[1].w - P_curve[0].w) * sd->u + P_curve[0].w; } return r*2.0f; @@ -130,8 +131,8 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd) ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd) { - float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim)); - int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type)); + float4 curvedata = kernel_tex_fetch(__curves, sd->prim); + int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type); int k1 = k0 + 1; float4 P_curve[2]; @@ -139,23 +140,23 @@ ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd P_curve[0]= kernel_tex_fetch(__curve_keys, k0); P_curve[1]= kernel_tex_fetch(__curve_keys, k1); - return float4_to_float3(P_curve[1]) * ccl_fetch(sd, u) + float4_to_float3(P_curve[0]) * (1.0f - ccl_fetch(sd, u)); + return float4_to_float3(P_curve[1]) * sd->u + float4_to_float3(P_curve[0]) * (1.0f - sd->u); } /* Curve tangent normal */ ccl_device float3 curve_tangent_normal(KernelGlobals *kg, ShaderData *sd) -{ +{ float3 tgN = make_float3(0.0f,0.0f,0.0f); - if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { + if(sd->type & PRIMITIVE_ALL_CURVE) { - tgN = -(-ccl_fetch(sd, I) - ccl_fetch(sd, dPdu) * (dot(ccl_fetch(sd, dPdu),-ccl_fetch(sd, I)) / len_squared(ccl_fetch(sd, dPdu)))); + tgN = -(-sd->I - sd->dPdu * (dot(sd->dPdu,-sd->I) / len_squared(sd->dPdu))); tgN = normalize(tgN); /* need to find suitable scaled gd for corrected normal */ #if 0 - tgN = normalize(tgN - gd * ccl_fetch(sd, dPdu)); + tgN = normalize(tgN - gd * sd->dPdu); #endif } @@ -213,893 +214,6 @@ ccl_device_inline void curvebounds(float *lower, float *upper, float *extremta, } } -#ifdef __KERNEL_SSE2__ -ccl_device_inline ssef transform_point_T3(const ssef t[3], const ssef &a) -{ - return madd(shuffle<0>(a), t[0], madd(shuffle<1>(a), t[1], shuffle<2>(a) * t[2])); -} -#endif - -#ifdef __KERNEL_SSE2__ -/* Pass P and dir by reference to aligned vector */ -ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect, - const float3 &P, const float3 &dir, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax) -#else -ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect, - float3 P, float3 dir, uint visibility, int object, int curveAddr, float time,int type, uint *lcg_state, float difl, float extmax) -#endif -{ - const bool is_curve_primitive = (type & PRIMITIVE_CURVE); - - if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) { - const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr); - if(time < prim_time.x || time > prim_time.y) { - return false; - } - } - - int segment = PRIMITIVE_UNPACK_SEGMENT(type); - float epsilon = 0.0f; - float r_st, r_en; - - int depth = kernel_data.curve.subdivisions; - int flags = kernel_data.curve.curveflags; - int prim = kernel_tex_fetch(__prim_index, curveAddr); - -#ifdef __KERNEL_SSE2__ - ssef vdir = load4f(dir); - ssef vcurve_coef[4]; - const float3 *curve_coef = (float3 *)vcurve_coef; - - { - ssef dtmp = vdir * vdir; - ssef d_ss = mm_sqrt(dtmp + shuffle<2>(dtmp)); - ssef rd_ss = load1f_first(1.0f) / d_ss; - - ssei v00vec = load4i((ssei *)&kg->__curves.data[prim]); - int2 &v00 = (int2 &)v00vec; - - int k0 = v00.x + segment; - int k1 = k0 + 1; - int ka = max(k0 - 1, v00.x); - int kb = min(k1 + 1, v00.x + v00.y - 1); - -#ifdef __KERNEL_AVX2__ - avxf P_curve_0_1, P_curve_2_3; - if(is_curve_primitive) { - P_curve_0_1 = _mm256_loadu2_m128(&kg->__curve_keys.data[k0].x, &kg->__curve_keys.data[ka].x); - P_curve_2_3 = _mm256_loadu2_m128(&kg->__curve_keys.data[kb].x, &kg->__curve_keys.data[k1].x); - } - else { - int fobject = (object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, curveAddr) : object; - motion_cardinal_curve_keys_avx(kg, fobject, prim, time, ka, k0, k1, kb, &P_curve_0_1,&P_curve_2_3); - } -#else /* __KERNEL_AVX2__ */ - ssef P_curve[4]; - - if(is_curve_primitive) { - P_curve[0] = load4f(&kg->__curve_keys.data[ka].x); - P_curve[1] = load4f(&kg->__curve_keys.data[k0].x); - P_curve[2] = load4f(&kg->__curve_keys.data[k1].x); - P_curve[3] = load4f(&kg->__curve_keys.data[kb].x); - } - else { - int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; - motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, (float4*)&P_curve); - } -#endif /* __KERNEL_AVX2__ */ - - ssef rd_sgn = set_sign_bit<0, 1, 1, 1>(shuffle<0>(rd_ss)); - ssef mul_zxxy = shuffle<2, 0, 0, 1>(vdir) * rd_sgn; - ssef mul_yz = shuffle<1, 2, 1, 2>(vdir) * mul_zxxy; - ssef mul_shuf = shuffle<0, 1, 2, 3>(mul_zxxy, mul_yz); - ssef vdir0 = vdir & cast(ssei(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0)); - - ssef htfm0 = shuffle<0, 2, 0, 3>(mul_shuf, vdir0); - ssef htfm1 = shuffle<1, 0, 1, 3>(load1f_first(extract<0>(d_ss)), vdir0); - ssef htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0); - -#ifdef __KERNEL_AVX2__ - const avxf vPP = _mm256_broadcast_ps(&P.m128); - const avxf htfm00 = avxf(htfm0.m128, htfm0.m128); - const avxf htfm11 = avxf(htfm1.m128, htfm1.m128); - const avxf htfm22 = avxf(htfm2.m128, htfm2.m128); - - const avxf p01 = madd(shuffle<0>(P_curve_0_1 - vPP), - htfm00, - madd(shuffle<1>(P_curve_0_1 - vPP), - htfm11, - shuffle<2>(P_curve_0_1 - vPP) * htfm22)); - const avxf p23 = madd(shuffle<0>(P_curve_2_3 - vPP), - htfm00, - madd(shuffle<1>(P_curve_2_3 - vPP), - htfm11, - shuffle<2>(P_curve_2_3 - vPP)*htfm22)); - - const ssef p0 = _mm256_castps256_ps128(p01); - const ssef p1 = _mm256_extractf128_ps(p01, 1); - const ssef p2 = _mm256_castps256_ps128(p23); - const ssef p3 = _mm256_extractf128_ps(p23, 1); - - const ssef P_curve_1 = _mm256_extractf128_ps(P_curve_0_1, 1); - r_st = ((float4 &)P_curve_1).w; - const ssef P_curve_2 = _mm256_castps256_ps128(P_curve_2_3); - r_en = ((float4 &)P_curve_2).w; -#else /* __KERNEL_AVX2__ */ - ssef htfm[] = { htfm0, htfm1, htfm2 }; - ssef vP = load4f(P); - ssef p0 = transform_point_T3(htfm, P_curve[0] - vP); - ssef p1 = transform_point_T3(htfm, P_curve[1] - vP); - ssef p2 = transform_point_T3(htfm, P_curve[2] - vP); - ssef p3 = transform_point_T3(htfm, P_curve[3] - vP); - - r_st = ((float4 &)P_curve[1]).w; - r_en = ((float4 &)P_curve[2]).w; -#endif /* __KERNEL_AVX2__ */ - - float fc = 0.71f; - ssef vfc = ssef(fc); - ssef vfcxp3 = vfc * p3; - - vcurve_coef[0] = p1; - vcurve_coef[1] = vfc * (p2 - p0); - vcurve_coef[2] = madd(ssef(fc * 2.0f), p0, madd(ssef(fc - 3.0f), p1, msub(ssef(3.0f - 2.0f * fc), p2, vfcxp3))); - vcurve_coef[3] = msub(ssef(fc - 2.0f), p2 - p1, msub(vfc, p0, vfcxp3)); - - } -#else - float3 curve_coef[4]; - - /* curve Intersection check */ - /* obtain curve parameters */ - { - /* ray transform created - this should be created at beginning of intersection loop */ - Transform htfm; - float d = sqrtf(dir.x * dir.x + dir.z * dir.z); - htfm = make_transform( - dir.z / d, 0, -dir.x /d, 0, - -dir.x * dir.y /d, d, -dir.y * dir.z /d, 0, - dir.x, dir.y, dir.z, 0, - 0, 0, 0, 1); - - float4 v00 = kernel_tex_fetch(__curves, prim); - - int k0 = __float_as_int(v00.x) + segment; - int k1 = k0 + 1; - - int ka = max(k0 - 1,__float_as_int(v00.x)); - int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1); - - float4 P_curve[4]; - - if(is_curve_primitive) { - P_curve[0] = kernel_tex_fetch(__curve_keys, ka); - P_curve[1] = kernel_tex_fetch(__curve_keys, k0); - P_curve[2] = kernel_tex_fetch(__curve_keys, k1); - P_curve[3] = kernel_tex_fetch(__curve_keys, kb); - } - else { - int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; - motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, P_curve); - } - - float3 p0 = transform_point(&htfm, float4_to_float3(P_curve[0]) - P); - float3 p1 = transform_point(&htfm, float4_to_float3(P_curve[1]) - P); - float3 p2 = transform_point(&htfm, float4_to_float3(P_curve[2]) - P); - float3 p3 = transform_point(&htfm, float4_to_float3(P_curve[3]) - P); - - float fc = 0.71f; - curve_coef[0] = p1; - curve_coef[1] = -fc*p0 + fc*p2; - curve_coef[2] = 2.0f * fc * p0 + (fc - 3.0f) * p1 + (3.0f - 2.0f * fc) * p2 - fc * p3; - curve_coef[3] = -fc * p0 + (2.0f - fc) * p1 + (fc - 2.0f) * p2 + fc * p3; - r_st = P_curve[1].w; - r_en = P_curve[2].w; - } -#endif - - float r_curr = max(r_st, r_en); - - if((flags & CURVE_KN_RIBBONS) || !(flags & CURVE_KN_BACKFACING)) - epsilon = 2 * r_curr; - - /* find bounds - this is slow for cubic curves */ - float upper, lower; - - float zextrem[4]; - curvebounds(&lower, &upper, &zextrem[0], &zextrem[1], &zextrem[2], &zextrem[3], curve_coef[0].z, curve_coef[1].z, curve_coef[2].z, curve_coef[3].z); - if(lower - r_curr > isect->t || upper + r_curr < epsilon) - return false; - - /* minimum width extension */ - float mw_extension = min(difl * fabsf(upper), extmax); - float r_ext = mw_extension + r_curr; - - float xextrem[4]; - curvebounds(&lower, &upper, &xextrem[0], &xextrem[1], &xextrem[2], &xextrem[3], curve_coef[0].x, curve_coef[1].x, curve_coef[2].x, curve_coef[3].x); - if(lower > r_ext || upper < -r_ext) - return false; - - float yextrem[4]; - curvebounds(&lower, &upper, &yextrem[0], &yextrem[1], &yextrem[2], &yextrem[3], curve_coef[0].y, curve_coef[1].y, curve_coef[2].y, curve_coef[3].y); - if(lower > r_ext || upper < -r_ext) - return false; - - /* setup recurrent loop */ - int level = 1 << depth; - int tree = 0; - float resol = 1.0f / (float)level; - bool hit = false; - - /* begin loop */ - while(!(tree >> (depth))) { - const float i_st = tree * resol; - const float i_en = i_st + (level * resol); - -#ifdef __KERNEL_SSE2__ - ssef vi_st = ssef(i_st), vi_en = ssef(i_en); - ssef vp_st = madd(madd(madd(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]), vi_st, vcurve_coef[0]); - ssef vp_en = madd(madd(madd(vcurve_coef[3], vi_en, vcurve_coef[2]), vi_en, vcurve_coef[1]), vi_en, vcurve_coef[0]); - - ssef vbmin = min(vp_st, vp_en); - ssef vbmax = max(vp_st, vp_en); - - float3 &bmin = (float3 &)vbmin, &bmax = (float3 &)vbmax; - float &bminx = bmin.x, &bminy = bmin.y, &bminz = bmin.z; - float &bmaxx = bmax.x, &bmaxy = bmax.y, &bmaxz = bmax.z; - float3 &p_st = (float3 &)vp_st, &p_en = (float3 &)vp_en; -#else - float3 p_st = ((curve_coef[3] * i_st + curve_coef[2]) * i_st + curve_coef[1]) * i_st + curve_coef[0]; - float3 p_en = ((curve_coef[3] * i_en + curve_coef[2]) * i_en + curve_coef[1]) * i_en + curve_coef[0]; - - float bminx = min(p_st.x, p_en.x); - float bmaxx = max(p_st.x, p_en.x); - float bminy = min(p_st.y, p_en.y); - float bmaxy = max(p_st.y, p_en.y); - float bminz = min(p_st.z, p_en.z); - float bmaxz = max(p_st.z, p_en.z); -#endif - - if(xextrem[0] >= i_st && xextrem[0] <= i_en) { - bminx = min(bminx,xextrem[1]); - bmaxx = max(bmaxx,xextrem[1]); - } - if(xextrem[2] >= i_st && xextrem[2] <= i_en) { - bminx = min(bminx,xextrem[3]); - bmaxx = max(bmaxx,xextrem[3]); - } - if(yextrem[0] >= i_st && yextrem[0] <= i_en) { - bminy = min(bminy,yextrem[1]); - bmaxy = max(bmaxy,yextrem[1]); - } - if(yextrem[2] >= i_st && yextrem[2] <= i_en) { - bminy = min(bminy,yextrem[3]); - bmaxy = max(bmaxy,yextrem[3]); - } - if(zextrem[0] >= i_st && zextrem[0] <= i_en) { - bminz = min(bminz,zextrem[1]); - bmaxz = max(bmaxz,zextrem[1]); - } - if(zextrem[2] >= i_st && zextrem[2] <= i_en) { - bminz = min(bminz,zextrem[3]); - bmaxz = max(bmaxz,zextrem[3]); - } - - float r1 = r_st + (r_en - r_st) * i_st; - float r2 = r_st + (r_en - r_st) * i_en; - r_curr = max(r1, r2); - - mw_extension = min(difl * fabsf(bmaxz), extmax); - float r_ext = mw_extension + r_curr; - float coverage = 1.0f; - - if(bminz - r_curr > isect->t || bmaxz + r_curr < epsilon || bminx > r_ext|| bmaxx < -r_ext|| bminy > r_ext|| bmaxy < -r_ext) { - /* the bounding box does not overlap the square centered at O */ - tree += level; - level = tree & -tree; - } - else if(level == 1) { - - /* the maximum recursion depth is reached. - * check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0. - * dP* is reversed if necessary.*/ - float t = isect->t; - float u = 0.0f; - float gd = 0.0f; - - if(flags & CURVE_KN_RIBBONS) { - float3 tg = (p_en - p_st); -#ifdef __KERNEL_SSE__ - const float3 tg_sq = tg * tg; - float w = tg_sq.x + tg_sq.y; -#else - float w = tg.x * tg.x + tg.y * tg.y; -#endif - if(w == 0) { - tree++; - level = tree & -tree; - continue; - } -#ifdef __KERNEL_SSE__ - const float3 p_sttg = p_st * tg; - w = -(p_sttg.x + p_sttg.y) / w; -#else - w = -(p_st.x * tg.x + p_st.y * tg.y) / w; -#endif - w = saturate(w); - - /* compute u on the curve segment */ - u = i_st * (1 - w) + i_en * w; - r_curr = r_st + (r_en - r_st) * u; - /* compare x-y distances */ - float3 p_curr = ((curve_coef[3] * u + curve_coef[2]) * u + curve_coef[1]) * u + curve_coef[0]; - - float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1]; - if(dot(tg, dp_st)< 0) - dp_st *= -1; - if(dot(dp_st, -p_st) + p_curr.z * dp_st.z < 0) { - tree++; - level = tree & -tree; - continue; - } - float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1]; - if(dot(tg, dp_en) < 0) - dp_en *= -1; - if(dot(dp_en, p_en) - p_curr.z * dp_en.z < 0) { - tree++; - level = tree & -tree; - continue; - } - - /* compute coverage */ - float r_ext = r_curr; - coverage = 1.0f; - if(difl != 0.0f) { - mw_extension = min(difl * fabsf(bmaxz), extmax); - r_ext = mw_extension + r_curr; -#ifdef __KERNEL_SSE__ - const float3 p_curr_sq = p_curr * p_curr; - const float3 dxxx = _mm_sqrt_ss(_mm_hadd_ps(p_curr_sq.m128, p_curr_sq.m128)); - float d = dxxx.x; -#else - float d = sqrtf(p_curr.x * p_curr.x + p_curr.y * p_curr.y); -#endif - float d0 = d - r_curr; - float d1 = d + r_curr; - float inv_mw_extension = 1.0f/mw_extension; - if(d0 >= 0) - coverage = (min(d1 * inv_mw_extension, 1.0f) - min(d0 * inv_mw_extension, 1.0f)) * 0.5f; - else // inside - coverage = (min(d1 * inv_mw_extension, 1.0f) + min(-d0 * inv_mw_extension, 1.0f)) * 0.5f; - } - - if(p_curr.x * p_curr.x + p_curr.y * p_curr.y >= r_ext * r_ext || p_curr.z <= epsilon || isect->t < p_curr.z) { - tree++; - level = tree & -tree; - continue; - } - - t = p_curr.z; - - /* stochastic fade from minimum width */ - if(difl != 0.0f && lcg_state) { - if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage)) - return hit; - } - } - else { - float l = len(p_en - p_st); - /* minimum width extension */ - float or1 = r1; - float or2 = r2; - - if(difl != 0.0f) { - mw_extension = min(len(p_st - P) * difl, extmax); - or1 = r1 < mw_extension ? mw_extension : r1; - mw_extension = min(len(p_en - P) * difl, extmax); - or2 = r2 < mw_extension ? mw_extension : r2; - } - /* --- */ - float invl = 1.0f/l; - float3 tg = (p_en - p_st) * invl; - gd = (or2 - or1) * invl; - float difz = -dot(p_st,tg); - float cyla = 1.0f - (tg.z * tg.z * (1 + gd*gd)); - float invcyla = 1.0f/cyla; - float halfb = (-p_st.z - tg.z*(difz + gd*(difz*gd + or1))); - float tcentre = -halfb*invcyla; - float zcentre = difz + (tg.z * tcentre); - float3 tdif = - p_st; - tdif.z += tcentre; - float tdifz = dot(tdif,tg); - float tb = 2*(tdif.z - tg.z*(tdifz + gd*(tdifz*gd + or1))); - float tc = dot(tdif,tdif) - tdifz * tdifz * (1 + gd*gd) - or1*or1 - 2*or1*tdifz*gd; - float td = tb*tb - 4*cyla*tc; - if(td < 0.0f) { - tree++; - level = tree & -tree; - continue; - } - - float rootd = sqrtf(td); - float correction = (-tb - rootd) * 0.5f * invcyla; - t = tcentre + correction; - - float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1]; - if(dot(tg, dp_st)< 0) - dp_st *= -1; - float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1]; - if(dot(tg, dp_en) < 0) - dp_en *= -1; - - if(flags & CURVE_KN_BACKFACING && (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f)) { - correction = (-tb + rootd) * 0.5f * invcyla; - t = tcentre + correction; - } - - if(dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f) { - tree++; - level = tree & -tree; - continue; - } - - float w = (zcentre + (tg.z * correction)) * invl; - w = saturate(w); - /* compute u on the curve segment */ - u = i_st * (1 - w) + i_en * w; - - /* stochastic fade from minimum width */ - if(difl != 0.0f && lcg_state) { - r_curr = r1 + (r2 - r1) * w; - r_ext = or1 + (or2 - or1) * w; - coverage = r_curr/r_ext; - - if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage)) - return hit; - } - } - /* we found a new intersection */ - -#ifdef __VISIBILITY_FLAG__ - /* visibility flag test. we do it here under the assumption - * that most triangles are culled by node flags */ - if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility) -#endif - { - /* record intersection */ - isect->t = t; - isect->u = u; - isect->v = gd; - isect->prim = curveAddr; - isect->object = object; - isect->type = type; - hit = true; - } - - tree++; - level = tree & -tree; - } - else { - /* split the curve into two curves and process */ - level = level >> 1; - } - } - - return hit; -} - -ccl_device_forceinline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect, - float3 P, float3 direction, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax) -{ - /* define few macros to minimize code duplication for SSE */ -#ifndef __KERNEL_SSE2__ -# define len3_squared(x) len_squared(x) -# define len3(x) len(x) -# define dot3(x, y) dot(x, y) -#endif - - const bool is_curve_primitive = (type & PRIMITIVE_CURVE); - - if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) { - const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr); - if(time < prim_time.x || time > prim_time.y) { - return false; - } - } - - int segment = PRIMITIVE_UNPACK_SEGMENT(type); - /* curve Intersection check */ - int flags = kernel_data.curve.curveflags; - - int prim = kernel_tex_fetch(__prim_index, curveAddr); - float4 v00 = kernel_tex_fetch(__curves, prim); - - int cnum = __float_as_int(v00.x); - int k0 = cnum + segment; - int k1 = k0 + 1; - -#ifndef __KERNEL_SSE2__ - float4 P_curve[2]; - - if(is_curve_primitive) { - P_curve[0] = kernel_tex_fetch(__curve_keys, k0); - P_curve[1] = kernel_tex_fetch(__curve_keys, k1); - } - else { - int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; - motion_curve_keys(kg, fobject, prim, time, k0, k1, P_curve); - } - - float or1 = P_curve[0].w; - float or2 = P_curve[1].w; - float3 p1 = float4_to_float3(P_curve[0]); - float3 p2 = float4_to_float3(P_curve[1]); - - /* minimum width extension */ - float r1 = or1; - float r2 = or2; - float3 dif = P - p1; - float3 dif_second = P - p2; - if(difl != 0.0f) { - float pixelsize = min(len3(dif) * difl, extmax); - r1 = or1 < pixelsize ? pixelsize : or1; - pixelsize = min(len3(dif_second) * difl, extmax); - r2 = or2 < pixelsize ? pixelsize : or2; - } - /* --- */ - - float3 p21_diff = p2 - p1; - float3 sphere_dif1 = (dif + dif_second) * 0.5f; - float3 dir = direction; - float sphere_b_tmp = dot3(dir, sphere_dif1); - float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir; -#else - ssef P_curve[2]; - - if(is_curve_primitive) { - P_curve[0] = load4f(&kg->__curve_keys.data[k0].x); - P_curve[1] = load4f(&kg->__curve_keys.data[k1].x); - } - else { - int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; - motion_curve_keys(kg, fobject, prim, time, k0, k1, (float4*)&P_curve); - } - - const ssef or12 = shuffle<3, 3, 3, 3>(P_curve[0], P_curve[1]); - - ssef r12 = or12; - const ssef vP = load4f(P); - const ssef dif = vP - P_curve[0]; - const ssef dif_second = vP - P_curve[1]; - if(difl != 0.0f) { - const ssef len1_sq = len3_squared_splat(dif); - const ssef len2_sq = len3_squared_splat(dif_second); - const ssef len12 = mm_sqrt(shuffle<0, 0, 0, 0>(len1_sq, len2_sq)); - const ssef pixelsize12 = min(len12 * difl, ssef(extmax)); - r12 = max(or12, pixelsize12); - } - float or1 = extract<0>(or12), or2 = extract<0>(shuffle<2>(or12)); - float r1 = extract<0>(r12), r2 = extract<0>(shuffle<2>(r12)); - - const ssef p21_diff = P_curve[1] - P_curve[0]; - const ssef sphere_dif1 = (dif + dif_second) * 0.5f; - const ssef dir = load4f(direction); - const ssef sphere_b_tmp = dot3_splat(dir, sphere_dif1); - const ssef sphere_dif2 = nmadd(sphere_b_tmp, dir, sphere_dif1); -#endif - - float mr = max(r1, r2); - float l = len3(p21_diff); - float invl = 1.0f / l; - float sp_r = mr + 0.5f * l; - - float sphere_b = dot3(dir, sphere_dif2); - float sdisc = sphere_b * sphere_b - len3_squared(sphere_dif2) + sp_r * sp_r; - - if(sdisc < 0.0f) - return false; - - /* obtain parameters and test midpoint distance for suitable modes */ -#ifndef __KERNEL_SSE2__ - float3 tg = p21_diff * invl; -#else - const ssef tg = p21_diff * invl; -#endif - float gd = (r2 - r1) * invl; - - float dirz = dot3(dir, tg); - float difz = dot3(dif, tg); - - float a = 1.0f - (dirz*dirz*(1 + gd*gd)); - - float halfb = dot3(dir, dif) - dirz*(difz + gd*(difz*gd + r1)); - - float tcentre = -halfb/a; - float zcentre = difz + (dirz * tcentre); - - if((tcentre > isect->t) && !(flags & CURVE_KN_ACCURATE)) - return false; - if((zcentre < 0 || zcentre > l) && !(flags & CURVE_KN_ACCURATE) && !(flags & CURVE_KN_INTERSECTCORRECTION)) - return false; - - /* test minimum separation */ -#ifndef __KERNEL_SSE2__ - float3 cprod = cross(tg, dir); - float cprod2sq = len3_squared(cross(tg, dif)); -#else - const ssef cprod = cross(tg, dir); - float cprod2sq = len3_squared(cross_zxy(tg, dif)); -#endif - float cprodsq = len3_squared(cprod); - float distscaled = dot3(cprod, dif); - - if(cprodsq == 0) - distscaled = cprod2sq; - else - distscaled = (distscaled*distscaled)/cprodsq; - - if(distscaled > mr*mr) - return false; - - /* calculate true intersection */ -#ifndef __KERNEL_SSE2__ - float3 tdif = dif + tcentre * dir; -#else - const ssef tdif = madd(ssef(tcentre), dir, dif); -#endif - float tdifz = dot3(tdif, tg); - float tdifma = tdifz*gd + r1; - float tb = 2*(dot3(dir, tdif) - dirz*(tdifz + gd*tdifma)); - float tc = dot3(tdif, tdif) - tdifz*tdifz - tdifma*tdifma; - float td = tb*tb - 4*a*tc; - - if(td < 0.0f) - return false; - - float rootd = 0.0f; - float correction = 0.0f; - if(flags & CURVE_KN_ACCURATE) { - rootd = sqrtf(td); - correction = ((-tb - rootd)/(2*a)); - } - - float t = tcentre + correction; - - if(t < isect->t) { - - if(flags & CURVE_KN_INTERSECTCORRECTION) { - rootd = sqrtf(td); - correction = ((-tb - rootd)/(2*a)); - t = tcentre + correction; - } - - float z = zcentre + (dirz * correction); - // bool backface = false; - - if(flags & CURVE_KN_BACKFACING && (t < 0.0f || z < 0 || z > l)) { - // backface = true; - correction = ((-tb + rootd)/(2*a)); - t = tcentre + correction; - z = zcentre + (dirz * correction); - } - - /* stochastic fade from minimum width */ - float adjradius = or1 + z * (or2 - or1) * invl; - adjradius = adjradius / (r1 + z * gd); - if(lcg_state && adjradius != 1.0f) { - if(lcg_step_float(lcg_state) > adjradius) - return false; - } - /* --- */ - - if(t > 0.0f && t < isect->t && z >= 0 && z <= l) { - - if(flags & CURVE_KN_ENCLOSEFILTER) { - float enc_ratio = 1.01f; - if((difz > -r1 * enc_ratio) && (dot3(dif_second, tg) < r2 * enc_ratio)) { - float a2 = 1.0f - (dirz*dirz*(1 + gd*gd*enc_ratio*enc_ratio)); - float c2 = dot3(dif, dif) - difz * difz * (1 + gd*gd*enc_ratio*enc_ratio) - r1*r1*enc_ratio*enc_ratio - 2*r1*difz*gd*enc_ratio; - if(a2*c2 < 0.0f) - return false; - } - } - -#ifdef __VISIBILITY_FLAG__ - /* visibility flag test. we do it here under the assumption - * that most triangles are culled by node flags */ - if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility) -#endif - { - /* record intersection */ - isect->t = t; - isect->u = z*invl; - isect->v = gd; - isect->prim = curveAddr; - isect->object = object; - isect->type = type; - - return true; - } - } - } - - return false; - -#ifndef __KERNEL_SSE2__ -# undef len3_squared -# undef len3 -# undef dot3 -#endif -} - -ccl_device_inline float3 curvetangent(float t, float3 p0, float3 p1, float3 p2, float3 p3) -{ - float fc = 0.71f; - float data[4]; - float t2 = t * t; - data[0] = -3.0f * fc * t2 + 4.0f * fc * t - fc; - data[1] = 3.0f * (2.0f - fc) * t2 + 2.0f * (fc - 3.0f) * t; - data[2] = 3.0f * (fc - 2.0f) * t2 + 2.0f * (3.0f - 2.0f * fc) * t + fc; - data[3] = 3.0f * fc * t2 - 2.0f * fc * t; - return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3; -} - -ccl_device_inline float3 curvepoint(float t, float3 p0, float3 p1, float3 p2, float3 p3) -{ - float data[4]; - float fc = 0.71f; - float t2 = t * t; - float t3 = t2 * t; - data[0] = -fc * t3 + 2.0f * fc * t2 - fc * t; - data[1] = (2.0f - fc) * t3 + (fc - 3.0f) * t2 + 1.0f; - data[2] = (fc - 2.0f) * t3 + (3.0f - 2.0f * fc) * t2 + fc * t; - data[3] = fc * t3 - fc * t2; - return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3; -} - -ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray) -{ - int flag = kernel_data.curve.curveflags; - float t = isect->t; - float3 P = ray->P; - float3 D = ray->D; - - if(isect->object != OBJECT_NONE) { -#ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_itfm); -#else - Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM); -#endif - - P = transform_point(&tfm, P); - D = transform_direction(&tfm, D*t); - D = normalize_len(D, &t); - } - - int prim = kernel_tex_fetch(__prim_index, isect->prim); - float4 v00 = kernel_tex_fetch(__curves, prim); - - int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type)); - int k1 = k0 + 1; - - float3 tg; - - if(flag & CURVE_KN_INTERPOLATE) { - int ka = max(k0 - 1,__float_as_int(v00.x)); - int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1); - - float4 P_curve[4]; - - if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) { - P_curve[0] = kernel_tex_fetch(__curve_keys, ka); - P_curve[1] = kernel_tex_fetch(__curve_keys, k0); - P_curve[2] = kernel_tex_fetch(__curve_keys, k1); - P_curve[3] = kernel_tex_fetch(__curve_keys, kb); - } - else { - motion_cardinal_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), ka, k0, k1, kb, P_curve); - } - - float3 p[4]; - p[0] = float4_to_float3(P_curve[0]); - p[1] = float4_to_float3(P_curve[1]); - p[2] = float4_to_float3(P_curve[2]); - p[3] = float4_to_float3(P_curve[3]); - - P = P + D*t; - -#ifdef __UV__ - ccl_fetch(sd, u) = isect->u; - ccl_fetch(sd, v) = 0.0f; -#endif - - tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3])); - - if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) { - ccl_fetch(sd, Ng) = normalize(-(D - tg * (dot(tg, D)))); - } - else { - /* direction from inside to surface of curve */ - float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]); - ccl_fetch(sd, Ng) = normalize(P - p_curr); - - /* adjustment for changing radius */ - float gd = isect->v; - - if(gd != 0.0f) { - ccl_fetch(sd, Ng) = ccl_fetch(sd, Ng) - gd * tg; - ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng)); - } - } - - /* todo: sometimes the normal is still so that this is detected as - * backfacing even if cull backfaces is enabled */ - - ccl_fetch(sd, N) = ccl_fetch(sd, Ng); - } - else { - float4 P_curve[2]; - - if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) { - P_curve[0]= kernel_tex_fetch(__curve_keys, k0); - P_curve[1]= kernel_tex_fetch(__curve_keys, k1); - } - else { - motion_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), k0, k1, P_curve); - } - - float l = 1.0f; - tg = normalize_len(float4_to_float3(P_curve[1] - P_curve[0]), &l); - - P = P + D*t; - - float3 dif = P - float4_to_float3(P_curve[0]); - -#ifdef __UV__ - ccl_fetch(sd, u) = dot(dif,tg)/l; - ccl_fetch(sd, v) = 0.0f; -#endif - - if(flag & CURVE_KN_TRUETANGENTGNORMAL) { - ccl_fetch(sd, Ng) = -(D - tg * dot(tg, D)); - ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng)); - } - else { - float gd = isect->v; - - /* direction from inside to surface of curve */ - ccl_fetch(sd, Ng) = (dif - tg * ccl_fetch(sd, u) * l) / (P_curve[0].w + ccl_fetch(sd, u) * l * gd); - - /* adjustment for changing radius */ - if(gd != 0.0f) { - ccl_fetch(sd, Ng) = ccl_fetch(sd, Ng) - gd * tg; - ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng)); - } - } - - ccl_fetch(sd, N) = ccl_fetch(sd, Ng); - } - -#ifdef __DPDU__ - /* dPdu/dPdv */ - ccl_fetch(sd, dPdu) = tg; - ccl_fetch(sd, dPdv) = cross(tg, ccl_fetch(sd, Ng)); -#endif - - if(isect->object != OBJECT_NONE) { -#ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_tfm); -#else - Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM); -#endif - - P = transform_point(&tfm, P); - } - - return P; -} - -#endif +#endif /* __HAIR__ */ CCL_NAMESPACE_END - diff --git a/intern/cycles/kernel/geom/geom_curve_intersect.h b/intern/cycles/kernel/geom/geom_curve_intersect.h new file mode 100644 index 00000000000..46c3f408f0b --- /dev/null +++ b/intern/cycles/kernel/geom/geom_curve_intersect.h @@ -0,0 +1,927 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +/* Curve primitive intersection functions. */ + +#ifdef __HAIR__ + +#ifdef __KERNEL_SSE2__ +ccl_device_inline ssef transform_point_T3(const ssef t[3], const ssef &a) +{ + return madd(shuffle<0>(a), t[0], madd(shuffle<1>(a), t[1], shuffle<2>(a) * t[2])); +} +#endif + +/* On CPU pass P and dir by reference to aligned vector. */ +ccl_device_forceinline bool cardinal_curve_intersect( + KernelGlobals *kg, + Intersection *isect, + const float3 ccl_ref P, + const float3 ccl_ref dir, + uint visibility, + int object, + int curveAddr, + float time, + int type, + uint *lcg_state, + float difl, + float extmax) +{ + const bool is_curve_primitive = (type & PRIMITIVE_CURVE); + + if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) { + const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr); + if(time < prim_time.x || time > prim_time.y) { + return false; + } + } + + int segment = PRIMITIVE_UNPACK_SEGMENT(type); + float epsilon = 0.0f; + float r_st, r_en; + + int depth = kernel_data.curve.subdivisions; + int flags = kernel_data.curve.curveflags; + int prim = kernel_tex_fetch(__prim_index, curveAddr); + +#ifdef __KERNEL_SSE2__ + ssef vdir = load4f(dir); + ssef vcurve_coef[4]; + const float3 *curve_coef = (float3 *)vcurve_coef; + + { + ssef dtmp = vdir * vdir; + ssef d_ss = mm_sqrt(dtmp + shuffle<2>(dtmp)); + ssef rd_ss = load1f_first(1.0f) / d_ss; + + ssei v00vec = load4i((ssei *)&kg->__curves.data[prim]); + int2 &v00 = (int2 &)v00vec; + + int k0 = v00.x + segment; + int k1 = k0 + 1; + int ka = max(k0 - 1, v00.x); + int kb = min(k1 + 1, v00.x + v00.y - 1); + +#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) && (!defined(_MSC_VER) || _MSC_VER > 1800) + avxf P_curve_0_1, P_curve_2_3; + if(is_curve_primitive) { + P_curve_0_1 = _mm256_loadu2_m128(&kg->__curve_keys.data[k0].x, &kg->__curve_keys.data[ka].x); + P_curve_2_3 = _mm256_loadu2_m128(&kg->__curve_keys.data[kb].x, &kg->__curve_keys.data[k1].x); + } + else { + int fobject = (object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, curveAddr) : object; + motion_cardinal_curve_keys_avx(kg, fobject, prim, time, ka, k0, k1, kb, &P_curve_0_1,&P_curve_2_3); + } +#else /* __KERNEL_AVX2__ */ + ssef P_curve[4]; + + if(is_curve_primitive) { + P_curve[0] = load4f(&kg->__curve_keys.data[ka].x); + P_curve[1] = load4f(&kg->__curve_keys.data[k0].x); + P_curve[2] = load4f(&kg->__curve_keys.data[k1].x); + P_curve[3] = load4f(&kg->__curve_keys.data[kb].x); + } + else { + int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; + motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, (float4*)&P_curve); + } +#endif /* __KERNEL_AVX2__ */ + + ssef rd_sgn = set_sign_bit<0, 1, 1, 1>(shuffle<0>(rd_ss)); + ssef mul_zxxy = shuffle<2, 0, 0, 1>(vdir) * rd_sgn; + ssef mul_yz = shuffle<1, 2, 1, 2>(vdir) * mul_zxxy; + ssef mul_shuf = shuffle<0, 1, 2, 3>(mul_zxxy, mul_yz); + ssef vdir0 = vdir & cast(ssei(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0)); + + ssef htfm0 = shuffle<0, 2, 0, 3>(mul_shuf, vdir0); + ssef htfm1 = shuffle<1, 0, 1, 3>(load1f_first(extract<0>(d_ss)), vdir0); + ssef htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0); + +#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) && (!defined(_MSC_VER) || _MSC_VER > 1800) + const avxf vPP = _mm256_broadcast_ps(&P.m128); + const avxf htfm00 = avxf(htfm0.m128, htfm0.m128); + const avxf htfm11 = avxf(htfm1.m128, htfm1.m128); + const avxf htfm22 = avxf(htfm2.m128, htfm2.m128); + + const avxf p01 = madd(shuffle<0>(P_curve_0_1 - vPP), + htfm00, + madd(shuffle<1>(P_curve_0_1 - vPP), + htfm11, + shuffle<2>(P_curve_0_1 - vPP) * htfm22)); + const avxf p23 = madd(shuffle<0>(P_curve_2_3 - vPP), + htfm00, + madd(shuffle<1>(P_curve_2_3 - vPP), + htfm11, + shuffle<2>(P_curve_2_3 - vPP)*htfm22)); + + const ssef p0 = _mm256_castps256_ps128(p01); + const ssef p1 = _mm256_extractf128_ps(p01, 1); + const ssef p2 = _mm256_castps256_ps128(p23); + const ssef p3 = _mm256_extractf128_ps(p23, 1); + + const ssef P_curve_1 = _mm256_extractf128_ps(P_curve_0_1, 1); + r_st = ((float4 &)P_curve_1).w; + const ssef P_curve_2 = _mm256_castps256_ps128(P_curve_2_3); + r_en = ((float4 &)P_curve_2).w; +#else /* __KERNEL_AVX2__ */ + ssef htfm[] = { htfm0, htfm1, htfm2 }; + ssef vP = load4f(P); + ssef p0 = transform_point_T3(htfm, P_curve[0] - vP); + ssef p1 = transform_point_T3(htfm, P_curve[1] - vP); + ssef p2 = transform_point_T3(htfm, P_curve[2] - vP); + ssef p3 = transform_point_T3(htfm, P_curve[3] - vP); + + r_st = ((float4 &)P_curve[1]).w; + r_en = ((float4 &)P_curve[2]).w; +#endif /* __KERNEL_AVX2__ */ + + float fc = 0.71f; + ssef vfc = ssef(fc); + ssef vfcxp3 = vfc * p3; + + vcurve_coef[0] = p1; + vcurve_coef[1] = vfc * (p2 - p0); + vcurve_coef[2] = madd(ssef(fc * 2.0f), p0, madd(ssef(fc - 3.0f), p1, msub(ssef(3.0f - 2.0f * fc), p2, vfcxp3))); + vcurve_coef[3] = msub(ssef(fc - 2.0f), p2 - p1, msub(vfc, p0, vfcxp3)); + + } +#else + float3 curve_coef[4]; + + /* curve Intersection check */ + /* obtain curve parameters */ + { + /* ray transform created - this should be created at beginning of intersection loop */ + Transform htfm; + float d = sqrtf(dir.x * dir.x + dir.z * dir.z); + htfm = make_transform( + dir.z / d, 0, -dir.x /d, 0, + -dir.x * dir.y /d, d, -dir.y * dir.z /d, 0, + dir.x, dir.y, dir.z, 0); + + float4 v00 = kernel_tex_fetch(__curves, prim); + + int k0 = __float_as_int(v00.x) + segment; + int k1 = k0 + 1; + + int ka = max(k0 - 1,__float_as_int(v00.x)); + int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1); + + float4 P_curve[4]; + + if(is_curve_primitive) { + P_curve[0] = kernel_tex_fetch(__curve_keys, ka); + P_curve[1] = kernel_tex_fetch(__curve_keys, k0); + P_curve[2] = kernel_tex_fetch(__curve_keys, k1); + P_curve[3] = kernel_tex_fetch(__curve_keys, kb); + } + else { + int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; + motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, P_curve); + } + + float3 p0 = transform_point(&htfm, float4_to_float3(P_curve[0]) - P); + float3 p1 = transform_point(&htfm, float4_to_float3(P_curve[1]) - P); + float3 p2 = transform_point(&htfm, float4_to_float3(P_curve[2]) - P); + float3 p3 = transform_point(&htfm, float4_to_float3(P_curve[3]) - P); + + float fc = 0.71f; + curve_coef[0] = p1; + curve_coef[1] = -fc*p0 + fc*p2; + curve_coef[2] = 2.0f * fc * p0 + (fc - 3.0f) * p1 + (3.0f - 2.0f * fc) * p2 - fc * p3; + curve_coef[3] = -fc * p0 + (2.0f - fc) * p1 + (fc - 2.0f) * p2 + fc * p3; + r_st = P_curve[1].w; + r_en = P_curve[2].w; + } +#endif + + float r_curr = max(r_st, r_en); + + if((flags & CURVE_KN_RIBBONS) || !(flags & CURVE_KN_BACKFACING)) + epsilon = 2 * r_curr; + + /* find bounds - this is slow for cubic curves */ + float upper, lower; + + float zextrem[4]; + curvebounds(&lower, &upper, &zextrem[0], &zextrem[1], &zextrem[2], &zextrem[3], curve_coef[0].z, curve_coef[1].z, curve_coef[2].z, curve_coef[3].z); + if(lower - r_curr > isect->t || upper + r_curr < epsilon) + return false; + + /* minimum width extension */ + float mw_extension = min(difl * fabsf(upper), extmax); + float r_ext = mw_extension + r_curr; + + float xextrem[4]; + curvebounds(&lower, &upper, &xextrem[0], &xextrem[1], &xextrem[2], &xextrem[3], curve_coef[0].x, curve_coef[1].x, curve_coef[2].x, curve_coef[3].x); + if(lower > r_ext || upper < -r_ext) + return false; + + float yextrem[4]; + curvebounds(&lower, &upper, &yextrem[0], &yextrem[1], &yextrem[2], &yextrem[3], curve_coef[0].y, curve_coef[1].y, curve_coef[2].y, curve_coef[3].y); + if(lower > r_ext || upper < -r_ext) + return false; + + /* setup recurrent loop */ + int level = 1 << depth; + int tree = 0; + float resol = 1.0f / (float)level; + bool hit = false; + + /* begin loop */ + while(!(tree >> (depth))) { + const float i_st = tree * resol; + const float i_en = i_st + (level * resol); + +#ifdef __KERNEL_SSE2__ + ssef vi_st = ssef(i_st), vi_en = ssef(i_en); + ssef vp_st = madd(madd(madd(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]), vi_st, vcurve_coef[0]); + ssef vp_en = madd(madd(madd(vcurve_coef[3], vi_en, vcurve_coef[2]), vi_en, vcurve_coef[1]), vi_en, vcurve_coef[0]); + + ssef vbmin = min(vp_st, vp_en); + ssef vbmax = max(vp_st, vp_en); + + float3 &bmin = (float3 &)vbmin, &bmax = (float3 &)vbmax; + float &bminx = bmin.x, &bminy = bmin.y, &bminz = bmin.z; + float &bmaxx = bmax.x, &bmaxy = bmax.y, &bmaxz = bmax.z; + float3 &p_st = (float3 &)vp_st, &p_en = (float3 &)vp_en; +#else + float3 p_st = ((curve_coef[3] * i_st + curve_coef[2]) * i_st + curve_coef[1]) * i_st + curve_coef[0]; + float3 p_en = ((curve_coef[3] * i_en + curve_coef[2]) * i_en + curve_coef[1]) * i_en + curve_coef[0]; + + float bminx = min(p_st.x, p_en.x); + float bmaxx = max(p_st.x, p_en.x); + float bminy = min(p_st.y, p_en.y); + float bmaxy = max(p_st.y, p_en.y); + float bminz = min(p_st.z, p_en.z); + float bmaxz = max(p_st.z, p_en.z); +#endif + + if(xextrem[0] >= i_st && xextrem[0] <= i_en) { + bminx = min(bminx,xextrem[1]); + bmaxx = max(bmaxx,xextrem[1]); + } + if(xextrem[2] >= i_st && xextrem[2] <= i_en) { + bminx = min(bminx,xextrem[3]); + bmaxx = max(bmaxx,xextrem[3]); + } + if(yextrem[0] >= i_st && yextrem[0] <= i_en) { + bminy = min(bminy,yextrem[1]); + bmaxy = max(bmaxy,yextrem[1]); + } + if(yextrem[2] >= i_st && yextrem[2] <= i_en) { + bminy = min(bminy,yextrem[3]); + bmaxy = max(bmaxy,yextrem[3]); + } + if(zextrem[0] >= i_st && zextrem[0] <= i_en) { + bminz = min(bminz,zextrem[1]); + bmaxz = max(bmaxz,zextrem[1]); + } + if(zextrem[2] >= i_st && zextrem[2] <= i_en) { + bminz = min(bminz,zextrem[3]); + bmaxz = max(bmaxz,zextrem[3]); + } + + float r1 = r_st + (r_en - r_st) * i_st; + float r2 = r_st + (r_en - r_st) * i_en; + r_curr = max(r1, r2); + + mw_extension = min(difl * fabsf(bmaxz), extmax); + float r_ext = mw_extension + r_curr; + float coverage = 1.0f; + + if(bminz - r_curr > isect->t || bmaxz + r_curr < epsilon || bminx > r_ext|| bmaxx < -r_ext|| bminy > r_ext|| bmaxy < -r_ext) { + /* the bounding box does not overlap the square centered at O */ + tree += level; + level = tree & -tree; + } + else if(level == 1) { + + /* the maximum recursion depth is reached. + * check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0. + * dP* is reversed if necessary.*/ + float t = isect->t; + float u = 0.0f; + float gd = 0.0f; + + if(flags & CURVE_KN_RIBBONS) { + float3 tg = (p_en - p_st); +#ifdef __KERNEL_SSE__ + const float3 tg_sq = tg * tg; + float w = tg_sq.x + tg_sq.y; +#else + float w = tg.x * tg.x + tg.y * tg.y; +#endif + if(w == 0) { + tree++; + level = tree & -tree; + continue; + } +#ifdef __KERNEL_SSE__ + const float3 p_sttg = p_st * tg; + w = -(p_sttg.x + p_sttg.y) / w; +#else + w = -(p_st.x * tg.x + p_st.y * tg.y) / w; +#endif + w = saturate(w); + + /* compute u on the curve segment */ + u = i_st * (1 - w) + i_en * w; + r_curr = r_st + (r_en - r_st) * u; + /* compare x-y distances */ + float3 p_curr = ((curve_coef[3] * u + curve_coef[2]) * u + curve_coef[1]) * u + curve_coef[0]; + + float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1]; + if(dot(tg, dp_st)< 0) + dp_st *= -1; + if(dot(dp_st, -p_st) + p_curr.z * dp_st.z < 0) { + tree++; + level = tree & -tree; + continue; + } + float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1]; + if(dot(tg, dp_en) < 0) + dp_en *= -1; + if(dot(dp_en, p_en) - p_curr.z * dp_en.z < 0) { + tree++; + level = tree & -tree; + continue; + } + + /* compute coverage */ + float r_ext = r_curr; + coverage = 1.0f; + if(difl != 0.0f) { + mw_extension = min(difl * fabsf(bmaxz), extmax); + r_ext = mw_extension + r_curr; +#ifdef __KERNEL_SSE__ + const float3 p_curr_sq = p_curr * p_curr; + const float3 dxxx(_mm_sqrt_ss(_mm_hadd_ps(p_curr_sq.m128, p_curr_sq.m128))); + float d = dxxx.x; +#else + float d = sqrtf(p_curr.x * p_curr.x + p_curr.y * p_curr.y); +#endif + float d0 = d - r_curr; + float d1 = d + r_curr; + float inv_mw_extension = 1.0f/mw_extension; + if(d0 >= 0) + coverage = (min(d1 * inv_mw_extension, 1.0f) - min(d0 * inv_mw_extension, 1.0f)) * 0.5f; + else // inside + coverage = (min(d1 * inv_mw_extension, 1.0f) + min(-d0 * inv_mw_extension, 1.0f)) * 0.5f; + } + + if(p_curr.x * p_curr.x + p_curr.y * p_curr.y >= r_ext * r_ext || p_curr.z <= epsilon || isect->t < p_curr.z) { + tree++; + level = tree & -tree; + continue; + } + + t = p_curr.z; + + /* stochastic fade from minimum width */ + if(difl != 0.0f && lcg_state) { + if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage)) + return hit; + } + } + else { + float l = len(p_en - p_st); + /* minimum width extension */ + float or1 = r1; + float or2 = r2; + + if(difl != 0.0f) { + mw_extension = min(len(p_st - P) * difl, extmax); + or1 = r1 < mw_extension ? mw_extension : r1; + mw_extension = min(len(p_en - P) * difl, extmax); + or2 = r2 < mw_extension ? mw_extension : r2; + } + /* --- */ + float invl = 1.0f/l; + float3 tg = (p_en - p_st) * invl; + gd = (or2 - or1) * invl; + float difz = -dot(p_st,tg); + float cyla = 1.0f - (tg.z * tg.z * (1 + gd*gd)); + float invcyla = 1.0f/cyla; + float halfb = (-p_st.z - tg.z*(difz + gd*(difz*gd + or1))); + float tcentre = -halfb*invcyla; + float zcentre = difz + (tg.z * tcentre); + float3 tdif = - p_st; + tdif.z += tcentre; + float tdifz = dot(tdif,tg); + float tb = 2*(tdif.z - tg.z*(tdifz + gd*(tdifz*gd + or1))); + float tc = dot(tdif,tdif) - tdifz * tdifz * (1 + gd*gd) - or1*or1 - 2*or1*tdifz*gd; + float td = tb*tb - 4*cyla*tc; + if(td < 0.0f) { + tree++; + level = tree & -tree; + continue; + } + + float rootd = sqrtf(td); + float correction = (-tb - rootd) * 0.5f * invcyla; + t = tcentre + correction; + + float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1]; + if(dot(tg, dp_st)< 0) + dp_st *= -1; + float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1]; + if(dot(tg, dp_en) < 0) + dp_en *= -1; + + if(flags & CURVE_KN_BACKFACING && (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f)) { + correction = (-tb + rootd) * 0.5f * invcyla; + t = tcentre + correction; + } + + if(dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f) { + tree++; + level = tree & -tree; + continue; + } + + float w = (zcentre + (tg.z * correction)) * invl; + w = saturate(w); + /* compute u on the curve segment */ + u = i_st * (1 - w) + i_en * w; + + /* stochastic fade from minimum width */ + if(difl != 0.0f && lcg_state) { + r_curr = r1 + (r2 - r1) * w; + r_ext = or1 + (or2 - or1) * w; + coverage = r_curr/r_ext; + + if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage)) + return hit; + } + } + /* we found a new intersection */ + +#ifdef __VISIBILITY_FLAG__ + /* visibility flag test. we do it here under the assumption + * that most triangles are culled by node flags */ + if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility) +#endif + { + /* record intersection */ + isect->t = t; + isect->u = u; + isect->v = gd; + isect->prim = curveAddr; + isect->object = object; + isect->type = type; + hit = true; + } + + tree++; + level = tree & -tree; + } + else { + /* split the curve into two curves and process */ + level = level >> 1; + } + } + + return hit; +} + +ccl_device_forceinline bool curve_intersect(KernelGlobals *kg, + Intersection *isect, + float3 P, + float3 direction, + uint visibility, + int object, + int curveAddr, + float time, + int type, + uint *lcg_state, + float difl, + float extmax) +{ + /* define few macros to minimize code duplication for SSE */ +#ifndef __KERNEL_SSE2__ +# define len3_squared(x) len_squared(x) +# define len3(x) len(x) +# define dot3(x, y) dot(x, y) +#endif + + const bool is_curve_primitive = (type & PRIMITIVE_CURVE); + + if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) { + const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr); + if(time < prim_time.x || time > prim_time.y) { + return false; + } + } + + int segment = PRIMITIVE_UNPACK_SEGMENT(type); + /* curve Intersection check */ + int flags = kernel_data.curve.curveflags; + + int prim = kernel_tex_fetch(__prim_index, curveAddr); + float4 v00 = kernel_tex_fetch(__curves, prim); + + int cnum = __float_as_int(v00.x); + int k0 = cnum + segment; + int k1 = k0 + 1; + +#ifndef __KERNEL_SSE2__ + float4 P_curve[2]; + + if(is_curve_primitive) { + P_curve[0] = kernel_tex_fetch(__curve_keys, k0); + P_curve[1] = kernel_tex_fetch(__curve_keys, k1); + } + else { + int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; + motion_curve_keys(kg, fobject, prim, time, k0, k1, P_curve); + } + + float or1 = P_curve[0].w; + float or2 = P_curve[1].w; + float3 p1 = float4_to_float3(P_curve[0]); + float3 p2 = float4_to_float3(P_curve[1]); + + /* minimum width extension */ + float r1 = or1; + float r2 = or2; + float3 dif = P - p1; + float3 dif_second = P - p2; + if(difl != 0.0f) { + float pixelsize = min(len3(dif) * difl, extmax); + r1 = or1 < pixelsize ? pixelsize : or1; + pixelsize = min(len3(dif_second) * difl, extmax); + r2 = or2 < pixelsize ? pixelsize : or2; + } + /* --- */ + + float3 p21_diff = p2 - p1; + float3 sphere_dif1 = (dif + dif_second) * 0.5f; + float3 dir = direction; + float sphere_b_tmp = dot3(dir, sphere_dif1); + float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir; +#else + ssef P_curve[2]; + + if(is_curve_primitive) { + P_curve[0] = load4f(&kg->__curve_keys.data[k0].x); + P_curve[1] = load4f(&kg->__curve_keys.data[k1].x); + } + else { + int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object; + motion_curve_keys(kg, fobject, prim, time, k0, k1, (float4*)&P_curve); + } + + const ssef or12 = shuffle<3, 3, 3, 3>(P_curve[0], P_curve[1]); + + ssef r12 = or12; + const ssef vP = load4f(P); + const ssef dif = vP - P_curve[0]; + const ssef dif_second = vP - P_curve[1]; + if(difl != 0.0f) { + const ssef len1_sq = len3_squared_splat(dif); + const ssef len2_sq = len3_squared_splat(dif_second); + const ssef len12 = mm_sqrt(shuffle<0, 0, 0, 0>(len1_sq, len2_sq)); + const ssef pixelsize12 = min(len12 * difl, ssef(extmax)); + r12 = max(or12, pixelsize12); + } + float or1 = extract<0>(or12), or2 = extract<0>(shuffle<2>(or12)); + float r1 = extract<0>(r12), r2 = extract<0>(shuffle<2>(r12)); + + const ssef p21_diff = P_curve[1] - P_curve[0]; + const ssef sphere_dif1 = (dif + dif_second) * 0.5f; + const ssef dir = load4f(direction); + const ssef sphere_b_tmp = dot3_splat(dir, sphere_dif1); + const ssef sphere_dif2 = nmadd(sphere_b_tmp, dir, sphere_dif1); +#endif + + float mr = max(r1, r2); + float l = len3(p21_diff); + float invl = 1.0f / l; + float sp_r = mr + 0.5f * l; + + float sphere_b = dot3(dir, sphere_dif2); + float sdisc = sphere_b * sphere_b - len3_squared(sphere_dif2) + sp_r * sp_r; + + if(sdisc < 0.0f) + return false; + + /* obtain parameters and test midpoint distance for suitable modes */ +#ifndef __KERNEL_SSE2__ + float3 tg = p21_diff * invl; +#else + const ssef tg = p21_diff * invl; +#endif + float gd = (r2 - r1) * invl; + + float dirz = dot3(dir, tg); + float difz = dot3(dif, tg); + + float a = 1.0f - (dirz*dirz*(1 + gd*gd)); + + float halfb = dot3(dir, dif) - dirz*(difz + gd*(difz*gd + r1)); + + float tcentre = -halfb/a; + float zcentre = difz + (dirz * tcentre); + + if((tcentre > isect->t) && !(flags & CURVE_KN_ACCURATE)) + return false; + if((zcentre < 0 || zcentre > l) && !(flags & CURVE_KN_ACCURATE) && !(flags & CURVE_KN_INTERSECTCORRECTION)) + return false; + + /* test minimum separation */ +#ifndef __KERNEL_SSE2__ + float3 cprod = cross(tg, dir); + float cprod2sq = len3_squared(cross(tg, dif)); +#else + const ssef cprod = cross(tg, dir); + float cprod2sq = len3_squared(cross_zxy(tg, dif)); +#endif + float cprodsq = len3_squared(cprod); + float distscaled = dot3(cprod, dif); + + if(cprodsq == 0) + distscaled = cprod2sq; + else + distscaled = (distscaled*distscaled)/cprodsq; + + if(distscaled > mr*mr) + return false; + + /* calculate true intersection */ +#ifndef __KERNEL_SSE2__ + float3 tdif = dif + tcentre * dir; +#else + const ssef tdif = madd(ssef(tcentre), dir, dif); +#endif + float tdifz = dot3(tdif, tg); + float tdifma = tdifz*gd + r1; + float tb = 2*(dot3(dir, tdif) - dirz*(tdifz + gd*tdifma)); + float tc = dot3(tdif, tdif) - tdifz*tdifz - tdifma*tdifma; + float td = tb*tb - 4*a*tc; + + if(td < 0.0f) + return false; + + float rootd = 0.0f; + float correction = 0.0f; + if(flags & CURVE_KN_ACCURATE) { + rootd = sqrtf(td); + correction = ((-tb - rootd)/(2*a)); + } + + float t = tcentre + correction; + + if(t < isect->t) { + + if(flags & CURVE_KN_INTERSECTCORRECTION) { + rootd = sqrtf(td); + correction = ((-tb - rootd)/(2*a)); + t = tcentre + correction; + } + + float z = zcentre + (dirz * correction); + // bool backface = false; + + if(flags & CURVE_KN_BACKFACING && (t < 0.0f || z < 0 || z > l)) { + // backface = true; + correction = ((-tb + rootd)/(2*a)); + t = tcentre + correction; + z = zcentre + (dirz * correction); + } + + /* stochastic fade from minimum width */ + float adjradius = or1 + z * (or2 - or1) * invl; + adjradius = adjradius / (r1 + z * gd); + if(lcg_state && adjradius != 1.0f) { + if(lcg_step_float(lcg_state) > adjradius) + return false; + } + /* --- */ + + if(t > 0.0f && t < isect->t && z >= 0 && z <= l) { + + if(flags & CURVE_KN_ENCLOSEFILTER) { + float enc_ratio = 1.01f; + if((difz > -r1 * enc_ratio) && (dot3(dif_second, tg) < r2 * enc_ratio)) { + float a2 = 1.0f - (dirz*dirz*(1 + gd*gd*enc_ratio*enc_ratio)); + float c2 = dot3(dif, dif) - difz * difz * (1 + gd*gd*enc_ratio*enc_ratio) - r1*r1*enc_ratio*enc_ratio - 2*r1*difz*gd*enc_ratio; + if(a2*c2 < 0.0f) + return false; + } + } + +#ifdef __VISIBILITY_FLAG__ + /* visibility flag test. we do it here under the assumption + * that most triangles are culled by node flags */ + if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility) +#endif + { + /* record intersection */ + isect->t = t; + isect->u = z*invl; + isect->v = gd; + isect->prim = curveAddr; + isect->object = object; + isect->type = type; + + return true; + } + } + } + + return false; + +#ifndef __KERNEL_SSE2__ +# undef len3_squared +# undef len3 +# undef dot3 +#endif +} + +ccl_device_inline float3 curvetangent(float t, float3 p0, float3 p1, float3 p2, float3 p3) +{ + float fc = 0.71f; + float data[4]; + float t2 = t * t; + data[0] = -3.0f * fc * t2 + 4.0f * fc * t - fc; + data[1] = 3.0f * (2.0f - fc) * t2 + 2.0f * (fc - 3.0f) * t; + data[2] = 3.0f * (fc - 2.0f) * t2 + 2.0f * (3.0f - 2.0f * fc) * t + fc; + data[3] = 3.0f * fc * t2 - 2.0f * fc * t; + return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3; +} + +ccl_device_inline float3 curvepoint(float t, float3 p0, float3 p1, float3 p2, float3 p3) +{ + float data[4]; + float fc = 0.71f; + float t2 = t * t; + float t3 = t2 * t; + data[0] = -fc * t3 + 2.0f * fc * t2 - fc * t; + data[1] = (2.0f - fc) * t3 + (fc - 3.0f) * t2 + 1.0f; + data[2] = (fc - 2.0f) * t3 + (3.0f - 2.0f * fc) * t2 + fc * t; + data[3] = fc * t3 - fc * t2; + return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3; +} + +ccl_device_inline float3 curve_refine(KernelGlobals *kg, + ShaderData *sd, + const Intersection *isect, + const Ray *ray) +{ + int flag = kernel_data.curve.curveflags; + float t = isect->t; + float3 P = ray->P; + float3 D = ray->D; + + if(isect->object != OBJECT_NONE) { +#ifdef __OBJECT_MOTION__ + Transform tfm = sd->ob_itfm; +#else + Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM); +#endif + + P = transform_point(&tfm, P); + D = transform_direction(&tfm, D*t); + D = normalize_len(D, &t); + } + + int prim = kernel_tex_fetch(__prim_index, isect->prim); + float4 v00 = kernel_tex_fetch(__curves, prim); + + int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type); + int k1 = k0 + 1; + + float3 tg; + + if(flag & CURVE_KN_INTERPOLATE) { + int ka = max(k0 - 1,__float_as_int(v00.x)); + int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1); + + float4 P_curve[4]; + + if(sd->type & PRIMITIVE_CURVE) { + P_curve[0] = kernel_tex_fetch(__curve_keys, ka); + P_curve[1] = kernel_tex_fetch(__curve_keys, k0); + P_curve[2] = kernel_tex_fetch(__curve_keys, k1); + P_curve[3] = kernel_tex_fetch(__curve_keys, kb); + } + else { + motion_cardinal_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve); + } + + float3 p[4]; + p[0] = float4_to_float3(P_curve[0]); + p[1] = float4_to_float3(P_curve[1]); + p[2] = float4_to_float3(P_curve[2]); + p[3] = float4_to_float3(P_curve[3]); + + P = P + D*t; + +#ifdef __UV__ + sd->u = isect->u; + sd->v = 0.0f; +#endif + + tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3])); + + if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) { + sd->Ng = normalize(-(D - tg * (dot(tg, D)))); + } + else { + /* direction from inside to surface of curve */ + float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]); + sd->Ng = normalize(P - p_curr); + + /* adjustment for changing radius */ + float gd = isect->v; + + if(gd != 0.0f) { + sd->Ng = sd->Ng - gd * tg; + sd->Ng = normalize(sd->Ng); + } + } + + /* todo: sometimes the normal is still so that this is detected as + * backfacing even if cull backfaces is enabled */ + + sd->N = sd->Ng; + } + else { + float4 P_curve[2]; + + if(sd->type & PRIMITIVE_CURVE) { + P_curve[0]= kernel_tex_fetch(__curve_keys, k0); + P_curve[1]= kernel_tex_fetch(__curve_keys, k1); + } + else { + motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve); + } + + float l = 1.0f; + tg = normalize_len(float4_to_float3(P_curve[1] - P_curve[0]), &l); + + P = P + D*t; + + float3 dif = P - float4_to_float3(P_curve[0]); + +#ifdef __UV__ + sd->u = dot(dif,tg)/l; + sd->v = 0.0f; +#endif + + if(flag & CURVE_KN_TRUETANGENTGNORMAL) { + sd->Ng = -(D - tg * dot(tg, D)); + sd->Ng = normalize(sd->Ng); + } + else { + float gd = isect->v; + + /* direction from inside to surface of curve */ + sd->Ng = (dif - tg * sd->u * l) / (P_curve[0].w + sd->u * l * gd); + + /* adjustment for changing radius */ + if(gd != 0.0f) { + sd->Ng = sd->Ng - gd * tg; + sd->Ng = normalize(sd->Ng); + } + } + + sd->N = sd->Ng; + } + +#ifdef __DPDU__ + /* dPdu/dPdv */ + sd->dPdu = tg; + sd->dPdv = cross(tg, sd->Ng); +#endif + + if(isect->object != OBJECT_NONE) { +#ifdef __OBJECT_MOTION__ + Transform tfm = sd->ob_tfm; +#else + Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM); +#endif + + P = transform_point(&tfm, P); + } + + return P; +} + +#endif + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/geom/geom_motion_curve.h b/intern/cycles/kernel/geom/geom_motion_curve.h index dc1388b6643..fad29e431ec 100644 --- a/intern/cycles/kernel/geom/geom_motion_curve.h +++ b/intern/cycles/kernel/geom/geom_motion_curve.h @@ -33,7 +33,7 @@ ccl_device_inline int find_attribute_curve_motion(KernelGlobals *kg, int object, * zero iterations and rendering is really slow with motion curves. For until other * areas are speed up it's probably not so crucial to optimize this out. */ - uint attr_offset = object*kernel_data.bvh.attributes_map_stride + ATTR_PRIM_CURVE; + uint attr_offset = object_attribute_map_offset(kg, object) + ATTR_PRIM_CURVE; uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset); while(attr_map.x != id) { @@ -152,7 +152,7 @@ ccl_device_inline void motion_cardinal_curve_keys(KernelGlobals *kg, keys[3] = (1.0f - t)*keys[3] + t*next_keys[3]; } -#ifdef __KERNEL_AVX2__ +#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) /* Similar to above, but returns keys as pair of two AVX registers with each * holding two float4. */ diff --git a/intern/cycles/kernel/geom/geom_motion_triangle.h b/intern/cycles/kernel/geom/geom_motion_triangle.h index 4e84aa97776..7ac6807e749 100644 --- a/intern/cycles/kernel/geom/geom_motion_triangle.h +++ b/intern/cycles/kernel/geom/geom_motion_triangle.h @@ -32,7 +32,7 @@ CCL_NAMESPACE_BEGIN ccl_device_inline int find_attribute_motion(KernelGlobals *kg, int object, uint id, AttributeElement *elem) { /* todo: find a better (faster) solution for this, maybe store offset per object */ - uint attr_offset = object*kernel_data.bvh.attributes_map_stride; + uint attr_offset = object_attribute_map_offset(kg, object); uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset); while(attr_map.x != id) { @@ -117,4 +117,39 @@ ccl_device_inline void motion_triangle_vertices(KernelGlobals *kg, int object, i verts[2] = (1.0f - t)*verts[2] + t*next_verts[2]; } +ccl_device_inline float3 motion_triangle_smooth_normal(KernelGlobals *kg, float3 Ng, int object, int prim, float u, float v, float time) +{ + /* get motion info */ + int numsteps, numverts; + object_motion_info(kg, object, &numsteps, &numverts, NULL); + + /* figure out which steps we need to fetch and their interpolation factor */ + int maxstep = numsteps*2; + int step = min((int)(time*maxstep), maxstep-1); + float t = time*maxstep - step; + + /* find attribute */ + AttributeElement elem; + int offset = find_attribute_motion(kg, object, ATTR_STD_MOTION_VERTEX_NORMAL, &elem); + kernel_assert(offset != ATTR_STD_NOT_FOUND); + + /* fetch normals */ + float3 normals[3], next_normals[3]; + uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); + + motion_triangle_normals_for_step(kg, tri_vindex, offset, numverts, numsteps, step, normals); + motion_triangle_normals_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_normals); + + /* interpolate between steps */ + normals[0] = (1.0f - t)*normals[0] + t*next_normals[0]; + normals[1] = (1.0f - t)*normals[1] + t*next_normals[1]; + normals[2] = (1.0f - t)*normals[2] + t*next_normals[2]; + + /* interpolate between vertices */ + float w = 1.0f - u - v; + float3 N = safe_normalize(u*normals[0] + v*normals[1] + w*normals[2]); + + return is_zero(N)? Ng: N; +} + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h index d57d74ea882..542843edc84 100644 --- a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h +++ b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h @@ -48,7 +48,7 @@ ccl_device_inline float3 motion_triangle_refine(KernelGlobals *kg, return P; } # ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_itfm); + Transform tfm = sd->ob_itfm; # else Transform tfm = object_fetch_transform(kg, isect->object, @@ -77,7 +77,7 @@ ccl_device_inline float3 motion_triangle_refine(KernelGlobals *kg, if(isect->object != OBJECT_NONE) { # ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_tfm); + Transform tfm = sd->ob_tfm; # else Transform tfm = object_fetch_transform(kg, isect->object, @@ -97,17 +97,17 @@ ccl_device_inline float3 motion_triangle_refine(KernelGlobals *kg, * for instancing. */ -#ifdef __SUBSURFACE__ +#ifdef __BVH_LOCAL__ # if defined(__KERNEL_CUDA__) && (defined(i386) || defined(_M_IX86)) ccl_device_noinline # else ccl_device_inline # endif -float3 motion_triangle_refine_subsurface(KernelGlobals *kg, - ShaderData *sd, - const Intersection *isect, - const Ray *ray, - float3 verts[3]) +float3 motion_triangle_refine_local(KernelGlobals *kg, + ShaderData *sd, + const Intersection *isect, + const Ray *ray, + float3 verts[3]) { float3 P = ray->P; float3 D = ray->D; @@ -116,7 +116,7 @@ float3 motion_triangle_refine_subsurface(KernelGlobals *kg, # ifdef __INTERSECTION_REFINE__ if(isect->object != OBJECT_NONE) { # ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_itfm); + Transform tfm = sd->ob_itfm; # else Transform tfm = object_fetch_transform(kg, isect->object, @@ -144,7 +144,7 @@ float3 motion_triangle_refine_subsurface(KernelGlobals *kg, if(isect->object != OBJECT_NONE) { # ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_tfm); + Transform tfm = sd->ob_tfm; # else Transform tfm = object_fetch_transform(kg, isect->object, @@ -159,21 +159,22 @@ float3 motion_triangle_refine_subsurface(KernelGlobals *kg, return P + D*t; # endif /* __INTERSECTION_REFINE__ */ } -#endif /* __SUBSURFACE__ */ +#endif /* __BVH_LOCAL__ */ /* Ray intersection. We simply compute the vertex positions at the given ray * time and do a ray intersection with the resulting triangle. */ -ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg, - Intersection *isect, - float3 P, - float3 dir, - float time, - uint visibility, - int object, - int prim_addr) +ccl_device_inline bool motion_triangle_intersect( + KernelGlobals *kg, + Intersection *isect, + float3 P, + float3 dir, + float time, + uint visibility, + int object, + int prim_addr) { /* Primitive index for vertex location lookup. */ int prim = kernel_tex_fetch(__prim_index, prim_addr); @@ -185,11 +186,15 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg, motion_triangle_vertices(kg, fobject, prim, time, verts); /* Ray-triangle intersection, unoptimized. */ float t, u, v; - if(ray_triangle_intersect_uv(P, - dir, - isect->t, - verts[2], verts[0], verts[1], - &u, &v, &t)) + if(ray_triangle_intersect(P, + dir, + isect->t, +#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) + (ssef*)verts, +#else + verts[0], verts[1], verts[2], +#endif + &u, &v, &t)) { #ifdef __VISIBILITY_FLAG__ /* Visibility flag test. we do it here under the assumption @@ -210,71 +215,99 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg, return false; } -/* Special ray intersection routines for subsurface scattering. In that case we +/* Special ray intersection routines for local intersections. In that case we * only want to intersect with primitives in the same object, and if case of * multiple hits we pick a single random primitive as the intersection point. */ -#ifdef __SUBSURFACE__ -ccl_device_inline void motion_triangle_intersect_subsurface( +#ifdef __BVH_LOCAL__ +ccl_device_inline void motion_triangle_intersect_local( KernelGlobals *kg, - SubsurfaceIntersection *ss_isect, + LocalIntersection *local_isect, float3 P, float3 dir, float time, int object, + int local_object, int prim_addr, float tmax, uint *lcg_state, int max_hits) { + /* Only intersect with matching object, for instanced objects we + * already know we are only intersecting the right object. */ + if(object == OBJECT_NONE) { + if(kernel_tex_fetch(__prim_object, prim_addr) != local_object) { + return; + } + } + /* Primitive index for vertex location lookup. */ int prim = kernel_tex_fetch(__prim_index, prim_addr); - int fobject = (object == OBJECT_NONE) - ? kernel_tex_fetch(__prim_object, prim_addr) - : object; /* Get vertex locations for intersection. */ float3 verts[3]; - motion_triangle_vertices(kg, fobject, prim, time, verts); + motion_triangle_vertices(kg, local_object, prim, time, verts); /* Ray-triangle intersection, unoptimized. */ float t, u, v; - if(ray_triangle_intersect_uv(P, - dir, - tmax, - verts[2], verts[0], verts[1], - &u, &v, &t)) + if(!ray_triangle_intersect(P, + dir, + tmax, +#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) + (ssef*)verts, +#else + verts[0], verts[1], verts[2], +#endif + &u, &v, &t)) { - for(int i = min(max_hits, ss_isect->num_hits) - 1; i >= 0; --i) { - if(ss_isect->hits[i].t == t) { + return; + } + + int hit; + if(lcg_state) { + /* Record up to max_hits intersections. */ + for(int i = min(max_hits, local_isect->num_hits) - 1; i >= 0; --i) { + if(local_isect->hits[i].t == t) { return; } } - ss_isect->num_hits++; - int hit; - if(ss_isect->num_hits <= max_hits) { - hit = ss_isect->num_hits - 1; + + local_isect->num_hits++; + + if(local_isect->num_hits <= max_hits) { + hit = local_isect->num_hits - 1; } else { /* Reservoir sampling: if we are at the maximum number of * hits, randomly replace element or skip it. */ - hit = lcg_step_uint(lcg_state) % ss_isect->num_hits; + hit = lcg_step_uint(lcg_state) % local_isect->num_hits; if(hit >= max_hits) return; } - /* Record intersection. */ - Intersection *isect = &ss_isect->hits[hit]; - isect->t = t; - isect->u = u; - isect->v = v; - isect->prim = prim_addr; - isect->object = object; - isect->type = PRIMITIVE_MOTION_TRIANGLE; - /* Record geometric normal. */ - ss_isect->Ng[hit] = normalize(cross(verts[1] - verts[0], - verts[2] - verts[0])); } + else { + /* Record closest intersection only. */ + if(local_isect->num_hits && t > local_isect->hits[0].t) { + return; + } + + hit = 0; + local_isect->num_hits = 1; + } + + /* Record intersection. */ + Intersection *isect = &local_isect->hits[hit]; + isect->t = t; + isect->u = u; + isect->v = v; + isect->prim = prim_addr; + isect->object = object; + isect->type = PRIMITIVE_MOTION_TRIANGLE; + + /* Record geometric normal. */ + local_isect->Ng[hit] = normalize(cross(verts[1] - verts[0], + verts[2] - verts[0])); } -#endif /* __SUBSURFACE__ */ +#endif /* __BVH_LOCAL__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_shader.h b/intern/cycles/kernel/geom/geom_motion_triangle_shader.h index 0e024a05db6..4789137d5b0 100644 --- a/intern/cycles/kernel/geom/geom_motion_triangle_shader.h +++ b/intern/cycles/kernel/geom/geom_motion_triangle_shader.h @@ -36,29 +36,29 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, - bool subsurface) + bool is_local) { /* Get shader. */ - ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim)); + sd->shader = kernel_tex_fetch(__tri_shader, sd->prim); /* Get motion info. */ /* TODO(sergey): This logic is really similar to motion_triangle_vertices(), * can we de-duplicate something here? */ int numsteps, numverts; - object_motion_info(kg, ccl_fetch(sd, object), &numsteps, &numverts, NULL); + object_motion_info(kg, sd->object, &numsteps, &numverts, NULL); /* Figure out which steps we need to fetch and their interpolation factor. */ int maxstep = numsteps*2; - int step = min((int)(ccl_fetch(sd, time)*maxstep), maxstep-1); - float t = ccl_fetch(sd, time)*maxstep - step; + int step = min((int)(sd->time*maxstep), maxstep-1); + float t = sd->time*maxstep - step; /* Find attribute. */ AttributeElement elem; - int offset = find_attribute_motion(kg, ccl_fetch(sd, object), + int offset = find_attribute_motion(kg, sd->object, ATTR_STD_MOTION_VERTEX_POSITION, &elem); kernel_assert(offset != ATTR_STD_NOT_FOUND); /* Fetch vertex coordinates. */ float3 verts[3], next_verts[3]; - uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); + uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts); motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_verts); /* Interpolate between steps. */ @@ -66,40 +66,40 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, verts[1] = (1.0f - t)*verts[1] + t*next_verts[1]; verts[2] = (1.0f - t)*verts[2] + t*next_verts[2]; /* Compute refined position. */ -#ifdef __SUBSURFACE__ - if(subsurface) { - ccl_fetch(sd, P) = motion_triangle_refine_subsurface(kg, - sd, - isect, - ray, - verts); +#ifdef __BVH_LOCAL__ + if(is_local) { + sd->P = motion_triangle_refine_local(kg, + sd, + isect, + ray, + verts); } else -#endif /* __SUBSURFACE__*/ +#endif /* __BVH_LOCAL__*/ { - ccl_fetch(sd, P) = motion_triangle_refine(kg, sd, isect, ray, verts); + sd->P = motion_triangle_refine(kg, sd, isect, ray, verts); } /* Compute face normal. */ float3 Ng; - if(ccl_fetch(sd, object_flag) & SD_OBJECT_NEGATIVE_SCALE_APPLIED) { + if(sd->object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) { Ng = normalize(cross(verts[2] - verts[0], verts[1] - verts[0])); } else { Ng = normalize(cross(verts[1] - verts[0], verts[2] - verts[0])); } - ccl_fetch(sd, Ng) = Ng; - ccl_fetch(sd, N) = Ng; + sd->Ng = Ng; + sd->N = Ng; /* Compute derivatives of P w.r.t. uv. */ #ifdef __DPDU__ - ccl_fetch(sd, dPdu) = (verts[0] - verts[2]); - ccl_fetch(sd, dPdv) = (verts[1] - verts[2]); + sd->dPdu = (verts[0] - verts[2]); + sd->dPdv = (verts[1] - verts[2]); #endif /* Compute smooth normal. */ - if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) { + if(sd->shader & SHADER_SMOOTH_NORMAL) { /* Find attribute. */ AttributeElement elem; int offset = find_attribute_motion(kg, - ccl_fetch(sd, object), + sd->object, ATTR_STD_MOTION_VERTEX_NORMAL, &elem); kernel_assert(offset != ATTR_STD_NOT_FOUND); @@ -112,10 +112,10 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, normals[1] = (1.0f - t)*normals[1] + t*next_normals[1]; normals[2] = (1.0f - t)*normals[2] + t*next_normals[2]; /* Interpolate between vertices. */ - float u = ccl_fetch(sd, u); - float v = ccl_fetch(sd, v); + float u = sd->u; + float v = sd->v; float w = 1.0f - u - v; - ccl_fetch(sd, N) = (u*normals[0] + v*normals[1] + w*normals[2]); + sd->N = (u*normals[0] + v*normals[1] + w*normals[2]); } } diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h index f51b2d18657..800649abf38 100644 --- a/intern/cycles/kernel/geom/geom_object.h +++ b/intern/cycles/kernel/geom/geom_object.h @@ -28,61 +28,44 @@ CCL_NAMESPACE_BEGIN enum ObjectTransform { OBJECT_TRANSFORM = 0, - OBJECT_TRANSFORM_MOTION_PRE = 0, - OBJECT_INVERSE_TRANSFORM = 4, - OBJECT_TRANSFORM_MOTION_POST = 4, - OBJECT_PROPERTIES = 8, - OBJECT_DUPLI = 9 + OBJECT_INVERSE_TRANSFORM = 1, }; enum ObjectVectorTransform { - OBJECT_VECTOR_MOTION_PRE = 0, - OBJECT_VECTOR_MOTION_POST = 3 + OBJECT_PASS_MOTION_PRE = 0, + OBJECT_PASS_MOTION_POST = 1 }; /* Object to world space transformation */ ccl_device_inline Transform object_fetch_transform(KernelGlobals *kg, int object, enum ObjectTransform type) { - int offset = object*OBJECT_SIZE + (int)type; - - Transform tfm; - tfm.x = kernel_tex_fetch(__objects, offset + 0); - tfm.y = kernel_tex_fetch(__objects, offset + 1); - tfm.z = kernel_tex_fetch(__objects, offset + 2); - tfm.w = make_float4(0.0f, 0.0f, 0.0f, 1.0f); - - return tfm; + if(type == OBJECT_INVERSE_TRANSFORM) { + return kernel_tex_fetch(__objects, object).itfm; + } + else { + return kernel_tex_fetch(__objects, object).tfm; + } } /* Lamp to world space transformation */ ccl_device_inline Transform lamp_fetch_transform(KernelGlobals *kg, int lamp, bool inverse) { - int offset = lamp*LIGHT_SIZE + (inverse? 8 : 5); - - Transform tfm; - tfm.x = kernel_tex_fetch(__light_data, offset + 0); - tfm.y = kernel_tex_fetch(__light_data, offset + 1); - tfm.z = kernel_tex_fetch(__light_data, offset + 2); - tfm.w = make_float4(0.0f, 0.0f, 0.0f, 1.0f); - - return tfm; + if(inverse) { + return kernel_tex_fetch(__lights, lamp).itfm; + } + else { + return kernel_tex_fetch(__lights, lamp).tfm; + } } /* Object to world space transformation for motion vectors */ -ccl_device_inline Transform object_fetch_vector_transform(KernelGlobals *kg, int object, enum ObjectVectorTransform type) +ccl_device_inline Transform object_fetch_motion_pass_transform(KernelGlobals *kg, int object, enum ObjectVectorTransform type) { - int offset = object*OBJECT_VECTOR_SIZE + (int)type; - - Transform tfm; - tfm.x = kernel_tex_fetch(__objects_vector, offset + 0); - tfm.y = kernel_tex_fetch(__objects_vector, offset + 1); - tfm.z = kernel_tex_fetch(__objects_vector, offset + 2); - tfm.w = make_float4(0.0f, 0.0f, 0.0f, 1.0f); - - return tfm; + int offset = object*OBJECT_MOTION_PASS_SIZE + (int)type; + return kernel_tex_fetch(__object_motion_pass, offset); } /* Motion blurred object transformations */ @@ -90,22 +73,12 @@ ccl_device_inline Transform object_fetch_vector_transform(KernelGlobals *kg, int #ifdef __OBJECT_MOTION__ ccl_device_inline Transform object_fetch_transform_motion(KernelGlobals *kg, int object, float time) { - DecompMotionTransform motion; - - int offset = object*OBJECT_SIZE + (int)OBJECT_TRANSFORM_MOTION_PRE; - - motion.mid.x = kernel_tex_fetch(__objects, offset + 0); - motion.mid.y = kernel_tex_fetch(__objects, offset + 1); - motion.mid.z = kernel_tex_fetch(__objects, offset + 2); - motion.mid.w = kernel_tex_fetch(__objects, offset + 3); - - motion.pre_x = kernel_tex_fetch(__objects, offset + 4); - motion.pre_y = kernel_tex_fetch(__objects, offset + 5); - motion.post_x = kernel_tex_fetch(__objects, offset + 6); - motion.post_y = kernel_tex_fetch(__objects, offset + 7); + const uint motion_offset = kernel_tex_fetch(__objects, object).motion_offset; + const ccl_global DecomposedTransform *motion = &kernel_tex_fetch(__object_motion, motion_offset); + const uint num_steps = kernel_tex_fetch(__objects, object).numsteps * 2 + 1; Transform tfm; - transform_motion_interpolate(&tfm, &motion, time); + transform_motion_array_interpolate(&tfm, motion, num_steps, time); return tfm; } @@ -137,9 +110,9 @@ ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg ccl_device_inline void object_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P) { #ifdef __OBJECT_MOTION__ - *P = transform_point_auto(&ccl_fetch(sd, ob_tfm), *P); + *P = transform_point_auto(&sd->ob_tfm, *P); #else - Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM); + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); *P = transform_point(&tfm, *P); #endif } @@ -149,9 +122,9 @@ ccl_device_inline void object_position_transform(KernelGlobals *kg, const Shader ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P) { #ifdef __OBJECT_MOTION__ - *P = transform_point_auto(&ccl_fetch(sd, ob_itfm), *P); + *P = transform_point_auto(&sd->ob_itfm, *P); #else - Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM); + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM); *P = transform_point(&tfm, *P); #endif } @@ -161,12 +134,16 @@ ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, cons ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N) { #ifdef __OBJECT_MOTION__ - if((ccl_fetch(sd, object) != OBJECT_NONE) || (ccl_fetch(sd, type) == PRIMITIVE_LAMP)) { - *N = normalize(transform_direction_transposed_auto(&ccl_fetch(sd, ob_tfm), *N)); + if((sd->object != OBJECT_NONE) || (sd->type == PRIMITIVE_LAMP)) { + *N = normalize(transform_direction_transposed_auto(&sd->ob_tfm, *N)); } #else - if(ccl_fetch(sd, object) != OBJECT_NONE) { - Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM); + if(sd->object != OBJECT_NONE) { + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); + *N = normalize(transform_direction_transposed(&tfm, *N)); + } + else if(sd->type == PRIMITIVE_LAMP) { + Transform tfm = lamp_fetch_transform(kg, sd->lamp, false); *N = normalize(transform_direction_transposed(&tfm, *N)); } #endif @@ -177,9 +154,9 @@ ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N) { #ifdef __OBJECT_MOTION__ - *N = normalize(transform_direction_transposed_auto(&ccl_fetch(sd, ob_itfm), *N)); + *N = normalize(transform_direction_transposed_auto(&sd->ob_itfm, *N)); #else - Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM); + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM); *N = normalize(transform_direction_transposed(&tfm, *N)); #endif } @@ -189,9 +166,9 @@ ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderDa ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D) { #ifdef __OBJECT_MOTION__ - *D = transform_direction_auto(&ccl_fetch(sd, ob_tfm), *D); + *D = transform_direction_auto(&sd->ob_tfm, *D); #else - Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM); + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); *D = transform_direction(&tfm, *D); #endif } @@ -201,9 +178,9 @@ ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D) { #ifdef __OBJECT_MOTION__ - *D = transform_direction_auto(&ccl_fetch(sd, ob_itfm), *D); + *D = transform_direction_auto(&sd->ob_itfm, *D); #else - Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM); + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM); *D = transform_direction(&tfm, *D); #endif } @@ -212,13 +189,13 @@ ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const Sha ccl_device_inline float3 object_location(KernelGlobals *kg, const ShaderData *sd) { - if(ccl_fetch(sd, object) == OBJECT_NONE) + if(sd->object == OBJECT_NONE) return make_float3(0.0f, 0.0f, 0.0f); #ifdef __OBJECT_MOTION__ - return make_float3(ccl_fetch(sd, ob_tfm).x.w, ccl_fetch(sd, ob_tfm).y.w, ccl_fetch(sd, ob_tfm).z.w); + return make_float3(sd->ob_tfm.x.w, sd->ob_tfm.y.w, sd->ob_tfm.z.w); #else - Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM); + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); return make_float3(tfm.x.w, tfm.y.w, tfm.z.w); #endif } @@ -227,9 +204,7 @@ ccl_device_inline float3 object_location(KernelGlobals *kg, const ShaderData *sd ccl_device_inline float object_surface_area(KernelGlobals *kg, int object) { - int offset = object*OBJECT_SIZE + OBJECT_PROPERTIES; - float4 f = kernel_tex_fetch(__objects, offset); - return f.x; + return kernel_tex_fetch(__objects, object).surface_area; } /* Pass ID number of object */ @@ -239,9 +214,17 @@ ccl_device_inline float object_pass_id(KernelGlobals *kg, int object) if(object == OBJECT_NONE) return 0.0f; - int offset = object*OBJECT_SIZE + OBJECT_PROPERTIES; - float4 f = kernel_tex_fetch(__objects, offset); - return f.y; + return kernel_tex_fetch(__objects, object).pass_id; +} + +/* Per lamp random number for shader variation */ + +ccl_device_inline float lamp_random_number(KernelGlobals *kg, int lamp) +{ + if(lamp == LAMP_NONE) + return 0.0f; + + return kernel_tex_fetch(__lights, lamp).random; } /* Per object random number for shader variation */ @@ -251,9 +234,7 @@ ccl_device_inline float object_random_number(KernelGlobals *kg, int object) if(object == OBJECT_NONE) return 0.0f; - int offset = object*OBJECT_SIZE + OBJECT_PROPERTIES; - float4 f = kernel_tex_fetch(__objects, offset); - return f.z; + return kernel_tex_fetch(__objects, object).random_number; } /* Particle ID from which this object was generated */ @@ -263,9 +244,7 @@ ccl_device_inline int object_particle_id(KernelGlobals *kg, int object) if(object == OBJECT_NONE) return 0; - int offset = object*OBJECT_SIZE + OBJECT_PROPERTIES; - float4 f = kernel_tex_fetch(__objects, offset); - return __float_as_uint(f.w); + return kernel_tex_fetch(__objects, object).particle_index; } /* Generated texture coordinate on surface from where object was instanced */ @@ -275,9 +254,10 @@ ccl_device_inline float3 object_dupli_generated(KernelGlobals *kg, int object) if(object == OBJECT_NONE) return make_float3(0.0f, 0.0f, 0.0f); - int offset = object*OBJECT_SIZE + OBJECT_DUPLI; - float4 f = kernel_tex_fetch(__objects, offset); - return make_float3(f.x, f.y, f.z); + const ccl_global KernelObject *kobject = &kernel_tex_fetch(__objects, object); + return make_float3(kobject->dupli_generated[0], + kobject->dupli_generated[1], + kobject->dupli_generated[2]); } /* UV texture coordinate on surface from where object was instanced */ @@ -287,27 +267,24 @@ ccl_device_inline float3 object_dupli_uv(KernelGlobals *kg, int object) if(object == OBJECT_NONE) return make_float3(0.0f, 0.0f, 0.0f); - int offset = object*OBJECT_SIZE + OBJECT_DUPLI; - float4 f = kernel_tex_fetch(__objects, offset + 1); - return make_float3(f.x, f.y, 0.0f); + const ccl_global KernelObject *kobject = &kernel_tex_fetch(__objects, object); + return make_float3(kobject->dupli_uv[0], + kobject->dupli_uv[1], + 0.0f); } /* Information about mesh for motion blurred triangles and curves */ ccl_device_inline void object_motion_info(KernelGlobals *kg, int object, int *numsteps, int *numverts, int *numkeys) { - int offset = object*OBJECT_SIZE + OBJECT_DUPLI; - if(numkeys) { - float4 f = kernel_tex_fetch(__objects, offset); - *numkeys = __float_as_int(f.w); + *numkeys = kernel_tex_fetch(__objects, object).numkeys; } - float4 f = kernel_tex_fetch(__objects, offset + 1); if(numsteps) - *numsteps = __float_as_int(f.z); + *numsteps = kernel_tex_fetch(__objects, object).numsteps; if(numverts) - *numverts = __float_as_int(f.w); + *numverts = kernel_tex_fetch(__objects, object).numverts; } /* Offset to an objects patch map */ @@ -317,76 +294,56 @@ ccl_device_inline uint object_patch_map_offset(KernelGlobals *kg, int object) if(object == OBJECT_NONE) return 0; - int offset = object*OBJECT_SIZE + 11; - float4 f = kernel_tex_fetch(__objects, offset); - return __float_as_uint(f.x); + return kernel_tex_fetch(__objects, object).patch_map_offset; } /* Pass ID for shader */ ccl_device int shader_pass_id(KernelGlobals *kg, const ShaderData *sd) { - return kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE + 1); + return kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).pass_id; } /* Particle data from which object was instanced */ -ccl_device_inline float particle_index(KernelGlobals *kg, int particle) +ccl_device_inline uint particle_index(KernelGlobals *kg, int particle) { - int offset = particle*PARTICLE_SIZE; - float4 f = kernel_tex_fetch(__particles, offset + 0); - return f.x; + return kernel_tex_fetch(__particles, particle).index; } ccl_device float particle_age(KernelGlobals *kg, int particle) { - int offset = particle*PARTICLE_SIZE; - float4 f = kernel_tex_fetch(__particles, offset + 0); - return f.y; + return kernel_tex_fetch(__particles, particle).age; } ccl_device float particle_lifetime(KernelGlobals *kg, int particle) { - int offset = particle*PARTICLE_SIZE; - float4 f = kernel_tex_fetch(__particles, offset + 0); - return f.z; + return kernel_tex_fetch(__particles, particle).lifetime; } ccl_device float particle_size(KernelGlobals *kg, int particle) { - int offset = particle*PARTICLE_SIZE; - float4 f = kernel_tex_fetch(__particles, offset + 0); - return f.w; + return kernel_tex_fetch(__particles, particle).size; } ccl_device float4 particle_rotation(KernelGlobals *kg, int particle) { - int offset = particle*PARTICLE_SIZE; - float4 f = kernel_tex_fetch(__particles, offset + 1); - return f; + return kernel_tex_fetch(__particles, particle).rotation; } ccl_device float3 particle_location(KernelGlobals *kg, int particle) { - int offset = particle*PARTICLE_SIZE; - float4 f = kernel_tex_fetch(__particles, offset + 2); - return make_float3(f.x, f.y, f.z); + return float4_to_float3(kernel_tex_fetch(__particles, particle).location); } ccl_device float3 particle_velocity(KernelGlobals *kg, int particle) { - int offset = particle*PARTICLE_SIZE; - float4 f2 = kernel_tex_fetch(__particles, offset + 2); - float4 f3 = kernel_tex_fetch(__particles, offset + 3); - return make_float3(f2.w, f3.x, f3.y); + return float4_to_float3(kernel_tex_fetch(__particles, particle).velocity); } ccl_device float3 particle_angular_velocity(KernelGlobals *kg, int particle) { - int offset = particle*PARTICLE_SIZE; - float4 f3 = kernel_tex_fetch(__particles, offset + 3); - float4 f4 = kernel_tex_fetch(__particles, offset + 4); - return make_float3(f3.z, f3.w, f4.x); + return float4_to_float3(kernel_tex_fetch(__particles, particle).angular_velocity); } /* Object intersection in BVH */ @@ -415,17 +372,18 @@ ccl_device_inline float3 bvh_clamp_direction(float3 dir) ccl_device_inline float3 bvh_inverse_direction(float3 dir) { - /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */ -#if defined(__KERNEL_SSE__) && 0 return rcp(dir); -#else - return 1.0f / dir; -#endif } /* Transform ray into object space to enter static object in BVH */ -ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t) +ccl_device_inline float bvh_instance_push(KernelGlobals *kg, + int object, + const Ray *ray, + float3 *P, + float3 *dir, + float3 *idir, + float t) { Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM); @@ -435,8 +393,11 @@ ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ra *dir = bvh_clamp_direction(normalize_len(transform_direction(&tfm, ray->D), &len)); *idir = bvh_inverse_direction(*dir); - if(*t != FLT_MAX) - *t *= len; + if(t != FLT_MAX) { + t *= len; + } + + return t; } #ifdef __QBVH__ @@ -473,16 +434,24 @@ ccl_device_inline void qbvh_instance_push(KernelGlobals *kg, /* Transorm ray to exit static object in BVH */ -ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t) +ccl_device_inline float bvh_instance_pop(KernelGlobals *kg, + int object, + const Ray *ray, + float3 *P, + float3 *dir, + float3 *idir, + float t) { - if(*t != FLT_MAX) { + if(t != FLT_MAX) { Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM); - *t /= len(transform_direction(&tfm, ray->D)); + t /= len(transform_direction(&tfm, ray->D)); } *P = ray->P; *dir = bvh_clamp_direction(ray->D); *idir = bvh_inverse_direction(*dir); + + return t; } /* Same as above, but returns scale factor to apply to multiple intersection distances */ @@ -501,13 +470,13 @@ ccl_device_inline void bvh_instance_pop_factor(KernelGlobals *kg, int object, co #ifdef __OBJECT_MOTION__ /* Transform ray into object space to enter motion blurred object in BVH */ -ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, +ccl_device_inline float bvh_instance_motion_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, - ccl_addr_space float *t, + float t, Transform *itfm) { object_fetch_transform_motion_test(kg, object, ray->time, itfm); @@ -518,8 +487,11 @@ ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, *dir = bvh_clamp_direction(normalize_len(transform_direction(itfm, ray->D), &len)); *idir = bvh_inverse_direction(*dir); - if(*t != FLT_MAX) - *t *= len; + if(t != FLT_MAX) { + t *= len; + } + + return t; } #ifdef __QBVH__ @@ -557,22 +529,24 @@ ccl_device_inline void qbvh_instance_motion_push(KernelGlobals *kg, /* Transorm ray to exit motion blurred object in BVH */ -ccl_device_inline void bvh_instance_motion_pop(KernelGlobals *kg, - int object, - const Ray *ray, - float3 *P, - float3 *dir, - float3 *idir, - ccl_addr_space float *t, - Transform *itfm) -{ - if(*t != FLT_MAX) { - *t /= len(transform_direction(itfm, ray->D)); +ccl_device_inline float bvh_instance_motion_pop(KernelGlobals *kg, + int object, + const Ray *ray, + float3 *P, + float3 *dir, + float3 *idir, + float t, + Transform *itfm) +{ + if(t != FLT_MAX) { + t /= len(transform_direction(itfm, ray->D)); } *P = ray->P; *dir = bvh_clamp_direction(ray->D); *idir = bvh_inverse_direction(*dir); + + return t; } /* Same as above, but returns scale factor to apply to multiple intersection distances */ diff --git a/intern/cycles/kernel/geom/geom_patch.h b/intern/cycles/kernel/geom/geom_patch.h index 6a0ff5a4a04..5663b598508 100644 --- a/intern/cycles/kernel/geom/geom_patch.h +++ b/intern/cycles/kernel/geom/geom_patch.h @@ -267,7 +267,7 @@ ccl_device float patch_eval_float(KernelGlobals *kg, const ShaderData *sd, int o float weights_du[PATCH_MAX_CONTROL_VERTS]; float weights_dv[PATCH_MAX_CONTROL_VERTS]; - int num_control = patch_eval_control_verts(kg, ccl_fetch(sd, object), patch, u, v, channel, + int num_control = patch_eval_control_verts(kg, sd->object, patch, u, v, channel, indices, weights, weights_du, weights_dv); float val = 0.0f; @@ -294,7 +294,7 @@ ccl_device float3 patch_eval_float3(KernelGlobals *kg, const ShaderData *sd, int float weights_du[PATCH_MAX_CONTROL_VERTS]; float weights_dv[PATCH_MAX_CONTROL_VERTS]; - int num_control = patch_eval_control_verts(kg, ccl_fetch(sd, object), patch, u, v, channel, + int num_control = patch_eval_control_verts(kg, sd->object, patch, u, v, channel, indices, weights, weights_du, weights_dv); float3 val = make_float3(0.0f, 0.0f, 0.0f); @@ -321,7 +321,7 @@ ccl_device float3 patch_eval_uchar4(KernelGlobals *kg, const ShaderData *sd, int float weights_du[PATCH_MAX_CONTROL_VERTS]; float weights_dv[PATCH_MAX_CONTROL_VERTS]; - int num_control = patch_eval_control_verts(kg, ccl_fetch(sd, object), patch, u, v, channel, + int num_control = patch_eval_control_verts(kg, sd->object, patch, u, v, channel, indices, weights, weights_du, weights_dv); float3 val = make_float3(0.0f, 0.0f, 0.0f); diff --git a/intern/cycles/kernel/geom/geom_primitive.h b/intern/cycles/kernel/geom/geom_primitive.h index 8a73bb2f78b..c159be92885 100644 --- a/intern/cycles/kernel/geom/geom_primitive.h +++ b/intern/cycles/kernel/geom/geom_primitive.h @@ -28,19 +28,19 @@ ccl_device_inline float primitive_attribute_float(KernelGlobals *kg, const AttributeDescriptor desc, float *dx, float *dy) { - if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) { + if(sd->type & PRIMITIVE_ALL_TRIANGLE) { if(subd_triangle_patch(kg, sd) == ~0) return triangle_attribute_float(kg, sd, desc, dx, dy); else return subd_triangle_attribute_float(kg, sd, desc, dx, dy); } #ifdef __HAIR__ - else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { + else if(sd->type & PRIMITIVE_ALL_CURVE) { return curve_attribute_float(kg, sd, desc, dx, dy); } #endif #ifdef __VOLUME__ - else if(ccl_fetch(sd, object) != OBJECT_NONE && desc.element == ATTR_ELEMENT_VOXEL) { + else if(sd->object != OBJECT_NONE && desc.element == ATTR_ELEMENT_VOXEL) { return volume_attribute_float(kg, sd, desc, dx, dy); } #endif @@ -56,19 +56,19 @@ ccl_device_inline float3 primitive_attribute_float3(KernelGlobals *kg, const AttributeDescriptor desc, float3 *dx, float3 *dy) { - if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) { + if(sd->type & PRIMITIVE_ALL_TRIANGLE) { if(subd_triangle_patch(kg, sd) == ~0) return triangle_attribute_float3(kg, sd, desc, dx, dy); else return subd_triangle_attribute_float3(kg, sd, desc, dx, dy); } #ifdef __HAIR__ - else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { + else if(sd->type & PRIMITIVE_ALL_CURVE) { return curve_attribute_float3(kg, sd, desc, dx, dy); } #endif #ifdef __VOLUME__ - else if(ccl_fetch(sd, object) != OBJECT_NONE && desc.element == ATTR_ELEMENT_VOXEL) { + else if(sd->object != OBJECT_NONE && desc.element == ATTR_ELEMENT_VOXEL) { return volume_attribute_float3(kg, sd, desc, dx, dy); } #endif @@ -118,9 +118,9 @@ ccl_device bool primitive_ptex(KernelGlobals *kg, ShaderData *sd, float2 *uv, in ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd) { #ifdef __HAIR__ - if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) + if(sd->type & PRIMITIVE_ALL_CURVE) # ifdef __DPDU__ - return normalize(ccl_fetch(sd, dPdu)); + return normalize(sd->dPdu); # else return make_float3(0.0f, 0.0f, 0.0f); # endif @@ -133,12 +133,12 @@ ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd) float3 data = primitive_attribute_float3(kg, sd, desc, NULL, NULL); data = make_float3(-(data.y - 0.5f), (data.x - 0.5f), 0.0f); object_normal_transform(kg, sd, &data); - return cross(ccl_fetch(sd, N), normalize(cross(data, ccl_fetch(sd, N)))); + return cross(sd->N, normalize(cross(data, sd->N))); } else { /* otherwise use surface derivatives */ #ifdef __DPDU__ - return normalize(ccl_fetch(sd, dPdu)); + return normalize(sd->dPdu); #else return make_float3(0.0f, 0.0f, 0.0f); #endif @@ -153,17 +153,17 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData * float3 center; #ifdef __HAIR__ - bool is_curve_primitive = ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE; + bool is_curve_primitive = sd->type & PRIMITIVE_ALL_CURVE; if(is_curve_primitive) { center = curve_motion_center_location(kg, sd); - if(!(ccl_fetch(sd, object_flag) & SD_OBJECT_TRANSFORM_APPLIED)) { + if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { object_position_transform(kg, sd, ¢er); } } else #endif - center = ccl_fetch(sd, P); + center = sd->P; float3 motion_pre = center, motion_post = center; @@ -173,16 +173,16 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData * if(desc.offset != ATTR_STD_NOT_FOUND) { /* get motion info */ int numverts, numkeys; - object_motion_info(kg, ccl_fetch(sd, object), NULL, &numverts, &numkeys); + object_motion_info(kg, sd->object, NULL, &numverts, &numkeys); /* lookup attributes */ motion_pre = primitive_attribute_float3(kg, sd, desc, NULL, NULL); - desc.offset += (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)? numverts: numkeys; + desc.offset += (sd->type & PRIMITIVE_ALL_TRIANGLE)? numverts: numkeys; motion_post = primitive_attribute_float3(kg, sd, desc, NULL, NULL); #ifdef __HAIR__ - if(is_curve_primitive && (ccl_fetch(sd, object_flag) & SD_OBJECT_HAS_VERTEX_MOTION) == 0) { + if(is_curve_primitive && (sd->object_flag & SD_OBJECT_HAS_VERTEX_MOTION) == 0) { object_position_transform(kg, sd, &motion_pre); object_position_transform(kg, sd, &motion_post); } @@ -193,10 +193,10 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData * * transformation was set match the world/object space of motion_pre/post */ Transform tfm; - tfm = object_fetch_vector_transform(kg, ccl_fetch(sd, object), OBJECT_VECTOR_MOTION_PRE); + tfm = object_fetch_motion_pass_transform(kg, sd->object, OBJECT_PASS_MOTION_PRE); motion_pre = transform_point(&tfm, motion_pre); - tfm = object_fetch_vector_transform(kg, ccl_fetch(sd, object), OBJECT_VECTOR_MOTION_POST); + tfm = object_fetch_motion_pass_transform(kg, sd->object, OBJECT_PASS_MOTION_POST); motion_post = transform_point(&tfm, motion_post); float3 motion_center; @@ -204,31 +204,31 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData * /* camera motion, for perspective/orthographic motion.pre/post will be a * world-to-raster matrix, for panorama it's world-to-camera */ if(kernel_data.cam.type != CAMERA_PANORAMA) { - tfm = kernel_data.cam.worldtoraster; - motion_center = transform_perspective(&tfm, center); + ProjectionTransform projection = kernel_data.cam.worldtoraster; + motion_center = transform_perspective(&projection, center); - tfm = kernel_data.cam.motion.pre; - motion_pre = transform_perspective(&tfm, motion_pre); + projection = kernel_data.cam.perspective_pre; + motion_pre = transform_perspective(&projection, motion_pre); - tfm = kernel_data.cam.motion.post; - motion_post = transform_perspective(&tfm, motion_post); + projection = kernel_data.cam.perspective_post; + motion_post = transform_perspective(&projection, motion_post); } else { tfm = kernel_data.cam.worldtocamera; motion_center = normalize(transform_point(&tfm, center)); - motion_center = float2_to_float3(direction_to_panorama(kg, motion_center)); + motion_center = float2_to_float3(direction_to_panorama(&kernel_data.cam, motion_center)); motion_center.x *= kernel_data.cam.width; motion_center.y *= kernel_data.cam.height; - tfm = kernel_data.cam.motion.pre; + tfm = kernel_data.cam.motion_pass_pre; motion_pre = normalize(transform_point(&tfm, motion_pre)); - motion_pre = float2_to_float3(direction_to_panorama(kg, motion_pre)); + motion_pre = float2_to_float3(direction_to_panorama(&kernel_data.cam, motion_pre)); motion_pre.x *= kernel_data.cam.width; motion_pre.y *= kernel_data.cam.height; - tfm = kernel_data.cam.motion.post; + tfm = kernel_data.cam.motion_pass_post; motion_post = normalize(transform_point(&tfm, motion_post)); - motion_post = float2_to_float3(direction_to_panorama(kg, motion_post)); + motion_post = float2_to_float3(direction_to_panorama(&kernel_data.cam, motion_post)); motion_post.x *= kernel_data.cam.width; motion_post.y *= kernel_data.cam.height; } diff --git a/intern/cycles/kernel/geom/geom_subd_triangle.h b/intern/cycles/kernel/geom/geom_subd_triangle.h index 647840dc696..044e82f03d4 100644 --- a/intern/cycles/kernel/geom/geom_subd_triangle.h +++ b/intern/cycles/kernel/geom/geom_subd_triangle.h @@ -22,14 +22,14 @@ CCL_NAMESPACE_BEGIN ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData *sd) { - return (ccl_fetch(sd, prim) != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, ccl_fetch(sd, prim)) : ~0; + return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0; } /* UV coords of triangle within patch */ ccl_device_inline void subd_triangle_patch_uv(KernelGlobals *kg, const ShaderData *sd, float2 uv[3]) { - uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); + uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); uv[0] = kernel_tex_fetch(__tri_patch_uv, tri_vindex.x); uv[1] = kernel_tex_fetch(__tri_patch_uv, tri_vindex.y); @@ -110,7 +110,7 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const float2 dpdv = uv[1] - uv[2]; /* p is [s, t] */ - float2 p = dpdu * ccl_fetch(sd, u) + dpdv * ccl_fetch(sd, v) + uv[2]; + float2 p = dpdu * sd->u + dpdv * sd->v + uv[2]; float a, dads, dadt; a = patch_eval_float(kg, sd, desc.offset, patch, p.x, p.y, 0, &dads, &dadt); @@ -123,8 +123,8 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const float dtdv = dpdv.y; if(dx) { - float dudx = ccl_fetch(sd, du).dx; - float dvdx = ccl_fetch(sd, dv).dx; + float dudx = sd->du.dx; + float dvdx = sd->dv.dx; float dsdx = dsdu*dudx + dsdv*dvdx; float dtdx = dtdu*dudx + dtdv*dvdx; @@ -132,8 +132,8 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const *dx = dads*dsdx + dadt*dtdx; } if(dy) { - float dudy = ccl_fetch(sd, du).dy; - float dvdy = ccl_fetch(sd, dv).dy; + float dudy = sd->du.dy; + float dvdy = sd->dv.dy; float dsdy = dsdu*dudy + dsdv*dvdy; float dtdy = dtdu*dudy + dtdv*dvdy; @@ -174,11 +174,11 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const float c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c; - if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c; + if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c; + if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c; #endif - return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c; + return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c; } else if(desc.element == ATTR_ELEMENT_CORNER) { float2 uv[3]; @@ -202,11 +202,11 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const float c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c; - if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c; + if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c; + if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c; #endif - return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c; + return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c; } else { if(dx) *dx = 0.0f; @@ -229,7 +229,7 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con float2 dpdv = uv[1] - uv[2]; /* p is [s, t] */ - float2 p = dpdu * ccl_fetch(sd, u) + dpdv * ccl_fetch(sd, v) + uv[2]; + float2 p = dpdu * sd->u + dpdv * sd->v + uv[2]; float3 a, dads, dadt; @@ -248,8 +248,8 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con float dtdv = dpdv.y; if(dx) { - float dudx = ccl_fetch(sd, du).dx; - float dvdx = ccl_fetch(sd, dv).dx; + float dudx = sd->du.dx; + float dvdx = sd->dv.dx; float dsdx = dsdu*dudx + dsdv*dvdx; float dtdx = dtdu*dudx + dtdv*dvdx; @@ -257,8 +257,8 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con *dx = dads*dsdx + dadt*dtdx; } if(dy) { - float dudy = ccl_fetch(sd, du).dy; - float dvdy = ccl_fetch(sd, dv).dy; + float dudy = sd->du.dy; + float dvdy = sd->dv.dy; float dsdy = dsdu*dudy + dsdv*dvdy; float dtdy = dtdu*dudy + dtdv*dvdy; @@ -299,11 +299,11 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con float3 c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c; - if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c; + if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c; + if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c; #endif - return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c; + return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c; } else if(desc.element == ATTR_ELEMENT_CORNER || desc.element == ATTR_ELEMENT_CORNER_BYTE) { float2 uv[3]; @@ -337,11 +337,11 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con float3 c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c; - if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c; + if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c; + if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c; #endif - return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c; + return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c; } else { if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f); diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h index 3229091bbb0..105aee8da15 100644 --- a/intern/cycles/kernel/geom/geom_triangle.h +++ b/intern/cycles/kernel/geom/geom_triangle.h @@ -26,13 +26,13 @@ CCL_NAMESPACE_BEGIN ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd) { /* load triangle vertices */ - const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); + const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); const float3 v0 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+0)); const float3 v1 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+1)); const float3 v2 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+2)); /* return normal */ - if(ccl_fetch(sd, object_flag) & SD_OBJECT_NEGATIVE_SCALE_APPLIED) { + if(sd->object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) { return normalize(cross(v2 - v0, v1 - v0)); } else { @@ -76,7 +76,7 @@ ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3 /* Interpolate smooth vertex normal from vertices */ -ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, float u, float v) +ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, float3 Ng, int prim, float u, float v) { /* load triangle vertices */ const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); @@ -84,7 +84,9 @@ ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, flo float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y)); float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z)); - return normalize((1.0f - u - v)*n2 + u*n0 + v*n1); + float3 N = safe_normalize((1.0f - u - v)*n2 + u*n0 + v*n1); + + return is_zero(N)? Ng: N; } /* Ray differentials on triangle */ @@ -110,34 +112,34 @@ ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *s if(dx) *dx = 0.0f; if(dy) *dy = 0.0f; - return kernel_tex_fetch(__attributes_float, desc.offset + ccl_fetch(sd, prim)); + return kernel_tex_fetch(__attributes_float, desc.offset + sd->prim); } else if(desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) { - uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); + uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); float f0 = kernel_tex_fetch(__attributes_float, desc.offset + tri_vindex.x); float f1 = kernel_tex_fetch(__attributes_float, desc.offset + tri_vindex.y); float f2 = kernel_tex_fetch(__attributes_float, desc.offset + tri_vindex.z); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2; - if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2; + if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2; + if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2; #endif - return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2; + return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2; } else if(desc.element == ATTR_ELEMENT_CORNER) { - int tri = desc.offset + ccl_fetch(sd, prim)*3; + int tri = desc.offset + sd->prim*3; float f0 = kernel_tex_fetch(__attributes_float, tri + 0); float f1 = kernel_tex_fetch(__attributes_float, tri + 1); float f2 = kernel_tex_fetch(__attributes_float, tri + 2); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2; - if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2; + if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2; + if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2; #endif - return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2; + return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2; } else { if(dx) *dx = 0.0f; @@ -153,24 +155,24 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f); if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f); - return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + ccl_fetch(sd, prim))); + return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + sd->prim)); } else if(desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) { - uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); + uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.x)); float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.y)); float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.z)); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2; - if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2; + if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2; + if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2; #endif - return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2; + return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2; } else if(desc.element == ATTR_ELEMENT_CORNER || desc.element == ATTR_ELEMENT_CORNER_BYTE) { - int tri = desc.offset + ccl_fetch(sd, prim)*3; + int tri = desc.offset + sd->prim*3; float3 f0, f1, f2; if(desc.element == ATTR_ELEMENT_CORNER) { @@ -185,11 +187,11 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData } #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2; - if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2; + if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2; + if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2; #endif - return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2; + return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2; } else { if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f); diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h index 4db121d94f4..a3b23115ae4 100644 --- a/intern/cycles/kernel/geom/geom_triangle_intersect.h +++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h @@ -22,414 +22,153 @@ CCL_NAMESPACE_BEGIN -/* Workaround stupidness of CUDA/OpenCL which doesn't allow to access indexed - * component of float3 value. - */ -#ifndef __KERNEL_CPU__ -# define IDX(vec, idx) \ - ((idx == 0) ? ((vec).x) : ( (idx == 1) ? ((vec).y) : ((vec).z) )) -#else -# define IDX(vec, idx) ((vec)[idx]) -#endif - -/* Ray-Triangle intersection for BVH traversal - * - * Sven Woop - * Watertight Ray/Triangle Intersection - * - * http://jcgt.org/published/0002/01/05/paper.pdf - */ - -/* Precalculated data for the ray->tri intersection. */ -typedef struct IsectPrecalc { - /* Maximal dimension kz, and orthogonal dimensions. */ - int kx, ky, kz; - - /* Shear constants. */ - float Sx, Sy, Sz; -} IsectPrecalc; - -#if (defined(__KERNEL_OPENCL_APPLE__)) || \ - (defined(__KERNEL_CUDA__) && (defined(i386) || defined(_M_IX86))) -ccl_device_noinline -#else -ccl_device_inline -#endif -void triangle_intersect_precalc(float3 dir, - IsectPrecalc *isect_precalc) -{ - /* Calculate dimension where the ray direction is maximal. */ -#ifndef __KERNEL_SSE__ - int kz = util_max_axis(make_float3(fabsf(dir.x), - fabsf(dir.y), - fabsf(dir.z))); - int kx = kz + 1; if(kx == 3) kx = 0; - int ky = kx + 1; if(ky == 3) ky = 0; -#else - int kx, ky, kz; - /* Avoiding mispredicted branch on direction. */ - kz = util_max_axis(fabs(dir)); - static const char inc_xaxis[] = {1, 2, 0, 55}; - static const char inc_yaxis[] = {2, 0, 1, 55}; - kx = inc_xaxis[kz]; - ky = inc_yaxis[kz]; -#endif - - float dir_kz = IDX(dir, kz); - - /* Swap kx and ky dimensions to preserve winding direction of triangles. */ - if(dir_kz < 0.0f) { - int tmp = kx; - kx = ky; - ky = tmp; - } - - /* Calculate the shear constants. */ - float inv_dir_z = 1.0f / dir_kz; - isect_precalc->Sx = IDX(dir, kx) * inv_dir_z; - isect_precalc->Sy = IDX(dir, ky) * inv_dir_z; - isect_precalc->Sz = inv_dir_z; - - /* Store the dimensions. */ - isect_precalc->kx = kx; - isect_precalc->ky = ky; - isect_precalc->kz = kz; -} - -/* TODO(sergey): Make it general utility function. */ -ccl_device_inline float xor_signmask(float x, int y) -{ - return __int_as_float(__float_as_int(x) ^ y); -} - ccl_device_inline bool triangle_intersect(KernelGlobals *kg, - const IsectPrecalc *isect_precalc, Intersection *isect, float3 P, + float3 dir, uint visibility, int object, int prim_addr) { - const int kx = isect_precalc->kx; - const int ky = isect_precalc->ky; - const int kz = isect_precalc->kz; - const float Sx = isect_precalc->Sx; - const float Sy = isect_precalc->Sy; - const float Sz = isect_precalc->Sz; - - /* Calculate vertices relative to ray origin. */ const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, prim_addr); - -#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) - const avxf avxf_P(P.m128, P.m128); - - const avxf tri_ab = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 0); - const avxf tri_bc = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 1); - - const avxf AB = tri_ab - avxf_P; - const avxf BC = tri_bc - avxf_P; - - const __m256i permute_mask = _mm256_set_epi32(0x3, kz, ky, kx, 0x3, kz, ky, kx); - - const avxf AB_k = shuffle(AB, permute_mask); - const avxf BC_k = shuffle(BC, permute_mask); - - /* Akz, Akz, Bkz, Bkz, Bkz, Bkz, Ckz, Ckz */ - const avxf ABBC_kz = shuffle<2>(AB_k, BC_k); - - /* Akx, Aky, Bkx, Bky, Bkx,Bky, Ckx, Cky */ - const avxf ABBC_kxy = shuffle<0,1,0,1>(AB_k, BC_k); - - const avxf Sxy(Sy, Sx, Sy, Sx); - - /* Ax, Ay, Bx, By, Bx, By, Cx, Cy */ - const avxf ABBC_xy = nmadd(ABBC_kz, Sxy, ABBC_kxy); - - float ABBC_kz_array[8]; - _mm256_storeu_ps((float*)&ABBC_kz_array, ABBC_kz); - - const float A_kz = ABBC_kz_array[0]; - const float B_kz = ABBC_kz_array[2]; - const float C_kz = ABBC_kz_array[6]; - - /* By, Bx, Cy, Cx, By, Bx, Ay, Ax */ - const avxf BCBA_yx = permute<3,2,7,6,3,2,1,0>(ABBC_xy); - - const avxf neg_mask(0,0,0,0,0x80000000, 0x80000000, 0x80000000, 0x80000000); - - /* W U V - * (AxBy-AyBx) (BxCy-ByCx) XX XX (BxBy-ByBx) (CxAy-CyAx) XX XX - */ - const avxf WUxxxxVxx_neg = _mm256_hsub_ps(ABBC_xy * BCBA_yx, neg_mask /* Dont care */); - - const avxf WUVWnegWUVW = permute<0,1,5,0,0,1,5,0>(WUxxxxVxx_neg) ^ neg_mask; - - /* Calculate scaled barycentric coordinates. */ - float WUVW_array[4]; - _mm_storeu_ps((float*)&WUVW_array, _mm256_castps256_ps128 (WUVWnegWUVW)); - - const float W = WUVW_array[0]; - const float U = WUVW_array[1]; - const float V = WUVW_array[2]; - - const int WUVW_mask = 0x7 & _mm256_movemask_ps(WUVWnegWUVW); - const int WUVW_zero = 0x7 & _mm256_movemask_ps(_mm256_cmp_ps(WUVWnegWUVW, - _mm256_setzero_ps(), 0)); - - if(!((WUVW_mask == 7) || (WUVW_mask == 0)) && ((WUVW_mask | WUVW_zero) != 7)) { - return false; - } +#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) + const ssef *ssef_verts = (ssef*)&kg->__prim_tri_verts.data[tri_vindex]; #else const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0), tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1), tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2); - const float3 A = make_float3(tri_a.x - P.x, tri_a.y - P.y, tri_a.z - P.z); - const float3 B = make_float3(tri_b.x - P.x, tri_b.y - P.y, tri_b.z - P.z); - const float3 C = make_float3(tri_c.x - P.x, tri_c.y - P.y, tri_c.z - P.z); - - const float A_kx = IDX(A, kx), A_ky = IDX(A, ky), A_kz = IDX(A, kz); - const float B_kx = IDX(B, kx), B_ky = IDX(B, ky), B_kz = IDX(B, kz); - const float C_kx = IDX(C, kx), C_ky = IDX(C, ky), C_kz = IDX(C, kz); - - /* Perform shear and scale of vertices. */ - const float Ax = A_kx - Sx * A_kz; - const float Ay = A_ky - Sy * A_kz; - const float Bx = B_kx - Sx * B_kz; - const float By = B_ky - Sy * B_kz; - const float Cx = C_kx - Sx * C_kz; - const float Cy = C_ky - Sy * C_kz; - - /* Calculate scaled barycentric coordinates. */ - float U = Cx * By - Cy * Bx; - float V = Ax * Cy - Ay * Cx; - float W = Bx * Ay - By * Ax; - if((U < 0.0f || V < 0.0f || W < 0.0f) && - (U > 0.0f || V > 0.0f || W > 0.0f)) - { - return false; - } #endif - - /* Calculate determinant. */ - float det = U + V + W; - if(UNLIKELY(det == 0.0f)) { - return false; - } - - /* Calculate scaled z-coordinates of vertices and use them to calculate - * the hit distance. - */ - const float T = (U * A_kz + V * B_kz + W * C_kz) * Sz; - const int sign_det = (__float_as_int(det) & 0x80000000); - const float sign_T = xor_signmask(T, sign_det); - if((sign_T < 0.0f) || - (sign_T > isect->t * xor_signmask(det, sign_det))) + float t, u, v; + if(ray_triangle_intersect(P, + dir, + isect->t, +#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) + ssef_verts, +#else + float4_to_float3(tri_a), + float4_to_float3(tri_b), + float4_to_float3(tri_c), +#endif + &u, &v, &t)) { - return false; - } - #ifdef __VISIBILITY_FLAG__ - /* visibility flag test. we do it here under the assumption - * that most triangles are culled by node flags */ - if(kernel_tex_fetch(__prim_visibility, prim_addr) & visibility) + /* Visibility flag test. we do it here under the assumption + * that most triangles are culled by node flags. + */ + if(kernel_tex_fetch(__prim_visibility, prim_addr) & visibility) #endif - { -#ifdef __KERNEL_CUDA__ - if(A == B && B == C) { - return false; + { + isect->prim = prim_addr; + isect->object = object; + isect->type = PRIMITIVE_TRIANGLE; + isect->u = u; + isect->v = v; + isect->t = t; + return true; } -#endif - /* Normalize U, V, W, and T. */ - const float inv_det = 1.0f / det; - isect->prim = prim_addr; - isect->object = object; - isect->type = PRIMITIVE_TRIANGLE; - isect->u = U * inv_det; - isect->v = V * inv_det; - isect->t = T * inv_det; - return true; } return false; } -/* Special ray intersection routines for subsurface scattering. In that case we +/* Special ray intersection routines for local intersection. In that case we * only want to intersect with primitives in the same object, and if case of * multiple hits we pick a single random primitive as the intersection point. */ -#ifdef __SUBSURFACE__ -ccl_device_inline void triangle_intersect_subsurface( +#ifdef __BVH_LOCAL__ +ccl_device_inline void triangle_intersect_local( KernelGlobals *kg, - const IsectPrecalc *isect_precalc, - SubsurfaceIntersection *ss_isect, + LocalIntersection *local_isect, float3 P, + float3 dir, int object, + int local_object, int prim_addr, float tmax, uint *lcg_state, int max_hits) { - const int kx = isect_precalc->kx; - const int ky = isect_precalc->ky; - const int kz = isect_precalc->kz; - const float Sx = isect_precalc->Sx; - const float Sy = isect_precalc->Sy; - const float Sz = isect_precalc->Sz; - - /* Calculate vertices relative to ray origin. */ - const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, prim_addr); - const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0), - tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1), - tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2); - -#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) - const avxf avxf_P(P.m128, P.m128); - - const avxf tri_ab = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 0); - const avxf tri_bc = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 1); - - const avxf AB = tri_ab - avxf_P; - const avxf BC = tri_bc - avxf_P; - - const __m256i permuteMask = _mm256_set_epi32(0x3, kz, ky, kx, 0x3, kz, ky, kx); - - const avxf AB_k = shuffle(AB, permuteMask); - const avxf BC_k = shuffle(BC, permuteMask); - - /* Akz, Akz, Bkz, Bkz, Bkz, Bkz, Ckz, Ckz */ - const avxf ABBC_kz = shuffle<2>(AB_k, BC_k); - - /* Akx, Aky, Bkx, Bky, Bkx,Bky, Ckx, Cky */ - const avxf ABBC_kxy = shuffle<0,1,0,1>(AB_k, BC_k); - - const avxf Sxy(Sy, Sx, Sy, Sx); - - /* Ax, Ay, Bx, By, Bx, By, Cx, Cy */ - const avxf ABBC_xy = nmadd(ABBC_kz, Sxy, ABBC_kxy); - - float ABBC_kz_array[8]; - _mm256_storeu_ps((float*)&ABBC_kz_array, ABBC_kz); - - const float A_kz = ABBC_kz_array[0]; - const float B_kz = ABBC_kz_array[2]; - const float C_kz = ABBC_kz_array[6]; - - /* By, Bx, Cy, Cx, By, Bx, Ay, Ax */ - const avxf BCBA_yx = permute<3,2,7,6,3,2,1,0>(ABBC_xy); - - const avxf negMask(0,0,0,0,0x80000000, 0x80000000, 0x80000000, 0x80000000); - - /* W U V - * (AxBy-AyBx) (BxCy-ByCx) XX XX (BxBy-ByBx) (CxAy-CyAx) XX XX - */ - const avxf WUxxxxVxx_neg = _mm256_hsub_ps(ABBC_xy * BCBA_yx, negMask /* Dont care */); - - const avxf WUVWnegWUVW = permute<0,1,5,0,0,1,5,0>(WUxxxxVxx_neg) ^ negMask; - - /* Calculate scaled barycentric coordinates. */ - float WUVW_array[4]; - _mm_storeu_ps((float*)&WUVW_array, _mm256_castps256_ps128 (WUVWnegWUVW)); - - const float W = WUVW_array[0]; - const float U = WUVW_array[1]; - const float V = WUVW_array[2]; - - const int WUVW_mask = 0x7 & _mm256_movemask_ps(WUVWnegWUVW); - const int WUVW_zero = 0x7 & _mm256_movemask_ps(_mm256_cmp_ps(WUVWnegWUVW, - _mm256_setzero_ps(), 0)); - - if(!((WUVW_mask == 7) || (WUVW_mask == 0)) && ((WUVW_mask | WUVW_zero) != 7)) { - return; + /* Only intersect with matching object, for instanced objects we + * already know we are only intersecting the right object. */ + if(object == OBJECT_NONE) { + if(kernel_tex_fetch(__prim_object, prim_addr) != local_object) { + return; + } } + + const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, prim_addr); +#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) + const ssef *ssef_verts = (ssef*)&kg->__prim_tri_verts.data[tri_vindex]; #else - const float3 A = make_float3(tri_a.x - P.x, tri_a.y - P.y, tri_a.z - P.z); - const float3 B = make_float3(tri_b.x - P.x, tri_b.y - P.y, tri_b.z - P.z); - const float3 C = make_float3(tri_c.x - P.x, tri_c.y - P.y, tri_c.z - P.z); - - const float A_kx = IDX(A, kx), A_ky = IDX(A, ky), A_kz = IDX(A, kz); - const float B_kx = IDX(B, kx), B_ky = IDX(B, ky), B_kz = IDX(B, kz); - const float C_kx = IDX(C, kx), C_ky = IDX(C, ky), C_kz = IDX(C, kz); - - /* Perform shear and scale of vertices. */ - const float Ax = A_kx - Sx * A_kz; - const float Ay = A_ky - Sy * A_kz; - const float Bx = B_kx - Sx * B_kz; - const float By = B_ky - Sy * B_kz; - const float Cx = C_kx - Sx * C_kz; - const float Cy = C_ky - Sy * C_kz; - - /* Calculate scaled barycentric coordinates. */ - float U = Cx * By - Cy * Bx; - float V = Ax * Cy - Ay * Cx; - float W = Bx * Ay - By * Ax; - - if((U < 0.0f || V < 0.0f || W < 0.0f) && - (U > 0.0f || V > 0.0f || W > 0.0f)) - { - return; - } + const float3 tri_a = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+0)), + tri_b = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+1)), + tri_c = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+2)); #endif - - /* Calculate determinant. */ - float det = U + V + W; - if(UNLIKELY(det == 0.0f)) { - return; - } - - /* Calculate scaled z−coordinates of vertices and use them to calculate - * the hit distance. - */ - const int sign_det = (__float_as_int(det) & 0x80000000); - const float T = (U * A_kz + V * B_kz + W * C_kz) * Sz; - const float sign_T = xor_signmask(T, sign_det); - if((sign_T < 0.0f) || - (sign_T > tmax * xor_signmask(det, sign_det))) + float t, u, v; + if(!ray_triangle_intersect(P, + dir, + tmax, +#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) + ssef_verts, +#else + tri_a, tri_b, tri_c, +#endif + &u, &v, &t)) { return; } - /* Normalize U, V, W, and T. */ - const float inv_det = 1.0f / det; - - const float t = T * inv_det; - for(int i = min(max_hits, ss_isect->num_hits) - 1; i >= 0; --i) { - if(ss_isect->hits[i].t == t) { - return; + int hit; + if(lcg_state) { + /* Record up to max_hits intersections. */ + for(int i = min(max_hits, local_isect->num_hits) - 1; i >= 0; --i) { + if(local_isect->hits[i].t == t) { + return; + } } - } - ss_isect->num_hits++; - int hit; + local_isect->num_hits++; - if(ss_isect->num_hits <= max_hits) { - hit = ss_isect->num_hits - 1; + if(local_isect->num_hits <= max_hits) { + hit = local_isect->num_hits - 1; + } + else { + /* reservoir sampling: if we are at the maximum number of + * hits, randomly replace element or skip it */ + hit = lcg_step_uint(lcg_state) % local_isect->num_hits; + + if(hit >= max_hits) + return; + } } else { - /* reservoir sampling: if we are at the maximum number of - * hits, randomly replace element or skip it */ - hit = lcg_step_uint(lcg_state) % ss_isect->num_hits; - - if(hit >= max_hits) + /* Record closest intersection only. */ + if(local_isect->num_hits && t > local_isect->hits[0].t) { return; + } + + hit = 0; + local_isect->num_hits = 1; } - /* record intersection */ - Intersection *isect = &ss_isect->hits[hit]; + /* Record intersection. */ + Intersection *isect = &local_isect->hits[hit]; isect->prim = prim_addr; isect->object = object; isect->type = PRIMITIVE_TRIANGLE; - isect->u = U * inv_det; - isect->v = V * inv_det; + isect->u = u; + isect->v = v; isect->t = t; /* Record geometric normal. */ - /* TODO(sergey): Use float4_to_float3() on just an edges. */ - const float3 v0 = float4_to_float3(tri_a); - const float3 v1 = float4_to_float3(tri_b); - const float3 v2 = float4_to_float3(tri_c); - ss_isect->Ng[hit] = normalize(cross(v1 - v0, v2 - v0)); -} +#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) + const float3 tri_a = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+0)), + tri_b = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+1)), + tri_c = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+2)); #endif + local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a)); +} +#endif /* __BVH_LOCAL__ */ /* Refine triangle intersection to more precise hit point. For rays that travel * far the precision is often not so good, this reintersects the primitive from @@ -457,7 +196,7 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg, return P; } # ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_itfm); + Transform tfm = sd->ob_itfm; # else Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM); # endif @@ -491,7 +230,7 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg, if(isect->object != OBJECT_NONE) { # ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_tfm); + Transform tfm = sd->ob_tfm; # else Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM); # endif @@ -508,10 +247,10 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg, /* Same as above, except that isect->t is assumed to be in object space for * instancing. */ -ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg, - ShaderData *sd, - const Intersection *isect, - const Ray *ray) +ccl_device_inline float3 triangle_refine_local(KernelGlobals *kg, + ShaderData *sd, + const Intersection *isect, + const Ray *ray) { float3 P = ray->P; float3 D = ray->D; @@ -519,7 +258,7 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg, if(isect->object != OBJECT_NONE) { #ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_itfm); + Transform tfm = sd->ob_itfm; #else Transform tfm = object_fetch_transform(kg, isect->object, @@ -557,7 +296,7 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg, if(isect->object != OBJECT_NONE) { #ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_tfm); + Transform tfm = sd->ob_tfm; #else Transform tfm = object_fetch_transform(kg, isect->object, @@ -570,6 +309,4 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg, return P; } -#undef IDX - CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h index 03724c955be..346f228e961 100644 --- a/intern/cycles/kernel/geom/geom_volume.h +++ b/intern/cycles/kernel/geom/geom_volume.h @@ -29,21 +29,6 @@ CCL_NAMESPACE_BEGIN /* Return position normalized to 0..1 in mesh bounds */ -#if defined(__KERNEL_CUDA__) && __CUDA_ARCH__ < 300 -ccl_device float4 volume_image_texture_3d(int id, float x, float y, float z) -{ - float4 r; - switch(id) { - case 0: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_000, x, y, z); break; - case 1: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_001, x, y, z); break; - case 2: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_002, x, y, z); break; - case 3: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_003, x, y, z); break; - case 4: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_004, x, y, z); break; - } - return r; -} -#endif /* __KERNEL_CUDA__ */ - ccl_device_inline float3 volume_normalized_position(KernelGlobals *kg, const ShaderData *sd, float3 P) @@ -64,24 +49,9 @@ ccl_device_inline float3 volume_normalized_position(KernelGlobals *kg, ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy) { - float3 P = volume_normalized_position(kg, sd, ccl_fetch(sd, P)); -#ifdef __KERNEL_CUDA__ -# if __CUDA_ARCH__ >= 300 - CUtexObject tex = kernel_tex_fetch(__bindless_mapping, desc.offset); - float f = kernel_tex_image_interp_3d_float(tex, P.x, P.y, P.z); - float4 r = make_float4(f, f, f, 1.0f); -# else - float4 r = volume_image_texture_3d(desc.offset, P.x, P.y, P.z); -# endif -#elif defined(__KERNEL_OPENCL__) - float4 r = kernel_tex_image_interp_3d(kg, desc.offset, P.x, P.y, P.z); -#else - float4 r; - if(sd->flag & SD_VOLUME_CUBIC) - r = kernel_tex_image_interp_3d_ex(desc.offset, P.x, P.y, P.z, INTERPOLATION_CUBIC); - else - r = kernel_tex_image_interp_3d(desc.offset, P.x, P.y, P.z); -#endif + float3 P = volume_normalized_position(kg, sd, sd->P); + InterpolationType interp = (sd->flag & SD_VOLUME_CUBIC)? INTERPOLATION_CUBIC: INTERPOLATION_NONE; + float4 r = kernel_tex_image_interp_3d(kg, desc.offset, P.x, P.y, P.z, interp); if(dx) *dx = 0.0f; if(dy) *dy = 0.0f; @@ -91,28 +61,20 @@ ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd, ccl_device float3 volume_attribute_float3(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float3 *dx, float3 *dy) { - float3 P = volume_normalized_position(kg, sd, ccl_fetch(sd, P)); -#ifdef __KERNEL_CUDA__ -# if __CUDA_ARCH__ >= 300 - CUtexObject tex = kernel_tex_fetch(__bindless_mapping, desc.offset); - float4 r = kernel_tex_image_interp_3d_float4(tex, P.x, P.y, P.z); -# else - float4 r = volume_image_texture_3d(desc.offset, P.x, P.y, P.z); -# endif -#elif defined(__KERNEL_OPENCL__) - float4 r = kernel_tex_image_interp_3d(kg, desc.offset, P.x, P.y, P.z); -#else - float4 r; - if(sd->flag & SD_VOLUME_CUBIC) - r = kernel_tex_image_interp_3d_ex(desc.offset, P.x, P.y, P.z, INTERPOLATION_CUBIC); - else - r = kernel_tex_image_interp_3d(desc.offset, P.x, P.y, P.z); -#endif + float3 P = volume_normalized_position(kg, sd, sd->P); + InterpolationType interp = (sd->flag & SD_VOLUME_CUBIC)? INTERPOLATION_CUBIC: INTERPOLATION_NONE; + float4 r = kernel_tex_image_interp_3d(kg, desc.offset, P.x, P.y, P.z, interp); if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f); if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f); - return float4_to_float3(r); + if(r.w > 1e-6f && r.w != 1.0f) { + /* For RGBA colors, unpremultiply after interpolation. */ + return float4_to_float3(r) / r.w; + } + else { + return float4_to_float3(r); + } } #endif diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h index 9279a94c13a..cfb35dd33f5 100644 --- a/intern/cycles/kernel/kernel.h +++ b/intern/cycles/kernel/kernel.h @@ -19,7 +19,8 @@ /* CPU Kernel Interface */ -#include "util_types.h" +#include "util/util_types.h" +#include "kernel/kernel_types.h" CCL_NAMESPACE_BEGIN @@ -28,6 +29,7 @@ CCL_NAMESPACE_BEGIN #define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name) struct KernelGlobals; +struct KernelData; KernelGlobals *kernel_globals_create(); void kernel_globals_free(KernelGlobals *kg); @@ -38,40 +40,26 @@ bool kernel_osl_use(KernelGlobals *kg); void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t size); void kernel_tex_copy(KernelGlobals *kg, const char *name, - device_ptr mem, - size_t width, - size_t height, - size_t depth, - InterpolationType interpolation=INTERPOLATION_LINEAR, - ExtensionType extension = EXTENSION_REPEAT); + void *mem, + size_t size); #define KERNEL_ARCH cpu -#include "kernels/cpu/kernel_cpu.h" - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 -# define KERNEL_ARCH cpu_sse2 -# include "kernels/cpu/kernel_cpu.h" -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 -# define KERNEL_ARCH cpu_sse3 -# include "kernels/cpu/kernel_cpu.h" -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 -# define KERNEL_ARCH cpu_sse41 -# include "kernels/cpu/kernel_cpu.h" -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX -# define KERNEL_ARCH cpu_avx -# include "kernels/cpu/kernel_cpu.h" -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 -# define KERNEL_ARCH cpu_avx2 -# include "kernels/cpu/kernel_cpu.h" -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ +#include "kernel/kernels/cpu/kernel_cpu.h" + +#define KERNEL_ARCH cpu_sse2 +#include "kernel/kernels/cpu/kernel_cpu.h" + +#define KERNEL_ARCH cpu_sse3 +#include "kernel/kernels/cpu/kernel_cpu.h" + +#define KERNEL_ARCH cpu_sse41 +#include "kernel/kernels/cpu/kernel_cpu.h" + +#define KERNEL_ARCH cpu_avx +#include "kernel/kernels/cpu/kernel_cpu.h" + +#define KERNEL_ARCH cpu_avx2 +#include "kernel/kernels/cpu/kernel_cpu.h" CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h index 6c3ee6b8098..c0f281cae97 100644 --- a/intern/cycles/kernel/kernel_accumulate.h +++ b/intern/cycles/kernel/kernel_accumulate.h @@ -21,6 +21,9 @@ CCL_NAMESPACE_BEGIN * BSDF evaluation result, split per BSDF type. This is used to accumulate * render passes separately. */ +ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, + const ShaderData *sd); + ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 value, int use_light_pass) { #ifdef __PASSES__ @@ -52,10 +55,17 @@ ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 v { eval->diffuse = value; } +#ifdef __SHADOW_TRICKS__ + eval->sum_no_mis = make_float3(0.0f, 0.0f, 0.0f); +#endif } -ccl_device_inline void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value) +ccl_device_inline void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value, float mis_weight) { +#ifdef __SHADOW_TRICKS__ + eval->sum_no_mis += value; +#endif + value *= mis_weight; #ifdef __PASSES__ if(eval->use_light_pass) { if(CLOSURE_IS_BSDF_DIFFUSE(type)) @@ -96,7 +106,7 @@ ccl_device_inline bool bsdf_eval_is_zero(BsdfEval *eval) } } -ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float value) +ccl_device_inline void bsdf_eval_mis(BsdfEval *eval, float value) { #ifdef __PASSES__ if(eval->use_light_pass) { @@ -115,8 +125,19 @@ ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float value) } } +ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float value) +{ +#ifdef __SHADOW_TRICKS__ + eval->sum_no_mis *= value; +#endif + bsdf_eval_mis(eval, value); +} + ccl_device_inline void bsdf_eval_mul3(BsdfEval *eval, float3 value) { +#ifdef __SHADOW_TRICKS__ + eval->sum_no_mis *= value; +#endif #ifdef __PASSES__ if(eval->use_light_pass) { eval->diffuse *= value; @@ -134,7 +155,7 @@ ccl_device_inline void bsdf_eval_mul3(BsdfEval *eval, float3 value) #endif } -ccl_device_inline float3 bsdf_eval_sum(BsdfEval *eval) +ccl_device_inline float3 bsdf_eval_sum(const BsdfEval *eval) { #ifdef __PASSES__ if(eval->use_light_pass) { @@ -160,14 +181,12 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass) if(use_light_pass) { L->indirect = make_float3(0.0f, 0.0f, 0.0f); - L->direct_throughput = make_float3(0.0f, 0.0f, 0.0f); L->direct_emission = make_float3(0.0f, 0.0f, 0.0f); L->color_diffuse = make_float3(0.0f, 0.0f, 0.0f); L->color_glossy = make_float3(0.0f, 0.0f, 0.0f); L->color_transmission = make_float3(0.0f, 0.0f, 0.0f); L->color_subsurface = make_float3(0.0f, 0.0f, 0.0f); - L->color_scatter = make_float3(0.0f, 0.0f, 0.0f); L->direct_diffuse = make_float3(0.0f, 0.0f, 0.0f); L->direct_glossy = make_float3(0.0f, 0.0f, 0.0f); @@ -181,45 +200,78 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass) L->indirect_subsurface = make_float3(0.0f, 0.0f, 0.0f); L->indirect_scatter = make_float3(0.0f, 0.0f, 0.0f); - L->path_diffuse = make_float3(0.0f, 0.0f, 0.0f); - L->path_glossy = make_float3(0.0f, 0.0f, 0.0f); - L->path_transmission = make_float3(0.0f, 0.0f, 0.0f); - L->path_subsurface = make_float3(0.0f, 0.0f, 0.0f); - L->path_scatter = make_float3(0.0f, 0.0f, 0.0f); - + L->transparent = 0.0f; L->emission = make_float3(0.0f, 0.0f, 0.0f); L->background = make_float3(0.0f, 0.0f, 0.0f); L->ao = make_float3(0.0f, 0.0f, 0.0f); L->shadow = make_float4(0.0f, 0.0f, 0.0f, 0.0f); L->mist = 0.0f; + + L->state.diffuse = make_float3(0.0f, 0.0f, 0.0f); + L->state.glossy = make_float3(0.0f, 0.0f, 0.0f); + L->state.transmission = make_float3(0.0f, 0.0f, 0.0f); + L->state.subsurface = make_float3(0.0f, 0.0f, 0.0f); + L->state.scatter = make_float3(0.0f, 0.0f, 0.0f); + L->state.direct = make_float3(0.0f, 0.0f, 0.0f); } else #endif { + L->transparent = 0.0f; L->emission = make_float3(0.0f, 0.0f, 0.0f); } + +#ifdef __SHADOW_TRICKS__ + L->path_total = make_float3(0.0f, 0.0f, 0.0f); + L->path_total_shaded = make_float3(0.0f, 0.0f, 0.0f); + L->shadow_background_color = make_float3(0.0f, 0.0f, 0.0f); + L->shadow_throughput = 0.0f; + L->shadow_transparency = 1.0f; + L->has_shadow_catcher = 0; +#endif + +#ifdef __DENOISING_FEATURES__ + L->denoising_normal = make_float3(0.0f, 0.0f, 0.0f); + L->denoising_albedo = make_float3(0.0f, 0.0f, 0.0f); + L->denoising_depth = 0.0f; +#endif + +#ifdef __KERNEL_DEBUG__ + L->debug_data.num_bvh_traversed_nodes = 0; + L->debug_data.num_bvh_traversed_instances = 0; + L->debug_data.num_bvh_intersections = 0; + L->debug_data.num_ray_bounces = 0; +#endif } -ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space float3 *throughput, - BsdfEval *bsdf_eval, float bsdf_pdf, int bounce, int bsdf_label) +ccl_device_inline void path_radiance_bsdf_bounce( + KernelGlobals *kg, + PathRadianceState *L_state, + ccl_addr_space float3 *throughput, + BsdfEval *bsdf_eval, + float bsdf_pdf, int bounce, int bsdf_label) { float inverse_pdf = 1.0f/bsdf_pdf; #ifdef __PASSES__ - if(L->use_light_pass) { + if(kernel_data.film.use_light_pass) { if(bounce == 0 && !(bsdf_label & LABEL_TRANSPARENT)) { /* first on directly visible surface */ float3 value = *throughput*inverse_pdf; - L->path_diffuse = bsdf_eval->diffuse*value; - L->path_glossy = bsdf_eval->glossy*value; - L->path_transmission = bsdf_eval->transmission*value; - L->path_subsurface = bsdf_eval->subsurface*value; - L->path_scatter = bsdf_eval->scatter*value; - - *throughput = L->path_diffuse + L->path_glossy + L->path_transmission + L->path_subsurface + L->path_scatter; + L_state->diffuse = bsdf_eval->diffuse*value; + L_state->glossy = bsdf_eval->glossy*value; + L_state->transmission = bsdf_eval->transmission*value; + L_state->subsurface = bsdf_eval->subsurface*value; + L_state->scatter = bsdf_eval->scatter*value; + + *throughput = L_state->diffuse + + L_state->glossy + + L_state->transmission + + L_state->subsurface + + L_state->scatter; - L->direct_throughput = *throughput; + L_state->direct = *throughput; } else { /* transparent bounce before first hit, or indirectly visible through BSDF */ @@ -234,13 +286,22 @@ ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space } } -ccl_device_inline void path_radiance_accum_emission(PathRadiance *L, float3 throughput, float3 value, int bounce) +ccl_device_inline void path_radiance_accum_emission(PathRadiance *L, + ccl_addr_space PathState *state, + float3 throughput, + float3 value) { +#ifdef __SHADOW_TRICKS__ + if(state->flag & PATH_RAY_SHADOW_CATCHER) { + return; + } +#endif + #ifdef __PASSES__ if(L->use_light_pass) { - if(bounce == 0) + if(state->bounce == 0) L->emission += throughput*value; - else if(bounce == 1) + else if(state->bounce == 1) L->direct_emission += throughput*value; else L->indirect += throughput*value; @@ -252,11 +313,28 @@ ccl_device_inline void path_radiance_accum_emission(PathRadiance *L, float3 thro } } -ccl_device_inline void path_radiance_accum_ao(PathRadiance *L, float3 throughput, float3 alpha, float3 bsdf, float3 ao, int bounce) +ccl_device_inline void path_radiance_accum_ao(PathRadiance *L, + ccl_addr_space PathState *state, + float3 throughput, + float3 alpha, + float3 bsdf, + float3 ao) { +#ifdef __SHADOW_TRICKS__ + if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { + float3 light = throughput * bsdf; + L->path_total += light; + L->path_total_shaded += ao * light; + + if(state->flag & PATH_RAY_SHADOW_CATCHER) { + return; + } + } +#endif + #ifdef __PASSES__ if(L->use_light_pass) { - if(bounce == 0) { + if(state->bounce == 0) { /* directly visible lighting */ L->direct_diffuse += throughput*bsdf*ao; L->ao += alpha*throughput*ao; @@ -273,11 +351,47 @@ ccl_device_inline void path_radiance_accum_ao(PathRadiance *L, float3 throughput } } -ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 throughput, BsdfEval *bsdf_eval, float3 shadow, float shadow_fac, int bounce, bool is_lamp) +ccl_device_inline void path_radiance_accum_total_ao( + PathRadiance *L, + ccl_addr_space PathState *state, + float3 throughput, + float3 bsdf) +{ +#ifdef __SHADOW_TRICKS__ + if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { + L->path_total += throughput * bsdf; + } +#else + (void) L; + (void) state; + (void) throughput; + (void) bsdf; +#endif +} + +ccl_device_inline void path_radiance_accum_light(PathRadiance *L, + ccl_addr_space PathState *state, + float3 throughput, + BsdfEval *bsdf_eval, + float3 shadow, + float shadow_fac, + bool is_lamp) { +#ifdef __SHADOW_TRICKS__ + if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { + float3 light = throughput * bsdf_eval->sum_no_mis; + L->path_total += light; + L->path_total_shaded += shadow * light; + + if(state->flag & PATH_RAY_SHADOW_CATCHER) { + return; + } + } +#endif + #ifdef __PASSES__ if(L->use_light_pass) { - if(bounce == 0) { + if(state->bounce == 0) { /* directly visible lighting */ L->direct_diffuse += throughput*bsdf_eval->diffuse*shadow; L->direct_glossy += throughput*bsdf_eval->glossy*shadow; @@ -303,13 +417,47 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 through } } -ccl_device_inline void path_radiance_accum_background(PathRadiance *L, float3 throughput, float3 value, int bounce) +ccl_device_inline void path_radiance_accum_total_light( + PathRadiance *L, + ccl_addr_space PathState *state, + float3 throughput, + const BsdfEval *bsdf_eval) { +#ifdef __SHADOW_TRICKS__ + if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { + L->path_total += throughput * bsdf_eval->sum_no_mis; + } +#else + (void) L; + (void) state; + (void) throughput; + (void) bsdf_eval; +#endif +} + +ccl_device_inline void path_radiance_accum_background( + PathRadiance *L, + ccl_addr_space PathState *state, + float3 throughput, + float3 value) +{ + +#ifdef __SHADOW_TRICKS__ + if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { + L->path_total += throughput * value; + L->path_total_shaded += throughput * value * L->shadow_transparency; + + if(state->flag & PATH_RAY_SHADOW_CATCHER) { + return; + } + } +#endif + #ifdef __PASSES__ if(L->use_light_pass) { - if(bounce == 0) + if(state->flag & PATH_RAY_TRANSPARENT_BACKGROUND) L->background += throughput*value; - else if(bounce == 1) + else if(state->bounce == 1) L->direct_emission += throughput*value; else L->indirect += throughput*value; @@ -319,7 +467,31 @@ ccl_device_inline void path_radiance_accum_background(PathRadiance *L, float3 th { L->emission += throughput*value; } + +#ifdef __DENOISING_FEATURES__ + L->denoising_albedo += state->denoising_feature_weight * value; +#endif /* __DENOISING_FEATURES__ */ +} + +ccl_device_inline void path_radiance_accum_transparent( + PathRadiance *L, + ccl_addr_space PathState *state, + float3 throughput) +{ + L->transparent += average(throughput); +} + +#ifdef __SHADOW_TRICKS__ +ccl_device_inline void path_radiance_accum_shadowcatcher( + PathRadiance *L, + float3 throughput, + float3 background) +{ + L->shadow_throughput += average(throughput); + L->shadow_background_color += throughput * background; + L->has_shadow_catcher = 1; } +#endif ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L) { @@ -328,19 +500,19 @@ ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L) * only a single throughput further along the path, here we recover just * the indirect path that is not influenced by any particular BSDF type */ if(L->use_light_pass) { - L->direct_emission = safe_divide_color(L->direct_emission, L->direct_throughput); - L->direct_diffuse += L->path_diffuse*L->direct_emission; - L->direct_glossy += L->path_glossy*L->direct_emission; - L->direct_transmission += L->path_transmission*L->direct_emission; - L->direct_subsurface += L->path_subsurface*L->direct_emission; - L->direct_scatter += L->path_scatter*L->direct_emission; - - L->indirect = safe_divide_color(L->indirect, L->direct_throughput); - L->indirect_diffuse += L->path_diffuse*L->indirect; - L->indirect_glossy += L->path_glossy*L->indirect; - L->indirect_transmission += L->path_transmission*L->indirect; - L->indirect_subsurface += L->path_subsurface*L->indirect; - L->indirect_scatter += L->path_scatter*L->indirect; + L->direct_emission = safe_divide_color(L->direct_emission, L->state.direct); + L->direct_diffuse += L->state.diffuse*L->direct_emission; + L->direct_glossy += L->state.glossy*L->direct_emission; + L->direct_transmission += L->state.transmission*L->direct_emission; + L->direct_subsurface += L->state.subsurface*L->direct_emission; + L->direct_scatter += L->state.scatter*L->direct_emission; + + L->indirect = safe_divide_color(L->indirect, L->state.direct); + L->indirect_diffuse += L->state.diffuse*L->indirect; + L->indirect_glossy += L->state.glossy*L->indirect; + L->indirect_transmission += L->state.transmission*L->indirect; + L->indirect_subsurface += L->state.subsurface*L->indirect; + L->indirect_scatter += L->state.scatter*L->indirect; } #endif } @@ -349,11 +521,11 @@ ccl_device_inline void path_radiance_reset_indirect(PathRadiance *L) { #ifdef __PASSES__ if(L->use_light_pass) { - L->path_diffuse = make_float3(0.0f, 0.0f, 0.0f); - L->path_glossy = make_float3(0.0f, 0.0f, 0.0f); - L->path_transmission = make_float3(0.0f, 0.0f, 0.0f); - L->path_subsurface = make_float3(0.0f, 0.0f, 0.0f); - L->path_scatter = make_float3(0.0f, 0.0f, 0.0f); + L->state.diffuse = make_float3(0.0f, 0.0f, 0.0f); + L->state.glossy = make_float3(0.0f, 0.0f, 0.0f); + L->state.transmission = make_float3(0.0f, 0.0f, 0.0f); + L->state.subsurface = make_float3(0.0f, 0.0f, 0.0f); + L->state.scatter = make_float3(0.0f, 0.0f, 0.0f); L->direct_emission = make_float3(0.0f, 0.0f, 0.0f); L->indirect = make_float3(0.0f, 0.0f, 0.0f); @@ -366,11 +538,7 @@ ccl_device_inline void path_radiance_copy_indirect(PathRadiance *L, { #ifdef __PASSES__ if(L->use_light_pass) { - L->path_diffuse = L_src->path_diffuse; - L->path_glossy = L_src->path_glossy; - L->path_transmission = L_src->path_transmission; - L->path_subsurface = L_src->path_subsurface; - L->path_scatter = L_src->path_scatter; + L->state = L_src->state; L->direct_emission = L_src->direct_emission; L->indirect = L_src->indirect; @@ -378,7 +546,40 @@ ccl_device_inline void path_radiance_copy_indirect(PathRadiance *L, #endif } -ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadiance *L) +#ifdef __SHADOW_TRICKS__ +ccl_device_inline void path_radiance_sum_shadowcatcher(KernelGlobals *kg, + PathRadiance *L, + float3 *L_sum, + float *alpha) +{ + /* Calculate current shadow of the path. */ + float path_total = average(L->path_total); + float shadow; + + if(UNLIKELY(!isfinite_safe(path_total))) { + kernel_assert(!"Non-finite total radiance along the path"); + shadow = 0.0f; + } + else if(path_total == 0.0f) { + shadow = L->shadow_transparency; + } + else { + float path_total_shaded = average(L->path_total_shaded); + shadow = path_total_shaded / path_total; + } + + /* Calculate final light sum and transparency for shadow catcher object. */ + if(kernel_data.background.transparent) { + *alpha -= L->shadow_throughput * shadow; + } + else { + L->shadow_background_color *= shadow; + *L_sum += L->shadow_background_color; + } +} +#endif + +ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadiance *L, float *alpha) { float3 L_sum; /* Light Passes are used */ @@ -399,7 +600,7 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z); /* Reject invalid value */ - if(!isfinite(sum)) { + if(!isfinite_safe(sum)) { kernel_assert(!"Non-finite sum in path_radiance_clamp_and_sum!"); L_sum = make_float3(0.0f, 0.0f, 0.0f); @@ -455,8 +656,6 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi L_sum = L_direct + L_indirect; } #endif - - return L_sum; } /* No Light Passes */ @@ -464,42 +663,105 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi #endif { L_sum = L->emission; + + /* Reject invalid value */ + float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z); + if(!isfinite_safe(sum)) { + kernel_assert(!"Non-finite final sum in path_radiance_clamp_and_sum!"); + L_sum = make_float3(0.0f, 0.0f, 0.0f); + } } - /* Reject invalid value */ - float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z); - if(!isfinite(sum)) { - kernel_assert(!"Non-finite final sum in path_radiance_clamp_and_sum!"); - L_sum = make_float3(0.0f, 0.0f, 0.0f); + /* Compute alpha. */ + *alpha = 1.0f - L->transparent; + + /* Add shadow catcher contributions. */ +#ifdef __SHADOW_TRICKS__ + if(L->has_shadow_catcher) { + path_radiance_sum_shadowcatcher(kg, L, &L_sum, alpha); } +#endif /* __SHADOW_TRICKS__ */ return L_sum; } -ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample, int num_samples) +ccl_device_inline void path_radiance_split_denoising(KernelGlobals *kg, PathRadiance *L, float3 *noisy, float3 *clean) +{ +#ifdef __PASSES__ + kernel_assert(L->use_light_pass); + + *clean = L->emission + L->background; + *noisy = L->direct_scatter + L->indirect_scatter; + +# define ADD_COMPONENT(flag, component) \ + if(kernel_data.film.denoising_flags & flag) \ + *clean += component; \ + else \ + *noisy += component; + + ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_DIR, L->direct_diffuse); + ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_IND, L->indirect_diffuse); + ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_DIR, L->direct_glossy); + ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_IND, L->indirect_glossy); + ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_DIR, L->direct_transmission); + ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_IND, L->indirect_transmission); + ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_DIR, L->direct_subsurface); + ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_IND, L->indirect_subsurface); +# undef ADD_COMPONENT +#else + *noisy = L->emission; + *clean = make_float3(0.0f, 0.0f, 0.0f); +#endif + +#ifdef __SHADOW_TRICKS__ + if(L->has_shadow_catcher) { + *noisy += L->shadow_background_color; + } +#endif + + *noisy = ensure_finite3(*noisy); + *clean = ensure_finite3(*clean); +} + +ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample) { - float fac = 1.0f/num_samples; +#ifdef __SPLIT_KERNEL__ +# define safe_float3_add(f, v) \ + do { \ + ccl_global float *p = (ccl_global float*)(&(f)); \ + atomic_add_and_fetch_float(p+0, (v).x); \ + atomic_add_and_fetch_float(p+1, (v).y); \ + atomic_add_and_fetch_float(p+2, (v).z); \ + } while(0) +# define safe_float_add(f, v) \ + atomic_add_and_fetch_float(&(f), (v)) +#else +# define safe_float3_add(f, v) (f) += (v) +# define safe_float_add(f, v) (f) += (v) +#endif /* __SPLIT_KERNEL__ */ #ifdef __PASSES__ - L->direct_diffuse += L_sample->direct_diffuse*fac; - L->direct_glossy += L_sample->direct_glossy*fac; - L->direct_transmission += L_sample->direct_transmission*fac; - L->direct_subsurface += L_sample->direct_subsurface*fac; - L->direct_scatter += L_sample->direct_scatter*fac; - - L->indirect_diffuse += L_sample->indirect_diffuse*fac; - L->indirect_glossy += L_sample->indirect_glossy*fac; - L->indirect_transmission += L_sample->indirect_transmission*fac; - L->indirect_subsurface += L_sample->indirect_subsurface*fac; - L->indirect_scatter += L_sample->indirect_scatter*fac; - - L->background += L_sample->background*fac; - L->ao += L_sample->ao*fac; - L->shadow += L_sample->shadow*fac; - L->mist += L_sample->mist*fac; -#endif - L->emission += L_sample->emission * fac; + safe_float3_add(L->direct_diffuse, L_sample->direct_diffuse); + safe_float3_add(L->direct_glossy, L_sample->direct_glossy); + safe_float3_add(L->direct_transmission, L_sample->direct_transmission); + safe_float3_add(L->direct_subsurface, L_sample->direct_subsurface); + safe_float3_add(L->direct_scatter, L_sample->direct_scatter); + + safe_float3_add(L->indirect_diffuse, L_sample->indirect_diffuse); + safe_float3_add(L->indirect_glossy, L_sample->indirect_glossy); + safe_float3_add(L->indirect_transmission, L_sample->indirect_transmission); + safe_float3_add(L->indirect_subsurface, L_sample->indirect_subsurface); + safe_float3_add(L->indirect_scatter, L_sample->indirect_scatter); + + safe_float3_add(L->background, L_sample->background); + safe_float3_add(L->ao, L_sample->ao); + safe_float3_add(L->shadow, L_sample->shadow); + safe_float_add(L->mist, L_sample->mist); +#endif /* __PASSES__ */ + safe_float3_add(L->emission, L_sample->emission); + +#undef safe_float_add +#undef safe_float3_add } CCL_NAMESPACE_END - diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h index 5bcc57cdcdf..79e6d1b4862 100644 --- a/intern/cycles/kernel/kernel_bake.h +++ b/intern/cycles/kernel/kernel_bake.h @@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN ccl_device_inline void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, - RNG rng, + uint rng_hash, int pass_filter, int sample) { @@ -48,13 +48,13 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg, path_radiance_init(&L_sample, kernel_data.film.use_light_pass); /* init path state */ - path_state_init(kg, &emission_sd, &state, &rng, sample, NULL); + path_state_init(kg, &emission_sd, &state, rng_hash, sample, NULL); /* evaluate surface shader */ - float rbsdf = path_state_rng_1D(kg, &rng, &state, PRNG_BSDF); - shader_eval_surface(kg, sd, &rng, &state, rbsdf, state.flag, SHADER_CONTEXT_MAIN); + shader_eval_surface(kg, sd, &state, state.flag); - /* TODO, disable the closures we won't need */ + /* TODO, disable more closures we don't need besides transparent */ + shader_bsdf_disable_transparency(kg, sd); #ifdef __BRANCHED_PATH__ if(!kernel_data.integrator.branched) { @@ -63,13 +63,13 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg, /* sample ambient occlusion */ if(pass_filter & BAKE_FILTER_AO) { - kernel_path_ao(kg, sd, &emission_sd, &L_sample, &state, &rng, throughput, shader_bsdf_alpha(kg, sd)); + kernel_path_ao(kg, sd, &emission_sd, &L_sample, &state, throughput, shader_bsdf_alpha(kg, sd)); } /* sample emission */ if((pass_filter & BAKE_FILTER_EMISSION) && (sd->flag & SD_EMISSION)) { float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf); - path_radiance_accum_emission(&L_sample, throughput, emission, state.bounce); + path_radiance_accum_emission(&L_sample, &state, throughput, emission); } bool is_sss_sample = false; @@ -85,7 +85,6 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg, &emission_sd, &L_sample, &state, - &rng, &ray, &throughput, &ss_indirect)) @@ -100,13 +99,10 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg, kernel_path_indirect(kg, &indirect_sd, &emission_sd, - &rng, &ray, throughput, - state.num_samples, &state, &L_sample); - kernel_path_subsurface_accum_indirect(&ss_indirect, &L_sample); } is_sss_sample = true; } @@ -115,14 +111,14 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg, /* sample light and BSDF */ if(!is_sss_sample && (pass_filter & (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT))) { - kernel_path_surface_connect_light(kg, &rng, sd, &emission_sd, throughput, &state, &L_sample); + kernel_path_surface_connect_light(kg, sd, &emission_sd, throughput, &state, &L_sample); - if(kernel_path_surface_bounce(kg, &rng, sd, &throughput, &state, &L_sample, &ray)) { + if(kernel_path_surface_bounce(kg, sd, &throughput, &state, &L_sample.state, &ray)) { #ifdef __LAMP_MIS__ state.ray_t = 0.0f; #endif /* compute indirect light */ - kernel_path_indirect(kg, &indirect_sd, &emission_sd, &rng, &ray, throughput, 1, &state, &L_sample); + kernel_path_indirect(kg, &indirect_sd, &emission_sd, &ray, throughput, &state, &L_sample); /* sum and reset indirect light pass variables for the next samples */ path_radiance_sum_indirect(&L_sample); @@ -136,13 +132,13 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg, /* sample ambient occlusion */ if(pass_filter & BAKE_FILTER_AO) { - kernel_branched_path_ao(kg, sd, &emission_sd, &L_sample, &state, &rng, throughput); + kernel_branched_path_ao(kg, sd, &emission_sd, &L_sample, &state, throughput); } /* sample emission */ if((pass_filter & BAKE_FILTER_EMISSION) && (sd->flag & SD_EMISSION)) { float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf); - path_radiance_accum_emission(&L_sample, throughput, emission, state.bounce); + path_radiance_accum_emission(&L_sample, &state, throughput, emission); } #ifdef __SUBSURFACE__ @@ -150,7 +146,7 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg, if((pass_filter & BAKE_FILTER_SUBSURFACE) && (sd->flag & SD_BSSRDF)) { /* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */ kernel_branched_path_subsurface_scatter(kg, sd, &indirect_sd, - &emission_sd, &L_sample, &state, &rng, &ray, throughput); + &emission_sd, &L_sample, &state, &ray, throughput); } #endif @@ -160,31 +156,20 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg, /* direct light */ if(kernel_data.integrator.use_direct_light) { int all = kernel_data.integrator.sample_all_lights_direct; - kernel_branched_path_surface_connect_light(kg, &rng, + kernel_branched_path_surface_connect_light(kg, sd, &emission_sd, &state, throughput, 1.0f, &L_sample, all); } #endif /* indirect light */ - kernel_branched_path_surface_indirect_light(kg, &rng, + kernel_branched_path_surface_indirect_light(kg, sd, &indirect_sd, &emission_sd, throughput, 1.0f, &state, &L_sample); } } #endif /* accumulate into master L */ - path_radiance_accum_sample(L, &L_sample, 1); -} - -ccl_device bool is_aa_pass(ShaderEvalType type) -{ - switch(type) { - case SHADER_EVAL_UV: - case SHADER_EVAL_NORMAL: - return false; - default: - return true; - } + path_radiance_accum_sample(L, &L_sample); } /* this helps with AA but it's not the real solution as it does not AA the geometry @@ -224,7 +209,6 @@ ccl_device_inline float3 kernel_bake_shader_bsdf(KernelGlobals *kg, ccl_device float3 kernel_bake_evaluate_direct_indirect(KernelGlobals *kg, ShaderData *sd, - RNG *rng, PathState *state, float3 direct, float3 indirect, @@ -244,12 +228,12 @@ ccl_device float3 kernel_bake_evaluate_direct_indirect(KernelGlobals *kg, } else { /* surface color of the pass only */ - shader_eval_surface(kg, sd, rng, state, 0.0f, 0, SHADER_CONTEXT_MAIN); + shader_eval_surface(kg, sd, state, 0); return kernel_bake_shader_bsdf(kg, sd, type); } } else { - shader_eval_surface(kg, sd, rng, state, 0.0f, 0, SHADER_CONTEXT_MAIN); + shader_eval_surface(kg, sd, state, 0); color = kernel_bake_shader_bsdf(kg, sd, type); } @@ -291,14 +275,14 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, int num_samples = kernel_data.integrator.aa_samples; /* random number generator */ - RNG rng = cmj_hash(offset + i, kernel_data.integrator.seed); + uint rng_hash = cmj_hash(offset + i, kernel_data.integrator.seed); float filter_x, filter_y; if(sample == 0) { filter_x = filter_y = 0.5f; } else { - path_rng_2D(kg, &rng, sample, num_samples, PRNG_FILTER_U, &filter_x, &filter_y); + path_rng_2D(kg, rng_hash, sample, num_samples, PRNG_FILTER_U, &filter_x, &filter_y); } /* subpixel u/v offset */ @@ -332,20 +316,44 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, sd.dv.dx = dvdx; sd.dv.dy = dvdy; + /* set RNG state for shaders that use sampling */ + state.rng_hash = rng_hash; + state.rng_offset = 0; + state.sample = sample; + state.num_samples = num_samples; + state.min_ray_pdf = FLT_MAX; + /* light passes if we need more than color */ if(pass_filter & ~BAKE_FILTER_COLOR) - compute_light_pass(kg, &sd, &L, rng, pass_filter, sample); + compute_light_pass(kg, &sd, &L, rng_hash, pass_filter, sample); switch(type) { /* data passes */ case SHADER_EVAL_NORMAL: + case SHADER_EVAL_ROUGHNESS: + case SHADER_EVAL_EMISSION: { - if((sd.flag & SD_HAS_BUMP)) { - shader_eval_surface(kg, &sd, &rng, &state, 0.f, 0, SHADER_CONTEXT_MAIN); + if(type != SHADER_EVAL_NORMAL || (sd.flag & SD_HAS_BUMP)) { + int path_flag = (type == SHADER_EVAL_EMISSION) ? PATH_RAY_EMISSION : 0; + shader_eval_surface(kg, &sd, &state, path_flag); } - /* compression: normal = (2 * color) - 1 */ - out = sd.N * 0.5f + make_float3(0.5f, 0.5f, 0.5f); + if(type == SHADER_EVAL_NORMAL) { + float3 N = sd.N; + if(sd.flag & SD_HAS_BUMP) { + N = shader_bsdf_average_normal(kg, &sd); + } + + /* encoding: normal = (2 * color) - 1 */ + out = N * 0.5f + make_float3(0.5f, 0.5f, 0.5f); + } + else if(type == SHADER_EVAL_ROUGHNESS) { + float roughness = shader_bsdf_average_roughness(&sd); + out = make_float3(roughness, roughness, roughness); + } + else { + out = shader_emissive_eval(kg, &sd); + } break; } case SHADER_EVAL_UV: @@ -353,13 +361,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, out = primitive_uv(kg, &sd); break; } - case SHADER_EVAL_EMISSION: - { - shader_eval_surface(kg, &sd, &rng, &state, 0.f, 0, SHADER_CONTEXT_EMISSION); - out = shader_emissive_eval(kg, &sd); - break; - } - #ifdef __PASSES__ /* light passes */ case SHADER_EVAL_AO: @@ -370,7 +371,8 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, case SHADER_EVAL_COMBINED: { if((pass_filter & BAKE_FILTER_COMBINED) == BAKE_FILTER_COMBINED) { - out = path_radiance_clamp_and_sum(kg, &L); + float alpha; + out = path_radiance_clamp_and_sum(kg, &L, &alpha); break; } @@ -408,7 +410,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, { out = kernel_bake_evaluate_direct_indirect(kg, &sd, - &rng, &state, L.direct_diffuse, L.indirect_diffuse, @@ -420,7 +421,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, { out = kernel_bake_evaluate_direct_indirect(kg, &sd, - &rng, &state, L.direct_glossy, L.indirect_glossy, @@ -432,7 +432,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, { out = kernel_bake_evaluate_direct_indirect(kg, &sd, - &rng, &state, L.direct_transmission, L.indirect_transmission, @@ -445,7 +444,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, #ifdef __SUBSURFACE__ out = kernel_bake_evaluate_direct_indirect(kg, &sd, - &rng, &state, L.direct_subsurface, L.indirect_subsurface, @@ -479,7 +477,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, /* evaluate */ int flag = 0; /* we can't know which type of BSDF this is for */ - out = shader_eval_background(kg, &sd, &state, flag, SHADER_CONTEXT_MAIN); + out = shader_eval_background(kg, &sd, &state, flag); break; } default: @@ -491,86 +489,77 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, } /* write output */ - const float output_fac = is_aa_pass(type)? 1.0f/num_samples: 1.0f; + const float output_fac = 1.0f/num_samples; const float4 scaled_result = make_float4(out.x, out.y, out.z, 1.0f) * output_fac; - output[i] = (sample == 0)? scaled_result: output[i] + scaled_result; + output[i] = (sample == 0)? scaled_result: output[i] + scaled_result; } #endif /* __BAKING__ */ -ccl_device void kernel_shader_evaluate(KernelGlobals *kg, - ccl_global uint4 *input, - ccl_global float4 *output, - ccl_global float *output_luma, - ShaderEvalType type, - int i, - int sample) +ccl_device void kernel_displace_evaluate(KernelGlobals *kg, + ccl_global uint4 *input, + ccl_global float4 *output, + int i) { ShaderData sd; PathState state = {0}; uint4 in = input[i]; - float3 out; - if(type == SHADER_EVAL_DISPLACE) { - /* setup shader data */ - int object = in.x; - int prim = in.y; - float u = __uint_as_float(in.z); - float v = __uint_as_float(in.w); + /* setup shader data */ + int object = in.x; + int prim = in.y; + float u = __uint_as_float(in.z); + float v = __uint_as_float(in.w); - shader_setup_from_displace(kg, &sd, object, prim, u, v); + shader_setup_from_displace(kg, &sd, object, prim, u, v); - /* evaluate */ - float3 P = sd.P; - shader_eval_displacement(kg, &sd, &state, SHADER_CONTEXT_MAIN); - out = sd.P - P; + /* evaluate */ + float3 P = sd.P; + shader_eval_displacement(kg, &sd, &state); + float3 D = sd.P - P; - object_inverse_dir_transform(kg, &sd, &out); - } - else { // SHADER_EVAL_BACKGROUND - /* setup ray */ - Ray ray; - float u = __uint_as_float(in.x); - float v = __uint_as_float(in.y); - - ray.P = make_float3(0.0f, 0.0f, 0.0f); - ray.D = equirectangular_to_direction(u, v); - ray.t = 0.0f; + object_inverse_dir_transform(kg, &sd, &D); + + /* write output */ + output[i] += make_float4(D.x, D.y, D.z, 0.0f); +} + +ccl_device void kernel_background_evaluate(KernelGlobals *kg, + ccl_global uint4 *input, + ccl_global float4 *output, + int i) +{ + ShaderData sd; + PathState state = {0}; + uint4 in = input[i]; + + /* setup ray */ + Ray ray; + float u = __uint_as_float(in.x); + float v = __uint_as_float(in.y); + + ray.P = make_float3(0.0f, 0.0f, 0.0f); + ray.D = equirectangular_to_direction(u, v); + ray.t = 0.0f; #ifdef __CAMERA_MOTION__ - ray.time = 0.5f; + ray.time = 0.5f; #endif #ifdef __RAY_DIFFERENTIALS__ - ray.dD = differential3_zero(); - ray.dP = differential3_zero(); + ray.dD = differential3_zero(); + ray.dP = differential3_zero(); #endif - /* setup shader data */ - shader_setup_from_background(kg, &sd, &ray); + /* setup shader data */ + shader_setup_from_background(kg, &sd, &ray); + + /* evaluate */ + int flag = 0; /* we can't know which type of BSDF this is for */ + float3 color = shader_eval_background(kg, &sd, &state, flag); - /* evaluate */ - int flag = 0; /* we can't know which type of BSDF this is for */ - out = shader_eval_background(kg, &sd, &state, flag, SHADER_CONTEXT_MAIN); - } - /* write output */ - if(sample == 0) { - if(output != NULL) { - output[i] = make_float4(out.x, out.y, out.z, 0.0f); - } - if(output_luma != NULL) { - output_luma[i] = average(out); - } - } - else { - if(output != NULL) { - output[i] += make_float4(out.x, out.y, out.z, 0.0f); - } - if(output_luma != NULL) { - output_luma[i] += average(out); - } - } + output[i] += make_float4(color.x, color.y, color.z, 0.0f); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_camera.h b/intern/cycles/kernel/kernel_camera.h index dedac6b1465..b73ad47dad3 100644 --- a/intern/cycles/kernel/kernel_camera.h +++ b/intern/cycles/kernel/kernel_camera.h @@ -18,9 +18,9 @@ CCL_NAMESPACE_BEGIN /* Perspective Camera */ -ccl_device float2 camera_sample_aperture(KernelGlobals *kg, float u, float v) +ccl_device float2 camera_sample_aperture(ccl_constant KernelCamera *cam, float u, float v) { - float blades = kernel_data.cam.blades; + float blades = cam->blades; float2 bokeh; if(blades == 0.0f) { @@ -29,12 +29,12 @@ ccl_device float2 camera_sample_aperture(KernelGlobals *kg, float u, float v) } else { /* sample polygon */ - float rotation = kernel_data.cam.bladesrotation; + float rotation = cam->bladesrotation; bokeh = regular_polygon_sample(blades, rotation, u, v); } /* anamorphic lens bokeh */ - bokeh.x *= kernel_data.cam.inv_aperture_ratio; + bokeh.x *= cam->inv_aperture_ratio; return bokeh; } @@ -42,7 +42,7 @@ ccl_device float2 camera_sample_aperture(KernelGlobals *kg, float u, float v) ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, ccl_addr_space Ray *ray) { /* create ray form raster position */ - Transform rastertocamera = kernel_data.cam.rastertocamera; + ProjectionTransform rastertocamera = kernel_data.cam.rastertocamera; float3 raster = make_float3(raster_x, raster_y, 0.0f); float3 Pcamera = transform_perspective(&rastertocamera, raster); @@ -54,13 +54,13 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, flo * interpolated field of view. */ if(ray->time < 0.5f) { - Transform rastertocamera_pre = kernel_data.cam.perspective_motion.pre; + ProjectionTransform rastertocamera_pre = kernel_data.cam.perspective_pre; float3 Pcamera_pre = transform_perspective(&rastertocamera_pre, raster); Pcamera = interp(Pcamera_pre, Pcamera, ray->time * 2.0f); } else { - Transform rastertocamera_post = kernel_data.cam.perspective_motion.post; + ProjectionTransform rastertocamera_post = kernel_data.cam.perspective_post; float3 Pcamera_post = transform_perspective(&rastertocamera_post, raster); Pcamera = interp(Pcamera, Pcamera_post, (ray->time - 0.5f) * 2.0f); @@ -76,7 +76,7 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, flo if(aperturesize > 0.0f) { /* sample point on aperture */ - float2 lensuv = camera_sample_aperture(kg, lens_u, lens_v)*aperturesize; + float2 lensuv = camera_sample_aperture(&kernel_data.cam, lens_u, lens_v)*aperturesize; /* compute point on plane of focus */ float ft = kernel_data.cam.focaldistance/D.z; @@ -91,17 +91,12 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, flo Transform cameratoworld = kernel_data.cam.cameratoworld; #ifdef __CAMERA_MOTION__ - if(kernel_data.cam.have_motion) { -# ifdef __KERNEL_OPENCL__ - const MotionTransform tfm = kernel_data.cam.motion; - transform_motion_interpolate(&cameratoworld, - ((const DecompMotionTransform*)&tfm), - ray->time); -# else - transform_motion_interpolate(&cameratoworld, - ((const DecompMotionTransform*)&kernel_data.cam.motion), - ray->time); -# endif + if(kernel_data.cam.num_motion_steps) { + transform_motion_array_interpolate( + &cameratoworld, + kernel_tex_array(__camera_motion), + kernel_data.cam.num_motion_steps, + ray->time); } #endif @@ -124,7 +119,7 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, flo } else { /* Spherical stereo */ - spherical_stereo_transform(kg, &P, &D); + spherical_stereo_transform(&kernel_data.cam, &P, &D); ray->P = P; ray->D = D; @@ -138,12 +133,12 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, flo float3 Pcenter = Pnostereo; float3 Dcenter = Pcamera; Dcenter = normalize(transform_direction(&cameratoworld, Dcenter)); - spherical_stereo_transform(kg, &Pcenter, &Dcenter); + spherical_stereo_transform(&kernel_data.cam, &Pcenter, &Dcenter); float3 Px = Pnostereo; float3 Dx = transform_perspective(&rastertocamera, make_float3(raster_x + 1.0f, raster_y, 0.0f)); Dx = normalize(transform_direction(&cameratoworld, Dx)); - spherical_stereo_transform(kg, &Px, &Dx); + spherical_stereo_transform(&kernel_data.cam, &Px, &Dx); ray->dP.dx = Px - Pcenter; ray->dD.dx = Dx - Dcenter; @@ -151,7 +146,7 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, flo float3 Py = Pnostereo; float3 Dy = transform_perspective(&rastertocamera, make_float3(raster_x, raster_y + 1.0f, 0.0f)); Dy = normalize(transform_direction(&cameratoworld, Dy)); - spherical_stereo_transform(kg, &Py, &Dy); + spherical_stereo_transform(&kernel_data.cam, &Py, &Dy); ray->dP.dy = Py - Pcenter; ray->dD.dy = Dy - Dcenter; @@ -175,7 +170,7 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, flo ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, ccl_addr_space Ray *ray) { /* create ray form raster position */ - Transform rastertocamera = kernel_data.cam.rastertocamera; + ProjectionTransform rastertocamera = kernel_data.cam.rastertocamera; float3 Pcamera = transform_perspective(&rastertocamera, make_float3(raster_x, raster_y, 0.0f)); float3 P; @@ -186,7 +181,7 @@ ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, fl if(aperturesize > 0.0f) { /* sample point on aperture */ - float2 lensuv = camera_sample_aperture(kg, lens_u, lens_v)*aperturesize; + float2 lensuv = camera_sample_aperture(&kernel_data.cam, lens_u, lens_v)*aperturesize; /* compute point on plane of focus */ float3 Pfocus = D * kernel_data.cam.focaldistance; @@ -203,17 +198,12 @@ ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, fl Transform cameratoworld = kernel_data.cam.cameratoworld; #ifdef __CAMERA_MOTION__ - if(kernel_data.cam.have_motion) { -# ifdef __KERNEL_OPENCL__ - const MotionTransform tfm = kernel_data.cam.motion; - transform_motion_interpolate(&cameratoworld, - (const DecompMotionTransform*)&tfm, - ray->time); -# else - transform_motion_interpolate(&cameratoworld, - (const DecompMotionTransform*)&kernel_data.cam.motion, - ray->time); -# endif + if(kernel_data.cam.num_motion_steps) { + transform_motion_array_interpolate( + &cameratoworld, + kernel_tex_array(__camera_motion), + kernel_data.cam.num_motion_steps, + ray->time); } #endif @@ -238,17 +228,18 @@ ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, fl /* Panorama Camera */ -ccl_device_inline void camera_sample_panorama(KernelGlobals *kg, +ccl_device_inline void camera_sample_panorama(ccl_constant KernelCamera *cam, + const ccl_global DecomposedTransform *cam_motion, float raster_x, float raster_y, float lens_u, float lens_v, ccl_addr_space Ray *ray) { - Transform rastertocamera = kernel_data.cam.rastertocamera; + ProjectionTransform rastertocamera = cam->rastertocamera; float3 Pcamera = transform_perspective(&rastertocamera, make_float3(raster_x, raster_y, 0.0f)); /* create ray form raster position */ float3 P = make_float3(0.0f, 0.0f, 0.0f); - float3 D = panorama_to_direction(kg, Pcamera.x, Pcamera.y); + float3 D = panorama_to_direction(cam, Pcamera.x, Pcamera.y); /* indicates ray should not receive any light, outside of the lens */ if(is_zero(D)) { @@ -257,15 +248,15 @@ ccl_device_inline void camera_sample_panorama(KernelGlobals *kg, } /* modify ray for depth of field */ - float aperturesize = kernel_data.cam.aperturesize; + float aperturesize = cam->aperturesize; if(aperturesize > 0.0f) { /* sample point on aperture */ - float2 lensuv = camera_sample_aperture(kg, lens_u, lens_v)*aperturesize; + float2 lensuv = camera_sample_aperture(cam, lens_u, lens_v)*aperturesize; /* compute point on plane of focus */ float3 Dfocus = normalize(D); - float3 Pfocus = Dfocus * kernel_data.cam.focaldistance; + float3 Pfocus = Dfocus * cam->focaldistance; /* calculate orthonormal coordinates perpendicular to Dfocus */ float3 U, V; @@ -278,20 +269,15 @@ ccl_device_inline void camera_sample_panorama(KernelGlobals *kg, } /* transform ray from camera to world */ - Transform cameratoworld = kernel_data.cam.cameratoworld; + Transform cameratoworld = cam->cameratoworld; #ifdef __CAMERA_MOTION__ - if(kernel_data.cam.have_motion) { -# ifdef __KERNEL_OPENCL__ - const MotionTransform tfm = kernel_data.cam.motion; - transform_motion_interpolate(&cameratoworld, - (const DecompMotionTransform*)&tfm, - ray->time); -# else - transform_motion_interpolate(&cameratoworld, - (const DecompMotionTransform*)&kernel_data.cam.motion, - ray->time); -# endif + if(cam->num_motion_steps) { + transform_motion_array_interpolate( + &cameratoworld, + cam_motion, + cam->num_motion_steps, + ray->time); } #endif @@ -299,9 +285,9 @@ ccl_device_inline void camera_sample_panorama(KernelGlobals *kg, D = normalize(transform_direction(&cameratoworld, D)); /* Stereo transform */ - bool use_stereo = kernel_data.cam.interocular_offset != 0.0f; + bool use_stereo = cam->interocular_offset != 0.0f; if(use_stereo) { - spherical_stereo_transform(kg, &P, &D); + spherical_stereo_transform(cam, &P, &D); } ray->P = P; @@ -313,30 +299,30 @@ ccl_device_inline void camera_sample_panorama(KernelGlobals *kg, * ray origin and direction for the center and two neighbouring pixels * and simply take their differences. */ float3 Pcenter = Pcamera; - float3 Dcenter = panorama_to_direction(kg, Pcenter.x, Pcenter.y); + float3 Dcenter = panorama_to_direction(cam, Pcenter.x, Pcenter.y); Pcenter = transform_point(&cameratoworld, Pcenter); Dcenter = normalize(transform_direction(&cameratoworld, Dcenter)); if(use_stereo) { - spherical_stereo_transform(kg, &Pcenter, &Dcenter); + spherical_stereo_transform(cam, &Pcenter, &Dcenter); } float3 Px = transform_perspective(&rastertocamera, make_float3(raster_x + 1.0f, raster_y, 0.0f)); - float3 Dx = panorama_to_direction(kg, Px.x, Px.y); + float3 Dx = panorama_to_direction(cam, Px.x, Px.y); Px = transform_point(&cameratoworld, Px); Dx = normalize(transform_direction(&cameratoworld, Dx)); if(use_stereo) { - spherical_stereo_transform(kg, &Px, &Dx); + spherical_stereo_transform(cam, &Px, &Dx); } ray->dP.dx = Px - Pcenter; ray->dD.dx = Dx - Dcenter; float3 Py = transform_perspective(&rastertocamera, make_float3(raster_x, raster_y + 1.0f, 0.0f)); - float3 Dy = panorama_to_direction(kg, Py.x, Py.y); + float3 Dy = panorama_to_direction(cam, Py.x, Py.y); Py = transform_point(&cameratoworld, Py); Dy = normalize(transform_direction(&cameratoworld, Dy)); if(use_stereo) { - spherical_stereo_transform(kg, &Py, &Dy); + spherical_stereo_transform(cam, &Py, &Dy); } ray->dP.dy = Py - Pcenter; @@ -345,11 +331,11 @@ ccl_device_inline void camera_sample_panorama(KernelGlobals *kg, #ifdef __CAMERA_CLIPPING__ /* clipping */ - float nearclip = kernel_data.cam.nearclip; + float nearclip = cam->nearclip; ray->P += nearclip * ray->D; ray->dP.dx += nearclip * ray->dD.dx; ray->dP.dy += nearclip * ray->dD.dy; - ray->t = kernel_data.cam.cliplength; + ray->t = cam->cliplength; #else ray->t = FLT_MAX; #endif @@ -410,12 +396,16 @@ ccl_device_inline void camera_sample(KernelGlobals *kg, #endif /* sample */ - if(kernel_data.cam.type == CAMERA_PERSPECTIVE) + if(kernel_data.cam.type == CAMERA_PERSPECTIVE) { camera_sample_perspective(kg, raster_x, raster_y, lens_u, lens_v, ray); - else if(kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) + } + else if(kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) { camera_sample_orthographic(kg, raster_x, raster_y, lens_u, lens_v, ray); - else - camera_sample_panorama(kg, raster_x, raster_y, lens_u, lens_v, ray); + } + else { + const ccl_global DecomposedTransform *cam_motion = kernel_tex_array(__camera_motion); + camera_sample_panorama(&kernel_data.cam, cam_motion, raster_x, raster_y, lens_u, lens_v, ray); + } } /* Utilities */ @@ -457,22 +447,22 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd, { if(kernel_data.cam.type != CAMERA_PANORAMA) { /* perspective / ortho */ - if(ccl_fetch(sd, object) == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE) + if(sd->object == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE) P += camera_position(kg); - Transform tfm = kernel_data.cam.worldtondc; + ProjectionTransform tfm = kernel_data.cam.worldtondc; return transform_perspective(&tfm, P); } else { /* panorama */ Transform tfm = kernel_data.cam.worldtocamera; - if(ccl_fetch(sd, object) != OBJECT_NONE) + if(sd->object != OBJECT_NONE) P = normalize(transform_point(&tfm, P)); else P = normalize(transform_direction(&tfm, P)); - float2 uv = direction_to_panorama(kg, P); + float2 uv = direction_to_panorama(&kernel_data.cam, P); return make_float3(uv.x, uv.y, 0.0f); } diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h index 9d1f3bdc918..d26b668cb11 100644 --- a/intern/cycles/kernel/kernel_compat_cpu.h +++ b/intern/cycles/kernel/kernel_compat_cpu.h @@ -35,15 +35,23 @@ # define __NODES_FEATURES__ NODE_FEATURE_ALL #endif -#include "util_debug.h" -#include "util_math.h" -#include "util_simd.h" -#include "util_half.h" -#include "util_types.h" -#include "util_texture.h" +#include "util/util_math.h" +#include "util/util_simd.h" +#include "util/util_half.h" +#include "util/util_types.h" +#include "util/util_texture.h" #define ccl_addr_space +#define ccl_local_id(d) 0 +#define ccl_global_id(d) (kg->global_id[d]) + +#define ccl_local_size(d) 1 +#define ccl_global_size(d) (kg->global_size[d]) + +#define ccl_group_id(d) ccl_global_id(d) +#define ccl_num_groups(d) ccl_global_size(d) + /* On x86_64, versions of glibc < 2.16 have an issue where expf is * much slower than the double version. This was fixed in glibc 2.16. */ @@ -65,7 +73,7 @@ CCL_NAMESPACE_BEGIN * pointer lookup. */ template<typename T> struct texture { - ccl_always_inline T fetch(int index) + ccl_always_inline const T& fetch(int index) { kernel_assert(index >= 0 && index < width); return data[index]; @@ -78,9 +86,9 @@ template<typename T> struct texture { ccl_always_inline avxf fetch_avxf(const int index) { kernel_assert(index >= 0 && (index+1) < width); - ssef *ssefData = (ssef*)data; - ssef *ssefNodeData = &ssefData[index]; - return _mm256_loadu_ps((float *)ssefNodeData); + ssef *ssef_data = (ssef*)data; + ssef *ssef_node_data = &ssef_data[index]; + return _mm256_loadu_ps((float *)ssef_node_data); } #endif @@ -103,420 +111,6 @@ template<typename T> struct texture { int width; }; -template<typename T> struct texture_image { -#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \ - { \ - u[0] = (((-1.0f/6.0f)* t + 0.5f) * t - 0.5f) * t + (1.0f/6.0f); \ - u[1] = (( 0.5f * t - 1.0f) * t ) * t + (2.0f/3.0f); \ - u[2] = (( -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \ - u[3] = (1.0f / 6.0f) * t * t * t; \ - } (void)0 - - ccl_always_inline float4 read(float4 r) - { - return r; - } - - ccl_always_inline float4 read(uchar4 r) - { - float f = 1.0f/255.0f; - return make_float4(r.x*f, r.y*f, r.z*f, r.w*f); - } - - ccl_always_inline float4 read(uchar r) - { - float f = r*(1.0f/255.0f); - return make_float4(f, f, f, 1.0f); - } - - ccl_always_inline float4 read(float r) - { - /* TODO(dingto): Optimize this, so interpolation - * happens on float instead of float4 */ - return make_float4(r, r, r, 1.0f); - } - - ccl_always_inline float4 read(half4 r) - { - return half4_to_float4(r); - } - - ccl_always_inline float4 read(half r) - { - float f = half_to_float(r); - return make_float4(f, f, f, 1.0f); - } - - ccl_always_inline int wrap_periodic(int x, int width) - { - x %= width; - if(x < 0) - x += width; - return x; - } - - ccl_always_inline int wrap_clamp(int x, int width) - { - return clamp(x, 0, width-1); - } - - ccl_always_inline float frac(float x, int *ix) - { - int i = float_to_int(x) - ((x < 0.0f)? 1: 0); - *ix = i; - return x - (float)i; - } - - ccl_always_inline float4 interp(float x, float y) - { - if(UNLIKELY(!data)) - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - - int ix, iy, nix, niy; - - if(interpolation == INTERPOLATION_CLOSEST) { - frac(x*(float)width, &ix); - frac(y*(float)height, &iy); - switch(extension) { - case EXTENSION_REPEAT: - ix = wrap_periodic(ix, width); - iy = wrap_periodic(iy, height); - break; - case EXTENSION_CLIP: - if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - /* Fall through. */ - case EXTENSION_EXTEND: - ix = wrap_clamp(ix, width); - iy = wrap_clamp(iy, height); - break; - default: - kernel_assert(0); - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - return read(data[ix + iy*width]); - } - else if(interpolation == INTERPOLATION_LINEAR) { - float tx = frac(x*(float)width - 0.5f, &ix); - float ty = frac(y*(float)height - 0.5f, &iy); - - switch(extension) { - case EXTENSION_REPEAT: - ix = wrap_periodic(ix, width); - iy = wrap_periodic(iy, height); - - nix = wrap_periodic(ix+1, width); - niy = wrap_periodic(iy+1, height); - break; - case EXTENSION_CLIP: - if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - /* Fall through. */ - case EXTENSION_EXTEND: - nix = wrap_clamp(ix+1, width); - niy = wrap_clamp(iy+1, height); - - ix = wrap_clamp(ix, width); - iy = wrap_clamp(iy, height); - break; - default: - kernel_assert(0); - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - - float4 r = (1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width]); - r += (1.0f - ty)*tx*read(data[nix + iy*width]); - r += ty*(1.0f - tx)*read(data[ix + niy*width]); - r += ty*tx*read(data[nix + niy*width]); - - return r; - } - else { - /* Bicubic b-spline interpolation. */ - float tx = frac(x*(float)width - 0.5f, &ix); - float ty = frac(y*(float)height - 0.5f, &iy); - int pix, piy, nnix, nniy; - switch(extension) { - case EXTENSION_REPEAT: - ix = wrap_periodic(ix, width); - iy = wrap_periodic(iy, height); - - pix = wrap_periodic(ix-1, width); - piy = wrap_periodic(iy-1, height); - - nix = wrap_periodic(ix+1, width); - niy = wrap_periodic(iy+1, height); - - nnix = wrap_periodic(ix+2, width); - nniy = wrap_periodic(iy+2, height); - break; - case EXTENSION_CLIP: - if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - /* Fall through. */ - case EXTENSION_EXTEND: - pix = wrap_clamp(ix-1, width); - piy = wrap_clamp(iy-1, height); - - nix = wrap_clamp(ix+1, width); - niy = wrap_clamp(iy+1, height); - - nnix = wrap_clamp(ix+2, width); - nniy = wrap_clamp(iy+2, height); - - ix = wrap_clamp(ix, width); - iy = wrap_clamp(iy, height); - break; - default: - kernel_assert(0); - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - - const int xc[4] = {pix, ix, nix, nnix}; - const int yc[4] = {width * piy, - width * iy, - width * niy, - width * nniy}; - float u[4], v[4]; - /* Some helper macro to keep code reasonable size, - * let compiler to inline all the matrix multiplications. - */ -#define DATA(x, y) (read(data[xc[x] + yc[y]])) -#define TERM(col) \ - (v[col] * (u[0] * DATA(0, col) + \ - u[1] * DATA(1, col) + \ - u[2] * DATA(2, col) + \ - u[3] * DATA(3, col))) - - SET_CUBIC_SPLINE_WEIGHTS(u, tx); - SET_CUBIC_SPLINE_WEIGHTS(v, ty); - - /* Actual interpolation. */ - return TERM(0) + TERM(1) + TERM(2) + TERM(3); - -#undef TERM -#undef DATA - } - } - - ccl_always_inline float4 interp_3d(float x, float y, float z) - { - return interp_3d_ex(x, y, z, interpolation); - } - - ccl_always_inline float4 interp_3d_ex(float x, float y, float z, - int interpolation = INTERPOLATION_LINEAR) - { - if(UNLIKELY(!data)) - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - - int ix, iy, iz, nix, niy, niz; - - if(interpolation == INTERPOLATION_CLOSEST) { - frac(x*(float)width, &ix); - frac(y*(float)height, &iy); - frac(z*(float)depth, &iz); - - switch(extension) { - case EXTENSION_REPEAT: - ix = wrap_periodic(ix, width); - iy = wrap_periodic(iy, height); - iz = wrap_periodic(iz, depth); - break; - case EXTENSION_CLIP: - if(x < 0.0f || y < 0.0f || z < 0.0f || - x > 1.0f || y > 1.0f || z > 1.0f) - { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - /* Fall through. */ - case EXTENSION_EXTEND: - ix = wrap_clamp(ix, width); - iy = wrap_clamp(iy, height); - iz = wrap_clamp(iz, depth); - break; - default: - kernel_assert(0); - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - - return read(data[ix + iy*width + iz*width*height]); - } - else if(interpolation == INTERPOLATION_LINEAR) { - float tx = frac(x*(float)width - 0.5f, &ix); - float ty = frac(y*(float)height - 0.5f, &iy); - float tz = frac(z*(float)depth - 0.5f, &iz); - - switch(extension) { - case EXTENSION_REPEAT: - ix = wrap_periodic(ix, width); - iy = wrap_periodic(iy, height); - iz = wrap_periodic(iz, depth); - - nix = wrap_periodic(ix+1, width); - niy = wrap_periodic(iy+1, height); - niz = wrap_periodic(iz+1, depth); - break; - case EXTENSION_CLIP: - if(x < 0.0f || y < 0.0f || z < 0.0f || - x > 1.0f || y > 1.0f || z > 1.0f) - { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - /* Fall through. */ - case EXTENSION_EXTEND: - nix = wrap_clamp(ix+1, width); - niy = wrap_clamp(iy+1, height); - niz = wrap_clamp(iz+1, depth); - - ix = wrap_clamp(ix, width); - iy = wrap_clamp(iy, height); - iz = wrap_clamp(iz, depth); - break; - default: - kernel_assert(0); - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - - float4 r; - - r = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + iz*width*height]); - r += (1.0f - tz)*(1.0f - ty)*tx*read(data[nix + iy*width + iz*width*height]); - r += (1.0f - tz)*ty*(1.0f - tx)*read(data[ix + niy*width + iz*width*height]); - r += (1.0f - tz)*ty*tx*read(data[nix + niy*width + iz*width*height]); - - r += tz*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + niz*width*height]); - r += tz*(1.0f - ty)*tx*read(data[nix + iy*width + niz*width*height]); - r += tz*ty*(1.0f - tx)*read(data[ix + niy*width + niz*width*height]); - r += tz*ty*tx*read(data[nix + niy*width + niz*width*height]); - - return r; - } - else { - /* Tricubic b-spline interpolation. */ - const float tx = frac(x*(float)width - 0.5f, &ix); - const float ty = frac(y*(float)height - 0.5f, &iy); - const float tz = frac(z*(float)depth - 0.5f, &iz); - int pix, piy, piz, nnix, nniy, nniz; - - switch(extension) { - case EXTENSION_REPEAT: - ix = wrap_periodic(ix, width); - iy = wrap_periodic(iy, height); - iz = wrap_periodic(iz, depth); - - pix = wrap_periodic(ix-1, width); - piy = wrap_periodic(iy-1, height); - piz = wrap_periodic(iz-1, depth); - - nix = wrap_periodic(ix+1, width); - niy = wrap_periodic(iy+1, height); - niz = wrap_periodic(iz+1, depth); - - nnix = wrap_periodic(ix+2, width); - nniy = wrap_periodic(iy+2, height); - nniz = wrap_periodic(iz+2, depth); - break; - case EXTENSION_CLIP: - if(x < 0.0f || y < 0.0f || z < 0.0f || - x > 1.0f || y > 1.0f || z > 1.0f) - { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - /* Fall through. */ - case EXTENSION_EXTEND: - pix = wrap_clamp(ix-1, width); - piy = wrap_clamp(iy-1, height); - piz = wrap_clamp(iz-1, depth); - - nix = wrap_clamp(ix+1, width); - niy = wrap_clamp(iy+1, height); - niz = wrap_clamp(iz+1, depth); - - nnix = wrap_clamp(ix+2, width); - nniy = wrap_clamp(iy+2, height); - nniz = wrap_clamp(iz+2, depth); - - ix = wrap_clamp(ix, width); - iy = wrap_clamp(iy, height); - iz = wrap_clamp(iz, depth); - break; - default: - kernel_assert(0); - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - - const int xc[4] = {pix, ix, nix, nnix}; - const int yc[4] = {width * piy, - width * iy, - width * niy, - width * nniy}; - const int zc[4] = {width * height * piz, - width * height * iz, - width * height * niz, - width * height * nniz}; - float u[4], v[4], w[4]; - - /* Some helper macro to keep code reasonable size, - * let compiler to inline all the matrix multiplications. - */ -#define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]])) -#define COL_TERM(col, row) \ - (v[col] * (u[0] * DATA(0, col, row) + \ - u[1] * DATA(1, col, row) + \ - u[2] * DATA(2, col, row) + \ - u[3] * DATA(3, col, row))) -#define ROW_TERM(row) \ - (w[row] * (COL_TERM(0, row) + \ - COL_TERM(1, row) + \ - COL_TERM(2, row) + \ - COL_TERM(3, row))) - - SET_CUBIC_SPLINE_WEIGHTS(u, tx); - SET_CUBIC_SPLINE_WEIGHTS(v, ty); - SET_CUBIC_SPLINE_WEIGHTS(w, tz); - - /* Actual interpolation. */ - return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3); - -#undef COL_TERM -#undef ROW_TERM -#undef DATA - } - } - - ccl_always_inline void dimensions_set(int width_, int height_, int depth_) - { - width = width_; - height = height_; - depth = depth_; - } - - T *data; - int interpolation; - ExtensionType extension; - int width, height, depth; -#undef SET_CUBIC_SPLINE_WEIGHTS -}; - -typedef texture<float4> texture_float4; -typedef texture<float2> texture_float2; -typedef texture<float> texture_float; -typedef texture<uint> texture_uint; -typedef texture<int> texture_int; -typedef texture<uint4> texture_uint4; -typedef texture<uchar4> texture_uchar4; -typedef texture<uchar> texture_uchar; -typedef texture_image<float> texture_image_float; -typedef texture_image<uchar> texture_image_uchar; -typedef texture_image<half> texture_image_half; -typedef texture_image<float4> texture_image_float4; -typedef texture_image<uchar4> texture_image_uchar4; -typedef texture_image<half4> texture_image_half4; - /* Macros to handle different memory storage on different devices */ #define kernel_tex_fetch(tex, index) (kg->tex.fetch(index)) @@ -524,10 +118,7 @@ typedef texture_image<half4> texture_image_half4; #define kernel_tex_fetch_ssef(tex, index) (kg->tex.fetch_ssef(index)) #define kernel_tex_fetch_ssei(tex, index) (kg->tex.fetch_ssei(index)) #define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size)) - -#define kernel_tex_image_interp(tex,x,y) kernel_tex_image_interp_impl(kg,tex,x,y) -#define kernel_tex_image_interp_3d(tex, x, y, z) kernel_tex_image_interp_3d_impl(kg,tex,x,y,z) -#define kernel_tex_image_interp_3d_ex(tex, x, y, z, interpolation) kernel_tex_image_interp_3d_ex_impl(kg,tex, x, y, z, interpolation) +#define kernel_tex_array(tex) (kg->tex.data) #define kernel_data (kg->__data) diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h index e0c7b17c6a0..ac63bcf7ac9 100644 --- a/intern/cycles/kernel/kernel_compat_cuda.h +++ b/intern/cycles/kernel/kernel_compat_cuda.h @@ -30,75 +30,114 @@ # define __NODES_FEATURES__ NODE_FEATURE_ALL #endif -#include <cuda.h> -#include <cuda_fp16.h> -#include <float.h> +/* Manual definitions so we can compile without CUDA toolkit. */ + +typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; +typedef unsigned short half; +typedef unsigned long long CUtexObject; + +#define FLT_MIN 1.175494350822287507969e-38f +#define FLT_MAX 340282346638528859811704183484516925440.0f + +__device__ half __float2half(const float f) +{ + half val; + asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f)); + return val; +} /* Qualifier wrappers for different names on different devices */ #define ccl_device __device__ __inline__ -# define ccl_device_forceinline __device__ __forceinline__ -#if (__KERNEL_CUDA_VERSION__ == 80) && (__CUDA_ARCH__ < 500) +#if __CUDA_ARCH__ < 500 # define ccl_device_inline __device__ __forceinline__ +# define ccl_device_forceinline __device__ __forceinline__ #else # define ccl_device_inline __device__ __inline__ +# define ccl_device_forceinline __device__ __forceinline__ #endif #define ccl_device_noinline __device__ __noinline__ #define ccl_global -#define ccl_constant +#define ccl_static_constant __constant__ +#define ccl_constant const +#define ccl_local __shared__ +#define ccl_local_param +#define ccl_private #define ccl_may_alias #define ccl_addr_space #define ccl_restrict __restrict__ +/* TODO(sergey): In theory we might use references with CUDA, however + * performance impact yet to be investigated. + */ +#define ccl_ref #define ccl_align(n) __align__(n) +#define ATTR_FALLTHROUGH + +#define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH) + + /* No assert supported for CUDA */ #define kernel_assert(cond) /* Types */ -#include "util_half.h" -#include "util_types.h" +#include "util/util_half.h" +#include "util/util_types.h" + +/* Work item functions */ + +ccl_device_inline uint ccl_local_id(uint d) +{ + switch(d) { + case 0: return threadIdx.x; + case 1: return threadIdx.y; + case 2: return threadIdx.z; + default: return 0; + } +} + +#define ccl_global_id(d) (ccl_group_id(d) * ccl_local_size(d) + ccl_local_id(d)) + +ccl_device_inline uint ccl_local_size(uint d) +{ + switch(d) { + case 0: return blockDim.x; + case 1: return blockDim.y; + case 2: return blockDim.z; + default: return 0; + } +} + +#define ccl_global_size(d) (ccl_num_groups(d) * ccl_local_size(d)) + +ccl_device_inline uint ccl_group_id(uint d) +{ + switch(d) { + case 0: return blockIdx.x; + case 1: return blockIdx.y; + case 2: return blockIdx.z; + default: return 0; + } +} + +ccl_device_inline uint ccl_num_groups(uint d) +{ + switch(d) { + case 0: return gridDim.x; + case 1: return gridDim.y; + case 2: return gridDim.z; + default: return 0; + } +} /* Textures */ -typedef texture<float4, 1> texture_float4; -typedef texture<float2, 1> texture_float2; -typedef texture<float, 1> texture_float; -typedef texture<uint, 1> texture_uint; -typedef texture<int, 1> texture_int; -typedef texture<uint4, 1> texture_uint4; -typedef texture<uchar, 1> texture_uchar; -typedef texture<uchar4, 1> texture_uchar4; -typedef texture<float4, 2> texture_image_float4; -typedef texture<float4, 3> texture_image3d_float4; -typedef texture<uchar4, 2, cudaReadModeNormalizedFloat> texture_image_uchar4; - -/* Macros to handle different memory storage on different devices */ - -/* On Fermi cards (4xx and 5xx), we use regular textures for both data and images. - * On Kepler (6xx) and above, we use Bindless Textures for images and arrays for data. - * - * Arrays are necessary in order to use the full VRAM on newer cards, and it's slightly faster. - * Using Arrays on Fermi turned out to be slower.*/ - -/* Fermi */ -#if __CUDA_ARCH__ < 300 -# define __KERNEL_CUDA_TEX_STORAGE__ -# define kernel_tex_fetch(t, index) tex1Dfetch(t, index) - -# define kernel_tex_image_interp(t, x, y) tex2D(t, x, y) -# define kernel_tex_image_interp_3d(t, x, y, z) tex3D(t, x, y, z) - -/* Kepler */ -#else -# define kernel_tex_fetch(t, index) t[(index)] - -# define kernel_tex_image_interp_float4(t, x, y) tex2D<float4>(t, x, y) -# define kernel_tex_image_interp_float(t, x, y) tex2D<float>(t, x, y) -# define kernel_tex_image_interp_3d_float4(t, x, y, z) tex3D<float4>(t, x, y, z) -# define kernel_tex_image_interp_3d_float(t, x, y, z) tex3D<float>(t, x, y, z) -#endif +/* Use arrays for regular data. */ +#define kernel_tex_fetch(t, index) t[(index)] +#define kernel_tex_array(t) (t) #define kernel_data __data diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h index f076e3a7d37..671c47e2225 100644 --- a/intern/cycles/kernel/kernel_compat_opencl.h +++ b/intern/cycles/kernel/kernel_compat_opencl.h @@ -36,11 +36,14 @@ #define ccl_device_forceinline ccl_device #define ccl_device_noinline ccl_device ccl_noinline #define ccl_may_alias +#define ccl_static_constant static __constant #define ccl_constant __constant #define ccl_global __global #define ccl_local __local +#define ccl_local_param __local #define ccl_private __private #define ccl_restrict restrict +#define ccl_ref #define ccl_align(n) __attribute__((aligned(n))) #ifdef __SPLIT_KERNEL__ @@ -49,6 +52,17 @@ # define ccl_addr_space #endif +#define ATTR_FALLTHROUGH + +#define ccl_local_id(d) get_local_id(d) +#define ccl_global_id(d) get_global_id(d) + +#define ccl_local_size(d) get_local_size(d) +#define ccl_global_size(d) get_global_size(d) + +#define ccl_group_id(d) get_group_id(d) +#define ccl_num_groups(d) get_num_groups(d) + /* Selective nodes compilation. */ #ifndef __NODES_MAX_GROUP__ # define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX @@ -117,6 +131,7 @@ # define expf(x) native_exp(((float)(x))) # define sqrtf(x) native_sqrt(((float)(x))) # define logf(x) native_log(((float)(x))) +# define rcp(x) native_recip(x) #else # define sinf(x) sin(((float)(x))) # define cosf(x) cos(((float)(x))) @@ -124,17 +139,19 @@ # define expf(x) exp(((float)(x))) # define sqrtf(x) sqrt(((float)(x))) # define logf(x) log(((float)(x))) +# define rcp(x) recip(x)) #endif /* data lookup defines */ #define kernel_data (*kg->data) -#define kernel_tex_fetch(t, index) kg->t[index] +#define kernel_tex_array(tex) ((const ccl_global tex##_t*)(kg->buffers[kg->tex.cl_buffer] + kg->tex.data)) +#define kernel_tex_fetch(tex, index) kernel_tex_array(tex)[(index)] /* define NULL */ #define NULL 0 -#include "util_half.h" -#include "util_types.h" +#include "util/util_half.h" +#include "util/util_types.h" #endif /* __KERNEL_COMPAT_OPENCL_H__ */ diff --git a/intern/cycles/kernel/kernel_debug.h b/intern/cycles/kernel/kernel_debug.h deleted file mode 100644 index 5647bbae5b5..00000000000 --- a/intern/cycles/kernel/kernel_debug.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright 2011-2014 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -ccl_device_inline void debug_data_init(DebugData *debug_data) -{ - debug_data->num_bvh_traversed_nodes = 0; - debug_data->num_bvh_traversed_instances = 0; - debug_data->num_bvh_intersections = 0; - debug_data->num_ray_bounces = 0; -} - -ccl_device_inline void kernel_write_debug_passes(KernelGlobals *kg, - ccl_global float *buffer, - ccl_addr_space PathState *state, - DebugData *debug_data, - int sample) -{ - int flag = kernel_data.film.pass_flag; - if(flag & PASS_BVH_TRAVERSED_NODES) { - kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversed_nodes, - sample, - debug_data->num_bvh_traversed_nodes); - } - if(flag & PASS_BVH_TRAVERSED_INSTANCES) { - kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversed_instances, - sample, - debug_data->num_bvh_traversed_instances); - } - if(flag & PASS_BVH_INTERSECTIONS) { - kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_intersections, - sample, - debug_data->num_bvh_intersections); - } - if(flag & PASS_RAY_BOUNCES) { - kernel_write_pass_float(buffer + kernel_data.film.pass_ray_bounces, - sample, - debug_data->num_ray_bounces); - } -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h index 8c7c651a053..a5556c3be8f 100644 --- a/intern/cycles/kernel/kernel_emission.h +++ b/intern/cycles/kernel/kernel_emission.h @@ -29,7 +29,7 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg, /* setup shading at emitter */ float3 eval; - int shader_flag = kernel_tex_fetch(__shader_flag, (ls->shader & SHADER_MASK)*SHADER_SIZE); + int shader_flag = kernel_tex_fetch(__shaders, (ls->shader & SHADER_MASK)).flags; #ifdef __BACKGROUND_MIS__ if(ls->type == LIGHT_BACKGROUND) { @@ -37,25 +37,23 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg, ray.D = ls->D; ray.P = ls->P; ray.t = 1.0f; -# ifdef __OBJECT_MOTION__ ray.time = time; -# endif ray.dP = differential3_zero(); ray.dD = dI; shader_setup_from_background(kg, emission_sd, &ray); path_state_modify_bounce(state, true); - eval = shader_eval_background(kg, emission_sd, state, 0, SHADER_CONTEXT_EMISSION); + eval = shader_eval_background(kg, emission_sd, state, 0); path_state_modify_bounce(state, false); } else #endif if(shader_flag & SD_HAS_CONSTANT_EMISSION) { - eval.x = __int_as_float(kernel_tex_fetch(__shader_flag, (ls->shader & SHADER_MASK)*SHADER_SIZE + 2)); - eval.y = __int_as_float(kernel_tex_fetch(__shader_flag, (ls->shader & SHADER_MASK)*SHADER_SIZE + 3)); - eval.z = __int_as_float(kernel_tex_fetch(__shader_flag, (ls->shader & SHADER_MASK)*SHADER_SIZE + 4)); + eval.x = kernel_tex_fetch(__shaders, (ls->shader & SHADER_MASK)).constant_emission[0]; + eval.y = kernel_tex_fetch(__shaders, (ls->shader & SHADER_MASK)).constant_emission[1]; + eval.z = kernel_tex_fetch(__shaders, (ls->shader & SHADER_MASK)).constant_emission[2]; if((ls->prim != PRIM_NONE) && dot(ls->Ng, I) < 0.0f) { ls->Ng = -ls->Ng; } @@ -67,19 +65,16 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg, ls->shader, ls->object, ls->prim, ls->u, ls->v, t, time, false, ls->lamp); - ls->Ng = ccl_fetch(emission_sd, Ng); + ls->Ng = emission_sd->Ng; - /* no path flag, we're evaluating this for all closures. that's weak but - * we'd have to do multiple evaluations otherwise */ + /* No proper path flag, we're evaluating this for all closures. that's + * weak but we'd have to do multiple evaluations otherwise. */ path_state_modify_bounce(state, true); - shader_eval_surface(kg, emission_sd, NULL, state, 0.0f, 0, SHADER_CONTEXT_EMISSION); + shader_eval_surface(kg, emission_sd, state, PATH_RAY_EMISSION); path_state_modify_bounce(state, false); - /* evaluate emissive closure */ - if(ccl_fetch(emission_sd, flag) & SD_EMISSION) - eval = shader_emissive_eval(kg, emission_sd); - else - eval = make_float3(0.0f, 0.0f, 0.0f); + /* Evaluate emissive closure. */ + eval = shader_emissive_eval(kg, emission_sd); } eval *= ls->eval_fac; @@ -112,7 +107,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, -ls->D, dD, ls->t, - ccl_fetch(sd, time)); + sd->time); if(is_zero(light_eval)) return false; @@ -120,7 +115,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, /* evaluate BSDF at shading point */ #ifdef __VOLUME__ - if(ccl_fetch(sd, prim) != PRIM_NONE) + if(sd->prim != PRIM_NONE) shader_bsdf_eval(kg, sd, ls->D, eval, ls->pdf, ls->shader & SHADER_USE_MIS); else { float bsdf_pdf; @@ -156,8 +151,13 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, if(bsdf_eval_is_zero(eval)) return false; - if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) { - float probability = max3(bsdf_eval_sum(eval)) * kernel_data.integrator.light_inv_rr_threshold; + if(kernel_data.integrator.light_inv_rr_threshold > 0.0f +#ifdef __SHADOW_TRICKS__ + && (state->flag & PATH_RAY_SHADOW_CATCHER) == 0 +#endif + ) + { + float probability = max3(fabs(bsdf_eval_sum(eval))) * kernel_data.integrator.light_inv_rr_threshold; if(probability < 1.0f) { if(rand_terminate >= probability) { return false; @@ -168,8 +168,8 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, if(ls->shader & SHADER_CAST_SHADOW) { /* setup ray */ - bool transmit = (dot(ccl_fetch(sd, Ng), ls->D) < 0.0f); - ray->P = ray_offset(ccl_fetch(sd, P), (transmit)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng)); + bool transmit = (dot(sd->Ng, ls->D) < 0.0f); + ray->P = ray_offset(sd->P, (transmit)? -sd->Ng: sd->Ng); if(ls->t == FLT_MAX) { /* distant light */ @@ -182,7 +182,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ray->D = normalize_len(ray->D, &ray->t); } - ray->dP = ccl_fetch(sd, dP); + ray->dP = sd->dP; ray->dD = differential3_zero(); } else { @@ -204,14 +204,14 @@ ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, Shader float3 L = shader_emissive_eval(kg, sd); #ifdef __HAIR__ - if(!(path_flag & PATH_RAY_MIS_SKIP) && (ccl_fetch(sd, flag) & SD_USE_MIS) && (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)) + if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) && (sd->type & PRIMITIVE_ALL_TRIANGLE)) #else - if(!(path_flag & PATH_RAY_MIS_SKIP) && (ccl_fetch(sd, flag) & SD_USE_MIS)) + if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS)) #endif { /* multiple importance sampling, get triangle light pdf, * and compute weight with respect to BSDF pdf */ - float pdf = triangle_light_pdf(kg, ccl_fetch(sd, Ng), ccl_fetch(sd, I), t); + float pdf = triangle_light_pdf(kg, sd, t); float mis_weight = power_heuristic(bsdf_pdf, pdf); return L*mis_weight; @@ -314,7 +314,7 @@ ccl_device_noinline float3 indirect_background(KernelGlobals *kg, # endif path_state_modify_bounce(state, true); - float3 L = shader_eval_background(kg, emission_sd, state, state->flag, SHADER_CONTEXT_EMISSION); + float3 L = shader_eval_background(kg, emission_sd, state, state->flag); path_state_modify_bounce(state, false); #ifdef __BACKGROUND_MIS__ diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h index 2b52a2d2f48..74cfacb5bc1 100644 --- a/intern/cycles/kernel/kernel_globals.h +++ b/intern/cycles/kernel/kernel_globals.h @@ -16,6 +16,17 @@ /* Constant Globals */ +#ifndef __KERNEL_GLOBALS_H__ +#define __KERNEL_GLOBALS_H__ + +#ifdef __KERNEL_CPU__ +# include "util/util_vector.h" +#endif + +#ifdef __KERNEL_OPENCL__ +# include "util/util_atomic.h" +#endif + CCL_NAMESPACE_BEGIN /* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in @@ -35,16 +46,8 @@ struct Intersection; struct VolumeStep; typedef struct KernelGlobals { - texture_image_uchar4 texture_byte4_images[TEX_NUM_BYTE4_CPU]; - texture_image_float4 texture_float4_images[TEX_NUM_FLOAT4_CPU]; - texture_image_half4 texture_half4_images[TEX_NUM_HALF4_CPU]; - texture_image_float texture_float_images[TEX_NUM_FLOAT_CPU]; - texture_image_uchar texture_byte_images[TEX_NUM_BYTE_CPU]; - texture_image_half texture_half_images[TEX_NUM_HALF_CPU]; - -# define KERNEL_TEX(type, ttype, name) ttype name; -# define KERNEL_IMAGE_TEX(type, ttype, name) -# include "kernel_textures.h" +# define KERNEL_TEX(type, name) texture<type> name; +# include "kernel/kernel_textures.h" KernelData __data; @@ -64,6 +67,13 @@ typedef struct KernelGlobals { /* Storage for decoupled volume steps. */ VolumeStep *decoupled_volume_steps[2]; int decoupled_volume_steps_index; + + /* split kernel */ + SplitData split_data; + SplitParams split_param_data; + + int2 global_size; + int2 global_id; } KernelGlobals; #endif /* __KERNEL_CPU__ */ @@ -81,13 +91,8 @@ typedef struct KernelGlobals { Intersection hits_stack[64]; } KernelGlobals; -# ifdef __KERNEL_CUDA_TEX_STORAGE__ -# define KERNEL_TEX(type, ttype, name) ttype name; -# else -# define KERNEL_TEX(type, ttype, name) const __constant__ __device__ type *name; -# endif -# define KERNEL_IMAGE_TEX(type, ttype, name) ttype name; -# include "kernel_textures.h" +# define KERNEL_TEX(type, name) const __constant__ __device__ type *name; +# include "kernel/kernel_textures.h" #endif /* __KERNEL_CUDA__ */ @@ -95,19 +100,75 @@ typedef struct KernelGlobals { #ifdef __KERNEL_OPENCL__ +# define KERNEL_TEX(type, name) \ +typedef type name##_t; +# include "kernel/kernel_textures.h" + typedef ccl_addr_space struct KernelGlobals { ccl_constant KernelData *data; + ccl_global char *buffers[8]; -# define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name; -# include "kernel_textures.h" +# define KERNEL_TEX(type, name) \ + TextureInfo name; +# include "kernel/kernel_textures.h" # ifdef __SPLIT_KERNEL__ - ShaderData *sd_input; - Intersection *isect_shadow; + SplitData split_data; + SplitParams split_param_data; # endif } KernelGlobals; +#define KERNEL_BUFFER_PARAMS \ + ccl_global char *buffer0, \ + ccl_global char *buffer1, \ + ccl_global char *buffer2, \ + ccl_global char *buffer3, \ + ccl_global char *buffer4, \ + ccl_global char *buffer5, \ + ccl_global char *buffer6, \ + ccl_global char *buffer7 + +#define KERNEL_BUFFER_ARGS buffer0, buffer1, buffer2, buffer3, buffer4, buffer5, buffer6, buffer7 + +ccl_device_inline void kernel_set_buffer_pointers(KernelGlobals *kg, KERNEL_BUFFER_PARAMS) +{ +#ifdef __SPLIT_KERNEL__ + if(ccl_local_id(0) + ccl_local_id(1) == 0) +#endif + { + kg->buffers[0] = buffer0; + kg->buffers[1] = buffer1; + kg->buffers[2] = buffer2; + kg->buffers[3] = buffer3; + kg->buffers[4] = buffer4; + kg->buffers[5] = buffer5; + kg->buffers[6] = buffer6; + kg->buffers[7] = buffer7; + } + +# ifdef __SPLIT_KERNEL__ + ccl_barrier(CCL_LOCAL_MEM_FENCE); +# endif +} + +ccl_device_inline void kernel_set_buffer_info(KernelGlobals *kg) +{ +# ifdef __SPLIT_KERNEL__ + if(ccl_local_id(0) + ccl_local_id(1) == 0) +# endif + { + ccl_global TextureInfo *info = (ccl_global TextureInfo*)kg->buffers[0]; + +# define KERNEL_TEX(type, name) \ + kg->name = *(info++); +# include "kernel/kernel_textures.h" + } + +# ifdef __SPLIT_KERNEL__ + ccl_barrier(CCL_LOCAL_MEM_FENCE); +# endif +} + #endif /* __KERNEL_OPENCL__ */ /* Interpolated lookup table access */ @@ -146,3 +207,4 @@ ccl_device float lookup_table_read_2D(KernelGlobals *kg, float x, float y, int o CCL_NAMESPACE_END +#endif /* __KERNEL_GLOBALS_H__ */ diff --git a/intern/cycles/kernel/kernel_image_opencl.h b/intern/cycles/kernel/kernel_image_opencl.h deleted file mode 100644 index 0352c58037d..00000000000 --- a/intern/cycles/kernel/kernel_image_opencl.h +++ /dev/null @@ -1,231 +0,0 @@ -/* - * Copyright 2016 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -/* For OpenCL all images are packed in a single array, and we do manual lookup - * and interpolation. */ - -ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, int id, int offset) -{ - /* Float4 */ - if(id < TEX_START_BYTE4_OPENCL) { - return kernel_tex_fetch(__tex_image_float4_packed, offset); - } - /* Byte4 */ - else if(id < TEX_START_FLOAT_OPENCL) { - uchar4 r = kernel_tex_fetch(__tex_image_byte4_packed, offset); - float f = 1.0f/255.0f; - return make_float4(r.x*f, r.y*f, r.z*f, r.w*f); - } - /* Float */ - else if(id < TEX_START_BYTE_OPENCL) { - float f = kernel_tex_fetch(__tex_image_float_packed, offset); - return make_float4(f, f, f, 1.0f); - } - /* Byte */ - else { - uchar r = kernel_tex_fetch(__tex_image_byte_packed, offset); - float f = r * (1.0f/255.0f); - return make_float4(f, f, f, 1.0f); - } -} - -ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width) -{ - x %= width; - if(x < 0) - x += width; - return x; -} - -ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width) -{ - return clamp(x, 0, width-1); -} - -ccl_device_inline float svm_image_texture_frac(float x, int *ix) -{ - int i = float_to_int(x) - ((x < 0.0f)? 1: 0); - *ix = i; - return x - (float)i; -} - -ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y) -{ - uint4 info = kernel_tex_fetch(__tex_image_packed_info, id*2); - uint width = info.x; - uint height = info.y; - uint offset = info.z; - - /* Image Options */ - uint interpolation = (info.w & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR; - uint extension; - if(info.w & (1 << 1)) - extension = EXTENSION_REPEAT; - else if(info.w & (1 << 2)) - extension = EXTENSION_EXTEND; - else - extension = EXTENSION_CLIP; - - float4 r; - int ix, iy, nix, niy; - if(interpolation == INTERPOLATION_CLOSEST) { - svm_image_texture_frac(x*width, &ix); - svm_image_texture_frac(y*height, &iy); - - if(extension == EXTENSION_REPEAT) { - ix = svm_image_texture_wrap_periodic(ix, width); - iy = svm_image_texture_wrap_periodic(iy, height); - } - else { - if(extension == EXTENSION_CLIP) { - if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - } - /* Fall through. */ - /* EXTENSION_EXTEND */ - ix = svm_image_texture_wrap_clamp(ix, width); - iy = svm_image_texture_wrap_clamp(iy, height); - } - - r = svm_image_texture_read(kg, id, offset + ix + iy*width); - } - else { /* INTERPOLATION_LINEAR */ - float tx = svm_image_texture_frac(x*width - 0.5f, &ix); - float ty = svm_image_texture_frac(y*height - 0.5f, &iy); - - if(extension == EXTENSION_REPEAT) { - ix = svm_image_texture_wrap_periodic(ix, width); - iy = svm_image_texture_wrap_periodic(iy, height); - - nix = svm_image_texture_wrap_periodic(ix+1, width); - niy = svm_image_texture_wrap_periodic(iy+1, height); - } - else { - if(extension == EXTENSION_CLIP) { - if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - } - nix = svm_image_texture_wrap_clamp(ix+1, width); - niy = svm_image_texture_wrap_clamp(iy+1, height); - ix = svm_image_texture_wrap_clamp(ix, width); - iy = svm_image_texture_wrap_clamp(iy, height); - } - - r = (1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + iy*width); - r += (1.0f - ty)*tx*svm_image_texture_read(kg, id, offset + nix + iy*width); - r += ty*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + niy*width); - r += ty*tx*svm_image_texture_read(kg, id, offset + nix + niy*width); - } - - return r; -} - - -ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z) -{ - uint4 info = kernel_tex_fetch(__tex_image_packed_info, id*2); - uint width = info.x; - uint height = info.y; - uint offset = info.z; - uint depth = kernel_tex_fetch(__tex_image_packed_info, id*2+1).x; - - /* Image Options */ - uint interpolation = (info.w & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR; - uint extension; - if(info.w & (1 << 1)) - extension = EXTENSION_REPEAT; - else if(info.w & (1 << 2)) - extension = EXTENSION_EXTEND; - else - extension = EXTENSION_CLIP; - - float4 r; - int ix, iy, iz, nix, niy, niz; - if(interpolation == INTERPOLATION_CLOSEST) { - svm_image_texture_frac(x*width, &ix); - svm_image_texture_frac(y*height, &iy); - svm_image_texture_frac(z*depth, &iz); - - if(extension == EXTENSION_REPEAT) { - ix = svm_image_texture_wrap_periodic(ix, width); - iy = svm_image_texture_wrap_periodic(iy, height); - iz = svm_image_texture_wrap_periodic(iz, depth); - } - else { - if(extension == EXTENSION_CLIP) { - if(x < 0.0f || y < 0.0f || z < 0.0f || - x > 1.0f || y > 1.0f || z > 1.0f) - { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - } - /* Fall through. */ - /* EXTENSION_EXTEND */ - ix = svm_image_texture_wrap_clamp(ix, width); - iy = svm_image_texture_wrap_clamp(iy, height); - iz = svm_image_texture_wrap_clamp(iz, depth); - } - r = svm_image_texture_read(kg, id, offset + ix + iy*width + iz*width*height); - } - else { /* INTERPOLATION_LINEAR */ - float tx = svm_image_texture_frac(x*(float)width - 0.5f, &ix); - float ty = svm_image_texture_frac(y*(float)height - 0.5f, &iy); - float tz = svm_image_texture_frac(z*(float)depth - 0.5f, &iz); - - if(extension == EXTENSION_REPEAT) { - ix = svm_image_texture_wrap_periodic(ix, width); - iy = svm_image_texture_wrap_periodic(iy, height); - iz = svm_image_texture_wrap_periodic(iz, depth); - - nix = svm_image_texture_wrap_periodic(ix+1, width); - niy = svm_image_texture_wrap_periodic(iy+1, height); - niz = svm_image_texture_wrap_periodic(iz+1, depth); - } - else { - if(extension == EXTENSION_CLIP) - if(x < 0.0f || y < 0.0f || z < 0.0f || - x > 1.0f || y > 1.0f || z > 1.0f) - { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - /* Fall through. */ - /* EXTENSION_EXTEND */ - nix = svm_image_texture_wrap_clamp(ix+1, width); - niy = svm_image_texture_wrap_clamp(iy+1, height); - niz = svm_image_texture_wrap_clamp(iz+1, depth); - - ix = svm_image_texture_wrap_clamp(ix, width); - iy = svm_image_texture_wrap_clamp(iy, height); - iz = svm_image_texture_wrap_clamp(iz, depth); - } - - r = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + iy*width + iz*width*height); - r += (1.0f - tz)*(1.0f - ty)*tx*svm_image_texture_read(kg, id, offset + nix + iy*width + iz*width*height); - r += (1.0f - tz)*ty*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + niy*width + iz*width*height); - r += (1.0f - tz)*ty*tx*svm_image_texture_read(kg, id, offset + nix + niy*width + iz*width*height); - - r += tz*(1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + iy*width + niz*width*height); - r += tz*(1.0f - ty)*tx*svm_image_texture_read(kg, id, offset + nix + iy*width + niz*width*height); - r += tz*ty*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + niy*width + niz*width*height); - r += tz*ty*tx*svm_image_texture_read(kg, id, offset + nix + niy*width + niz*width*height); - - } - - return r; -} diff --git a/intern/cycles/kernel/kernel_jitter.h b/intern/cycles/kernel/kernel_jitter.h index 67546131746..f5855757d3f 100644 --- a/intern/cycles/kernel/kernel_jitter.h +++ b/intern/cycles/kernel/kernel_jitter.h @@ -175,15 +175,26 @@ ccl_device float cmj_sample_1D(int s, int N, int p) return (x + jx)*invN; } -ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy) +/* TODO(sergey): Do some extra tests and consider moving to util_math.h. */ +ccl_device_inline int cmj_isqrt(int value) { - kernel_assert(s < N); - #if defined(__KERNEL_CUDA__) - int m = float_to_int(__fsqrt_ru(N)); + return float_to_int(__fsqrt_ru(value)); +#elif defined(__KERNEL_GPU__) + return float_to_int(sqrtf(value)); #else - int m = float_to_int(sqrtf(N)); + /* This is a work around for fast-math on CPU which might replace sqrtf() + * with am approximated version. + */ + return float_to_int(sqrtf(value) + 1e-6f); #endif +} + +ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy) +{ + kernel_assert(s < N); + + int m = cmj_isqrt(N); int n = (N - 1)/m + 1; float invN = 1.0f/N; float invm = 1.0f/m; diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h index a2909cec1a1..efab69ee37d 100644 --- a/intern/cycles/kernel/kernel_light.h +++ b/intern/cycles/kernel/kernel_light.h @@ -102,7 +102,7 @@ ccl_device_inline float area_light_sample(float3 P, float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f); cu = clamp(cu, -1.0f, 1.0f); /* Compute xu. */ - float xu = -(cu * z0) / sqrtf(1.0f - cu * cu); + float xu = -(cu * z0) / max(sqrtf(1.0f - cu * cu), 1e-7f); xu = clamp(xu, x0, x1); /* Compute yv. */ float z0sq = z0 * z0; @@ -255,11 +255,11 @@ ccl_device_inline bool background_portal_data_fetch_and_check_side(KernelGlobals float3 *lightpos, float3 *dir) { - float4 data0 = kernel_tex_fetch(__light_data, (index + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 0); - float4 data3 = kernel_tex_fetch(__light_data, (index + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 3); + int portal = kernel_data.integrator.portal_offset + index; + const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, portal); - *lightpos = make_float3(data0.y, data0.z, data0.w); - *dir = make_float3(data3.y, data3.z, data3.w); + *lightpos = make_float3(klight->co[0], klight->co[1], klight->co[2]); + *dir = make_float3(klight->area.dir[0], klight->area.dir[1], klight->area.dir[2]); /* Check whether portal is on the right side. */ if(dot(*dir, P - *lightpos) > 1e-4f) @@ -291,11 +291,10 @@ ccl_device_inline float background_portal_pdf(KernelGlobals *kg, } num_possible++; - float4 data1 = kernel_tex_fetch(__light_data, (p + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 1); - float4 data2 = kernel_tex_fetch(__light_data, (p + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 2); - - float3 axisu = make_float3(data1.y, data1.z, data1.w); - float3 axisv = make_float3(data2.y, data2.z, data2.w); + int portal = kernel_data.integrator.portal_offset + p; + const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, portal); + float3 axisu = make_float3(klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]); + float3 axisv = make_float3(klight->area.axisv[0], klight->area.axisv[1], klight->area.axisv[2]); if(!ray_quad_intersect(P, direction, 1e-4f, FLT_MAX, lightpos, axisu, axisv, dir, NULL, NULL, NULL, NULL)) continue; @@ -346,10 +345,10 @@ ccl_device float3 background_portal_sample(KernelGlobals *kg, if(portal == 0) { /* p is the portal to be sampled. */ - float4 data1 = kernel_tex_fetch(__light_data, (p + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 1); - float4 data2 = kernel_tex_fetch(__light_data, (p + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 2); - float3 axisu = make_float3(data1.y, data1.z, data1.w); - float3 axisv = make_float3(data2.y, data2.z, data2.w); + int portal = kernel_data.integrator.portal_offset + p; + const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, portal); + float3 axisu = make_float3(klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]); + float3 axisv = make_float3(klight->area.axisv[0], klight->area.axisv[1], klight->area.axisv[2]); *pdf = area_light_sample(P, &lightpos, axisu, axisv, @@ -396,11 +395,13 @@ ccl_device_inline float3 background_light_sample(KernelGlobals *kg, + (1.0f - portal_sampling_pdf) * cdf_pdf); } return D; - } else { + } + else { /* Sample map, but with nonzero portal_sampling_pdf for MIS. */ randu = (randu - portal_sampling_pdf) / (1.0f - portal_sampling_pdf); } - } else { + } + else { /* We can't sample a portal. * Check if we can sample the map instead. */ @@ -477,14 +478,10 @@ ccl_device float3 sphere_light_sample(float3 P, float3 center, float radius, flo return disk_light_sample(normalize(P - center), randu, randv)*radius; } -ccl_device float spot_light_attenuation(float4 data1, float4 data2, LightSample *ls) +ccl_device float spot_light_attenuation(float3 dir, float spot_angle, float spot_smooth, LightSample *ls) { - float3 dir = make_float3(data2.y, data2.z, data2.w); float3 I = ls->Ng; - float spot_angle = data1.w; - float spot_smooth = data2.x; - float attenuation = dot(dir, I); if(attenuation <= spot_angle) { @@ -516,12 +513,10 @@ ccl_device_inline bool lamp_light_sample(KernelGlobals *kg, float3 P, LightSample *ls) { - float4 data0 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 0); - float4 data1 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 1); - - LightType type = (LightType)__float_as_int(data0.x); + const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp); + LightType type = (LightType)klight->type; ls->type = type; - ls->shader = __float_as_int(data1.x); + ls->shader = klight->shader_id; ls->object = PRIM_NONE; ls->prim = PRIM_NONE; ls->lamp = lamp; @@ -530,10 +525,10 @@ ccl_device_inline bool lamp_light_sample(KernelGlobals *kg, if(type == LIGHT_DISTANT) { /* distant light */ - float3 lightD = make_float3(data0.y, data0.z, data0.w); + float3 lightD = make_float3(klight->co[0], klight->co[1], klight->co[2]); float3 D = lightD; - float radius = data1.y; - float invarea = data1.w; + float radius = klight->distant.radius; + float invarea = klight->distant.invarea; if(radius > 0.0f) D = distant_light_sample(D, radius, randu, randv); @@ -545,7 +540,7 @@ ccl_device_inline bool lamp_light_sample(KernelGlobals *kg, float costheta = dot(lightD, D); ls->pdf = invarea/(costheta*costheta*costheta); - ls->eval_fac = ls->pdf*kernel_data.integrator.inv_pdf_lights; + ls->eval_fac = ls->pdf; } #ifdef __BACKGROUND_MIS__ else if(type == LIGHT_BACKGROUND) { @@ -557,14 +552,13 @@ ccl_device_inline bool lamp_light_sample(KernelGlobals *kg, ls->D = -D; ls->t = FLT_MAX; ls->eval_fac = 1.0f; - ls->pdf *= kernel_data.integrator.pdf_lights; } #endif else { - ls->P = make_float3(data0.y, data0.z, data0.w); + ls->P = make_float3(klight->co[0], klight->co[1], klight->co[2]); if(type == LIGHT_POINT || type == LIGHT_SPOT) { - float radius = data1.y; + float radius = klight->spot.radius; if(radius > 0.0f) /* sphere light */ @@ -573,14 +567,19 @@ ccl_device_inline bool lamp_light_sample(KernelGlobals *kg, ls->D = normalize_len(ls->P - P, &ls->t); ls->Ng = -ls->D; - float invarea = data1.z; + float invarea = klight->spot.invarea; ls->eval_fac = (0.25f*M_1_PI_F)*invarea; ls->pdf = invarea; if(type == LIGHT_SPOT) { /* spot light attenuation */ - float4 data2 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 2); - ls->eval_fac *= spot_light_attenuation(data1, data2, ls); + float3 dir = make_float3(klight->spot.dir[0], + klight->spot.dir[1], + klight->spot.dir[2]); + ls->eval_fac *= spot_light_attenuation(dir, + klight->spot.spot_angle, + klight->spot.spot_smooth, + ls); if(ls->eval_fac == 0.0f) { return false; } @@ -593,12 +592,15 @@ ccl_device_inline bool lamp_light_sample(KernelGlobals *kg, } else { /* area light */ - float4 data2 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 2); - float4 data3 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 3); - - float3 axisu = make_float3(data1.y, data1.z, data1.w); - float3 axisv = make_float3(data2.y, data2.z, data2.w); - float3 D = make_float3(data3.y, data3.z, data3.w); + float3 axisu = make_float3(klight->area.axisu[0], + klight->area.axisu[1], + klight->area.axisu[2]); + float3 axisv = make_float3(klight->area.axisv[0], + klight->area.axisv[1], + klight->area.axisv[2]); + float3 D = make_float3(klight->area.dir[0], + klight->area.dir[1], + klight->area.dir[2]); if(dot(ls->P - P, D) > 0.0f) { return false; @@ -617,24 +619,22 @@ ccl_device_inline bool lamp_light_sample(KernelGlobals *kg, ls->Ng = D; ls->D = normalize_len(ls->P - P, &ls->t); - float invarea = data2.x; + float invarea = klight->area.invarea; ls->eval_fac = 0.25f*invarea; } - - ls->eval_fac *= kernel_data.integrator.inv_pdf_lights; } + ls->pdf *= kernel_data.integrator.pdf_lights; + return (ls->pdf > 0.0f); } ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D, float t, LightSample *ls) { - float4 data0 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 0); - float4 data1 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 1); - - LightType type = (LightType)__float_as_int(data0.x); + const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp); + LightType type = (LightType)klight->type; ls->type = type; - ls->shader = __float_as_int(data1.x); + ls->shader = klight->shader_id; ls->object = PRIM_NONE; ls->prim = PRIM_NONE; ls->lamp = lamp; @@ -647,7 +647,7 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D, if(type == LIGHT_DISTANT) { /* distant light */ - float radius = data1.y; + float radius = klight->distant.radius; if(radius == 0.0f) return false; @@ -669,9 +669,9 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D, * P */ - float3 lightD = make_float3(data0.y, data0.z, data0.w); + float3 lightD = make_float3(klight->co[0], klight->co[1], klight->co[2]); float costheta = dot(-lightD, D); - float cosangle = data1.z; + float cosangle = klight->distant.cosangle; if(costheta < cosangle) return false; @@ -682,13 +682,14 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D, ls->t = FLT_MAX; /* compute pdf */ - float invarea = data1.w; + float invarea = klight->distant.invarea; ls->pdf = invarea/(costheta*costheta*costheta); ls->eval_fac = ls->pdf; } else if(type == LIGHT_POINT || type == LIGHT_SPOT) { - float3 lightP = make_float3(data0.y, data0.z, data0.w); - float radius = data1.y; + float3 lightP = make_float3(klight->co[0], klight->co[1], klight->co[2]); + + float radius = klight->spot.radius; /* sphere light */ if(radius == 0.0f) @@ -703,14 +704,19 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D, ls->Ng = -D; ls->D = D; - float invarea = data1.z; + float invarea = klight->spot.invarea; ls->eval_fac = (0.25f*M_1_PI_F)*invarea; ls->pdf = invarea; if(type == LIGHT_SPOT) { /* spot light attenuation */ - float4 data2 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 2); - ls->eval_fac *= spot_light_attenuation(data1, data2, ls); + float3 dir = make_float3(klight->spot.dir[0], + klight->spot.dir[1], + klight->spot.dir[2]); + ls->eval_fac *= spot_light_attenuation(dir, + klight->spot.spot_angle, + klight->spot.spot_smooth, + ls); if(ls->eval_fac == 0.0f) return false; @@ -725,22 +731,25 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D, } else if(type == LIGHT_AREA) { /* area light */ - float4 data2 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 2); - float4 data3 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 3); - - float invarea = data2.x; + float invarea = klight->area.invarea; if(invarea == 0.0f) return false; - float3 axisu = make_float3(data1.y, data1.z, data1.w); - float3 axisv = make_float3(data2.y, data2.z, data2.w); - float3 Ng = make_float3(data3.y, data3.z, data3.w); + float3 axisu = make_float3(klight->area.axisu[0], + klight->area.axisu[1], + klight->area.axisu[2]); + float3 axisv = make_float3(klight->area.axisv[0], + klight->area.axisv[1], + klight->area.axisv[2]); + float3 Ng = make_float3(klight->area.dir[0], + klight->area.dir[1], + klight->area.dir[2]); /* one sided */ if(dot(D, Ng) >= 0.0f) return false; - float3 light_P = make_float3(data0.y, data0.z, data0.w); + float3 light_P = make_float3(klight->co[0], klight->co[1], klight->co[2]); if(!ray_quad_intersect(P, D, 0.0f, t, light_P, axisu, axisv, Ng, @@ -755,86 +764,292 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D, ls->pdf = area_light_sample(P, &light_P, axisu, axisv, 0, 0, false); ls->eval_fac = 0.25f*invarea; } - else + else { return false; + } + + ls->pdf *= kernel_data.integrator.pdf_lights; return true; } /* Triangle Light */ -ccl_device void object_transform_light_sample(KernelGlobals *kg, LightSample *ls, int object, float time) +/* returns true if the triangle is has motion blur or an instancing transform applied */ +ccl_device_inline bool triangle_world_space_vertices(KernelGlobals *kg, int object, int prim, float time, float3 V[3]) { + bool has_motion = false; + const int object_flag = kernel_tex_fetch(__object_flag, object); + + if(object_flag & SD_OBJECT_HAS_VERTEX_MOTION && time >= 0.0f) { + motion_triangle_vertices(kg, object, prim, time, V); + has_motion = true; + } + else { + triangle_vertices(kg, prim, V); + } + #ifdef __INSTANCING__ - /* instance transform */ - if(!(kernel_tex_fetch(__object_flag, object) & SD_OBJECT_TRANSFORM_APPLIED)) { + if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { # ifdef __OBJECT_MOTION__ - Transform itfm; - Transform tfm = object_fetch_transform_motion_test(kg, object, time, &itfm); + float object_time = (time >= 0.0f) ? time : 0.5f; + Transform tfm = object_fetch_transform_motion_test(kg, object, object_time, NULL); # else Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM); # endif - - ls->P = transform_point(&tfm, ls->P); - ls->Ng = normalize(transform_direction(&tfm, ls->Ng)); + V[0] = transform_point(&tfm, V[0]); + V[1] = transform_point(&tfm, V[1]); + V[2] = transform_point(&tfm, V[2]); + has_motion = true; } #endif + return has_motion; } -ccl_device void triangle_light_sample(KernelGlobals *kg, int prim, int object, - float randu, float randv, float time, LightSample *ls) +ccl_device_inline float triangle_light_pdf_area(KernelGlobals *kg, const float3 Ng, const float3 I, float t) { - float u, v; + float pdf = kernel_data.integrator.pdf_triangles; + float cos_pi = fabsf(dot(Ng, I)); - /* compute random point in triangle */ - randu = sqrtf(randu); + if(cos_pi == 0.0f) + return 0.0f; - u = 1.0f - randu; - v = randv*randu; + return t*t*pdf/cos_pi; +} + +ccl_device_forceinline float triangle_light_pdf(KernelGlobals *kg, ShaderData *sd, float t) +{ + /* A naive heuristic to decide between costly solid angle sampling + * and simple area sampling, comparing the distance to the triangle plane + * to the length of the edges of the triangle. */ + + float3 V[3]; + bool has_motion = triangle_world_space_vertices(kg, sd->object, sd->prim, sd->time, V); + + const float3 e0 = V[1] - V[0]; + const float3 e1 = V[2] - V[0]; + const float3 e2 = V[2] - V[1]; + const float longest_edge_squared = max(len_squared(e0), max(len_squared(e1), len_squared(e2))); + const float3 N = cross(e0, e1); + const float distance_to_plane = fabsf(dot(N, sd->I * t))/dot(N, N); + + if(longest_edge_squared > distance_to_plane*distance_to_plane) { + /* sd contains the point on the light source + * calculate Px, the point that we're shading */ + const float3 Px = sd->P + sd->I * t; + const float3 v0_p = V[0] - Px; + const float3 v1_p = V[1] - Px; + const float3 v2_p = V[2] - Px; + + const float3 u01 = safe_normalize(cross(v0_p, v1_p)); + const float3 u02 = safe_normalize(cross(v0_p, v2_p)); + const float3 u12 = safe_normalize(cross(v1_p, v2_p)); + + const float alpha = fast_acosf(dot(u02, u01)); + const float beta = fast_acosf(-dot(u01, u12)); + const float gamma = fast_acosf(dot(u02, u12)); + const float solid_angle = alpha + beta + gamma - M_PI_F; + + /* pdf_triangles is calculated over triangle area, but we're not sampling over its area */ + if(UNLIKELY(solid_angle == 0.0f)) { + return 0.0f; + } + else { + float area = 1.0f; + if(has_motion) { + /* get the center frame vertices, this is what the PDF was calculated from */ + triangle_world_space_vertices(kg, sd->object, sd->prim, -1.0f, V); + area = triangle_area(V[0], V[1], V[2]); + } + else { + area = 0.5f * len(N); + } + const float pdf = area * kernel_data.integrator.pdf_triangles; + return pdf / solid_angle; + } + } + else { + float pdf = triangle_light_pdf_area(kg, sd->Ng, sd->I, t); + if(has_motion) { + const float area = 0.5f * len(N); + if(UNLIKELY(area == 0.0f)) { + return 0.0f; + } + /* scale the PDF. + * area = the area the sample was taken from + * area_pre = the are from which pdf_triangles was calculated from */ + triangle_world_space_vertices(kg, sd->object, sd->prim, -1.0f, V); + const float area_pre = triangle_area(V[0], V[1], V[2]); + pdf = pdf * area_pre / area; + } + return pdf; + } +} - /* triangle, so get position, normal, shader */ - triangle_point_normal(kg, object, prim, u, v, &ls->P, &ls->Ng, &ls->shader); +ccl_device_forceinline void triangle_light_sample(KernelGlobals *kg, int prim, int object, + float randu, float randv, float time, LightSample *ls, const float3 P) +{ + /* A naive heuristic to decide between costly solid angle sampling + * and simple area sampling, comparing the distance to the triangle plane + * to the length of the edges of the triangle. */ + + float3 V[3]; + bool has_motion = triangle_world_space_vertices(kg, object, prim, time, V); + + const float3 e0 = V[1] - V[0]; + const float3 e1 = V[2] - V[0]; + const float3 e2 = V[2] - V[1]; + const float longest_edge_squared = max(len_squared(e0), max(len_squared(e1), len_squared(e2))); + const float3 N0 = cross(e0, e1); + float Nl = 0.0f; + ls->Ng = safe_normalize_len(N0, &Nl); + float area = 0.5f * Nl; + + /* flip normal if necessary */ + const int object_flag = kernel_tex_fetch(__object_flag, object); + if(object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) { + ls->Ng = -ls->Ng; + } + ls->eval_fac = 1.0f; + ls->shader = kernel_tex_fetch(__tri_shader, prim); ls->object = object; ls->prim = prim; ls->lamp = LAMP_NONE; ls->shader |= SHADER_USE_MIS; - ls->t = 0.0f; - ls->u = u; - ls->v = v; ls->type = LIGHT_TRIANGLE; - ls->eval_fac = 1.0f; - object_transform_light_sample(kg, ls, object, time); -} + float distance_to_plane = fabsf(dot(N0, V[0] - P)/dot(N0, N0)); + + if(longest_edge_squared > distance_to_plane*distance_to_plane) { + /* see James Arvo, "Stratified Sampling of Spherical Triangles" + * http://www.graphics.cornell.edu/pubs/1995/Arv95c.pdf */ + + /* project the triangle to the unit sphere + * and calculate its edges and angles */ + const float3 v0_p = V[0] - P; + const float3 v1_p = V[1] - P; + const float3 v2_p = V[2] - P; + + const float3 u01 = safe_normalize(cross(v0_p, v1_p)); + const float3 u02 = safe_normalize(cross(v0_p, v2_p)); + const float3 u12 = safe_normalize(cross(v1_p, v2_p)); + + const float3 A = safe_normalize(v0_p); + const float3 B = safe_normalize(v1_p); + const float3 C = safe_normalize(v2_p); + + const float cos_alpha = dot(u02, u01); + const float cos_beta = -dot(u01, u12); + const float cos_gamma = dot(u02, u12); + + /* calculate dihedral angles */ + const float alpha = fast_acosf(cos_alpha); + const float beta = fast_acosf(cos_beta); + const float gamma = fast_acosf(cos_gamma); + /* the area of the unit spherical triangle = solid angle */ + const float solid_angle = alpha + beta + gamma - M_PI_F; + + /* precompute a few things + * these could be re-used to take several samples + * as they are independent of randu/randv */ + const float cos_c = dot(A, B); + const float sin_alpha = fast_sinf(alpha); + const float product = sin_alpha * cos_c; + + /* Select a random sub-area of the spherical triangle + * and calculate the third vertex C_ of that new triangle */ + const float phi = randu * solid_angle - alpha; + float s, t; + fast_sincosf(phi, &s, &t); + const float u = t - cos_alpha; + const float v = s + product; + + const float3 U = safe_normalize(C - dot(C, A) * A); + + float q = 1.0f; + const float det = ((v * s + u * t) * sin_alpha); + if(det != 0.0f) { + q = ((v * t - u * s) * cos_alpha - v) / det; + } + const float temp = max(1.0f - q*q, 0.0f); -ccl_device float triangle_light_pdf(KernelGlobals *kg, - const float3 Ng, const float3 I, float t) -{ - float pdf = kernel_data.integrator.pdf_triangles; - float cos_pi = fabsf(dot(Ng, I)); + const float3 C_ = safe_normalize(q * A + sqrtf(temp) * U); - if(cos_pi == 0.0f) - return 0.0f; - - return t*t*pdf/cos_pi; + /* Finally, select a random point along the edge of the new triangle + * That point on the spherical triangle is the sampled ray direction */ + const float z = 1.0f - randv * (1.0f - dot(C_, B)); + ls->D = z * B + safe_sqrtf(1.0f - z*z) * safe_normalize(C_ - dot(C_, B) * B); + + /* calculate intersection with the planar triangle */ + if(!ray_triangle_intersect(P, ls->D, FLT_MAX, +#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) + (ssef*)V, +#else + V[0], V[1], V[2], +#endif + &ls->u, &ls->v, &ls->t)) { + ls->pdf = 0.0f; + return; + } + + ls->P = P + ls->D * ls->t; + + /* pdf_triangles is calculated over triangle area, but we're sampling over solid angle */ + if(UNLIKELY(solid_angle == 0.0f)) { + ls->pdf = 0.0f; + return; + } + else { + if(has_motion) { + /* get the center frame vertices, this is what the PDF was calculated from */ + triangle_world_space_vertices(kg, object, prim, -1.0f, V); + area = triangle_area(V[0], V[1], V[2]); + } + const float pdf = area * kernel_data.integrator.pdf_triangles; + ls->pdf = pdf / solid_angle; + } + } + else { + /* compute random point in triangle */ + randu = sqrtf(randu); + + const float u = 1.0f - randu; + const float v = randv*randu; + const float t = 1.0f - u - v; + ls->P = u * V[0] + v * V[1] + t * V[2]; + /* compute incoming direction, distance and pdf */ + ls->D = normalize_len(ls->P - P, &ls->t); + ls->pdf = triangle_light_pdf_area(kg, ls->Ng, -ls->D, ls->t); + if(has_motion && area != 0.0f) { + /* scale the PDF. + * area = the area the sample was taken from + * area_pre = the are from which pdf_triangles was calculated from */ + triangle_world_space_vertices(kg, object, prim, -1.0f, V); + const float area_pre = triangle_area(V[0], V[1], V[2]); + ls->pdf = ls->pdf * area_pre / area; + } + ls->u = u; + ls->v = v; + } } /* Light Distribution */ -ccl_device int light_distribution_sample(KernelGlobals *kg, float randt) +ccl_device int light_distribution_sample(KernelGlobals *kg, float *randu) { - /* this is basically std::upper_bound as used by pbrt, to find a point light or + /* This is basically std::upper_bound as used by pbrt, to find a point light or * triangle to emit from, proportional to area. a good improvement would be to * also sample proportional to power, though it's not so well defined with - * OSL shaders. */ + * arbitrary shaders. */ int first = 0; int len = kernel_data.integrator.num_distribution + 1; + float r = *randu; while(len > 0) { int half_len = len >> 1; int middle = first + half_len; - if(randt < kernel_tex_fetch(__light_distribution, middle).x) { + if(r < kernel_tex_fetch(__light_distribution, middle).totarea) { len = half_len; } else { @@ -843,21 +1058,27 @@ ccl_device int light_distribution_sample(KernelGlobals *kg, float randt) } } - /* clamping should not be needed but float rounding errors seem to - * make this fail on rare occasions */ - return clamp(first-1, 0, kernel_data.integrator.num_distribution-1); + /* Clamping should not be needed but float rounding errors seem to + * make this fail on rare occasions. */ + int index = clamp(first-1, 0, kernel_data.integrator.num_distribution-1); + + /* Rescale to reuse random number. this helps the 2D samples within + * each area light be stratified as well. */ + float distr_min = kernel_tex_fetch(__light_distribution, index).totarea; + float distr_max = kernel_tex_fetch(__light_distribution, index+1).totarea; + *randu = (r - distr_min)/(distr_max - distr_min); + + return index; } /* Generic Light */ ccl_device bool light_select_reached_max_bounces(KernelGlobals *kg, int index, int bounce) { - float4 data4 = kernel_tex_fetch(__light_data, index*LIGHT_SIZE + 4); - return (bounce > __float_as_int(data4.x)); + return (bounce > kernel_tex_fetch(__lights, index).max_bounces); } ccl_device_noinline bool light_sample(KernelGlobals *kg, - float randt, float randu, float randv, float time, @@ -866,20 +1087,17 @@ ccl_device_noinline bool light_sample(KernelGlobals *kg, LightSample *ls) { /* sample index */ - int index = light_distribution_sample(kg, randt); + int index = light_distribution_sample(kg, &randu); /* fetch light data */ - float4 l = kernel_tex_fetch(__light_distribution, index); - int prim = __float_as_int(l.y); + const ccl_global KernelLightDistribution *kdistribution = &kernel_tex_fetch(__light_distribution, index); + int prim = kdistribution->prim; if(prim >= 0) { - int object = __float_as_int(l.w); - int shader_flag = __float_as_int(l.z); + int object = kdistribution->mesh_light.object_id; + int shader_flag = kdistribution->mesh_light.shader_flag; - triangle_light_sample(kg, prim, object, randu, randv, time, ls); - /* compute incoming direction, distance and pdf */ - ls->D = normalize_len(ls->P - P, &ls->t); - ls->pdf = triangle_light_pdf(kg, ls->Ng, -ls->D, ls->t); + triangle_light_sample(kg, prim, object, randu, randv, time, ls, P); ls->shader |= shader_flag; return (ls->pdf > 0.0f); } @@ -896,8 +1114,7 @@ ccl_device_noinline bool light_sample(KernelGlobals *kg, ccl_device int light_select_num_samples(KernelGlobals *kg, int index) { - float4 data3 = kernel_tex_fetch(__light_data, index*LIGHT_SIZE + 3); - return __float_as_int(data3.x); + return kernel_tex_fetch(__lights, index).samples; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_math.h b/intern/cycles/kernel/kernel_math.h index 9bee5603474..96391db7649 100644 --- a/intern/cycles/kernel/kernel_math.h +++ b/intern/cycles/kernel/kernel_math.h @@ -17,11 +17,12 @@ #ifndef __KERNEL_MATH_H__ #define __KERNEL_MATH_H__ -#include "util_color.h" -#include "util_math.h" -#include "util_math_fast.h" -#include "util_texture.h" -#include "util_transform.h" +#include "util/util_color.h" +#include "util/util_math.h" +#include "util/util_math_fast.h" +#include "util/util_math_intersect.h" +#include "util/util_projection.h" +#include "util/util_texture.h" +#include "util/util_transform.h" #endif /* __KERNEL_MATH_H__ */ - diff --git a/intern/cycles/kernel/kernel_montecarlo.h b/intern/cycles/kernel/kernel_montecarlo.h index af7b727c1ba..9995490505f 100644 --- a/intern/cycles/kernel/kernel_montecarlo.h +++ b/intern/cycles/kernel/kernel_montecarlo.h @@ -67,8 +67,8 @@ ccl_device_inline void sample_cos_hemisphere(const float3 N, /* sample direction uniformly distributed in hemisphere */ ccl_device_inline void sample_uniform_hemisphere(const float3 N, - float randu, float randv, - float3 *omega_in, float *pdf) + float randu, float randv, + float3 *omega_in, float *pdf) { float z = randu; float r = sqrtf(max(0.0f, 1.0f - z*z)); @@ -84,8 +84,8 @@ ccl_device_inline void sample_uniform_hemisphere(const float3 N, /* sample direction uniformly distributed in cone */ ccl_device_inline void sample_uniform_cone(const float3 N, float angle, - float randu, float randv, - float3 *omega_in, float *pdf) + float randu, float randv, + float3 *omega_in, float *pdf) { float z = cosf(angle*randu); float r = sqrtf(max(0.0f, 1.0f - z*z)); @@ -187,4 +187,3 @@ ccl_device float2 regular_polygon_sample(float corners, float rotation, float u, CCL_NAMESPACE_END #endif /* __KERNEL_MONTECARLO_CL__ */ - diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h index 7aec47e4957..a42a8e9812f 100644 --- a/intern/cycles/kernel/kernel_passes.h +++ b/intern/cycles/kernel/kernel_passes.h @@ -16,19 +16,23 @@ CCL_NAMESPACE_BEGIN -ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, int sample, float value) +#if defined(__SPLIT_KERNEL__) || defined(__KERNEL_CUDA__) +#define __ATOMIC_PASS_WRITE__ +#endif + +ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, float value) { ccl_global float *buf = buffer; -#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__) +#ifdef __ATOMIC_PASS_WRITE__ atomic_add_and_fetch_float(buf, value); #else - *buf = (sample == 0)? value: *buf + value; -#endif // __SPLIT_KERNEL__ && __WORK_STEALING__ + *buf += value; +#endif } -ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, int sample, float3 value) +ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, float3 value) { -#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__) +#ifdef __ATOMIC_PASS_WRITE__ ccl_global float *buf_x = buffer + 0; ccl_global float *buf_y = buffer + 1; ccl_global float *buf_z = buffer + 2; @@ -38,13 +42,13 @@ ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, int sa atomic_add_and_fetch_float(buf_z, value.z); #else ccl_global float3 *buf = (ccl_global float3*)buffer; - *buf = (sample == 0)? value: *buf + value; -#endif // __SPLIT_KERNEL__ && __WORK_STEALING__ + *buf += value; +#endif } -ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sample, float4 value) +ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, float4 value) { -#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__) +#ifdef __ATOMIC_PASS_WRITE__ ccl_global float *buf_x = buffer + 0; ccl_global float *buf_y = buffer + 1; ccl_global float *buf_z = buffer + 2; @@ -56,12 +60,137 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sa atomic_add_and_fetch_float(buf_w, value.w); #else ccl_global float4 *buf = (ccl_global float4*)buffer; - *buf = (sample == 0)? value: *buf + value; -#endif // __SPLIT_KERNEL__ && __WORK_STEALING__ + *buf += value; +#endif +} + +#ifdef __DENOISING_FEATURES__ +ccl_device_inline void kernel_write_pass_float_variance(ccl_global float *buffer, float value) +{ + kernel_write_pass_float(buffer, value); + + /* The online one-pass variance update that's used for the megakernel can't easily be implemented + * with atomics, so for the split kernel the E[x^2] - 1/N * (E[x])^2 fallback is used. */ + kernel_write_pass_float(buffer+1, value*value); +} + +# ifdef __ATOMIC_PASS_WRITE__ +# define kernel_write_pass_float3_unaligned kernel_write_pass_float3 +# else +ccl_device_inline void kernel_write_pass_float3_unaligned(ccl_global float *buffer, float3 value) +{ + buffer[0] += value.x; + buffer[1] += value.y; + buffer[2] += value.z; +} +# endif + +ccl_device_inline void kernel_write_pass_float3_variance(ccl_global float *buffer, float3 value) +{ + kernel_write_pass_float3_unaligned(buffer, value); + kernel_write_pass_float3_unaligned(buffer+3, value*value); +} + +ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg, ccl_global float *buffer, + int sample, float path_total, float path_total_shaded) +{ + if(kernel_data.film.pass_denoising_data == 0) + return; + + buffer += (sample & 1)? DENOISING_PASS_SHADOW_B : DENOISING_PASS_SHADOW_A; + + path_total = ensure_finite(path_total); + path_total_shaded = ensure_finite(path_total_shaded); + + kernel_write_pass_float(buffer, path_total); + kernel_write_pass_float(buffer+1, path_total_shaded); + + float value = path_total_shaded / max(path_total, 1e-7f); + kernel_write_pass_float(buffer+2, value*value); +} +#endif /* __DENOISING_FEATURES__ */ + +ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg, + ShaderData *sd, + ccl_addr_space PathState *state, + PathRadiance *L) +{ +#ifdef __DENOISING_FEATURES__ + if(state->denoising_feature_weight == 0.0f) { + return; + } + + L->denoising_depth += ensure_finite(state->denoising_feature_weight * sd->ray_length); + + /* Skip implicitly transparent surfaces. */ + if(sd->flag & SD_HAS_ONLY_VOLUME) { + return; + } + + float3 normal = make_float3(0.0f, 0.0f, 0.0f); + float3 albedo = make_float3(0.0f, 0.0f, 0.0f); + float sum_weight = 0.0f, sum_nonspecular_weight = 0.0f; + + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; + + if(!CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) + continue; + + /* All closures contribute to the normal feature, but only diffuse-like ones to the albedo. */ + normal += sc->N * sc->sample_weight; + sum_weight += sc->sample_weight; + if(bsdf_get_specular_roughness_squared(sc) > sqr(0.075f)) { + albedo += sc->weight; + sum_nonspecular_weight += sc->sample_weight; + } + } + + /* Wait for next bounce if 75% or more sample weight belongs to specular-like closures. */ + if((sum_weight == 0.0f) || (sum_nonspecular_weight*4.0f > sum_weight)) { + if(sum_weight != 0.0f) { + normal /= sum_weight; + } + L->denoising_normal += ensure_finite3(state->denoising_feature_weight * normal); + L->denoising_albedo += ensure_finite3(state->denoising_feature_weight * albedo); + + state->denoising_feature_weight = 0.0f; + } +#else + (void) kg; + (void) sd; + (void) state; + (void) L; +#endif /* __DENOISING_FEATURES__ */ +} + +#ifdef __KERNEL_DEBUG__ +ccl_device_inline void kernel_write_debug_passes(KernelGlobals *kg, + ccl_global float *buffer, + PathRadiance *L) +{ + int flag = kernel_data.film.pass_flag; + if(flag & PASSMASK(BVH_TRAVERSED_NODES)) { + kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversed_nodes, + L->debug_data.num_bvh_traversed_nodes); + } + if(flag & PASSMASK(BVH_TRAVERSED_INSTANCES)) { + kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversed_instances, + L->debug_data.num_bvh_traversed_instances); + } + if(flag & PASSMASK(BVH_INTERSECTIONS)) { + kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_intersections, + L->debug_data.num_bvh_intersections); + } + if(flag & PASSMASK(RAY_BOUNCES)) { + kernel_write_pass_float(buffer + kernel_data.film.pass_ray_bounces, + L->debug_data.num_ray_bounces); + } } +#endif /* __KERNEL_DEBUG__ */ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L, - ShaderData *sd, int sample, ccl_addr_space PathState *state, float3 throughput) + ShaderData *sd, ccl_addr_space PathState *state, float3 throughput) { #ifdef __PASSES__ int path_flag = state->flag; @@ -70,64 +199,64 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl return; int flag = kernel_data.film.pass_flag; + int light_flag = kernel_data.film.light_pass_flag; - if(!(flag & PASS_ALL)) + if(!((flag | light_flag) & PASS_ANY)) return; if(!(path_flag & PATH_RAY_SINGLE_PASS_DONE)) { - if(!(ccl_fetch(sd, flag) & SD_TRANSPARENT) || + if(!(sd->flag & SD_TRANSPARENT) || kernel_data.film.pass_alpha_threshold == 0.0f || average(shader_bsdf_alpha(kg, sd)) >= kernel_data.film.pass_alpha_threshold) { - - if(sample == 0) { - if(flag & PASS_DEPTH) { - float depth = camera_distance(kg, ccl_fetch(sd, P)); - kernel_write_pass_float(buffer + kernel_data.film.pass_depth, sample, depth); + if(state->sample == 0) { + if(flag & PASSMASK(DEPTH)) { + float depth = camera_distance(kg, sd->P); + kernel_write_pass_float(buffer + kernel_data.film.pass_depth, depth); } - if(flag & PASS_OBJECT_ID) { - float id = object_pass_id(kg, ccl_fetch(sd, object)); - kernel_write_pass_float(buffer + kernel_data.film.pass_object_id, sample, id); + if(flag & PASSMASK(OBJECT_ID)) { + float id = object_pass_id(kg, sd->object); + kernel_write_pass_float(buffer + kernel_data.film.pass_object_id, id); } - if(flag & PASS_MATERIAL_ID) { + if(flag & PASSMASK(MATERIAL_ID)) { float id = shader_pass_id(kg, sd); - kernel_write_pass_float(buffer + kernel_data.film.pass_material_id, sample, id); + kernel_write_pass_float(buffer + kernel_data.film.pass_material_id, id); } } - if(flag & PASS_NORMAL) { - float3 normal = ccl_fetch(sd, N); - kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, sample, normal); + if(flag & PASSMASK(NORMAL)) { + float3 normal = shader_bsdf_average_normal(kg, sd); + kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, normal); } - if(flag & PASS_UV) { + if(flag & PASSMASK(UV)) { float3 uv = primitive_uv(kg, sd); - kernel_write_pass_float3(buffer + kernel_data.film.pass_uv, sample, uv); + kernel_write_pass_float3(buffer + kernel_data.film.pass_uv, uv); } - if(flag & PASS_MOTION) { + if(flag & PASSMASK(MOTION)) { float4 speed = primitive_motion_vector(kg, sd); - kernel_write_pass_float4(buffer + kernel_data.film.pass_motion, sample, speed); - kernel_write_pass_float(buffer + kernel_data.film.pass_motion_weight, sample, 1.0f); + kernel_write_pass_float4(buffer + kernel_data.film.pass_motion, speed); + kernel_write_pass_float(buffer + kernel_data.film.pass_motion_weight, 1.0f); } state->flag |= PATH_RAY_SINGLE_PASS_DONE; } } - if(flag & (PASS_DIFFUSE_INDIRECT|PASS_DIFFUSE_COLOR|PASS_DIFFUSE_DIRECT)) + if(light_flag & PASSMASK_COMPONENT(DIFFUSE)) L->color_diffuse += shader_bsdf_diffuse(kg, sd)*throughput; - if(flag & (PASS_GLOSSY_INDIRECT|PASS_GLOSSY_COLOR|PASS_GLOSSY_DIRECT)) + if(light_flag & PASSMASK_COMPONENT(GLOSSY)) L->color_glossy += shader_bsdf_glossy(kg, sd)*throughput; - if(flag & (PASS_TRANSMISSION_INDIRECT|PASS_TRANSMISSION_COLOR|PASS_TRANSMISSION_DIRECT)) + if(light_flag & PASSMASK_COMPONENT(TRANSMISSION)) L->color_transmission += shader_bsdf_transmission(kg, sd)*throughput; - if(flag & (PASS_SUBSURFACE_INDIRECT|PASS_SUBSURFACE_COLOR|PASS_SUBSURFACE_DIRECT)) + if(light_flag & PASSMASK_COMPONENT(SUBSURFACE)) L->color_subsurface += shader_bsdf_subsurface(kg, sd)*throughput; - if(flag & PASS_MIST) { + if(light_flag & PASSMASK(MIST)) { /* bring depth into 0..1 range */ float mist_start = kernel_data.film.mist_start; float mist_inv_depth = kernel_data.film.mist_inv_depth; - float depth = camera_distance(kg, ccl_fetch(sd, P)); + float depth = camera_distance(kg, sd->P); float mist = saturate((depth - mist_start)*mist_inv_depth); /* falloff */ @@ -149,53 +278,116 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl #endif } -ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L, int sample) +ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L) { #ifdef __PASSES__ - int flag = kernel_data.film.pass_flag; + int light_flag = kernel_data.film.light_pass_flag; if(!kernel_data.film.use_light_pass) return; - if(flag & PASS_DIFFUSE_INDIRECT) - kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_indirect, sample, L->indirect_diffuse); - if(flag & PASS_GLOSSY_INDIRECT) - kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_indirect, sample, L->indirect_glossy); - if(flag & PASS_TRANSMISSION_INDIRECT) - kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_indirect, sample, L->indirect_transmission); - if(flag & PASS_SUBSURFACE_INDIRECT) - kernel_write_pass_float3(buffer + kernel_data.film.pass_subsurface_indirect, sample, L->indirect_subsurface); - if(flag & PASS_DIFFUSE_DIRECT) - kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_direct, sample, L->direct_diffuse); - if(flag & PASS_GLOSSY_DIRECT) - kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_direct, sample, L->direct_glossy); - if(flag & PASS_TRANSMISSION_DIRECT) - kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_direct, sample, L->direct_transmission); - if(flag & PASS_SUBSURFACE_DIRECT) - kernel_write_pass_float3(buffer + kernel_data.film.pass_subsurface_direct, sample, L->direct_subsurface); - - if(flag & PASS_EMISSION) - kernel_write_pass_float3(buffer + kernel_data.film.pass_emission, sample, L->emission); - if(flag & PASS_BACKGROUND) - kernel_write_pass_float3(buffer + kernel_data.film.pass_background, sample, L->background); - if(flag & PASS_AO) - kernel_write_pass_float3(buffer + kernel_data.film.pass_ao, sample, L->ao); - - if(flag & PASS_DIFFUSE_COLOR) - kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_color, sample, L->color_diffuse); - if(flag & PASS_GLOSSY_COLOR) - kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_color, sample, L->color_glossy); - if(flag & PASS_TRANSMISSION_COLOR) - kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_color, sample, L->color_transmission); - if(flag & PASS_SUBSURFACE_COLOR) - kernel_write_pass_float3(buffer + kernel_data.film.pass_subsurface_color, sample, L->color_subsurface); - if(flag & PASS_SHADOW) { + if(light_flag & PASSMASK(DIFFUSE_INDIRECT)) + kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_indirect, L->indirect_diffuse); + if(light_flag & PASSMASK(GLOSSY_INDIRECT)) + kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_indirect, L->indirect_glossy); + if(light_flag & PASSMASK(TRANSMISSION_INDIRECT)) + kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_indirect, L->indirect_transmission); + if(light_flag & PASSMASK(SUBSURFACE_INDIRECT)) + kernel_write_pass_float3(buffer + kernel_data.film.pass_subsurface_indirect, L->indirect_subsurface); + if(light_flag & PASSMASK(VOLUME_INDIRECT)) + kernel_write_pass_float3(buffer + kernel_data.film.pass_volume_indirect, L->indirect_scatter); + if(light_flag & PASSMASK(DIFFUSE_DIRECT)) + kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_direct, L->direct_diffuse); + if(light_flag & PASSMASK(GLOSSY_DIRECT)) + kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_direct, L->direct_glossy); + if(light_flag & PASSMASK(TRANSMISSION_DIRECT)) + kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_direct, L->direct_transmission); + if(light_flag & PASSMASK(SUBSURFACE_DIRECT)) + kernel_write_pass_float3(buffer + kernel_data.film.pass_subsurface_direct, L->direct_subsurface); + if(light_flag & PASSMASK(VOLUME_DIRECT)) + kernel_write_pass_float3(buffer + kernel_data.film.pass_volume_direct, L->direct_scatter); + + if(light_flag & PASSMASK(EMISSION)) + kernel_write_pass_float3(buffer + kernel_data.film.pass_emission, L->emission); + if(light_flag & PASSMASK(BACKGROUND)) + kernel_write_pass_float3(buffer + kernel_data.film.pass_background, L->background); + if(light_flag & PASSMASK(AO)) + kernel_write_pass_float3(buffer + kernel_data.film.pass_ao, L->ao); + + if(light_flag & PASSMASK(DIFFUSE_COLOR)) + kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_color, L->color_diffuse); + if(light_flag & PASSMASK(GLOSSY_COLOR)) + kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_color, L->color_glossy); + if(light_flag & PASSMASK(TRANSMISSION_COLOR)) + kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_color, L->color_transmission); + if(light_flag & PASSMASK(SUBSURFACE_COLOR)) + kernel_write_pass_float3(buffer + kernel_data.film.pass_subsurface_color, L->color_subsurface); + if(light_flag & PASSMASK(SHADOW)) { float4 shadow = L->shadow; shadow.w = kernel_data.film.pass_shadow_scale; - kernel_write_pass_float4(buffer + kernel_data.film.pass_shadow, sample, shadow); + kernel_write_pass_float4(buffer + kernel_data.film.pass_shadow, shadow); } - if(flag & PASS_MIST) - kernel_write_pass_float(buffer + kernel_data.film.pass_mist, sample, 1.0f - L->mist); + if(light_flag & PASSMASK(MIST)) + kernel_write_pass_float(buffer + kernel_data.film.pass_mist, 1.0f - L->mist); +#endif +} + +ccl_device_inline void kernel_write_result(KernelGlobals *kg, + ccl_global float *buffer, + int sample, + PathRadiance *L) +{ + float alpha; + float3 L_sum = path_radiance_clamp_and_sum(kg, L, &alpha); + + kernel_write_pass_float4(buffer, make_float4(L_sum.x, L_sum.y, L_sum.z, alpha)); + + kernel_write_light_passes(kg, buffer, L); + +#ifdef __DENOISING_FEATURES__ + if(kernel_data.film.pass_denoising_data) { +# ifdef __SHADOW_TRICKS__ + kernel_write_denoising_shadow(kg, + buffer + kernel_data.film.pass_denoising_data, + sample, + average(L->path_total), + average(L->path_total_shaded)); +# else + kernel_write_denoising_shadow(kg, + buffer + kernel_data.film.pass_denoising_data, + sample, + 0.0f, 0.0f); +# endif + if(kernel_data.film.pass_denoising_clean) { + float3 noisy, clean; + path_radiance_split_denoising(kg, L, &noisy, &clean); + kernel_write_pass_float3_variance( + buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, + noisy); + kernel_write_pass_float3_unaligned( + buffer + kernel_data.film.pass_denoising_clean, + clean); + } + else { + kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, + ensure_finite3(L_sum)); + } + + kernel_write_pass_float3_variance( + buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL, + L->denoising_normal); + kernel_write_pass_float3_variance( + buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO, + L->denoising_albedo); + kernel_write_pass_float_variance( + buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH, + L->denoising_depth); + } +#endif /* __DENOISING_FEATURES__ */ + + +#ifdef __KERNEL_DEBUG__ + kernel_write_debug_passes(kg, buffer, L); #endif } diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h index f90701a8260..b0f53aef2d5 100644 --- a/intern/cycles/kernel/kernel_path.h +++ b/intern/cycles/kernel/kernel_path.h @@ -15,57 +15,350 @@ */ #ifdef __OSL__ -# include "osl_shader.h" +# include "kernel/osl/osl_shader.h" #endif -#include "kernel_random.h" -#include "kernel_projection.h" -#include "kernel_montecarlo.h" -#include "kernel_differential.h" -#include "kernel_camera.h" +#include "kernel/kernel_random.h" +#include "kernel/kernel_projection.h" +#include "kernel/kernel_montecarlo.h" +#include "kernel/kernel_differential.h" +#include "kernel/kernel_camera.h" -#include "geom/geom.h" -#include "bvh/bvh.h" +#include "kernel/geom/geom.h" +#include "kernel/bvh/bvh.h" -#include "kernel_accumulate.h" -#include "kernel_shader.h" -#include "kernel_light.h" -#include "kernel_passes.h" +#include "kernel/kernel_accumulate.h" +#include "kernel/kernel_shader.h" +#include "kernel/kernel_light.h" +#include "kernel/kernel_passes.h" -#ifdef __SUBSURFACE__ -# include "kernel_subsurface.h" +#if defined(__VOLUME__) || defined(__SUBSURFACE__) +# include "kernel/kernel_volume.h" #endif -#ifdef __VOLUME__ -# include "kernel_volume.h" +#ifdef __SUBSURFACE__ +# include "kernel/kernel_subsurface.h" #endif -#include "kernel_path_state.h" -#include "kernel_shadow.h" -#include "kernel_emission.h" -#include "kernel_path_common.h" -#include "kernel_path_surface.h" -#include "kernel_path_volume.h" +#include "kernel/kernel_path_state.h" +#include "kernel/kernel_shadow.h" +#include "kernel/kernel_emission.h" +#include "kernel/kernel_path_common.h" +#include "kernel/kernel_path_surface.h" +#include "kernel/kernel_path_volume.h" +#include "kernel/kernel_path_subsurface.h" + +CCL_NAMESPACE_BEGIN + +ccl_device_forceinline bool kernel_path_scene_intersect( + KernelGlobals *kg, + ccl_addr_space PathState *state, + Ray *ray, + Intersection *isect, + PathRadiance *L) +{ + uint visibility = path_state_ray_visibility(kg, state); + + if(path_state_ao_bounce(kg, state)) { + visibility = PATH_RAY_SHADOW; + ray->t = kernel_data.background.ao_distance; + } + +#ifdef __HAIR__ + float difl = 0.0f, extmax = 0.0f; + uint lcg_state = 0; + + if(kernel_data.bvh.have_curves) { + if((kernel_data.cam.resolution == 1) && (state->flag & PATH_RAY_CAMERA)) { + float3 pixdiff = ray->dD.dx + ray->dD.dy; + /*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/ + difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f; + } + + extmax = kernel_data.curve.maximum_width; + lcg_state = lcg_state_init_addrspace(state, 0x51633e2d); + } + + bool hit = scene_intersect(kg, *ray, visibility, isect, &lcg_state, difl, extmax); +#else + bool hit = scene_intersect(kg, *ray, visibility, isect, NULL, 0.0f, 0.0f); +#endif /* __HAIR__ */ #ifdef __KERNEL_DEBUG__ -# include "kernel_debug.h" -#endif + if(state->flag & PATH_RAY_CAMERA) { + L->debug_data.num_bvh_traversed_nodes += isect->num_traversed_nodes; + L->debug_data.num_bvh_traversed_instances += isect->num_traversed_instances; + L->debug_data.num_bvh_intersections += isect->num_intersections; + } + L->debug_data.num_ray_bounces++; +#endif /* __KERNEL_DEBUG__ */ -CCL_NAMESPACE_BEGIN + return hit; +} + +ccl_device_forceinline void kernel_path_lamp_emission( + KernelGlobals *kg, + ccl_addr_space PathState *state, + Ray *ray, + float3 throughput, + ccl_addr_space Intersection *isect, + ShaderData *emission_sd, + PathRadiance *L) +{ +#ifdef __LAMP_MIS__ + if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) { + /* ray starting from previous non-transparent bounce */ + Ray light_ray; + + light_ray.P = ray->P - state->ray_t*ray->D; + state->ray_t += isect->t; + light_ray.D = ray->D; + light_ray.t = state->ray_t; + light_ray.time = ray->time; + light_ray.dD = ray->dD; + light_ray.dP = ray->dP; + + /* intersect with lamp */ + float3 emission; + + if(indirect_lamp_emission(kg, emission_sd, state, &light_ray, &emission)) + path_radiance_accum_emission(L, state, throughput, emission); + } +#endif /* __LAMP_MIS__ */ +} + +ccl_device_forceinline void kernel_path_background( + KernelGlobals *kg, + ccl_addr_space PathState *state, + ccl_addr_space Ray *ray, + float3 throughput, + ShaderData *sd, + PathRadiance *L) +{ + /* eval background shader if nothing hit */ + if(kernel_data.background.transparent && (state->flag & PATH_RAY_TRANSPARENT_BACKGROUND)) { + L->transparent += average(throughput); + +#ifdef __PASSES__ + if(!(kernel_data.film.light_pass_flag & PASSMASK(BACKGROUND))) +#endif /* __PASSES__ */ + return; + } + + /* When using the ao bounces approximation, adjust background + * shader intensity with ao factor. */ + if(path_state_ao_bounce(kg, state)) { + throughput *= kernel_data.background.ao_bounces_factor; + } + +#ifdef __BACKGROUND__ + /* sample background shader */ + float3 L_background = indirect_background(kg, sd, state, ray); + path_radiance_accum_background(L, state, throughput, L_background); +#endif /* __BACKGROUND__ */ +} + +#ifndef __SPLIT_KERNEL__ + +#ifdef __VOLUME__ +ccl_device_forceinline VolumeIntegrateResult kernel_path_volume( + KernelGlobals *kg, + ShaderData *sd, + PathState *state, + Ray *ray, + float3 *throughput, + ccl_addr_space Intersection *isect, + bool hit, + ShaderData *emission_sd, + PathRadiance *L) +{ + /* Sanitize volume stack. */ + if(!hit) { + kernel_volume_clean_stack(kg, state->volume_stack); + } + + if(state->volume_stack[0].shader == SHADER_NONE) { + return VOLUME_PATH_ATTENUATED; + } + + /* volume attenuation, emission, scatter */ + Ray volume_ray = *ray; + volume_ray.t = (hit)? isect->t: FLT_MAX; + + bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack); + +# ifdef __VOLUME_DECOUPLED__ + int sampling_method = volume_stack_sampling_method(kg, state->volume_stack); + bool direct = (state->flag & PATH_RAY_CAMERA) != 0; + bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, direct, sampling_method); + + if(decoupled) { + /* cache steps along volume for repeated sampling */ + VolumeSegment volume_segment; + + shader_setup_from_volume(kg, sd, &volume_ray); + kernel_volume_decoupled_record(kg, state, + &volume_ray, sd, &volume_segment, heterogeneous); + + volume_segment.sampling_method = sampling_method; + + /* emission */ + if(volume_segment.closure_flag & SD_EMISSION) + path_radiance_accum_emission(L, state, *throughput, volume_segment.accum_emission); + + /* scattering */ + VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED; + + if(volume_segment.closure_flag & SD_SCATTER) { + int all = kernel_data.integrator.sample_all_lights_indirect; + + /* direct light sampling */ + kernel_branched_path_volume_connect_light(kg, sd, + emission_sd, *throughput, state, L, all, + &volume_ray, &volume_segment); + + /* indirect sample. if we use distance sampling and take just + * one sample for direct and indirect light, we could share + * this computation, but makes code a bit complex */ + float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL); + float rscatter = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE); + + result = kernel_volume_decoupled_scatter(kg, + state, &volume_ray, sd, throughput, + rphase, rscatter, &volume_segment, NULL, true); + } + + /* free cached steps */ + kernel_volume_decoupled_free(kg, &volume_segment); + + if(result == VOLUME_PATH_SCATTERED) { + if(kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray)) + return VOLUME_PATH_SCATTERED; + else + return VOLUME_PATH_MISSED; + } + else { + *throughput *= volume_segment.accum_transmittance; + } + } + else +# endif /* __VOLUME_DECOUPLED__ */ + { + /* integrate along volume segment with distance sampling */ + VolumeIntegrateResult result = kernel_volume_integrate( + kg, state, sd, &volume_ray, L, throughput, heterogeneous); + +# ifdef __VOLUME_SCATTER__ + if(result == VOLUME_PATH_SCATTERED) { + /* direct lighting */ + kernel_path_volume_connect_light(kg, sd, emission_sd, *throughput, state, L); + + /* indirect light bounce */ + if(kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray)) + return VOLUME_PATH_SCATTERED; + else + return VOLUME_PATH_MISSED; + } +# endif /* __VOLUME_SCATTER__ */ + } + + return VOLUME_PATH_ATTENUATED; +} +#endif /* __VOLUME__ */ + +#endif /* __SPLIT_KERNEL__ */ + +ccl_device_forceinline bool kernel_path_shader_apply( + KernelGlobals *kg, + ShaderData *sd, + ccl_addr_space PathState *state, + ccl_addr_space Ray *ray, + float3 throughput, + ShaderData *emission_sd, + PathRadiance *L, + ccl_global float *buffer) +{ +#ifdef __SHADOW_TRICKS__ + if((sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) { + if(state->flag & PATH_RAY_TRANSPARENT_BACKGROUND) { + state->flag |= (PATH_RAY_SHADOW_CATCHER | + PATH_RAY_STORE_SHADOW_INFO); + + float3 bg = make_float3(0.0f, 0.0f, 0.0f); + if(!kernel_data.background.transparent) { + bg = indirect_background(kg, emission_sd, state, ray); + } + path_radiance_accum_shadowcatcher(L, throughput, bg); + } + } + else if(state->flag & PATH_RAY_SHADOW_CATCHER) { + /* Only update transparency after shadow catcher bounce. */ + L->shadow_transparency *= + average(shader_bsdf_transparency(kg, sd)); + } +#endif /* __SHADOW_TRICKS__ */ + + /* holdout */ +#ifdef __HOLDOUT__ + if(((sd->flag & SD_HOLDOUT) || + (sd->object_flag & SD_OBJECT_HOLDOUT_MASK)) && + (state->flag & PATH_RAY_TRANSPARENT_BACKGROUND)) + { + if(kernel_data.background.transparent) { + float3 holdout_weight; + if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) { + holdout_weight = make_float3(1.0f, 1.0f, 1.0f); + } + else { + holdout_weight = shader_holdout_eval(kg, sd); + } + /* any throughput is ok, should all be identical here */ + L->transparent += average(holdout_weight*throughput); + } + + if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) { + return false; + } + } +#endif /* __HOLDOUT__ */ + + /* holdout mask objects do not write data passes */ + kernel_write_data_passes(kg, buffer, L, sd, state, throughput); + + /* blurring of bsdf after bounces, for rays that have a small likelihood + * of following this particular path (diffuse, rough glossy) */ + if(kernel_data.integrator.filter_glossy != FLT_MAX) { + float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf; + + if(blur_pdf < 1.0f) { + float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f; + shader_bsdf_blur(kg, sd, blur_roughness); + } + } + +#ifdef __EMISSION__ + /* emission */ + if(sd->flag & SD_EMISSION) { + float3 emission = indirect_primitive_emission(kg, sd, sd->ray_length, state->flag, state->ray_pdf); + path_radiance_accum_emission(L, state, throughput, emission); + } +#endif /* __EMISSION__ */ + + return true; +} ccl_device_noinline void kernel_path_ao(KernelGlobals *kg, ShaderData *sd, ShaderData *emission_sd, PathRadiance *L, - PathState *state, - RNG *rng, + ccl_addr_space PathState *state, float3 throughput, float3 ao_alpha) { /* todo: solve correlation */ float bsdf_u, bsdf_v; - path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); + path_state_rng_2D(kg, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); float ao_factor = kernel_data.background.ao_factor; float3 ao_N; @@ -75,278 +368,118 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg, sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf); - if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) { + if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) { Ray light_ray; float3 ao_shadow; - light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng)); + light_ray.P = ray_offset(sd->P, sd->Ng); light_ray.D = ao_D; light_ray.t = kernel_data.background.ao_distance; -#ifdef __OBJECT_MOTION__ - light_ray.time = ccl_fetch(sd, time); -#endif /* __OBJECT_MOTION__ */ - light_ray.dP = ccl_fetch(sd, dP); + light_ray.time = sd->time; + light_ray.dP = sd->dP; light_ray.dD = differential3_zero(); - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) { - path_radiance_accum_ao(L, throughput, ao_alpha, ao_bsdf, ao_shadow, state->bounce); + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &ao_shadow)) { + path_radiance_accum_ao(L, state, throughput, ao_alpha, ao_bsdf, ao_shadow); + } + else { + path_radiance_accum_total_ao(L, state, throughput, ao_bsdf); } } } +#ifndef __SPLIT_KERNEL__ + +#if defined(__BRANCHED_PATH__) || defined(__BAKING__) + ccl_device void kernel_path_indirect(KernelGlobals *kg, ShaderData *sd, ShaderData *emission_sd, - RNG *rng, Ray *ray, float3 throughput, - int num_samples, PathState *state, PathRadiance *L) { +#ifdef __SUBSURFACE__ + SubsurfaceIndirectRays ss_indirect; + kernel_path_subsurface_init_indirect(&ss_indirect); + + for(;;) { +#endif /* __SUBSURFACE__ */ + /* path iteration */ for(;;) { - /* intersect scene */ + /* Find intersection with objects in scene. */ Intersection isect; - uint visibility = path_state_ray_visibility(kg, state); - if(state->bounce > kernel_data.integrator.ao_bounces) { - visibility = PATH_RAY_SHADOW; - ray->t = kernel_data.background.ao_distance; - } - bool hit = scene_intersect(kg, - *ray, - visibility, - &isect, - NULL, - 0.0f, 0.0f); + bool hit = kernel_path_scene_intersect(kg, state, ray, &isect, L); -#ifdef __LAMP_MIS__ - if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) { - /* ray starting from previous non-transparent bounce */ - Ray light_ray; - - light_ray.P = ray->P - state->ray_t*ray->D; - state->ray_t += isect.t; - light_ray.D = ray->D; - light_ray.t = state->ray_t; - light_ray.time = ray->time; - light_ray.dD = ray->dD; - light_ray.dP = ray->dP; - - /* intersect with lamp */ - float3 emission; - if(indirect_lamp_emission(kg, emission_sd, state, &light_ray, &emission)) { - path_radiance_accum_emission(L, - throughput, - emission, - state->bounce); - } - } -#endif /* __LAMP_MIS__ */ + /* Find intersection with lamps and compute emission for MIS. */ + kernel_path_lamp_emission(kg, state, ray, throughput, &isect, sd, L); #ifdef __VOLUME__ - /* Sanitize volume stack. */ - if(!hit) { - kernel_volume_clean_stack(kg, state->volume_stack); - } - /* volume attenuation, emission, scatter */ - if(state->volume_stack[0].shader != SHADER_NONE) { - Ray volume_ray = *ray; - volume_ray.t = (hit)? isect.t: FLT_MAX; - - bool heterogeneous = - volume_stack_is_heterogeneous(kg, - state->volume_stack); - -# ifdef __VOLUME_DECOUPLED__ - int sampling_method = - volume_stack_sampling_method(kg, - state->volume_stack); - bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, false, sampling_method); - - if(decoupled) { - /* cache steps along volume for repeated sampling */ - VolumeSegment volume_segment; - - shader_setup_from_volume(kg, - sd, - &volume_ray); - kernel_volume_decoupled_record(kg, - state, - &volume_ray, - sd, - &volume_segment, - heterogeneous); - - volume_segment.sampling_method = sampling_method; - - /* emission */ - if(volume_segment.closure_flag & SD_EMISSION) { - path_radiance_accum_emission(L, - throughput, - volume_segment.accum_emission, - state->bounce); - } - - /* scattering */ - VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED; - - if(volume_segment.closure_flag & SD_SCATTER) { - int all = kernel_data.integrator.sample_all_lights_indirect; - - /* direct light sampling */ - kernel_branched_path_volume_connect_light(kg, - rng, - sd, - emission_sd, - throughput, - state, - L, - all, - &volume_ray, - &volume_segment); - - /* indirect sample. if we use distance sampling and take just - * one sample for direct and indirect light, we could share - * this computation, but makes code a bit complex */ - float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE); - float rscatter = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE); - - result = kernel_volume_decoupled_scatter(kg, - state, - &volume_ray, - sd, - &throughput, - rphase, - rscatter, - &volume_segment, - NULL, - true); - } - - /* free cached steps */ - kernel_volume_decoupled_free(kg, &volume_segment); - - if(result == VOLUME_PATH_SCATTERED) { - if(kernel_path_volume_bounce(kg, - rng, - sd, - &throughput, - state, - L, - ray)) - { - continue; - } - else { - break; - } - } - else { - throughput *= volume_segment.accum_transmittance; - } - } - else -# endif /* __VOLUME_DECOUPLED__ */ - { - /* integrate along volume segment with distance sampling */ - VolumeIntegrateResult result = kernel_volume_integrate( - kg, state, sd, &volume_ray, L, &throughput, rng, heterogeneous); - -# ifdef __VOLUME_SCATTER__ - if(result == VOLUME_PATH_SCATTERED) { - /* direct lighting */ - kernel_path_volume_connect_light(kg, - rng, - sd, - emission_sd, - throughput, - state, - L); - - /* indirect light bounce */ - if(kernel_path_volume_bounce(kg, - rng, - sd, - &throughput, - state, - L, - ray)) - { - continue; - } - else { - break; - } - } -# endif /* __VOLUME_SCATTER__ */ - } + /* Volume integration. */ + VolumeIntegrateResult result = kernel_path_volume(kg, + sd, + state, + ray, + &throughput, + &isect, + hit, + emission_sd, + L); + + if(result == VOLUME_PATH_SCATTERED) { + continue; + } + else if(result == VOLUME_PATH_MISSED) { + break; } -#endif /* __VOLUME__ */ +#endif /* __VOLUME__*/ + /* Shade background. */ if(!hit) { -#ifdef __BACKGROUND__ - /* sample background shader */ - float3 L_background = indirect_background(kg, emission_sd, state, ray); - path_radiance_accum_background(L, - throughput, - L_background, - state->bounce); -#endif /* __BACKGROUND__ */ - + kernel_path_background(kg, state, ray, throughput, sd, L); break; } - else if(state->bounce > kernel_data.integrator.ao_bounces) { + else if(path_state_ao_bounce(kg, state)) { break; } - /* setup shading */ - shader_setup_from_ray(kg, - sd, - &isect, - ray); - float rbsdf = path_state_rng_1D_for_decision(kg, rng, state, PRNG_BSDF); - shader_eval_surface(kg, sd, rng, state, rbsdf, state->flag, SHADER_CONTEXT_INDIRECT); -#ifdef __BRANCHED_PATH__ - shader_merge_closures(sd); -#endif /* __BRANCHED_PATH__ */ - - /* blurring of bsdf after bounces, for rays that have a small likelihood - * of following this particular path (diffuse, rough glossy) */ - if(kernel_data.integrator.filter_glossy != FLT_MAX) { - float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf; - - if(blur_pdf < 1.0f) { - float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f; - shader_bsdf_blur(kg, sd, blur_roughness); - } - } + /* Setup shader data. */ + shader_setup_from_ray(kg, sd, &isect, ray); -#ifdef __EMISSION__ - /* emission */ - if(sd->flag & SD_EMISSION) { - float3 emission = indirect_primitive_emission(kg, - sd, - isect.t, - state->flag, - state->ray_pdf); - path_radiance_accum_emission(L, throughput, emission, state->bounce); + /* Skip most work for volume bounding surface. */ +#ifdef __VOLUME__ + if(!(sd->flag & SD_HAS_ONLY_VOLUME)) { +#endif + + /* Evaluate shader. */ + shader_eval_surface(kg, sd, state, state->flag); + shader_prepare_closures(sd, state); + + /* Apply shadow catcher, holdout, emission. */ + if(!kernel_path_shader_apply(kg, + sd, + state, + ray, + throughput, + emission_sd, + L, + NULL)) + { + break; } -#endif /* __EMISSION__ */ /* path termination. this is a strange place to put the termination, it's * mainly due to the mixed in MIS that we use. gives too many unneeded * shader evaluations, only need emission if we are going to terminate */ - float probability = - path_state_terminate_probability(kg, - state, - throughput*num_samples); + float probability = path_state_continuation_probability(kg, state, throughput); if(probability == 0.0f) { break; } else if(probability != 1.0f) { - float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE); + float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE); if(terminate >= probability) break; @@ -354,50 +487,39 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, throughput /= probability; } + kernel_update_denoising_features(kg, sd, state, L); + #ifdef __AO__ /* ambient occlusion */ if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) { - kernel_path_ao(kg, sd, emission_sd, L, state, rng, throughput, make_float3(0.0f, 0.0f, 0.0f)); + kernel_path_ao(kg, sd, emission_sd, L, state, throughput, make_float3(0.0f, 0.0f, 0.0f)); } #endif /* __AO__ */ + #ifdef __SUBSURFACE__ /* bssrdf scatter to a different location on the same object, replacing * the closures with a diffuse BSDF */ if(sd->flag & SD_BSSRDF) { - float bssrdf_probability; - ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability); - - /* modify throughput for picking bssrdf or bsdf */ - throughput *= bssrdf_probability; - - /* do bssrdf scatter step if we picked a bssrdf closure */ - if(sc) { - uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb); - - float bssrdf_u, bssrdf_v; - path_state_rng_2D(kg, - rng, - state, - PRNG_BSDF_U, - &bssrdf_u, &bssrdf_v); - subsurface_scatter_step(kg, - sd, - state, - state->flag, - sc, - &lcg_state, - bssrdf_u, bssrdf_v, - false); + if(kernel_path_subsurface_scatter(kg, + sd, + emission_sd, + L, + state, + ray, + &throughput, + &ss_indirect)) + { + break; } } #endif /* __SUBSURFACE__ */ -#if defined(__EMISSION__) && defined(__BRANCHED_PATH__) +#if defined(__EMISSION__) if(kernel_data.integrator.use_direct_light) { - int all = kernel_data.integrator.sample_all_lights_indirect; + int all = (kernel_data.integrator.sample_all_lights_indirect) || + (state->flag & PATH_RAY_SHADOW_CATCHER); kernel_branched_path_surface_connect_light(kg, - rng, sd, emission_sd, state, @@ -406,205 +528,48 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, L, all); } -#endif /* defined(__EMISSION__) && defined(__BRANCHED_PATH__) */ +#endif /* defined(__EMISSION__) */ - if(!kernel_path_surface_bounce(kg, rng, sd, &throughput, state, L, ray)) +#ifdef __VOLUME__ + } +#endif + + if(!kernel_path_surface_bounce(kg, sd, &throughput, state, &L->state, ray)) break; } -} #ifdef __SUBSURFACE__ -# ifndef __KERNEL_CUDA__ -ccl_device -# else -ccl_device_inline -# endif -bool kernel_path_subsurface_scatter( - KernelGlobals *kg, - ShaderData *sd, - ShaderData *emission_sd, - PathRadiance *L, - PathState *state, - RNG *rng, - Ray *ray, - float3 *throughput, - SubsurfaceIndirectRays *ss_indirect) -{ - float bssrdf_probability; - ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability); - - /* modify throughput for picking bssrdf or bsdf */ - *throughput *= bssrdf_probability; - - /* do bssrdf scatter step if we picked a bssrdf closure */ - if(sc) { - /* We should never have two consecutive BSSRDF bounces, - * the second one should be converted to a diffuse BSDF to - * avoid this. + /* Trace indirect subsurface rays by restarting the loop. this uses less + * stack memory than invoking kernel_path_indirect. */ - kernel_assert(!ss_indirect->tracing); - - uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb); - - SubsurfaceIntersection ss_isect; - float bssrdf_u, bssrdf_v; - path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); - int num_hits = subsurface_scatter_multi_intersect(kg, - &ss_isect, - sd, - sc, - &lcg_state, - bssrdf_u, bssrdf_v, - false); -# ifdef __VOLUME__ - ss_indirect->need_update_volume_stack = - kernel_data.integrator.use_volumes && - ccl_fetch(sd, object_flag) & SD_OBJECT_INTERSECTS_VOLUME; -# endif /* __VOLUME__ */ - - /* compute lighting with the BSDF closure */ - for(int hit = 0; hit < num_hits; hit++) { - /* NOTE: We reuse the existing ShaderData, we assume the path - * integration loop stops when this function returns true. - */ - subsurface_scatter_multi_setup(kg, - &ss_isect, - hit, - sd, - state, - state->flag, - sc, - false); - - PathState *hit_state = &ss_indirect->state[ss_indirect->num_rays]; - Ray *hit_ray = &ss_indirect->rays[ss_indirect->num_rays]; - float3 *hit_tp = &ss_indirect->throughputs[ss_indirect->num_rays]; - PathRadiance *hit_L = &ss_indirect->L[ss_indirect->num_rays]; - - *hit_state = *state; - *hit_ray = *ray; - *hit_tp = *throughput; - - hit_state->rng_offset += PRNG_BOUNCE_NUM; - - path_radiance_init(hit_L, kernel_data.film.use_light_pass); - hit_L->direct_throughput = L->direct_throughput; - path_radiance_copy_indirect(hit_L, L); - - kernel_path_surface_connect_light(kg, rng, sd, emission_sd, *hit_tp, state, hit_L); - - if(kernel_path_surface_bounce(kg, - rng, - sd, - hit_tp, - hit_state, - hit_L, - hit_ray)) - { -# ifdef __LAMP_MIS__ - hit_state->ray_t = 0.0f; -# endif /* __LAMP_MIS__ */ - -# ifdef __VOLUME__ - if(ss_indirect->need_update_volume_stack) { - Ray volume_ray = *ray; - /* Setup ray from previous surface point to the new one. */ - volume_ray.D = normalize_len(hit_ray->P - volume_ray.P, - &volume_ray.t); - - kernel_volume_stack_update_for_subsurface( - kg, - emission_sd, - &volume_ray, - hit_state->volume_stack); - } -# endif /* __VOLUME__ */ - path_radiance_reset_indirect(L); - ss_indirect->num_rays++; - } - else { - path_radiance_accum_sample(L, hit_L, 1); - } + if(ss_indirect.num_rays) { + kernel_path_subsurface_setup_indirect(kg, + &ss_indirect, + state, + ray, + L, + &throughput); } - return true; - } - return false; -} - -ccl_device_inline void kernel_path_subsurface_init_indirect( - SubsurfaceIndirectRays *ss_indirect) -{ - ss_indirect->tracing = false; - ss_indirect->num_rays = 0; -} - -ccl_device void kernel_path_subsurface_accum_indirect( - SubsurfaceIndirectRays *ss_indirect, - PathRadiance *L) -{ - if(ss_indirect->tracing) { - path_radiance_sum_indirect(L); - path_radiance_accum_sample(&ss_indirect->direct_L, L, 1); - if(ss_indirect->num_rays == 0) { - *L = ss_indirect->direct_L; + else { + break; } } +#endif /* __SUBSURFACE__ */ } -ccl_device void kernel_path_subsurface_setup_indirect( - KernelGlobals *kg, - SubsurfaceIndirectRays *ss_indirect, - PathState *state, - Ray *ray, - PathRadiance *L, - float3 *throughput) -{ - if(!ss_indirect->tracing) { - ss_indirect->direct_L = *L; - } - ss_indirect->tracing = true; - - /* Setup state, ray and throughput for indirect SSS rays. */ - ss_indirect->num_rays--; - - Ray *indirect_ray = &ss_indirect->rays[ss_indirect->num_rays]; - PathRadiance *indirect_L = &ss_indirect->L[ss_indirect->num_rays]; - - *state = ss_indirect->state[ss_indirect->num_rays]; - *ray = *indirect_ray; - *L = *indirect_L; - *throughput = ss_indirect->throughputs[ss_indirect->num_rays]; - - state->rng_offset += ss_indirect->num_rays * PRNG_BOUNCE_NUM; -} - -#endif /* __SUBSURFACE__ */ +#endif /* defined(__BRANCHED_PATH__) || defined(__BAKING__) */ -ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, - RNG *rng, - int sample, - Ray ray, - ccl_global float *buffer) +ccl_device_forceinline void kernel_path_integrate( + KernelGlobals *kg, + PathState *state, + float3 throughput, + Ray *ray, + PathRadiance *L, + ccl_global float *buffer, + ShaderData *emission_sd) { - /* initialize */ - PathRadiance L; - float3 throughput = make_float3(1.0f, 1.0f, 1.0f); - float L_transparent = 0.0f; - - path_radiance_init(&L, kernel_data.film.use_light_pass); - - /* shader data memory used for both volumes and surfaces, saves stack space */ + /* Shader data memory used for both volumes and surfaces, saves stack space. */ ShaderData sd; - /* shader data used by emission, shadows, volume stacks */ - ShaderData emission_sd; - - PathState state; - path_state_init(kg, &emission_sd, &state, rng, sample, &ray); - -#ifdef __KERNEL_DEBUG__ - DebugData debug_data; - debug_data_init(&debug_data); -#endif /* __KERNEL_DEBUG__ */ #ifdef __SUBSURFACE__ SubsurfaceIndirectRays ss_indirect; @@ -615,248 +580,89 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, /* path iteration */ for(;;) { - /* intersect scene */ + /* Find intersection with objects in scene. */ Intersection isect; - uint visibility = path_state_ray_visibility(kg, &state); - -#ifdef __HAIR__ - float difl = 0.0f, extmax = 0.0f; - uint lcg_state = 0; - - if(kernel_data.bvh.have_curves) { - if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) { - float3 pixdiff = ray.dD.dx + ray.dD.dy; - /*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/ - difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f; - } + bool hit = kernel_path_scene_intersect(kg, state, ray, &isect, L); - extmax = kernel_data.curve.maximum_width; - lcg_state = lcg_state_init(rng, &state, 0x51633e2d); - } - - if(state.bounce > kernel_data.integrator.ao_bounces) { - visibility = PATH_RAY_SHADOW; - ray.t = kernel_data.background.ao_distance; - } - - bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax); -#else - bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f); -#endif /* __HAIR__ */ - -#ifdef __KERNEL_DEBUG__ - if(state.flag & PATH_RAY_CAMERA) { - debug_data.num_bvh_traversed_nodes += isect.num_traversed_nodes; - debug_data.num_bvh_traversed_instances += isect.num_traversed_instances; - debug_data.num_bvh_intersections += isect.num_intersections; - } - debug_data.num_ray_bounces++; -#endif /* __KERNEL_DEBUG__ */ - -#ifdef __LAMP_MIS__ - if(kernel_data.integrator.use_lamp_mis && !(state.flag & PATH_RAY_CAMERA)) { - /* ray starting from previous non-transparent bounce */ - Ray light_ray; - - light_ray.P = ray.P - state.ray_t*ray.D; - state.ray_t += isect.t; - light_ray.D = ray.D; - light_ray.t = state.ray_t; - light_ray.time = ray.time; - light_ray.dD = ray.dD; - light_ray.dP = ray.dP; - - /* intersect with lamp */ - float3 emission; - - if(indirect_lamp_emission(kg, &emission_sd, &state, &light_ray, &emission)) - path_radiance_accum_emission(&L, throughput, emission, state.bounce); - } -#endif /* __LAMP_MIS__ */ + /* Find intersection with lamps and compute emission for MIS. */ + kernel_path_lamp_emission(kg, state, ray, throughput, &isect, &sd, L); #ifdef __VOLUME__ - /* Sanitize volume stack. */ - if(!hit) { - kernel_volume_clean_stack(kg, state.volume_stack); - } - /* volume attenuation, emission, scatter */ - if(state.volume_stack[0].shader != SHADER_NONE) { - Ray volume_ray = ray; - volume_ray.t = (hit)? isect.t: FLT_MAX; - - bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack); - -# ifdef __VOLUME_DECOUPLED__ - int sampling_method = volume_stack_sampling_method(kg, state.volume_stack); - bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, true, sampling_method); - - if(decoupled) { - /* cache steps along volume for repeated sampling */ - VolumeSegment volume_segment; - - shader_setup_from_volume(kg, &sd, &volume_ray); - kernel_volume_decoupled_record(kg, &state, - &volume_ray, &sd, &volume_segment, heterogeneous); - - volume_segment.sampling_method = sampling_method; - - /* emission */ - if(volume_segment.closure_flag & SD_EMISSION) - path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce); - - /* scattering */ - VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED; - - if(volume_segment.closure_flag & SD_SCATTER) { - int all = false; - - /* direct light sampling */ - kernel_branched_path_volume_connect_light(kg, rng, &sd, - &emission_sd, throughput, &state, &L, all, - &volume_ray, &volume_segment); - - /* indirect sample. if we use distance sampling and take just - * one sample for direct and indirect light, we could share - * this computation, but makes code a bit complex */ - float rphase = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_PHASE); - float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE); - - result = kernel_volume_decoupled_scatter(kg, - &state, &volume_ray, &sd, &throughput, - rphase, rscatter, &volume_segment, NULL, true); - } - - /* free cached steps */ - kernel_volume_decoupled_free(kg, &volume_segment); - - if(result == VOLUME_PATH_SCATTERED) { - if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray)) - continue; - else - break; - } - else { - throughput *= volume_segment.accum_transmittance; - } - } - else -# endif /* __VOLUME_DECOUPLED__ */ - { - /* integrate along volume segment with distance sampling */ - VolumeIntegrateResult result = kernel_volume_integrate( - kg, &state, &sd, &volume_ray, &L, &throughput, rng, heterogeneous); - -# ifdef __VOLUME_SCATTER__ - if(result == VOLUME_PATH_SCATTERED) { - /* direct lighting */ - kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L); - - /* indirect light bounce */ - if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray)) - continue; - else - break; - } -# endif /* __VOLUME_SCATTER__ */ - } + /* Volume integration. */ + VolumeIntegrateResult result = kernel_path_volume(kg, + &sd, + state, + ray, + &throughput, + &isect, + hit, + emission_sd, + L); + + if(result == VOLUME_PATH_SCATTERED) { + continue; + } + else if(result == VOLUME_PATH_MISSED) { + break; } -#endif /* __VOLUME__ */ +#endif /* __VOLUME__*/ + /* Shade background. */ if(!hit) { - /* eval background shader if nothing hit */ - if(kernel_data.background.transparent && (state.flag & PATH_RAY_CAMERA)) { - L_transparent += average(throughput); - -#ifdef __PASSES__ - if(!(kernel_data.film.pass_flag & PASS_BACKGROUND)) -#endif /* __PASSES__ */ - break; - } - -#ifdef __BACKGROUND__ - /* sample background shader */ - float3 L_background = indirect_background(kg, &emission_sd, &state, &ray); - path_radiance_accum_background(&L, throughput, L_background, state.bounce); -#endif /* __BACKGROUND__ */ - + kernel_path_background(kg, state, ray, throughput, &sd, L); break; } - else if(state.bounce > kernel_data.integrator.ao_bounces) { + else if(path_state_ao_bounce(kg, state)) { break; } - /* setup shading */ - shader_setup_from_ray(kg, &sd, &isect, &ray); - float rbsdf = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_BSDF); - shader_eval_surface(kg, &sd, rng, &state, rbsdf, state.flag, SHADER_CONTEXT_MAIN); - - /* holdout */ -#ifdef __HOLDOUT__ - if(((sd.flag & SD_HOLDOUT) || - (sd.object_flag & SD_OBJECT_HOLDOUT_MASK)) && - (state.flag & PATH_RAY_CAMERA)) - { - if(kernel_data.background.transparent) { - float3 holdout_weight; - if(sd.object_flag & SD_OBJECT_HOLDOUT_MASK) { - holdout_weight = make_float3(1.0f, 1.0f, 1.0f); - } - else { - holdout_weight = shader_holdout_eval(kg, &sd); - } - /* any throughput is ok, should all be identical here */ - L_transparent += average(holdout_weight*throughput); - } + /* Setup shader data. */ + shader_setup_from_ray(kg, &sd, &isect, ray); - if(sd.object_flag & SD_OBJECT_HOLDOUT_MASK) { - break; - } - } -#endif /* __HOLDOUT__ */ - - /* holdout mask objects do not write data passes */ - kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput); - - /* blurring of bsdf after bounces, for rays that have a small likelihood - * of following this particular path (diffuse, rough glossy) */ - if(kernel_data.integrator.filter_glossy != FLT_MAX) { - float blur_pdf = kernel_data.integrator.filter_glossy*state.min_ray_pdf; - - if(blur_pdf < 1.0f) { - float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f; - shader_bsdf_blur(kg, &sd, blur_roughness); - } - } + /* Skip most work for volume bounding surface. */ +#ifdef __VOLUME__ + if(!(sd.flag & SD_HAS_ONLY_VOLUME)) { +#endif -#ifdef __EMISSION__ - /* emission */ - if(sd.flag & SD_EMISSION) { - /* todo: is isect.t wrong here for transparent surfaces? */ - float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf); - path_radiance_accum_emission(&L, throughput, emission, state.bounce); + /* Evaluate shader. */ + shader_eval_surface(kg, &sd, state, state->flag); + shader_prepare_closures(&sd, state); + + /* Apply shadow catcher, holdout, emission. */ + if(!kernel_path_shader_apply(kg, + &sd, + state, + ray, + throughput, + emission_sd, + L, + buffer)) + { + break; } -#endif /* __EMISSION__ */ /* path termination. this is a strange place to put the termination, it's * mainly due to the mixed in MIS that we use. gives too many unneeded * shader evaluations, only need emission if we are going to terminate */ - float probability = path_state_terminate_probability(kg, &state, throughput); + float probability = path_state_continuation_probability(kg, state, throughput); if(probability == 0.0f) { break; } else if(probability != 1.0f) { - float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE); + float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE); if(terminate >= probability) break; throughput /= probability; } + kernel_update_denoising_features(kg, &sd, state, L); + #ifdef __AO__ /* ambient occlusion */ if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) { - kernel_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd)); + kernel_path_ao(kg, &sd, emission_sd, L, state, throughput, shader_bsdf_alpha(kg, &sd)); } #endif /* __AO__ */ @@ -866,11 +672,10 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, if(sd.flag & SD_BSSRDF) { if(kernel_path_subsurface_scatter(kg, &sd, - &emission_sd, - &L, - &state, - rng, - &ray, + emission_sd, + L, + state, + ray, &throughput, &ss_indirect)) { @@ -880,25 +685,27 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, #endif /* __SUBSURFACE__ */ /* direct lighting */ - kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L); + kernel_path_surface_connect_light(kg, &sd, emission_sd, throughput, state, L); + +#ifdef __VOLUME__ + } +#endif /* compute direct lighting and next bounce */ - if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, &L, &ray)) + if(!kernel_path_surface_bounce(kg, &sd, &throughput, state, &L->state, ray)) break; } #ifdef __SUBSURFACE__ - kernel_path_subsurface_accum_indirect(&ss_indirect, &L); - /* Trace indirect subsurface rays by restarting the loop. this uses less * stack memory than invoking kernel_path_indirect. */ if(ss_indirect.num_rays) { kernel_path_subsurface_setup_indirect(kg, &ss_indirect, - &state, - &ray, - &L, + state, + ray, + L, &throughput); } else { @@ -906,48 +713,53 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, } } #endif /* __SUBSURFACE__ */ - - float3 L_sum = path_radiance_clamp_and_sum(kg, &L); - - kernel_write_light_passes(kg, buffer, &L, sample); - -#ifdef __KERNEL_DEBUG__ - kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample); -#endif /* __KERNEL_DEBUG__ */ - - return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent); } ccl_device void kernel_path_trace(KernelGlobals *kg, - ccl_global float *buffer, ccl_global uint *rng_state, + ccl_global float *buffer, int sample, int x, int y, int offset, int stride) { /* buffer offset */ int index = offset + x + y*stride; int pass_stride = kernel_data.film.pass_stride; - rng_state += index; buffer += index*pass_stride; - /* initialize random numbers and ray */ - RNG rng; + /* Initialize random numbers and sample ray. */ + uint rng_hash; Ray ray; - kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray); + kernel_path_trace_setup(kg, sample, x, y, &rng_hash, &ray); - /* integrate */ - float4 L; + if(ray.t == 0.0f) { + return; + } - if(ray.t != 0.0f) - L = kernel_path_integrate(kg, &rng, sample, ray, buffer); - else - L = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + /* Initialize state. */ + float3 throughput = make_float3(1.0f, 1.0f, 1.0f); - /* accumulate result in output buffer */ - kernel_write_pass_float4(buffer, sample, L); + PathRadiance L; + path_radiance_init(&L, kernel_data.film.use_light_pass); + + ShaderDataTinyStorage emission_sd_storage; + ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage); - path_rng_end(kg, rng_state, rng); + PathState state; + path_state_init(kg, emission_sd, &state, rng_hash, sample, &ray); + + /* Integrate. */ + kernel_path_integrate(kg, + &state, + throughput, + &ray, + &L, + buffer, + emission_sd); + + kernel_write_result(kg, buffer, sample, &L); } +#endif /* __SPLIT_KERNEL__ */ + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h index ff2b828795d..66f67c3e2c4 100644 --- a/intern/cycles/kernel/kernel_path_branched.h +++ b/intern/cycles/kernel/kernel_path_branched.h @@ -22,8 +22,7 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg, ShaderData *sd, ShaderData *emission_sd, PathRadiance *L, - PathState *state, - RNG *rng, + ccl_addr_space PathState *state, float3 throughput) { int num_samples = kernel_data.integrator.ao_samples; @@ -35,46 +34,225 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg, for(int j = 0; j < num_samples; j++) { float bsdf_u, bsdf_v; - path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v); + path_branched_rng_2D(kg, state->rng_hash, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v); float3 ao_D; float ao_pdf; sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf); - if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) { + if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) { Ray light_ray; float3 ao_shadow; - light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng)); + light_ray.P = ray_offset(sd->P, sd->Ng); light_ray.D = ao_D; light_ray.t = kernel_data.background.ao_distance; -#ifdef __OBJECT_MOTION__ - light_ray.time = ccl_fetch(sd, time); -#endif /* __OBJECT_MOTION__ */ - light_ray.dP = ccl_fetch(sd, dP); + light_ray.time = sd->time; + light_ray.dP = sd->dP; light_ray.dD = differential3_zero(); - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) - path_radiance_accum_ao(L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state->bounce); + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &ao_shadow)) { + path_radiance_accum_ao(L, state, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow); + } + else { + path_radiance_accum_total_ao(L, state, throughput*num_samples_inv, ao_bsdf); + } } } } +#ifndef __SPLIT_KERNEL__ + +#ifdef __VOLUME__ +ccl_device_forceinline void kernel_branched_path_volume( + KernelGlobals *kg, + ShaderData *sd, + PathState *state, + Ray *ray, + float3 *throughput, + ccl_addr_space Intersection *isect, + bool hit, + ShaderData *indirect_sd, + ShaderData *emission_sd, + PathRadiance *L) +{ + /* Sanitize volume stack. */ + if(!hit) { + kernel_volume_clean_stack(kg, state->volume_stack); + } + + if(state->volume_stack[0].shader == SHADER_NONE) { + return; + } + + /* volume attenuation, emission, scatter */ + Ray volume_ray = *ray; + volume_ray.t = (hit)? isect->t: FLT_MAX; + + bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack); + +# ifdef __VOLUME_DECOUPLED__ + /* decoupled ray marching only supported on CPU */ + if(kernel_data.integrator.volume_decoupled) { + /* cache steps along volume for repeated sampling */ + VolumeSegment volume_segment; + + shader_setup_from_volume(kg, sd, &volume_ray); + kernel_volume_decoupled_record(kg, state, + &volume_ray, sd, &volume_segment, heterogeneous); + + /* direct light sampling */ + if(volume_segment.closure_flag & SD_SCATTER) { + volume_segment.sampling_method = volume_stack_sampling_method(kg, state->volume_stack); + + int all = kernel_data.integrator.sample_all_lights_direct; + + kernel_branched_path_volume_connect_light(kg, sd, + emission_sd, *throughput, state, L, all, + &volume_ray, &volume_segment); + + /* indirect light sampling */ + int num_samples = kernel_data.integrator.volume_samples; + float num_samples_inv = 1.0f/num_samples; + + for(int j = 0; j < num_samples; j++) { + PathState ps = *state; + Ray pray = *ray; + float3 tp = *throughput; + + /* branch RNG state */ + path_state_branch(&ps, j, num_samples); + + /* scatter sample. if we use distance sampling and take just one + * sample for direct and indirect light, we could share this + * computation, but makes code a bit complex */ + float rphase = path_state_rng_1D(kg, &ps, PRNG_PHASE_CHANNEL); + float rscatter = path_state_rng_1D(kg, &ps, PRNG_SCATTER_DISTANCE); + + VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, + &ps, &pray, sd, &tp, rphase, rscatter, &volume_segment, NULL, false); + + if(result == VOLUME_PATH_SCATTERED && + kernel_path_volume_bounce(kg, + sd, + &tp, + &ps, + &L->state, + &pray)) + { + kernel_path_indirect(kg, + indirect_sd, + emission_sd, + &pray, + tp*num_samples_inv, + &ps, + L); + + /* for render passes, sum and reset indirect light pass variables + * for the next samples */ + path_radiance_sum_indirect(L); + path_radiance_reset_indirect(L); + } + } + } + + /* emission and transmittance */ + if(volume_segment.closure_flag & SD_EMISSION) + path_radiance_accum_emission(L, state, *throughput, volume_segment.accum_emission); + *throughput *= volume_segment.accum_transmittance; + + /* free cached steps */ + kernel_volume_decoupled_free(kg, &volume_segment); + } + else +# endif /* __VOLUME_DECOUPLED__ */ + { + /* GPU: no decoupled ray marching, scatter probalistically */ + int num_samples = kernel_data.integrator.volume_samples; + float num_samples_inv = 1.0f/num_samples; + + /* todo: we should cache the shader evaluations from stepping + * through the volume, for now we redo them multiple times */ + + for(int j = 0; j < num_samples; j++) { + PathState ps = *state; + Ray pray = *ray; + float3 tp = (*throughput) * num_samples_inv; + + /* branch RNG state */ + path_state_branch(&ps, j, num_samples); + + VolumeIntegrateResult result = kernel_volume_integrate( + kg, &ps, sd, &volume_ray, L, &tp, heterogeneous); + +# ifdef __VOLUME_SCATTER__ + if(result == VOLUME_PATH_SCATTERED) { + /* todo: support equiangular, MIS and all light sampling. + * alternatively get decoupled ray marching working on the GPU */ + kernel_path_volume_connect_light(kg, sd, emission_sd, tp, state, L); + + if(kernel_path_volume_bounce(kg, + sd, + &tp, + &ps, + &L->state, + &pray)) + { + kernel_path_indirect(kg, + indirect_sd, + emission_sd, + &pray, + tp, + &ps, + L); + + /* for render passes, sum and reset indirect light pass variables + * for the next samples */ + path_radiance_sum_indirect(L); + path_radiance_reset_indirect(L); + } + } +# endif /* __VOLUME_SCATTER__ */ + } + + /* todo: avoid this calculation using decoupled ray marching */ + kernel_volume_shadow(kg, emission_sd, state, &volume_ray, throughput); + } +} +#endif /* __VOLUME__ */ /* bounce off surface and integrate indirect light */ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg, - RNG *rng, ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd, + ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd, float3 throughput, float num_samples_adjust, PathState *state, PathRadiance *L) { - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - const ShaderClosure *sc = &ccl_fetch(sd, closure)[i]; + float sum_sample_weight = 0.0f; +#ifdef __DENOISING_FEATURES__ + if(state->denoising_feature_weight > 0.0f) { + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; + + /* transparency is not handled here, but in outer loop */ + if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) { + continue; + } + + sum_sample_weight += sc->sample_weight; + } + } + else { + sum_sample_weight = 1.0f; + } +#endif /* __DENOISING_FEATURES__ */ + + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; - if(!CLOSURE_IS_BSDF(sc->type)) - continue; /* transparency is not handled here, but in outer loop */ - if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) + if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) { continue; + } int num_samples; @@ -90,34 +268,38 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba num_samples = ceil_to_int(num_samples_adjust*num_samples); float num_samples_inv = num_samples_adjust/num_samples; - RNG bsdf_rng = cmj_hash(*rng, i); for(int j = 0; j < num_samples; j++) { PathState ps = *state; float3 tp = throughput; Ray bsdf_ray; +#ifdef __SHADOW_TRICKS__ + float shadow_transparency = L->shadow_transparency; +#endif + + ps.rng_hash = cmj_hash(state->rng_hash, i); if(!kernel_branched_path_surface_bounce(kg, - &bsdf_rng, sd, sc, j, num_samples, &tp, &ps, - L, - &bsdf_ray)) + &L->state, + &bsdf_ray, + sum_sample_weight)) { continue; } + ps.rng_hash = state->rng_hash; + kernel_path_indirect(kg, indirect_sd, emission_sd, - rng, &bsdf_ray, tp*num_samples_inv, - num_samples, &ps, L); @@ -125,6 +307,10 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba * for the next samples */ path_radiance_sum_indirect(L); path_radiance_reset_indirect(L); + +#ifdef __SHADOW_TRICKS__ + L->shadow_transparency = shadow_transparency; +#endif } } } @@ -136,40 +322,47 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, ShaderData *emission_sd, PathRadiance *L, PathState *state, - RNG *rng, Ray *ray, float3 throughput) { - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = &ccl_fetch(sd, closure)[i]; + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(!CLOSURE_IS_BSSRDF(sc->type)) continue; /* set up random number generator */ - uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb); - int num_samples = kernel_data.integrator.subsurface_samples; + uint lcg_state = lcg_state_init(state, 0x68bc21eb); + int num_samples = kernel_data.integrator.subsurface_samples * 3; float num_samples_inv = 1.0f/num_samples; - RNG bssrdf_rng = cmj_hash(*rng, i); + uint bssrdf_rng_hash = cmj_hash(state->rng_hash, i); /* do subsurface scatter step with copy of shader data, this will * replace the BSSRDF with a diffuse BSDF closure */ for(int j = 0; j < num_samples; j++) { - SubsurfaceIntersection ss_isect; + PathState hit_state = *state; + path_state_branch(&hit_state, j, num_samples); + hit_state.rng_hash = bssrdf_rng_hash; + + LocalIntersection ss_isect; float bssrdf_u, bssrdf_v; - path_branched_rng_2D(kg, &bssrdf_rng, state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); + path_state_rng_2D(kg, &hit_state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); int num_hits = subsurface_scatter_multi_intersect(kg, &ss_isect, sd, + &hit_state, sc, &lcg_state, bssrdf_u, bssrdf_v, true); + + hit_state.rng_offset += PRNG_BOUNCE_NUM; + #ifdef __VOLUME__ Ray volume_ray = *ray; bool need_update_volume_stack = kernel_data.integrator.use_volumes && - ccl_fetch(sd, object_flag) & SD_OBJECT_INTERSECTS_VOLUME; + sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME; #endif /* __VOLUME__ */ /* compute lighting with the BSDF closure */ @@ -179,14 +372,8 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, &ss_isect, hit, &bssrdf_sd, - state, - state->flag, - sc, - true); - - PathState hit_state = *state; - - path_state_branch(&hit_state, j, num_samples); + &hit_state, + sc); #ifdef __VOLUME__ if(need_update_volume_stack) { @@ -195,6 +382,10 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, volume_ray.D = normalize_len(P - volume_ray.P, &volume_ray.t); + for(int k = 0; k < VOLUME_STACK_SIZE; k++) { + hit_state.volume_stack[k] = state->volume_stack[k]; + } + kernel_volume_stack_update_for_subsurface( kg, emission_sd, @@ -206,10 +397,10 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, #ifdef __EMISSION__ /* direct light */ if(kernel_data.integrator.use_direct_light) { - int all = kernel_data.integrator.sample_all_lights_direct; + int all = (kernel_data.integrator.sample_all_lights_direct) || + (hit_state.flag & PATH_RAY_SHADOW_CATCHER); kernel_branched_path_surface_connect_light( kg, - rng, &bssrdf_sd, emission_sd, &hit_state, @@ -223,7 +414,6 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, /* indirect light */ kernel_branched_path_surface_indirect_light( kg, - rng, &bssrdf_sd, indirect_sd, emission_sd, @@ -237,284 +427,93 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, } #endif /* __SUBSURFACE__ */ -ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer) +ccl_device void kernel_branched_path_integrate(KernelGlobals *kg, + uint rng_hash, + int sample, + Ray ray, + ccl_global float *buffer, + PathRadiance *L) { /* initialize */ - PathRadiance L; float3 throughput = make_float3(1.0f, 1.0f, 1.0f); - float L_transparent = 0.0f; - path_radiance_init(&L, kernel_data.film.use_light_pass); + path_radiance_init(L, kernel_data.film.use_light_pass); /* shader data memory used for both volumes and surfaces, saves stack space */ ShaderData sd; /* shader data used by emission, shadows, volume stacks, indirect path */ - ShaderData emission_sd, indirect_sd; + ShaderDataTinyStorage emission_sd_storage; + ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage); + ShaderData indirect_sd; PathState state; - path_state_init(kg, &emission_sd, &state, rng, sample, &ray); - -#ifdef __KERNEL_DEBUG__ - DebugData debug_data; - debug_data_init(&debug_data); -#endif /* __KERNEL_DEBUG__ */ + path_state_init(kg, emission_sd, &state, rng_hash, sample, &ray); /* Main Loop * Here we only handle transparency intersections from the camera ray. * Indirect bounces are handled in kernel_branched_path_surface_indirect_light(). */ for(;;) { - /* intersect scene */ + /* Find intersection with objects in scene. */ Intersection isect; - uint visibility = path_state_ray_visibility(kg, &state); - -#ifdef __HAIR__ - float difl = 0.0f, extmax = 0.0f; - uint lcg_state = 0; - - if(kernel_data.bvh.have_curves) { - if(kernel_data.cam.resolution == 1) { - float3 pixdiff = ray.dD.dx + ray.dD.dy; - /*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/ - difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f; - } - - extmax = kernel_data.curve.maximum_width; - lcg_state = lcg_state_init(rng, &state, 0x51633e2d); - } - - bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax); -#else - bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f); -#endif /* __HAIR__ */ - -#ifdef __KERNEL_DEBUG__ - debug_data.num_bvh_traversed_nodes += isect.num_traversed_nodes; - debug_data.num_bvh_traversed_instances += isect.num_traversed_instances; - debug_data.num_bvh_intersections += isect.num_intersections; - debug_data.num_ray_bounces++; -#endif /* __KERNEL_DEBUG__ */ + bool hit = kernel_path_scene_intersect(kg, &state, &ray, &isect, L); #ifdef __VOLUME__ - /* Sanitize volume stack. */ - if(!hit) { - kernel_volume_clean_stack(kg, state.volume_stack); - } - /* volume attenuation, emission, scatter */ - if(state.volume_stack[0].shader != SHADER_NONE) { - Ray volume_ray = ray; - volume_ray.t = (hit)? isect.t: FLT_MAX; - - bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack); - -#ifdef __VOLUME_DECOUPLED__ - /* decoupled ray marching only supported on CPU */ - - /* cache steps along volume for repeated sampling */ - VolumeSegment volume_segment; - - shader_setup_from_volume(kg, &sd, &volume_ray); - kernel_volume_decoupled_record(kg, &state, - &volume_ray, &sd, &volume_segment, heterogeneous); - - /* direct light sampling */ - if(volume_segment.closure_flag & SD_SCATTER) { - volume_segment.sampling_method = volume_stack_sampling_method(kg, state.volume_stack); - - int all = kernel_data.integrator.sample_all_lights_direct; - - kernel_branched_path_volume_connect_light(kg, rng, &sd, - &emission_sd, throughput, &state, &L, all, - &volume_ray, &volume_segment); - - /* indirect light sampling */ - int num_samples = kernel_data.integrator.volume_samples; - float num_samples_inv = 1.0f/num_samples; - - for(int j = 0; j < num_samples; j++) { - /* workaround to fix correlation bug in T38710, can find better solution - * in random number generator later, for now this is done here to not impact - * performance of rendering without volumes */ - RNG tmp_rng = cmj_hash(*rng, state.rng_offset); - - PathState ps = state; - Ray pray = ray; - float3 tp = throughput; - - /* branch RNG state */ - path_state_branch(&ps, j, num_samples); - - /* scatter sample. if we use distance sampling and take just one - * sample for direct and indirect light, we could share this - * computation, but makes code a bit complex */ - float rphase = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_PHASE); - float rscatter = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_SCATTER_DISTANCE); - - VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, - &ps, &pray, &sd, &tp, rphase, rscatter, &volume_segment, NULL, false); - - (void)result; - kernel_assert(result == VOLUME_PATH_SCATTERED); - - if(kernel_path_volume_bounce(kg, - rng, - &sd, - &tp, - &ps, - &L, - &pray)) - { - kernel_path_indirect(kg, - &indirect_sd, - &emission_sd, - rng, - &pray, - tp*num_samples_inv, - num_samples, - &ps, - &L); - - /* for render passes, sum and reset indirect light pass variables - * for the next samples */ - path_radiance_sum_indirect(&L); - path_radiance_reset_indirect(&L); - } - } - } - - /* emission and transmittance */ - if(volume_segment.closure_flag & SD_EMISSION) - path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce); - throughput *= volume_segment.accum_transmittance; - - /* free cached steps */ - kernel_volume_decoupled_free(kg, &volume_segment); -#else - /* GPU: no decoupled ray marching, scatter probalistically */ - int num_samples = kernel_data.integrator.volume_samples; - float num_samples_inv = 1.0f/num_samples; - - /* todo: we should cache the shader evaluations from stepping - * through the volume, for now we redo them multiple times */ - - for(int j = 0; j < num_samples; j++) { - PathState ps = state; - Ray pray = ray; - float3 tp = throughput * num_samples_inv; - - /* branch RNG state */ - path_state_branch(&ps, j, num_samples); - - VolumeIntegrateResult result = kernel_volume_integrate( - kg, &ps, &sd, &volume_ray, &L, &tp, rng, heterogeneous); - -#ifdef __VOLUME_SCATTER__ - if(result == VOLUME_PATH_SCATTERED) { - /* todo: support equiangular, MIS and all light sampling. - * alternatively get decoupled ray marching working on the GPU */ - kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, &L); - - if(kernel_path_volume_bounce(kg, - rng, - &sd, - &tp, - &ps, - &L, - &pray)) - { - kernel_path_indirect(kg, - &indirect_sd, - &emission_sd, - rng, - &pray, - tp, - num_samples, - &ps, - &L); - - /* for render passes, sum and reset indirect light pass variables - * for the next samples */ - path_radiance_sum_indirect(&L); - path_radiance_reset_indirect(&L); - } - } -#endif /* __VOLUME_SCATTER__ */ - } - - /* todo: avoid this calculation using decoupled ray marching */ - kernel_volume_shadow(kg, &emission_sd, &state, &volume_ray, &throughput); -#endif /* __VOLUME_DECOUPLED__ */ - } + /* Volume integration. */ + kernel_branched_path_volume(kg, + &sd, + &state, + &ray, + &throughput, + &isect, + hit, + &indirect_sd, + emission_sd, + L); #endif /* __VOLUME__ */ + /* Shade background. */ if(!hit) { - /* eval background shader if nothing hit */ - if(kernel_data.background.transparent) { - L_transparent += average(throughput); - -#ifdef __PASSES__ - if(!(kernel_data.film.pass_flag & PASS_BACKGROUND)) -#endif /* __PASSES__ */ - break; - } - -#ifdef __BACKGROUND__ - /* sample background shader */ - float3 L_background = indirect_background(kg, &emission_sd, &state, &ray); - path_radiance_accum_background(&L, throughput, L_background, state.bounce); -#endif /* __BACKGROUND__ */ - + kernel_path_background(kg, &state, &ray, throughput, &sd, L); break; } - /* setup shading */ + /* Setup and evaluate shader. */ shader_setup_from_ray(kg, &sd, &isect, &ray); - shader_eval_surface(kg, &sd, rng, &state, 0.0f, state.flag, SHADER_CONTEXT_MAIN); - shader_merge_closures(&sd); - /* holdout */ -#ifdef __HOLDOUT__ - if((sd.flag & SD_HOLDOUT) || (sd.object_flag & SD_OBJECT_HOLDOUT_MASK)) { - if(kernel_data.background.transparent) { - float3 holdout_weight; - if(sd.object_flag & SD_OBJECT_HOLDOUT_MASK) { - holdout_weight = make_float3(1.0f, 1.0f, 1.0f); - } - else { - holdout_weight = shader_holdout_eval(kg, &sd); - } - /* any throughput is ok, should all be identical here */ - L_transparent += average(holdout_weight*throughput); - } - if(sd.object_flag & SD_OBJECT_HOLDOUT_MASK) { - break; - } - } -#endif /* __HOLDOUT__ */ + /* Skip most work for volume bounding surface. */ +#ifdef __VOLUME__ + if(!(sd.flag & SD_HAS_ONLY_VOLUME)) { +#endif - /* holdout mask objects do not write data passes */ - kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput); + shader_eval_surface(kg, &sd, &state, state.flag); + shader_merge_closures(&sd); -#ifdef __EMISSION__ - /* emission */ - if(sd.flag & SD_EMISSION) { - float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf); - path_radiance_accum_emission(&L, throughput, emission, state.bounce); + /* Apply shadow catcher, holdout, emission. */ + if(!kernel_path_shader_apply(kg, + &sd, + &state, + &ray, + throughput, + emission_sd, + L, + buffer)) + { + break; } -#endif /* __EMISSION__ */ /* transparency termination */ if(state.flag & PATH_RAY_TRANSPARENT) { /* path termination. this is a strange place to put the termination, it's * mainly due to the mixed in MIS that we use. gives too many unneeded * shader evaluations, only need emission if we are going to terminate */ - float probability = path_state_terminate_probability(kg, &state, throughput); + float probability = path_state_continuation_probability(kg, &state, throughput); if(probability == 0.0f) { break; } else if(probability != 1.0f) { - float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE); + float terminate = path_state_rng_1D(kg, &state, PRNG_TERMINATE); if(terminate >= probability) break; @@ -523,52 +522,60 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in } } + kernel_update_denoising_features(kg, &sd, &state, L); + #ifdef __AO__ /* ambient occlusion */ if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) { - kernel_branched_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput); + kernel_branched_path_ao(kg, &sd, emission_sd, L, &state, throughput); } #endif /* __AO__ */ #ifdef __SUBSURFACE__ /* bssrdf scatter to a different location on the same object */ if(sd.flag & SD_BSSRDF) { - kernel_branched_path_subsurface_scatter(kg, &sd, &indirect_sd, &emission_sd, - &L, &state, rng, &ray, throughput); + kernel_branched_path_subsurface_scatter(kg, &sd, &indirect_sd, emission_sd, + L, &state, &ray, throughput); } #endif /* __SUBSURFACE__ */ - if(!(sd.flag & SD_HAS_ONLY_VOLUME)) { - PathState hit_state = state; + PathState hit_state = state; #ifdef __EMISSION__ - /* direct light */ - if(kernel_data.integrator.use_direct_light) { - int all = kernel_data.integrator.sample_all_lights_direct; - kernel_branched_path_surface_connect_light(kg, rng, - &sd, &emission_sd, &hit_state, throughput, 1.0f, &L, all); - } + /* direct light */ + if(kernel_data.integrator.use_direct_light) { + int all = (kernel_data.integrator.sample_all_lights_direct) || + (state.flag & PATH_RAY_SHADOW_CATCHER); + kernel_branched_path_surface_connect_light(kg, + &sd, emission_sd, &hit_state, throughput, 1.0f, L, all); + } #endif /* __EMISSION__ */ - /* indirect light */ - kernel_branched_path_surface_indirect_light(kg, rng, - &sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, &L); + /* indirect light */ + kernel_branched_path_surface_indirect_light(kg, + &sd, &indirect_sd, emission_sd, throughput, 1.0f, &hit_state, L); - /* continue in case of transparency */ - throughput *= shader_bsdf_transparency(kg, &sd); + /* continue in case of transparency */ + throughput *= shader_bsdf_transparency(kg, &sd); - if(is_zero(throughput)) - break; - } + if(is_zero(throughput)) + break; /* Update Path State */ - state.flag |= PATH_RAY_TRANSPARENT; - state.transparent_bounce++; + path_state_next(kg, &state, LABEL_TRANSPARENT); + +#ifdef __VOLUME__ + } + else { + if(!path_state_volume_next(kg, &state)) { + break; + } + } +#endif ray.P = ray_offset(sd.P, -sd.Ng); ray.t -= sd.ray_length; /* clipping works through transparent */ - #ifdef __RAY_DIFFERENTIALS__ ray.dP = sd.dP; ray.dD.dx = -sd.dI.dx; @@ -580,50 +587,35 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack); #endif /* __VOLUME__ */ } - - float3 L_sum = path_radiance_clamp_and_sum(kg, &L); - - kernel_write_light_passes(kg, buffer, &L, sample); - -#ifdef __KERNEL_DEBUG__ - kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample); -#endif /* __KERNEL_DEBUG__ */ - - return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent); } ccl_device void kernel_branched_path_trace(KernelGlobals *kg, - ccl_global float *buffer, ccl_global uint *rng_state, + ccl_global float *buffer, int sample, int x, int y, int offset, int stride) { /* buffer offset */ int index = offset + x + y*stride; int pass_stride = kernel_data.film.pass_stride; - rng_state += index; buffer += index*pass_stride; /* initialize random numbers and ray */ - RNG rng; + uint rng_hash; Ray ray; - kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray); + kernel_path_trace_setup(kg, sample, x, y, &rng_hash, &ray); /* integrate */ - float4 L; - - if(ray.t != 0.0f) - L = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer); - else - L = make_float4(0.0f, 0.0f, 0.0f, 0.0f); - - /* accumulate result in output buffer */ - kernel_write_pass_float4(buffer, sample, L); + PathRadiance L; - path_rng_end(kg, rng_state, rng); + if(ray.t != 0.0f) { + kernel_branched_path_integrate(kg, rng_hash, sample, ray, buffer, &L); + kernel_write_result(kg, buffer, sample, &L); + } } +#endif /* __SPLIT_KERNEL__ */ + #endif /* __BRANCHED_PATH__ */ CCL_NAMESPACE_END - diff --git a/intern/cycles/kernel/kernel_path_common.h b/intern/cycles/kernel/kernel_path_common.h index 7b903556bf9..d83fd474cde 100644 --- a/intern/cycles/kernel/kernel_path_common.h +++ b/intern/cycles/kernel/kernel_path_common.h @@ -14,15 +14,14 @@ * limitations under the License. */ -#include "util_hash.h" +#include "util/util_hash.h" CCL_NAMESPACE_BEGIN ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg, - ccl_global uint *rng_state, int sample, int x, int y, - ccl_addr_space RNG *rng, + uint *rng_hash, ccl_addr_space Ray *ray) { float filter_u; @@ -30,24 +29,20 @@ ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg, int num_samples = kernel_data.integrator.aa_samples; - if(sample == kernel_data.integrator.start_sample) { - *rng_state = hash_int_2d(x, y); - } - - path_rng_init(kg, rng_state, sample, num_samples, rng, x, y, &filter_u, &filter_v); + path_rng_init(kg, sample, num_samples, rng_hash, x, y, &filter_u, &filter_v); /* sample camera ray */ float lens_u = 0.0f, lens_v = 0.0f; if(kernel_data.cam.aperturesize > 0.0f) - path_rng_2D(kg, rng, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v); + path_rng_2D(kg, *rng_hash, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v); float time = 0.0f; #ifdef __CAMERA_MOTION__ if(kernel_data.cam.shuttertime != -1.0f) - time = path_rng_1D(kg, rng, sample, num_samples, PRNG_TIME); + time = path_rng_1D(kg, *rng_hash, sample, num_samples, PRNG_TIME); #endif camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray); diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h index 661dc52fb31..8a358e51f94 100644 --- a/intern/cycles/kernel/kernel_path_state.h +++ b/intern/cycles/kernel/kernel_path_state.h @@ -19,15 +19,17 @@ CCL_NAMESPACE_BEGIN ccl_device_inline void path_state_init(KernelGlobals *kg, ShaderData *stack_sd, ccl_addr_space PathState *state, - ccl_addr_space RNG *rng, + uint rng_hash, int sample, ccl_addr_space Ray *ray) { - state->flag = PATH_RAY_CAMERA|PATH_RAY_MIS_SKIP; + state->flag = PATH_RAY_CAMERA|PATH_RAY_MIS_SKIP|PATH_RAY_TRANSPARENT_BACKGROUND; + state->rng_hash = rng_hash; state->rng_offset = PRNG_BASE_NUM; state->sample = sample; state->num_samples = kernel_data.integrator.aa_samples; + state->branch_factor = 1.0f; state->bounce = 0; state->diffuse_bounce = 0; @@ -35,6 +37,16 @@ ccl_device_inline void path_state_init(KernelGlobals *kg, state->transmission_bounce = 0; state->transparent_bounce = 0; +#ifdef __DENOISING_FEATURES__ + if(kernel_data.film.pass_denoising_data) { + state->flag |= PATH_RAY_STORE_SHADOW_INFO; + state->denoising_feature_weight = 1.0f; + } + else { + state->denoising_feature_weight = 0.0f; + } +#endif /* __DENOISING_FEATURES__ */ + state->min_ray_pdf = FLT_MAX; state->ray_pdf = 0.0f; #ifdef __LAMP_MIS__ @@ -43,12 +55,11 @@ ccl_device_inline void path_state_init(KernelGlobals *kg, #ifdef __VOLUME__ state->volume_bounce = 0; + state->volume_bounds_bounce = 0; if(kernel_data.integrator.use_volumes) { /* Initialize volume stack with volume we are inside of. */ kernel_volume_stack_init(kg, stack_sd, state, ray, state->volume_stack); - /* Seed RNG for cases where we can't use stratified samples .*/ - state->rng_congruential = lcg_init(*rng + sample*0x51633e2d); } else { state->volume_stack[0].shader = SHADER_NONE; @@ -63,25 +74,36 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathSta if(label & LABEL_TRANSPARENT) { state->flag |= PATH_RAY_TRANSPARENT; state->transparent_bounce++; - - /* don't increase random number generator offset here, to avoid some - * unwanted patterns, see path_state_rng_1D_for_decision */ + if(state->transparent_bounce >= kernel_data.integrator.transparent_max_bounce) { + state->flag |= PATH_RAY_TERMINATE_IMMEDIATE; + } if(!kernel_data.integrator.transparent_shadows) state->flag |= PATH_RAY_MIS_SKIP; + /* random number generator next bounce */ + state->rng_offset += PRNG_BOUNCE_NUM; + return; } state->bounce++; + if(state->bounce >= kernel_data.integrator.max_bounce) { + state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT; + } + + state->flag &= ~(PATH_RAY_ALL_VISIBILITY|PATH_RAY_MIS_SKIP); #ifdef __VOLUME__ if(label & LABEL_VOLUME_SCATTER) { /* volume scatter */ state->flag |= PATH_RAY_VOLUME_SCATTER; - state->flag &= ~(PATH_RAY_REFLECT|PATH_RAY_TRANSMIT|PATH_RAY_CAMERA|PATH_RAY_TRANSPARENT|PATH_RAY_DIFFUSE|PATH_RAY_GLOSSY|PATH_RAY_SINGULAR|PATH_RAY_MIS_SKIP); + state->flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND; state->volume_bounce++; + if(state->volume_bounce >= kernel_data.integrator.max_volume_bounce) { + state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT; + } } else #endif @@ -89,44 +111,79 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathSta /* surface reflection/transmission */ if(label & LABEL_REFLECT) { state->flag |= PATH_RAY_REFLECT; - state->flag &= ~(PATH_RAY_TRANSMIT|PATH_RAY_VOLUME_SCATTER|PATH_RAY_CAMERA|PATH_RAY_TRANSPARENT); + state->flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND; - if(label & LABEL_DIFFUSE) + if(label & LABEL_DIFFUSE) { state->diffuse_bounce++; - else + if(state->diffuse_bounce >= kernel_data.integrator.max_diffuse_bounce) { + state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT; + } + } + else { state->glossy_bounce++; + if(state->glossy_bounce >= kernel_data.integrator.max_glossy_bounce) { + state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT; + } + } } else { kernel_assert(label & LABEL_TRANSMIT); state->flag |= PATH_RAY_TRANSMIT; - state->flag &= ~(PATH_RAY_REFLECT|PATH_RAY_VOLUME_SCATTER|PATH_RAY_CAMERA|PATH_RAY_TRANSPARENT); + + if(!(label & LABEL_TRANSMIT_TRANSPARENT)) { + state->flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND; + } state->transmission_bounce++; + if(state->transmission_bounce >= kernel_data.integrator.max_transmission_bounce) { + state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT; + } } /* diffuse/glossy/singular */ if(label & LABEL_DIFFUSE) { state->flag |= PATH_RAY_DIFFUSE|PATH_RAY_DIFFUSE_ANCESTOR; - state->flag &= ~(PATH_RAY_GLOSSY|PATH_RAY_SINGULAR|PATH_RAY_MIS_SKIP); } else if(label & LABEL_GLOSSY) { state->flag |= PATH_RAY_GLOSSY; - state->flag &= ~(PATH_RAY_DIFFUSE|PATH_RAY_SINGULAR|PATH_RAY_MIS_SKIP); } else { kernel_assert(label & LABEL_SINGULAR); - state->flag |= PATH_RAY_GLOSSY|PATH_RAY_SINGULAR|PATH_RAY_MIS_SKIP; - state->flag &= ~PATH_RAY_DIFFUSE; } } /* random number generator next bounce */ state->rng_offset += PRNG_BOUNCE_NUM; + +#ifdef __DENOISING_FEATURES__ + if((state->denoising_feature_weight == 0.0f) && !(state->flag & PATH_RAY_SHADOW_CATCHER)) { + state->flag &= ~PATH_RAY_STORE_SHADOW_INFO; + } +#endif } -ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *state) +#ifdef __VOLUME__ +ccl_device_inline bool path_state_volume_next(KernelGlobals *kg, ccl_addr_space PathState *state) +{ + /* For volume bounding meshes we pass through without counting transparent + * bounces, only sanity check in case self intersection gets us stuck. */ + state->volume_bounds_bounce++; + if (state->volume_bounds_bounce > VOLUME_BOUNDS_MAX) { + return false; + } + + /* Random number generator next bounce. */ + if(state->volume_bounds_bounce > 1) { + state->rng_offset += PRNG_BOUNCE_NUM; + } + + return true; +} +#endif + +ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, ccl_addr_space PathState *state) { uint flag = state->flag & PATH_RAY_ALL_VISIBILITY; @@ -140,34 +197,42 @@ ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *s return flag; } -ccl_device_inline float path_state_terminate_probability(KernelGlobals *kg, ccl_addr_space PathState *state, const float3 throughput) +ccl_device_inline float path_state_continuation_probability(KernelGlobals *kg, + ccl_addr_space PathState *state, + const float3 throughput) { - if(state->flag & PATH_RAY_TRANSPARENT) { - /* transparent rays treated separately */ - if(state->transparent_bounce >= kernel_data.integrator.transparent_max_bounce) - return 0.0f; - else if(state->transparent_bounce <= kernel_data.integrator.transparent_min_bounce) + if(state->flag & PATH_RAY_TERMINATE_IMMEDIATE) { + /* Ray is to be terminated immediately. */ + return 0.0f; + } + else if(state->flag & PATH_RAY_TRANSPARENT) { + /* Do at least one bounce without RR. */ + if(state->transparent_bounce <= 1) { + return 1.0f; + } +#ifdef __SHADOW_TRICKS__ + /* Exception for shadow catcher not working correctly with RR. */ + else if((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->transparent_bounce <= 8)) { return 1.0f; + } +#endif } else { - /* other rays */ - if((state->bounce >= kernel_data.integrator.max_bounce) || - (state->diffuse_bounce >= kernel_data.integrator.max_diffuse_bounce) || - (state->glossy_bounce >= kernel_data.integrator.max_glossy_bounce) || -#ifdef __VOLUME__ - (state->volume_bounce >= kernel_data.integrator.max_volume_bounce) || -#endif - (state->transmission_bounce >= kernel_data.integrator.max_transmission_bounce)) - { - return 0.0f; + /* Do at least one bounce without RR. */ + if(state->bounce <= 1) { + return 1.0f; } - else if(state->bounce <= kernel_data.integrator.min_bounce) { +#ifdef __SHADOW_TRICKS__ + /* Exception for shadow catcher not working correctly with RR. */ + else if((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->bounce <= 3)) { return 1.0f; } +#endif } - /* probalistic termination */ - return average(throughput); /* todo: try using max here */ + /* Probabilistic termination: use sqrt() to roughly match typical view + * transform and do path termination a bit later on average. */ + return min(sqrtf(max3(fabs(throughput)) * state->branch_factor), 1.0f); } /* TODO(DingTo): Find more meaningful name for this */ @@ -180,5 +245,28 @@ ccl_device_inline void path_state_modify_bounce(ccl_addr_space PathState *state, state->bounce -= 1; } +ccl_device_inline bool path_state_ao_bounce(KernelGlobals *kg, ccl_addr_space PathState *state) +{ + if(state->bounce <= kernel_data.integrator.ao_bounces) { + return false; + } + + int bounce = state->bounce - state->transmission_bounce - (state->glossy_bounce > 0); + return (bounce > kernel_data.integrator.ao_bounces); +} + +ccl_device_inline void path_state_branch(ccl_addr_space PathState *state, + int branch, + int num_branches) +{ + if(num_branches > 1) { + /* Path is splitting into a branch, adjust so that each branch + * still gets a unique sample from the same sequence. */ + state->sample = state->sample*num_branches + branch; + state->num_samples = state->num_samples*num_branches; + state->branch_factor *= num_branches; + } +} + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_path_subsurface.h b/intern/cycles/kernel/kernel_path_subsurface.h new file mode 100644 index 00000000000..71aea9e3b27 --- /dev/null +++ b/intern/cycles/kernel/kernel_path_subsurface.h @@ -0,0 +1,155 @@ +/* + * Copyright 2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +#ifdef __SUBSURFACE__ +# ifndef __KERNEL_CUDA__ +ccl_device +# else +ccl_device_inline +# endif +bool kernel_path_subsurface_scatter( + KernelGlobals *kg, + ShaderData *sd, + ShaderData *emission_sd, + PathRadiance *L, + ccl_addr_space PathState *state, + ccl_addr_space Ray *ray, + ccl_addr_space float3 *throughput, + ccl_addr_space SubsurfaceIndirectRays *ss_indirect) +{ + float bssrdf_u, bssrdf_v; + path_state_rng_2D(kg, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); + + const ShaderClosure *sc = shader_bssrdf_pick(sd, throughput, &bssrdf_u); + + /* do bssrdf scatter step if we picked a bssrdf closure */ + if(sc) { + /* We should never have two consecutive BSSRDF bounces, + * the second one should be converted to a diffuse BSDF to + * avoid this. + */ + kernel_assert(!(state->flag & PATH_RAY_DIFFUSE_ANCESTOR)); + + uint lcg_state = lcg_state_init_addrspace(state, 0x68bc21eb); + + LocalIntersection ss_isect; + int num_hits = subsurface_scatter_multi_intersect(kg, + &ss_isect, + sd, + state, + sc, + &lcg_state, + bssrdf_u, bssrdf_v, + false); +# ifdef __VOLUME__ + bool need_update_volume_stack = + kernel_data.integrator.use_volumes && + sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME; +# endif /* __VOLUME__ */ + + /* compute lighting with the BSDF closure */ + for(int hit = 0; hit < num_hits; hit++) { + /* NOTE: We reuse the existing ShaderData, we assume the path + * integration loop stops when this function returns true. + */ + subsurface_scatter_multi_setup(kg, + &ss_isect, + hit, + sd, + state, + sc); + + kernel_path_surface_connect_light(kg, sd, emission_sd, *throughput, state, L); + + ccl_addr_space PathState *hit_state = &ss_indirect->state[ss_indirect->num_rays]; + ccl_addr_space Ray *hit_ray = &ss_indirect->rays[ss_indirect->num_rays]; + ccl_addr_space float3 *hit_tp = &ss_indirect->throughputs[ss_indirect->num_rays]; + PathRadianceState *hit_L_state = &ss_indirect->L_state[ss_indirect->num_rays]; + + *hit_state = *state; + *hit_ray = *ray; + *hit_tp = *throughput; + *hit_L_state = L->state; + + hit_state->rng_offset += PRNG_BOUNCE_NUM; + + if(kernel_path_surface_bounce(kg, + sd, + hit_tp, + hit_state, + hit_L_state, + hit_ray)) + { +# ifdef __LAMP_MIS__ + hit_state->ray_t = 0.0f; +# endif /* __LAMP_MIS__ */ + +# ifdef __VOLUME__ + if(need_update_volume_stack) { + Ray volume_ray = *ray; + /* Setup ray from previous surface point to the new one. */ + volume_ray.D = normalize_len(hit_ray->P - volume_ray.P, + &volume_ray.t); + + kernel_volume_stack_update_for_subsurface( + kg, + emission_sd, + &volume_ray, + hit_state->volume_stack); + } +# endif /* __VOLUME__ */ + ss_indirect->num_rays++; + } + } + return true; + } + return false; +} + +ccl_device_inline void kernel_path_subsurface_init_indirect( + ccl_addr_space SubsurfaceIndirectRays *ss_indirect) +{ + ss_indirect->num_rays = 0; +} + +ccl_device void kernel_path_subsurface_setup_indirect( + KernelGlobals *kg, + ccl_addr_space SubsurfaceIndirectRays *ss_indirect, + ccl_addr_space PathState *state, + ccl_addr_space Ray *ray, + PathRadiance *L, + ccl_addr_space float3 *throughput) +{ + /* Setup state, ray and throughput for indirect SSS rays. */ + ss_indirect->num_rays--; + + path_radiance_sum_indirect(L); + path_radiance_reset_indirect(L); + + *state = ss_indirect->state[ss_indirect->num_rays]; + *ray = ss_indirect->rays[ss_indirect->num_rays]; + L->state = ss_indirect->L_state[ss_indirect->num_rays]; + *throughput = ss_indirect->throughputs[ss_indirect->num_rays]; + + state->rng_offset += ss_indirect->num_rays * PRNG_BOUNCE_NUM; +} + +#endif /* __SUBSURFACE__ */ + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h index fea503d06e5..27be90d5059 100644 --- a/intern/cycles/kernel/kernel_path_surface.h +++ b/intern/cycles/kernel/kernel_path_surface.h @@ -16,16 +16,21 @@ CCL_NAMESPACE_BEGIN -#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__) - +#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__) || defined(__SHADOW_TRICKS__) || defined(__BAKING__) /* branched path tracing: connect path directly to position on one or more lights and add it to L */ -ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RNG *rng, - ShaderData *sd, ShaderData *emission_sd, PathState *state, float3 throughput, - float num_samples_adjust, PathRadiance *L, int sample_all_lights) +ccl_device_noinline void kernel_branched_path_surface_connect_light( + KernelGlobals *kg, + ShaderData *sd, + ShaderData *emission_sd, + ccl_addr_space PathState *state, + float3 throughput, + float num_samples_adjust, + PathRadiance *L, + int sample_all_lights) { #ifdef __EMISSION__ /* sample illumination from lights to find path contribution */ - if(!(ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL)) + if(!(sd->flag & SD_BSDF_HAS_EVAL)) return; Ray light_ray; @@ -33,7 +38,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal bool is_lamp; # ifdef __OBJECT_MOTION__ - light_ray.time = ccl_fetch(sd, time); + light_ray.time = sd->time; # endif if(sample_all_lights) { @@ -44,15 +49,15 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal int num_samples = ceil_to_int(num_samples_adjust*light_select_num_samples(kg, i)); float num_samples_inv = num_samples_adjust/(num_samples*kernel_data.integrator.num_all_lights); - RNG lamp_rng = cmj_hash(*rng, i); + uint lamp_rng_hash = cmj_hash(state->rng_hash, i); for(int j = 0; j < num_samples; j++) { float light_u, light_v; - path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); - float terminate = path_branched_rng_light_termination(kg, &lamp_rng, state, j, num_samples); + path_branched_rng_2D(kg, lamp_rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); + float terminate = path_branched_rng_light_termination(kg, lamp_rng_hash, state, j, num_samples); LightSample ls; - if(lamp_light_sample(kg, i, light_u, light_v, ccl_fetch(sd, P), &ls)) { + if(lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls)) { /* The sampling probability returned by lamp_light_sample assumes that all lights were sampled. * However, this code only samples lamps, so if the scene also had mesh lights, the real probability is twice as high. */ if(kernel_data.integrator.pdf_triangles != 0.0f) @@ -62,9 +67,12 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal /* trace shadow ray */ float3 shadow; - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); + path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp); + } + else { + path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light); } } } @@ -77,17 +85,16 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal float num_samples_inv = num_samples_adjust/num_samples; for(int j = 0; j < num_samples; j++) { - float light_t = path_branched_rng_1D(kg, rng, state, j, num_samples, PRNG_LIGHT); float light_u, light_v; - path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); - float terminate = path_branched_rng_light_termination(kg, rng, state, j, num_samples); + path_branched_rng_2D(kg, state->rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); + float terminate = path_branched_rng_light_termination(kg, state->rng_hash, state, j, num_samples); /* only sample triangle lights */ if(kernel_data.integrator.num_all_lights) - light_t = 0.5f*light_t; + light_u = 0.5f*light_u; LightSample ls; - if(light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls)) { + if(light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { /* Same as above, probability needs to be corrected since the sampling was forced to select a mesh light. */ if(kernel_data.integrator.num_all_lights) ls.pdf *= 2.0f; @@ -96,9 +103,12 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal /* trace shadow ray */ float3 shadow; - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); + path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp); + } + else { + path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light); } } } @@ -107,21 +117,23 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal } else { /* sample one light at random */ - float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); float light_u, light_v; - path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); - float terminate = path_state_rng_light_termination(kg, rng, state); + path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v); + float terminate = path_state_rng_light_termination(kg, state); LightSample ls; - if(light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls)) { + if(light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { /* sample random light */ if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) { /* trace shadow ray */ float3 shadow; - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, state->bounce, is_lamp); + path_radiance_accum_light(L, state, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, is_lamp); + } + else { + path_radiance_accum_total_light(L, state, throughput*num_samples_adjust, &L_light); } } } @@ -130,9 +142,17 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal } /* branched path tracing: bounce off or through surface to with new direction stored in ray */ -ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng, - ShaderData *sd, const ShaderClosure *sc, int sample, int num_samples, - float3 *throughput, PathState *state, PathRadiance *L, Ray *ray) +ccl_device bool kernel_branched_path_surface_bounce( + KernelGlobals *kg, + ShaderData *sd, + const ShaderClosure *sc, + int sample, + int num_samples, + ccl_addr_space float3 *throughput, + ccl_addr_space PathState *state, + PathRadianceState *L_state, + ccl_addr_space Ray *ray, + float sum_sample_weight) { /* sample BSDF */ float bsdf_pdf; @@ -140,7 +160,7 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng, float3 bsdf_omega_in; differential3 bsdf_domega_in; float bsdf_u, bsdf_v; - path_branched_rng_2D(kg, rng, state, sample, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v); + path_branched_rng_2D(kg, state->rng_hash, state, sample, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v); int label; label = shader_bsdf_sample_closure(kg, sd, sc, bsdf_u, bsdf_v, &bsdf_eval, @@ -150,21 +170,25 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng, return false; /* modify throughput */ - path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label); + path_radiance_bsdf_bounce(kg, L_state, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label); + +#ifdef __DENOISING_FEATURES__ + state->denoising_feature_weight *= sc->sample_weight / (sum_sample_weight * num_samples); +#endif /* modify path state */ path_state_next(kg, state, label); /* setup ray */ - ray->P = ray_offset(ccl_fetch(sd, P), (label & LABEL_TRANSMIT)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng)); + ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng); ray->D = normalize(bsdf_omega_in); ray->t = FLT_MAX; #ifdef __RAY_DIFFERENTIALS__ - ray->dP = ccl_fetch(sd, dP); + ray->dP = sd->dP; ray->dD = bsdf_domega_in; #endif #ifdef __OBJECT_MOTION__ - ray->time = ccl_fetch(sd, time); + ray->time = sd->time; #endif #ifdef __VOLUME__ @@ -188,64 +212,77 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng, #endif -#ifndef __SPLIT_KERNEL__ /* path tracing: connect path directly to position on a light and add it to L */ -ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ccl_addr_space RNG *rng, +ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ShaderData *sd, ShaderData *emission_sd, float3 throughput, ccl_addr_space PathState *state, PathRadiance *L) { #ifdef __EMISSION__ - if(!(kernel_data.integrator.use_direct_light && (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL))) + if(!(kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL))) return; +#ifdef __SHADOW_TRICKS__ + if(state->flag & PATH_RAY_SHADOW_CATCHER) { + kernel_branched_path_surface_connect_light(kg, + sd, + emission_sd, + state, + throughput, + 1.0f, + L, + 1); + return; + } +#endif + /* sample illumination from lights to find path contribution */ - float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); float light_u, light_v; - path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); + path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v); Ray light_ray; BsdfEval L_light; bool is_lamp; #ifdef __OBJECT_MOTION__ - light_ray.time = ccl_fetch(sd, time); + light_ray.time = sd->time; #endif LightSample ls; - if(light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls)) { - float terminate = path_state_rng_light_termination(kg, rng, state); + if(light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { + float terminate = path_state_rng_light_termination(kg, state); if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) { /* trace shadow ray */ float3 shadow; - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp); + path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp); + } + else { + path_radiance_accum_total_light(L, state, throughput, &L_light); } } } #endif } -#endif /* path tracing: bounce off or through surface to with new direction stored in ray */ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg, - ccl_addr_space RNG *rng, ShaderData *sd, ccl_addr_space float3 *throughput, ccl_addr_space PathState *state, - PathRadiance *L, + PathRadianceState *L_state, ccl_addr_space Ray *ray) { /* no BSDF? we can stop here */ - if(ccl_fetch(sd, flag) & SD_BSDF) { + if(sd->flag & SD_BSDF) { /* sample BSDF */ float bsdf_pdf; BsdfEval bsdf_eval; float3 bsdf_omega_in; differential3 bsdf_domega_in; float bsdf_u, bsdf_v; - path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); + path_state_rng_2D(kg, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); int label; label = shader_bsdf_sample(kg, sd, bsdf_u, bsdf_v, &bsdf_eval, @@ -255,7 +292,7 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg, return false; /* modify throughput */ - path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label); + path_radiance_bsdf_bounce(kg, L_state, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label); /* set labels */ if(!(label & LABEL_TRANSPARENT)) { @@ -270,16 +307,16 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg, path_state_next(kg, state, label); /* setup ray */ - ray->P = ray_offset(ccl_fetch(sd, P), (label & LABEL_TRANSMIT)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng)); + ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng); ray->D = normalize(bsdf_omega_in); if(state->bounce == 0) - ray->t -= ccl_fetch(sd, ray_length); /* clipping works through transparent */ + ray->t -= sd->ray_length; /* clipping works through transparent */ else ray->t = FLT_MAX; #ifdef __RAY_DIFFERENTIALS__ - ray->dP = ccl_fetch(sd, dP); + ray->dP = sd->dP; ray->dD = bsdf_domega_in; #endif @@ -291,21 +328,20 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg, return true; } #ifdef __VOLUME__ - else if(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME) { - /* no surface shader but have a volume shader? act transparent */ - - /* update path state, count as transparent */ - path_state_next(kg, state, LABEL_TRANSPARENT); + else if(sd->flag & SD_HAS_ONLY_VOLUME) { + if(!path_state_volume_next(kg, state)) { + return false; + } if(state->bounce == 0) - ray->t -= ccl_fetch(sd, ray_length); /* clipping works through transparent */ + ray->t -= sd->ray_length; /* clipping works through transparent */ else ray->t = FLT_MAX; /* setup ray position, direction stays unchanged */ - ray->P = ray_offset(ccl_fetch(sd, P), -ccl_fetch(sd, Ng)); + ray->P = ray_offset(sd->P, -sd->Ng); #ifdef __RAY_DIFFERENTIALS__ - ray->dP = ccl_fetch(sd, dP); + ray->dP = sd->dP; #endif /* enter/exit volume */ diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h index 3d3b7385d8b..6275d0d6562 100644 --- a/intern/cycles/kernel/kernel_path_volume.h +++ b/intern/cycles/kernel/kernel_path_volume.h @@ -20,11 +20,10 @@ CCL_NAMESPACE_BEGIN ccl_device_inline void kernel_path_volume_connect_light( KernelGlobals *kg, - RNG *rng, ShaderData *sd, ShaderData *emission_sd, float3 throughput, - PathState *state, + ccl_addr_space PathState *state, PathRadiance *L) { #ifdef __EMISSION__ @@ -32,9 +31,8 @@ ccl_device_inline void kernel_path_volume_connect_light( return; /* sample illumination from lights to find path contribution */ - float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); float light_u, light_v; - path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); + path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v); Ray light_ray; BsdfEval L_light; @@ -42,24 +40,22 @@ ccl_device_inline void kernel_path_volume_connect_light( bool is_lamp; /* connect to light from given point where shader has been evaluated */ -# ifdef __OBJECT_MOTION__ light_ray.time = sd->time; -# endif - if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) + if(light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { - float terminate = path_state_rng_light_termination(kg, rng, state); + float terminate = path_state_rng_light_termination(kg, state); if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) { /* trace shadow ray */ float3 shadow; - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp); + path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp); } } } -#endif +#endif /* __EMISSION__ */ } #ifdef __KERNEL_GPU__ @@ -67,8 +63,13 @@ ccl_device_noinline #else ccl_device #endif -bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng, - ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray) +bool kernel_path_volume_bounce( + KernelGlobals *kg, + ShaderData *sd, + ccl_addr_space float3 *throughput, + ccl_addr_space PathState *state, + PathRadianceState *L_state, + ccl_addr_space Ray *ray) { /* sample phase function */ float phase_pdf; @@ -76,7 +77,7 @@ bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng, float3 phase_omega_in; differential3 phase_domega_in; float phase_u, phase_v; - path_state_rng_2D(kg, rng, state, PRNG_PHASE_U, &phase_u, &phase_v); + path_state_rng_2D(kg, state, PRNG_BSDF_U, &phase_u, &phase_v); int label; label = shader_volume_phase_sample(kg, sd, phase_u, phase_v, &phase_eval, @@ -86,7 +87,7 @@ bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng, return false; /* modify throughput */ - path_radiance_bsdf_bounce(L, throughput, &phase_eval, phase_pdf, state->bounce, label); + path_radiance_bsdf_bounce(kg, L_state, throughput, &phase_eval, phase_pdf, state->bounce, label); /* set labels */ state->ray_pdf = phase_pdf; @@ -98,6 +99,23 @@ bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng, /* update path state */ path_state_next(kg, state, label); + /* Russian roulette termination of volume ray scattering. */ + float probability = path_state_continuation_probability(kg, state, *throughput); + + if(probability == 0.0f) { + return false; + } + else if(probability != 1.0f) { + /* Use dimension from the previous bounce, has not been used yet. */ + float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE - PRNG_BOUNCE_NUM); + + if(terminate >= probability) { + return false; + } + + *throughput /= probability; + } + /* setup ray */ ray->P = sd->P; ray->D = phase_omega_in; @@ -111,9 +129,17 @@ bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng, return true; } -ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG *rng, - ShaderData *sd, ShaderData *emission_sd, float3 throughput, PathState *state, PathRadiance *L, - bool sample_all_lights, Ray *ray, const VolumeSegment *segment) +#ifndef __SPLIT_KERNEL__ +ccl_device void kernel_branched_path_volume_connect_light( + KernelGlobals *kg, + ShaderData *sd, + ShaderData *emission_sd, + float3 throughput, + ccl_addr_space PathState *state, + PathRadiance *L, + bool sample_all_lights, + Ray *ray, + const VolumeSegment *segment) { #ifdef __EMISSION__ if(!kernel_data.integrator.use_direct_light) @@ -123,9 +149,7 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG BsdfEval L_light; bool is_lamp; -# ifdef __OBJECT_MOTION__ light_ray.time = sd->time; -# endif if(sample_all_lights) { /* lamp sampling */ @@ -135,12 +159,12 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG int num_samples = light_select_num_samples(kg, i); float num_samples_inv = 1.0f/(num_samples*kernel_data.integrator.num_all_lights); - RNG lamp_rng = cmj_hash(*rng, i); + uint lamp_rng_hash = cmj_hash(state->rng_hash, i); for(int j = 0; j < num_samples; j++) { /* sample random position on given light */ float light_u, light_v; - path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); + path_branched_rng_2D(kg, lamp_rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); LightSample ls; lamp_light_sample(kg, i, light_u, light_v, ray->P, &ls); @@ -148,28 +172,26 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG float3 tp = throughput; /* sample position on volume segment */ - float rphase = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_PHASE); - float rscatter = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_SCATTER_DISTANCE); + float rphase = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_PHASE_CHANNEL); + float rscatter = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_SCATTER_DISTANCE); VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false); - (void)result; - kernel_assert(result == VOLUME_PATH_SCATTERED); - /* todo: split up light_sample so we don't have to call it again with new position */ - if(lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls)) { + if(result == VOLUME_PATH_SCATTERED && + lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls)) { if(kernel_data.integrator.pdf_triangles != 0.0f) ls.pdf *= 2.0f; - float terminate = path_branched_rng_light_termination(kg, rng, state, j, num_samples); + float terminate = path_branched_rng_light_termination(kg, state->rng_hash, state, j, num_samples); if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) { /* trace shadow ray */ float3 shadow; - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); + path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp); } } } @@ -183,42 +205,39 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG for(int j = 0; j < num_samples; j++) { /* sample random position on random triangle */ - float light_t = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_LIGHT); float light_u, light_v; - path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); + path_branched_rng_2D(kg, state->rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); /* only sample triangle lights */ if(kernel_data.integrator.num_all_lights) - light_t = 0.5f*light_t; + light_u = 0.5f*light_u; LightSample ls; - light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, state->bounce, &ls); + light_sample(kg, light_u, light_v, sd->time, ray->P, state->bounce, &ls); float3 tp = throughput; /* sample position on volume segment */ - float rphase = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_PHASE); - float rscatter = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_SCATTER_DISTANCE); + float rphase = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_PHASE_CHANNEL); + float rscatter = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_SCATTER_DISTANCE); VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false); - (void)result; - kernel_assert(result == VOLUME_PATH_SCATTERED); - /* todo: split up light_sample so we don't have to call it again with new position */ - if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { + if(result == VOLUME_PATH_SCATTERED && + light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { if(kernel_data.integrator.num_all_lights) ls.pdf *= 2.0f; - float terminate = path_branched_rng_light_termination(kg, rng, state, j, num_samples); + float terminate = path_branched_rng_light_termination(kg, state->rng_hash, state, j, num_samples); if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) { /* trace shadow ray */ float3 shadow; - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); + path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp); } } } @@ -227,44 +246,42 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG } else { /* sample random position on random light */ - float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); float light_u, light_v; - path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); + path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v); LightSample ls; - light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, state->bounce, &ls); + light_sample(kg, light_u, light_v, sd->time, ray->P, state->bounce, &ls); float3 tp = throughput; /* sample position on volume segment */ - float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE); - float rscatter = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE); + float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL); + float rscatter = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE); VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false); - (void)result; - kernel_assert(result == VOLUME_PATH_SCATTERED); - /* todo: split up light_sample so we don't have to call it again with new position */ - if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { + if(result == VOLUME_PATH_SCATTERED && + light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { /* sample random light */ - float terminate = path_state_rng_light_termination(kg, rng, state); + float terminate = path_state_rng_light_termination(kg, state); if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) { /* trace shadow ray */ float3 shadow; - if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { + if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, tp, &L_light, shadow, 1.0f, state->bounce, is_lamp); + path_radiance_accum_light(L, state, tp, &L_light, shadow, 1.0f, is_lamp); } } } } -#endif +#endif /* __EMISSION__ */ } +#endif /* __SPLIT_KERNEL__ */ -#endif +#endif /* __VOLUME_SCATTER__ */ CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_projection.h b/intern/cycles/kernel/kernel_projection.h index 9a2b0884a7e..4540d733af4 100644 --- a/intern/cycles/kernel/kernel_projection.h +++ b/intern/cycles/kernel/kernel_projection.h @@ -57,6 +57,9 @@ ccl_device float3 spherical_to_direction(float theta, float phi) ccl_device float2 direction_to_equirectangular_range(float3 dir, float4 range) { + if(is_zero(dir)) + return make_float2(0.0f, 0.0f); + float u = (atan2f(dir.y, dir.x) - range.y) / range.x; float v = (acosf(dir.z / len(dir)) - range.w) / range.z; @@ -192,49 +195,49 @@ ccl_device float2 direction_to_mirrorball(float3 dir) return make_float2(u, v); } -ccl_device_inline float3 panorama_to_direction(KernelGlobals *kg, float u, float v) +ccl_device_inline float3 panorama_to_direction(ccl_constant KernelCamera *cam, float u, float v) { - switch(kernel_data.cam.panorama_type) { + switch(cam->panorama_type) { case PANORAMA_EQUIRECTANGULAR: - return equirectangular_range_to_direction(u, v, kernel_data.cam.equirectangular_range); + return equirectangular_range_to_direction(u, v, cam->equirectangular_range); case PANORAMA_MIRRORBALL: return mirrorball_to_direction(u, v); case PANORAMA_FISHEYE_EQUIDISTANT: - return fisheye_to_direction(u, v, kernel_data.cam.fisheye_fov); + return fisheye_to_direction(u, v, cam->fisheye_fov); case PANORAMA_FISHEYE_EQUISOLID: default: - return fisheye_equisolid_to_direction(u, v, kernel_data.cam.fisheye_lens, - kernel_data.cam.fisheye_fov, kernel_data.cam.sensorwidth, kernel_data.cam.sensorheight); + return fisheye_equisolid_to_direction(u, v, cam->fisheye_lens, + cam->fisheye_fov, cam->sensorwidth, cam->sensorheight); } } -ccl_device_inline float2 direction_to_panorama(KernelGlobals *kg, float3 dir) +ccl_device_inline float2 direction_to_panorama(ccl_constant KernelCamera *cam, float3 dir) { - switch(kernel_data.cam.panorama_type) { + switch(cam->panorama_type) { case PANORAMA_EQUIRECTANGULAR: - return direction_to_equirectangular_range(dir, kernel_data.cam.equirectangular_range); + return direction_to_equirectangular_range(dir, cam->equirectangular_range); case PANORAMA_MIRRORBALL: return direction_to_mirrorball(dir); case PANORAMA_FISHEYE_EQUIDISTANT: - return direction_to_fisheye(dir, kernel_data.cam.fisheye_fov); + return direction_to_fisheye(dir, cam->fisheye_fov); case PANORAMA_FISHEYE_EQUISOLID: default: - return direction_to_fisheye_equisolid(dir, kernel_data.cam.fisheye_lens, - kernel_data.cam.sensorwidth, kernel_data.cam.sensorheight); + return direction_to_fisheye_equisolid(dir, cam->fisheye_lens, + cam->sensorwidth, cam->sensorheight); } } -ccl_device_inline void spherical_stereo_transform(KernelGlobals *kg, float3 *P, float3 *D) +ccl_device_inline void spherical_stereo_transform(ccl_constant KernelCamera *cam, float3 *P, float3 *D) { - float interocular_offset = kernel_data.cam.interocular_offset; + float interocular_offset = cam->interocular_offset; /* Interocular offset of zero means either non stereo, or stereo without * spherical stereo. */ kernel_assert(interocular_offset != 0.0f); - if(kernel_data.cam.pole_merge_angle_to > 0.0f) { - const float pole_merge_angle_from = kernel_data.cam.pole_merge_angle_from, - pole_merge_angle_to = kernel_data.cam.pole_merge_angle_to; + if(cam->pole_merge_angle_to > 0.0f) { + const float pole_merge_angle_from = cam->pole_merge_angle_from, + pole_merge_angle_to = cam->pole_merge_angle_to; float altitude = fabsf(safe_asinf((*D).z)); if(altitude > pole_merge_angle_to) { interocular_offset = 0.0f; @@ -254,7 +257,7 @@ ccl_device_inline void spherical_stereo_transform(KernelGlobals *kg, float3 *P, /* Convergence distance is FLT_MAX in the case of parallel convergence mode, * no need to modify direction in this case either. */ - const float convergence_distance = kernel_data.cam.convergence_distance; + const float convergence_distance = cam->convergence_distance; if(convergence_distance != FLT_MAX) { diff --git a/intern/cycles/kernel/kernel_queues.h b/intern/cycles/kernel/kernel_queues.h index cf5614b8a86..e32d4bbbc1b 100644 --- a/intern/cycles/kernel/kernel_queues.h +++ b/intern/cycles/kernel/kernel_queues.h @@ -17,12 +17,15 @@ #ifndef __KERNEL_QUEUE_H__ #define __KERNEL_QUEUE_H__ +CCL_NAMESPACE_BEGIN + /* * Queue utility functions for split kernel */ - +#ifdef __KERNEL_OPENCL__ #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable +#endif /* * Enqueue ray index into the queue @@ -35,7 +38,8 @@ ccl_device void enqueue_ray_index( ccl_global int *queue_index) /* Array of size num_queues; Used for atomic increment. */ { /* This thread's queue index. */ - int my_queue_index = atomic_inc(&queue_index[queue_number]) + (queue_number * queue_size); + int my_queue_index = atomic_fetch_and_inc_uint32((ccl_global uint*)&queue_index[queue_number]) + + (queue_number * queue_size); queues[my_queue_index] = ray_index; } @@ -47,6 +51,7 @@ ccl_device void enqueue_ray_index( * is no more ray to allocate to other threads. */ ccl_device int get_ray_index( + KernelGlobals *kg, int thread_index, /* Global thread index. */ int queue_number, /* Queue to operate on. */ ccl_global int *queues, /* Buffer of all queues. */ @@ -68,24 +73,25 @@ ccl_device void enqueue_ray_index_local( int queue_number, /* Queue in which to enqueue ray index. */ char enqueue_flag, /* True for threads whose ray index has to be enqueued. */ int queuesize, /* queue size. */ - ccl_local unsigned int *local_queue_atomics, /* To to local queue atomics. */ + ccl_local_param unsigned int *local_queue_atomics, /* To to local queue atomics. */ ccl_global int *Queue_data, /* Queues. */ ccl_global int *Queue_index) /* To do global queue atomics. */ { - int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0); + int lidx = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0); /* Get local queue id .*/ unsigned int lqidx; if(enqueue_flag) { - lqidx = atomic_inc(local_queue_atomics); + lqidx = atomic_fetch_and_inc_uint32(local_queue_atomics); } - barrier(CLK_LOCAL_MEM_FENCE); + ccl_barrier(CCL_LOCAL_MEM_FENCE); /* Get global queue offset. */ if(lidx == 0) { - *local_queue_atomics = atomic_add(&Queue_index[queue_number], *local_queue_atomics); + *local_queue_atomics = atomic_fetch_and_add_uint32((ccl_global uint*)&Queue_index[queue_number], + *local_queue_atomics); } - barrier(CLK_LOCAL_MEM_FENCE); + ccl_barrier(CCL_LOCAL_MEM_FENCE); /* Get global queue index and enqueue ray. */ if(enqueue_flag) { @@ -96,19 +102,19 @@ ccl_device void enqueue_ray_index_local( ccl_device unsigned int get_local_queue_index( int queue_number, /* Queue in which to enqueue the ray; -1 if no queue */ - ccl_local unsigned int *local_queue_atomics) + ccl_local_param unsigned int *local_queue_atomics) { - int my_lqidx = atomic_inc(&local_queue_atomics[queue_number]); + int my_lqidx = atomic_fetch_and_inc_uint32(&local_queue_atomics[queue_number]); return my_lqidx; } ccl_device unsigned int get_global_per_queue_offset( int queue_number, - ccl_local unsigned int *local_queue_atomics, + ccl_local_param unsigned int *local_queue_atomics, ccl_global int* global_queue_atomics) { - unsigned int queue_offset = atomic_add(&global_queue_atomics[queue_number], - local_queue_atomics[queue_number]); + unsigned int queue_offset = atomic_fetch_and_add_uint32((ccl_global uint*)&global_queue_atomics[queue_number], + local_queue_atomics[queue_number]); return queue_offset; } @@ -116,10 +122,27 @@ ccl_device unsigned int get_global_queue_index( int queue_number, int queuesize, unsigned int lqidx, - ccl_local unsigned int * global_per_queue_offset) + ccl_local_param unsigned int * global_per_queue_offset) { int my_gqidx = queuesize * queue_number + lqidx + global_per_queue_offset[queue_number]; return my_gqidx; } +ccl_device int dequeue_ray_index( + int queue_number, + ccl_global int *queues, + int queue_size, + ccl_global int *queue_index) +{ + int index = atomic_fetch_and_dec_uint32((ccl_global uint*)&queue_index[queue_number])-1; + + if(index < 0) { + return QUEUE_EMPTY_SLOT; + } + + return queues[index + queue_number * queue_size]; +} + +CCL_NAMESPACE_END + #endif // __KERNEL_QUEUE_H__ diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h index e773753396f..93152e9ff1c 100644 --- a/intern/cycles/kernel/kernel_random.h +++ b/intern/cycles/kernel/kernel_random.h @@ -14,107 +14,68 @@ * limitations under the License. */ -#include "kernel_jitter.h" +#include "kernel/kernel_jitter.h" +#include "util/util_hash.h" CCL_NAMESPACE_BEGIN -#ifdef __SOBOL__ - -/* skip initial numbers that are not as well distributed, especially the - * first sequence is just 0 everywhere, which can be problematic for e.g. - * path termination */ -#define SOBOL_SKIP 64 +/* Pseudo random numbers, uncomment this for debugging correlations. Only run + * this single threaded on a CPU for repeatable results. */ +//#define __DEBUG_CORRELATION__ -/* High Dimensional Sobol */ - -/* van der corput radical inverse */ -ccl_device uint van_der_corput(uint bits) -{ - bits = (bits << 16) | (bits >> 16); - bits = ((bits & 0x00ff00ff) << 8) | ((bits & 0xff00ff00) >> 8); - bits = ((bits & 0x0f0f0f0f) << 4) | ((bits & 0xf0f0f0f0) >> 4); - bits = ((bits & 0x33333333) << 2) | ((bits & 0xcccccccc) >> 2); - bits = ((bits & 0x55555555) << 1) | ((bits & 0xaaaaaaaa) >> 1); - return bits; -} - -/* sobol radical inverse */ -ccl_device uint sobol(uint i) -{ - uint r = 0; - - for(uint v = 1U << 31; i; i >>= 1, v ^= v >> 1) - if(i & 1) - r ^= v; - - return r; -} -/* inverse of sobol radical inverse */ -ccl_device uint sobol_inverse(uint i) -{ - const uint msb = 1U << 31; - uint r = 0; +/* High Dimensional Sobol. + * + * Multidimensional sobol with generator matrices. Dimension 0 and 1 are equal + * to classic Van der Corput and Sobol sequences. */ - for(uint v = 1; i; i <<= 1, v ^= v << 1) - if(i & msb) - r ^= v; +#ifdef __SOBOL__ - return r; -} +/* Skip initial numbers that for some dimensions have clear patterns that + * don't cover the entire sample space. Ideally we would have a better + * progressive pattern that doesn't suffer from this problem, because even + * with this offset some dimensions are quite poor. + */ +#define SOBOL_SKIP 64 -/* multidimensional sobol with generator matrices - * dimension 0 and 1 are equal to van_der_corput() and sobol() respectively */ ccl_device uint sobol_dimension(KernelGlobals *kg, int index, int dimension) { uint result = 0; - uint i = index; - - for(uint j = 0; i; i >>= 1, j++) - if(i & 1) + uint i = index + SOBOL_SKIP; + for(uint j = 0; i; i >>= 1, j++) { + if(i & 1) { result ^= kernel_tex_fetch(__sobol_directions, 32*dimension + j); - + } + } return result; } -/* lookup index and x/y coordinate, assumes m is a power of two */ -ccl_device uint sobol_lookup(const uint m, const uint frame, const uint ex, const uint ey, uint *x, uint *y) -{ - /* shift is constant per frame */ - const uint shift = frame << (m << 1); - const uint sobol_shift = sobol(shift); - /* van der Corput is its own inverse */ - const uint lower = van_der_corput(ex << (32 - m)); - /* need to compensate for ey difference and shift */ - const uint sobol_lower = sobol(lower); - const uint mask = ~-(1 << m) << (32 - m); /* only m upper bits */ - const uint delta = ((ey << (32 - m)) ^ sobol_lower ^ sobol_shift) & mask; - /* only use m upper bits for the index (m is a power of two) */ - const uint sobol_result = delta | (delta >> m); - const uint upper = sobol_inverse(sobol_result); - const uint index = shift | upper | lower; - *x = van_der_corput(index); - *y = sobol_shift ^ sobol_result ^ sobol_lower; - return index; -} +#endif /* __SOBOL__ */ -ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension) + +ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, + uint rng_hash, + int sample, int num_samples, + int dimension) { +#ifdef __DEBUG_CORRELATION__ + return (float)drand48(); +#endif + #ifdef __CMJ__ - if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) { - /* correlated multi-jittered */ - int p = *rng + dimension; +# ifdef __SOBOL__ + if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) +# endif + { + /* Correlated multi-jitter. */ + int p = rng_hash + dimension; return cmj_sample_1D(sample, num_samples, p); } #endif -#ifdef __SOBOL_FULL_SCREEN__ - uint result = sobol_dimension(kg, *rng, dimension); - float r = (float)result * (1.0f/(float)0xFFFFFFFF); - return r; -#else - /* compute sobol sequence value using direction vectors */ - uint result = sobol_dimension(kg, sample + SOBOL_SKIP, dimension); +#ifdef __SOBOL__ + /* Sobol sequence value using direction vectors. */ + uint result = sobol_dimension(kg, sample, dimension); float r = (float)result * (1.0f/(float)0xFFFFFFFF); /* Cranly-Patterson rotation using rng seed */ @@ -123,111 +84,67 @@ ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG * /* Hash rng with dimension to solve correlation issues. * See T38710, T50116. */ - RNG tmp_rng = cmj_hash_simple(dimension, *rng); + uint tmp_rng = cmj_hash_simple(dimension, rng_hash); shift = tmp_rng * (1.0f/(float)0xFFFFFFFF); return r + shift - floorf(r + shift); #endif } -ccl_device_forceinline void path_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy) +ccl_device_forceinline void path_rng_2D(KernelGlobals *kg, + uint rng_hash, + int sample, int num_samples, + int dimension, + float *fx, float *fy) { +#ifdef __DEBUG_CORRELATION__ + *fx = (float)drand48(); + *fy = (float)drand48(); + return; +#endif + #ifdef __CMJ__ - if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) { - /* correlated multi-jittered */ - int p = *rng + dimension; +# ifdef __SOBOL__ + if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) +# endif + { + /* Correlated multi-jitter. */ + int p = rng_hash + dimension; cmj_sample_2D(sample, num_samples, p, fx, fy); + return; } - else #endif - { - /* sobol */ - *fx = path_rng_1D(kg, rng, sample, num_samples, dimension); - *fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1); - } -} - -ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, ccl_addr_space RNG *rng, int x, int y, float *fx, float *fy) -{ -#ifdef __SOBOL_FULL_SCREEN__ - uint px, py; - uint bits = 16; /* limits us to 65536x65536 and 65536 samples */ - uint size = 1 << bits; - uint frame = sample; - - *rng = sobol_lookup(bits, frame, x, y, &px, &py); - - *rng ^= kernel_data.integrator.seed; - - if(sample == 0) { - *fx = 0.5f; - *fy = 0.5f; - } - else { - *fx = size * (float)px * (1.0f/(float)0xFFFFFFFF) - x; - *fy = size * (float)py * (1.0f/(float)0xFFFFFFFF) - y; - } -#else - *rng = *rng_state; - *rng ^= kernel_data.integrator.seed; - - if(sample == 0) { - *fx = 0.5f; - *fy = 0.5f; - } - else { - path_rng_2D(kg, rng, sample, num_samples, PRNG_FILTER_U, fx, fy); - } +#ifdef __SOBOL__ + /* Sobol. */ + *fx = path_rng_1D(kg, rng_hash, sample, num_samples, dimension); + *fy = path_rng_1D(kg, rng_hash, sample, num_samples, dimension + 1); #endif } -ccl_device void path_rng_end(KernelGlobals *kg, ccl_global uint *rng_state, RNG rng) -{ - /* nothing to do */ -} - -#else - -/* Linear Congruential Generator */ - -ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension) -{ - /* implicit mod 2^32 */ - rng = (1103515245*(rng) + 12345); - return (float)rng * (1.0f/(float)0xFFFFFFFF); -} - -ccl_device_inline void path_rng_2D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension, float *fx, float *fy) -{ - *fx = path_rng_1D(kg, rng, sample, num_samples, dimension); - *fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1); -} - -ccl_device void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, RNG *rng, int x, int y, float *fx, float *fy) +ccl_device_inline void path_rng_init(KernelGlobals *kg, + int sample, int num_samples, + uint *rng_hash, + int x, int y, + float *fx, float *fy) { /* load state */ - *rng = *rng_state; + *rng_hash = hash_int_2d(x, y); + *rng_hash ^= kernel_data.integrator.seed; - *rng ^= kernel_data.integrator.seed; +#ifdef __DEBUG_CORRELATION__ + srand48(*rng_hash + sample); +#endif if(sample == 0) { *fx = 0.5f; *fy = 0.5f; } else { - path_rng_2D(kg, rng, sample, num_samples, PRNG_FILTER_U, fx, fy); + path_rng_2D(kg, *rng_hash, sample, num_samples, PRNG_FILTER_U, fx, fy); } } -ccl_device void path_rng_end(KernelGlobals *kg, ccl_global uint *rng_state, RNG rng) -{ - /* store state for next sample */ - *rng_state = rng; -} - -#endif - /* Linear Congruential Generator */ ccl_device uint lcg_step_uint(uint *rng) @@ -257,90 +174,123 @@ ccl_device uint lcg_init(uint seed) * dimension to avoid using the same sequence twice. * * For branches in the path we must be careful not to reuse the same number - * in a sequence and offset accordingly. */ - -ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension) -{ - return path_rng_1D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension); -} + * in a sequence and offset accordingly. + */ -ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension) +ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, + const ccl_addr_space PathState *state, + int dimension) { - /* the rng_offset is not increased for transparent bounces. if we do then - * fully transparent objects can become subtly visible by the different - * sampling patterns used where the transparent object is. - * - * however for some random numbers that will determine if we next bounce - * is transparent we do need to increase the offset to avoid always making - * the same decision */ - int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM; - return path_rng_1D(kg, rng, state->sample, state->num_samples, rng_offset + dimension); + return path_rng_1D(kg, + state->rng_hash, + state->sample, state->num_samples, + state->rng_offset + dimension); } -ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension, float *fx, float *fy) +ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, + const ccl_addr_space PathState *state, + int dimension, + float *fx, float *fy) { - path_rng_2D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension, fx, fy); + path_rng_2D(kg, + state->rng_hash, + state->sample, state->num_samples, + state->rng_offset + dimension, + fx, fy); } -ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension) +ccl_device_inline float path_state_rng_1D_hash(KernelGlobals *kg, + const ccl_addr_space PathState *state, + uint hash) { - return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension); + /* Use a hash instead of dimension, this is not great but avoids adding + * more dimensions to each bounce which reduces quality of dimensions we + * are already using. */ + return path_rng_1D(kg, + cmj_hash_simple(state->rng_hash, hash), + state->sample, state->num_samples, + state->rng_offset); } -ccl_device_inline float path_branched_rng_1D_for_decision(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension) +ccl_device_inline float path_branched_rng_1D( + KernelGlobals *kg, + uint rng_hash, + const ccl_addr_space PathState *state, + int branch, + int num_branches, + int dimension) { - int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM; - return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, rng_offset + dimension); + return path_rng_1D(kg, + rng_hash, + state->sample * num_branches + branch, + state->num_samples * num_branches, + state->rng_offset + dimension); } -ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy) +ccl_device_inline void path_branched_rng_2D( + KernelGlobals *kg, + uint rng_hash, + const ccl_addr_space PathState *state, + int branch, + int num_branches, + int dimension, + float *fx, float *fy) { - path_rng_2D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension, fx, fy); + path_rng_2D(kg, + rng_hash, + state->sample * num_branches + branch, + state->num_samples * num_branches, + state->rng_offset + dimension, + fx, fy); } -/* Utitility functions to get light termination value, since it might not be needed in many cases. */ -ccl_device_inline float path_state_rng_light_termination(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state) +/* Utitility functions to get light termination value, + * since it might not be needed in many cases. + */ +ccl_device_inline float path_state_rng_light_termination( + KernelGlobals *kg, + const ccl_addr_space PathState *state) { if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) { - return path_state_rng_1D_for_decision(kg, rng, state, PRNG_LIGHT_TERMINATE); + return path_state_rng_1D(kg, state, PRNG_LIGHT_TERMINATE); } return 0.0f; } -ccl_device_inline float path_branched_rng_light_termination(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches) +ccl_device_inline float path_branched_rng_light_termination( + KernelGlobals *kg, + uint rng_hash, + const ccl_addr_space PathState *state, + int branch, + int num_branches) { if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) { - return path_branched_rng_1D_for_decision(kg, rng, state, branch, num_branches, PRNG_LIGHT_TERMINATE); + return path_branched_rng_1D(kg, + rng_hash, + state, + branch, + num_branches, + PRNG_LIGHT_TERMINATE); } return 0.0f; } -ccl_device_inline void path_state_branch(PathState *state, int branch, int num_branches) -{ - /* path is splitting into a branch, adjust so that each branch - * still gets a unique sample from the same sequence */ - state->rng_offset += PRNG_BOUNCE_NUM; - state->sample = state->sample*num_branches + branch; - state->num_samples = state->num_samples*num_branches; -} - -ccl_device_inline uint lcg_state_init(RNG *rng, const PathState *state, uint scramble) +ccl_device_inline uint lcg_state_init(PathState *state, + uint scramble) { - return lcg_init(*rng + state->rng_offset + state->sample*scramble); + return lcg_init(state->rng_hash + state->rng_offset + state->sample*scramble); } -/* TODO(sergey): For until we can use generic address space from OpenCL 2.0. */ - -ccl_device_inline uint lcg_state_init_addrspace(ccl_addr_space RNG *rng, - const ccl_addr_space PathState *state, +ccl_device_inline uint lcg_state_init_addrspace(ccl_addr_space PathState *state, uint scramble) { - return lcg_init(*rng + state->rng_offset + state->sample*scramble); + return lcg_init(state->rng_hash + state->rng_offset + state->sample*scramble); } + ccl_device float lcg_step_float_addrspace(ccl_addr_space uint *rng) { - /* implicit mod 2^32 */ + /* Implicit mod 2^32 */ *rng = (1103515245*(*rng) + 12345); return (float)*rng * (1.0f/(float)0xFFFFFFFF); } diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h index d0826e5e879..937a50cba8b 100644 --- a/intern/cycles/kernel/kernel_shader.h +++ b/intern/cycles/kernel/kernel_shader.h @@ -24,12 +24,12 @@ * */ -#include "closure/alloc.h" -#include "closure/bsdf_util.h" -#include "closure/bsdf.h" -#include "closure/emissive.h" +#include "kernel/closure/alloc.h" +#include "kernel/closure/bsdf_util.h" +#include "kernel/closure/bsdf.h" +#include "kernel/closure/emissive.h" -#include "svm/svm.h" +#include "kernel/svm/svm.h" CCL_NAMESPACE_BEGIN @@ -38,13 +38,13 @@ CCL_NAMESPACE_BEGIN #ifdef __OBJECT_MOTION__ ccl_device void shader_setup_object_transforms(KernelGlobals *kg, ShaderData *sd, float time) { - if(ccl_fetch(sd, object_flag) & SD_OBJECT_MOTION) { - ccl_fetch(sd, ob_tfm) = object_fetch_transform_motion(kg, ccl_fetch(sd, object), time); - ccl_fetch(sd, ob_itfm) = transform_quick_inverse(ccl_fetch(sd, ob_tfm)); + if(sd->object_flag & SD_OBJECT_MOTION) { + sd->ob_tfm = object_fetch_transform_motion(kg, sd->object, time); + sd->ob_itfm = transform_quick_inverse(sd->ob_tfm); } else { - ccl_fetch(sd, ob_tfm) = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM); - ccl_fetch(sd, ob_itfm) = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM); + sd->ob_tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); + sd->ob_itfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM); } } #endif @@ -55,55 +55,56 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg, const Ray *ray) { #ifdef __INSTANCING__ - ccl_fetch(sd, object) = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object; + sd->object = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object; #endif + sd->lamp = LAMP_NONE; - ccl_fetch(sd, type) = isect->type; - ccl_fetch(sd, flag) = 0; - ccl_fetch(sd, object_flag) = kernel_tex_fetch(__object_flag, - ccl_fetch(sd, object)); + sd->type = isect->type; + sd->flag = 0; + sd->object_flag = kernel_tex_fetch(__object_flag, + sd->object); /* matrices and time */ #ifdef __OBJECT_MOTION__ shader_setup_object_transforms(kg, sd, ray->time); - ccl_fetch(sd, time) = ray->time; #endif + sd->time = ray->time; - ccl_fetch(sd, prim) = kernel_tex_fetch(__prim_index, isect->prim); - ccl_fetch(sd, ray_length) = isect->t; + sd->prim = kernel_tex_fetch(__prim_index, isect->prim); + sd->ray_length = isect->t; #ifdef __UV__ - ccl_fetch(sd, u) = isect->u; - ccl_fetch(sd, v) = isect->v; + sd->u = isect->u; + sd->v = isect->v; #endif #ifdef __HAIR__ - if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { + if(sd->type & PRIMITIVE_ALL_CURVE) { /* curve */ - float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim)); + float4 curvedata = kernel_tex_fetch(__curves, sd->prim); - ccl_fetch(sd, shader) = __float_as_int(curvedata.z); - ccl_fetch(sd, P) = bvh_curve_refine(kg, sd, isect, ray); + sd->shader = __float_as_int(curvedata.z); + sd->P = curve_refine(kg, sd, isect, ray); } else #endif - if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) { + if(sd->type & PRIMITIVE_TRIANGLE) { /* static triangle */ float3 Ng = triangle_normal(kg, sd); - ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim)); + sd->shader = kernel_tex_fetch(__tri_shader, sd->prim); /* vectors */ - ccl_fetch(sd, P) = triangle_refine(kg, sd, isect, ray); - ccl_fetch(sd, Ng) = Ng; - ccl_fetch(sd, N) = Ng; - + sd->P = triangle_refine(kg, sd, isect, ray); + sd->Ng = Ng; + sd->N = Ng; + /* smooth normal */ - if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) - ccl_fetch(sd, N) = triangle_smooth_normal(kg, ccl_fetch(sd, prim), ccl_fetch(sd, u), ccl_fetch(sd, v)); + if(sd->shader & SHADER_SMOOTH_NORMAL) + sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v); #ifdef __DPDU__ /* dPdu/dPdv */ - triangle_dPdudv(kg, ccl_fetch(sd, prim), &ccl_fetch(sd, dPdu), &ccl_fetch(sd, dPdv)); + triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv); #endif } else { @@ -111,40 +112,40 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg, motion_triangle_shader_setup(kg, sd, isect, ray, false); } - ccl_fetch(sd, I) = -ray->D; + sd->I = -ray->D; - ccl_fetch(sd, flag) |= kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE); + sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags; #ifdef __INSTANCING__ if(isect->object != OBJECT_NONE) { /* instance transform */ - object_normal_transform_auto(kg, sd, &ccl_fetch(sd, N)); - object_normal_transform_auto(kg, sd, &ccl_fetch(sd, Ng)); + object_normal_transform_auto(kg, sd, &sd->N); + object_normal_transform_auto(kg, sd, &sd->Ng); # ifdef __DPDU__ - object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdu)); - object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdv)); + object_dir_transform_auto(kg, sd, &sd->dPdu); + object_dir_transform_auto(kg, sd, &sd->dPdv); # endif } #endif /* backfacing test */ - bool backfacing = (dot(ccl_fetch(sd, Ng), ccl_fetch(sd, I)) < 0.0f); + bool backfacing = (dot(sd->Ng, sd->I) < 0.0f); if(backfacing) { - ccl_fetch(sd, flag) |= SD_BACKFACING; - ccl_fetch(sd, Ng) = -ccl_fetch(sd, Ng); - ccl_fetch(sd, N) = -ccl_fetch(sd, N); + sd->flag |= SD_BACKFACING; + sd->Ng = -sd->Ng; + sd->N = -sd->N; #ifdef __DPDU__ - ccl_fetch(sd, dPdu) = -ccl_fetch(sd, dPdu); - ccl_fetch(sd, dPdv) = -ccl_fetch(sd, dPdv); + sd->dPdu = -sd->dPdu; + sd->dPdv = -sd->dPdv; #endif } #ifdef __RAY_DIFFERENTIALS__ /* differentials */ - differential_transfer(&ccl_fetch(sd, dP), ray->dP, ray->D, ray->dD, ccl_fetch(sd, Ng), isect->t); - differential_incoming(&ccl_fetch(sd, dI), ray->dD); - differential_dudv(&ccl_fetch(sd, du), &ccl_fetch(sd, dv), ccl_fetch(sd, dPdu), ccl_fetch(sd, dPdv), ccl_fetch(sd, dP), ccl_fetch(sd, Ng)); + differential_transfer(&sd->dP, ray->dP, ray->D, ray->dD, sd->Ng, isect->t); + differential_incoming(&sd->dI, ray->dD); + differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng); #endif } @@ -181,12 +182,12 @@ void shader_setup_from_subsurface( sd->shader = kernel_tex_fetch(__tri_shader, sd->prim); /* static triangle */ - sd->P = triangle_refine_subsurface(kg, sd, isect, ray); + sd->P = triangle_refine_local(kg, sd, isect, ray); sd->Ng = Ng; sd->N = Ng; if(sd->shader & SHADER_SMOOTH_NORMAL) - sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v); + sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v); # ifdef __DPDU__ /* dPdu/dPdv */ @@ -198,16 +199,16 @@ void shader_setup_from_subsurface( motion_triangle_shader_setup(kg, sd, isect, ray, true); } - sd->flag |= kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE); + sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags; # ifdef __INSTANCING__ if(isect->object != OBJECT_NONE) { /* instance transform */ - object_normal_transform(kg, sd, &sd->N); - object_normal_transform(kg, sd, &sd->Ng); + object_normal_transform_auto(kg, sd, &sd->N); + object_normal_transform_auto(kg, sd, &sd->Ng); # ifdef __DPDU__ - object_dir_transform(kg, sd, &sd->dPdu); - object_dir_transform(kg, sd, &sd->dPdv); + object_dir_transform_auto(kg, sd, &sd->dPdu); + object_dir_transform_auto(kg, sd, &sd->dPdv); # endif } # endif @@ -249,106 +250,108 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg, int lamp) { /* vectors */ - ccl_fetch(sd, P) = P; - ccl_fetch(sd, N) = Ng; - ccl_fetch(sd, Ng) = Ng; - ccl_fetch(sd, I) = I; - ccl_fetch(sd, shader) = shader; + sd->P = P; + sd->N = Ng; + sd->Ng = Ng; + sd->I = I; + sd->shader = shader; if(prim != PRIM_NONE) - ccl_fetch(sd, type) = PRIMITIVE_TRIANGLE; + sd->type = PRIMITIVE_TRIANGLE; else if(lamp != LAMP_NONE) - ccl_fetch(sd, type) = PRIMITIVE_LAMP; + sd->type = PRIMITIVE_LAMP; else - ccl_fetch(sd, type) = PRIMITIVE_NONE; + sd->type = PRIMITIVE_NONE; /* primitive */ #ifdef __INSTANCING__ - ccl_fetch(sd, object) = object; + sd->object = object; #endif + sd->lamp = LAMP_NONE; /* currently no access to bvh prim index for strand sd->prim*/ - ccl_fetch(sd, prim) = prim; + sd->prim = prim; #ifdef __UV__ - ccl_fetch(sd, u) = u; - ccl_fetch(sd, v) = v; + sd->u = u; + sd->v = v; #endif - ccl_fetch(sd, ray_length) = t; + sd->time = time; + sd->ray_length = t; - ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE); - ccl_fetch(sd, object_flag) = 0; - if(ccl_fetch(sd, object) != OBJECT_NONE) { - ccl_fetch(sd, object_flag) |= kernel_tex_fetch(__object_flag, - ccl_fetch(sd, object)); + sd->flag = kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags; + sd->object_flag = 0; + if(sd->object != OBJECT_NONE) { + sd->object_flag |= kernel_tex_fetch(__object_flag, + sd->object); #ifdef __OBJECT_MOTION__ shader_setup_object_transforms(kg, sd, time); - ccl_fetch(sd, time) = time; } else if(lamp != LAMP_NONE) { - ccl_fetch(sd, ob_tfm) = lamp_fetch_transform(kg, lamp, false); - ccl_fetch(sd, ob_itfm) = lamp_fetch_transform(kg, lamp, true); + sd->ob_tfm = lamp_fetch_transform(kg, lamp, false); + sd->ob_itfm = lamp_fetch_transform(kg, lamp, true); + sd->lamp = lamp; #endif } /* transform into world space */ if(object_space) { - object_position_transform_auto(kg, sd, &ccl_fetch(sd, P)); - object_normal_transform_auto(kg, sd, &ccl_fetch(sd, Ng)); - ccl_fetch(sd, N) = ccl_fetch(sd, Ng); - object_dir_transform_auto(kg, sd, &ccl_fetch(sd, I)); + object_position_transform_auto(kg, sd, &sd->P); + object_normal_transform_auto(kg, sd, &sd->Ng); + sd->N = sd->Ng; + object_dir_transform_auto(kg, sd, &sd->I); } - if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) { + if(sd->type & PRIMITIVE_TRIANGLE) { /* smooth normal */ - if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) { - ccl_fetch(sd, N) = triangle_smooth_normal(kg, ccl_fetch(sd, prim), ccl_fetch(sd, u), ccl_fetch(sd, v)); + if(sd->shader & SHADER_SMOOTH_NORMAL) { + sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v); #ifdef __INSTANCING__ - if(!(ccl_fetch(sd, object_flag) & SD_OBJECT_TRANSFORM_APPLIED)) { - object_normal_transform_auto(kg, sd, &ccl_fetch(sd, N)); + if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { + object_normal_transform_auto(kg, sd, &sd->N); } #endif } /* dPdu/dPdv */ #ifdef __DPDU__ - triangle_dPdudv(kg, ccl_fetch(sd, prim), &ccl_fetch(sd, dPdu), &ccl_fetch(sd, dPdv)); + triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv); # ifdef __INSTANCING__ - if(!(ccl_fetch(sd, object_flag) & SD_OBJECT_TRANSFORM_APPLIED)) { - object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdu)); - object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdv)); + if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { + object_dir_transform_auto(kg, sd, &sd->dPdu); + object_dir_transform_auto(kg, sd, &sd->dPdv); } # endif #endif } else { #ifdef __DPDU__ - ccl_fetch(sd, dPdu) = make_float3(0.0f, 0.0f, 0.0f); - ccl_fetch(sd, dPdv) = make_float3(0.0f, 0.0f, 0.0f); + sd->dPdu = make_float3(0.0f, 0.0f, 0.0f); + sd->dPdv = make_float3(0.0f, 0.0f, 0.0f); #endif } /* backfacing test */ - if(ccl_fetch(sd, prim) != PRIM_NONE) { - bool backfacing = (dot(ccl_fetch(sd, Ng), ccl_fetch(sd, I)) < 0.0f); + if(sd->prim != PRIM_NONE) { + bool backfacing = (dot(sd->Ng, sd->I) < 0.0f); if(backfacing) { - ccl_fetch(sd, flag) |= SD_BACKFACING; - ccl_fetch(sd, Ng) = -ccl_fetch(sd, Ng); - ccl_fetch(sd, N) = -ccl_fetch(sd, N); + sd->flag |= SD_BACKFACING; + sd->Ng = -sd->Ng; + sd->N = -sd->N; #ifdef __DPDU__ - ccl_fetch(sd, dPdu) = -ccl_fetch(sd, dPdu); - ccl_fetch(sd, dPdv) = -ccl_fetch(sd, dPdv); + sd->dPdu = -sd->dPdu; + sd->dPdv = -sd->dPdv; #endif } } #ifdef __RAY_DIFFERENTIALS__ /* no ray differentials here yet */ - ccl_fetch(sd, dP) = differential3_zero(); - ccl_fetch(sd, dI) = differential3_zero(); - ccl_fetch(sd, du) = differential_zero(); - ccl_fetch(sd, dv) = differential_zero(); + sd->dP = differential3_zero(); + sd->dI = differential3_zero(); + sd->du = differential_zero(); + sd->dv = differential_zero(); #endif } @@ -378,39 +381,38 @@ ccl_device void shader_setup_from_displace(KernelGlobals *kg, ShaderData *sd, ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderData *sd, const Ray *ray) { /* vectors */ - ccl_fetch(sd, P) = ray->D; - ccl_fetch(sd, N) = -ray->D; - ccl_fetch(sd, Ng) = -ray->D; - ccl_fetch(sd, I) = -ray->D; - ccl_fetch(sd, shader) = kernel_data.background.surface_shader; - ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE); - ccl_fetch(sd, object_flag) = 0; -#ifdef __OBJECT_MOTION__ - ccl_fetch(sd, time) = ray->time; -#endif - ccl_fetch(sd, ray_length) = 0.0f; + sd->P = ray->D; + sd->N = -ray->D; + sd->Ng = -ray->D; + sd->I = -ray->D; + sd->shader = kernel_data.background.surface_shader; + sd->flag = kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags; + sd->object_flag = 0; + sd->time = ray->time; + sd->ray_length = 0.0f; #ifdef __INSTANCING__ - ccl_fetch(sd, object) = PRIM_NONE; + sd->object = PRIM_NONE; #endif - ccl_fetch(sd, prim) = PRIM_NONE; + sd->lamp = LAMP_NONE; + sd->prim = PRIM_NONE; #ifdef __UV__ - ccl_fetch(sd, u) = 0.0f; - ccl_fetch(sd, v) = 0.0f; + sd->u = 0.0f; + sd->v = 0.0f; #endif #ifdef __DPDU__ /* dPdu/dPdv */ - ccl_fetch(sd, dPdu) = make_float3(0.0f, 0.0f, 0.0f); - ccl_fetch(sd, dPdv) = make_float3(0.0f, 0.0f, 0.0f); + sd->dPdu = make_float3(0.0f, 0.0f, 0.0f); + sd->dPdv = make_float3(0.0f, 0.0f, 0.0f); #endif #ifdef __RAY_DIFFERENTIALS__ /* differentials */ - ccl_fetch(sd, dP) = ray->dD; - differential_incoming(&ccl_fetch(sd, dI), ccl_fetch(sd, dP)); - ccl_fetch(sd, du) = differential_zero(); - ccl_fetch(sd, dv) = differential_zero(); + sd->dP = ray->dD; + differential_incoming(&sd->dI, sd->dP); + sd->du = differential_zero(); + sd->dv = differential_zero(); #endif } @@ -421,47 +423,46 @@ ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *s { /* vectors */ sd->P = ray->P; - sd->N = -ray->D; + sd->N = -ray->D; sd->Ng = -ray->D; sd->I = -ray->D; sd->shader = SHADER_NONE; sd->flag = 0; sd->object_flag = 0; -#ifdef __OBJECT_MOTION__ sd->time = ray->time; -#endif sd->ray_length = 0.0f; /* todo: can we set this to some useful value? */ -#ifdef __INSTANCING__ +# ifdef __INSTANCING__ sd->object = PRIM_NONE; /* todo: fill this for texture coordinates */ -#endif +# endif + sd->lamp = LAMP_NONE; sd->prim = PRIM_NONE; sd->type = PRIMITIVE_NONE; -#ifdef __UV__ +# ifdef __UV__ sd->u = 0.0f; sd->v = 0.0f; -#endif +# endif -#ifdef __DPDU__ +# ifdef __DPDU__ /* dPdu/dPdv */ sd->dPdu = make_float3(0.0f, 0.0f, 0.0f); sd->dPdv = make_float3(0.0f, 0.0f, 0.0f); -#endif +# endif -#ifdef __RAY_DIFFERENTIALS__ +# ifdef __RAY_DIFFERENTIALS__ /* differentials */ sd->dP = ray->dD; differential_incoming(&sd->dI, sd->dP); sd->du = differential_zero(); sd->dv = differential_zero(); -#endif +# endif /* for NDC coordinates */ sd->ray_P = ray->P; sd->ray_dP = ray->dP; } -#endif +#endif /* __VOLUME__ */ /* Merging */ @@ -496,27 +497,52 @@ ccl_device_inline void shader_merge_closures(ShaderData *sd) } } } -#endif +#endif /* __BRANCHED_PATH__ || __VOLUME__ */ + +/* Defensive sampling. */ + +ccl_device_inline void shader_prepare_closures(ShaderData *sd, + ccl_addr_space PathState *state) +{ + /* We can likely also do defensive sampling at deeper bounces, particularly + * for cases like a perfect mirror but possibly also others. This will need + * a good heuristic. */ + if(state->bounce + state->transparent_bounce == 0 && sd->num_closure > 1) { + float sum = 0.0f; + + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; + if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) { + sum += sc->sample_weight; + } + } + + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; + if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) { + sc->sample_weight = max(sc->sample_weight, 0.125f * sum); + } + } + } +} + /* BSDF */ ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg, ShaderData *sd, const float3 omega_in, float *pdf, - int skip_bsdf, BsdfEval *result_eval, float sum_pdf, float sum_sample_weight) + const ShaderClosure *skip_sc, BsdfEval *result_eval, float sum_pdf, float sum_sample_weight) { /* this is the veach one-sample model with balance heuristic, some pdf * factors drop out when using balance heuristic weighting */ - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - if(i == skip_bsdf) - continue; - - const ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; - if(CLOSURE_IS_BSDF(sc->type)) { + if(sc != skip_sc && CLOSURE_IS_BSDF(sc->type)) { float bsdf_pdf = 0.0f; float3 eval = bsdf_eval(kg, sd, sc, omega_in, &bsdf_pdf); if(bsdf_pdf != 0.0f) { - bsdf_eval_accum(result_eval, sc->type, eval*sc->weight); + bsdf_eval_accum(result_eval, sc->type, eval*sc->weight, 1.0f); sum_pdf += bsdf_pdf*sc->sample_weight; } @@ -535,8 +561,8 @@ ccl_device_inline void _shader_bsdf_multi_eval_branched(KernelGlobals *kg, float light_pdf, bool use_mis) { - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - const ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BSDF(sc->type)) { float bsdf_pdf = 0.0f; float3 eval = bsdf_eval(kg, sd, sc, omega_in, &bsdf_pdf); @@ -544,12 +570,13 @@ ccl_device_inline void _shader_bsdf_multi_eval_branched(KernelGlobals *kg, float mis_weight = use_mis? power_heuristic(light_pdf, bsdf_pdf): 1.0f; bsdf_eval_accum(result_eval, sc->type, - eval * sc->weight * mis_weight); + eval * sc->weight, + mis_weight); } } } } -#endif +#endif /* __BRANCHED_PATH__ */ #ifndef __KERNEL_CUDA__ @@ -573,56 +600,134 @@ void shader_bsdf_eval(KernelGlobals *kg, #endif { float pdf; - _shader_bsdf_multi_eval(kg, sd, omega_in, &pdf, -1, eval, 0.0f, 0.0f); + _shader_bsdf_multi_eval(kg, sd, omega_in, &pdf, NULL, eval, 0.0f, 0.0f); if(use_mis) { float weight = power_heuristic(light_pdf, pdf); - bsdf_eval_mul(eval, weight); + bsdf_eval_mis(eval, weight); } } } -ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg, - ShaderData *sd, - float randu, float randv, - BsdfEval *bsdf_eval, - float3 *omega_in, - differential3 *domega_in, - float *pdf) +ccl_device_inline const ShaderClosure *shader_bsdf_pick(ShaderData *sd, + float *randu) { + /* Note the sampling here must match shader_bssrdf_pick, + * since we reuse the same random number. */ int sampled = 0; - if(ccl_fetch(sd, num_closure) > 1) { - /* pick a BSDF closure based on sample weights */ + if(sd->num_closure > 1) { + /* Pick a BSDF or based on sample weights. */ float sum = 0.0f; - for(sampled = 0; sampled < ccl_fetch(sd, num_closure); sampled++) { - const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled); - - if(CLOSURE_IS_BSDF(sc->type)) + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; + + if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) { sum += sc->sample_weight; + } } - float r = ccl_fetch(sd, randb_closure)*sum; - sum = 0.0f; + float r = (*randu)*sum; + float partial_sum = 0.0f; - for(sampled = 0; sampled < ccl_fetch(sd, num_closure); sampled++) { - const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled); - - if(CLOSURE_IS_BSDF(sc->type)) { - sum += sc->sample_weight; + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; - if(r <= sum) + if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) { + float next_sum = partial_sum + sc->sample_weight; + + if(r < next_sum) { + sampled = i; + + /* Rescale to reuse for direction sample, to better + * preserve stratifaction. */ + *randu = (r - partial_sum) / sc->sample_weight; break; + } + + partial_sum = next_sum; } } + } - if(sampled == ccl_fetch(sd, num_closure)) { - *pdf = 0.0f; - return LABEL_NONE; + const ShaderClosure *sc = &sd->closure[sampled]; + return CLOSURE_IS_BSDF(sc->type)? sc: NULL; +} + +ccl_device_inline const ShaderClosure *shader_bssrdf_pick(ShaderData *sd, + ccl_addr_space float3 *throughput, + float *randu) +{ + /* Note the sampling here must match shader_bsdf_pick, + * since we reuse the same random number. */ + int sampled = 0; + + if(sd->num_closure > 1) { + /* Pick a BSDF or BSSRDF or based on sample weights. */ + float sum_bsdf = 0.0f; + float sum_bssrdf = 0.0f; + + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; + + if(CLOSURE_IS_BSDF(sc->type)) { + sum_bsdf += sc->sample_weight; + } + else if(CLOSURE_IS_BSSRDF(sc->type)) { + sum_bssrdf += sc->sample_weight; + } + } + + float r = (*randu)*(sum_bsdf + sum_bssrdf); + float partial_sum = 0.0f; + + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; + + if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) { + float next_sum = partial_sum + sc->sample_weight; + + if(r < next_sum) { + if(CLOSURE_IS_BSDF(sc->type)) { + *throughput *= (sum_bsdf + sum_bssrdf) / sum_bsdf; + return NULL; + } + else { + *throughput *= (sum_bsdf + sum_bssrdf) / sum_bssrdf; + sampled = i; + + /* Rescale to reuse for direction sample, to better + * preserve stratifaction. */ + *randu = (r - partial_sum) / sc->sample_weight; + break; + } + } + + partial_sum = next_sum; + } } } - const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled); + const ShaderClosure *sc = &sd->closure[sampled]; + return CLOSURE_IS_BSSRDF(sc->type)? sc: NULL; +} + +ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg, + ShaderData *sd, + float randu, float randv, + BsdfEval *bsdf_eval, + float3 *omega_in, + differential3 *domega_in, + float *pdf) +{ + const ShaderClosure *sc = shader_bsdf_pick(sd, &randu); + if(sc == NULL) { + *pdf = 0.0f; + return LABEL_NONE; + } + + /* BSSRDF should already have been handled elsewhere. */ + kernel_assert(CLOSURE_IS_BSDF(sc->type)); int label; float3 eval; @@ -633,9 +738,9 @@ ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg, if(*pdf != 0.0f) { bsdf_eval_init(bsdf_eval, sc->type, eval*sc->weight, kernel_data.film.use_light_pass); - if(ccl_fetch(sd, num_closure) > 1) { + if(sd->num_closure > 1) { float sweight = sc->sample_weight; - _shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sampled, bsdf_eval, *pdf*sweight, sweight); + _shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sc, bsdf_eval, *pdf*sweight, sweight); } } @@ -658,31 +763,63 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg, ShaderData *sd, return label; } +ccl_device float shader_bsdf_average_roughness(ShaderData *sd) +{ + float roughness = 0.0f; + float sum_weight = 0.0f; + + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; + + if(CLOSURE_IS_BSDF(sc->type)) { + /* sqrt once to undo the squaring from multiplying roughness on the + * two axes, and once for the squared roughness convention. */ + float weight = fabsf(average(sc->weight)); + roughness += weight * sqrtf(safe_sqrtf(bsdf_get_roughness_squared(sc))); + sum_weight += weight; + } + } + + return (sum_weight > 0.0f) ? roughness / sum_weight : 0.0f; +} + ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughness) { - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BSDF(sc->type)) bsdf_blur(kg, sc, roughness); } } -ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd) +ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, const ShaderData *sd) { - if(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME) + if(sd->flag & SD_HAS_ONLY_VOLUME) { return make_float3(1.0f, 1.0f, 1.0f); + } + else if(sd->flag & SD_TRANSPARENT) { + return sd->closure_transparent_extinction; + } + else { + return make_float3(0.0f, 0.0f, 0.0f); + } +} - float3 eval = make_float3(0.0f, 0.0f, 0.0f); +ccl_device void shader_bsdf_disable_transparency(KernelGlobals *kg, ShaderData *sd) +{ + if(sd->flag & SD_TRANSPARENT) { + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) { + sc->sample_weight = 0.0f; + sc->weight = make_float3(0.0f, 0.0f, 0.0f); + } + } - if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) // todo: make this work for osl - eval += sc->weight; + sd->flag &= ~SD_TRANSPARENT; } - - return eval; } ccl_device float3 shader_bsdf_alpha(KernelGlobals *kg, ShaderData *sd) @@ -691,7 +828,7 @@ ccl_device float3 shader_bsdf_alpha(KernelGlobals *kg, ShaderData *sd) alpha = max(alpha, make_float3(0.0f, 0.0f, 0.0f)); alpha = min(alpha, make_float3(1.0f, 1.0f, 1.0f)); - + return alpha; } @@ -699,8 +836,8 @@ ccl_device float3 shader_bsdf_diffuse(KernelGlobals *kg, ShaderData *sd) { float3 eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BSDF_DIFFUSE(sc->type)) eval += sc->weight; @@ -713,8 +850,8 @@ ccl_device float3 shader_bsdf_glossy(KernelGlobals *kg, ShaderData *sd) { float3 eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BSDF_GLOSSY(sc->type)) eval += sc->weight; @@ -727,8 +864,8 @@ ccl_device float3 shader_bsdf_transmission(KernelGlobals *kg, ShaderData *sd) { float3 eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BSDF_TRANSMISSION(sc->type)) eval += sc->weight; @@ -741,8 +878,8 @@ ccl_device float3 shader_bsdf_subsurface(KernelGlobals *kg, ShaderData *sd) { float3 eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BSSRDF(sc->type) || CLOSURE_IS_BSDF_BSSRDF(sc->type)) eval += sc->weight; @@ -751,31 +888,39 @@ ccl_device float3 shader_bsdf_subsurface(KernelGlobals *kg, ShaderData *sd) return eval; } +ccl_device float3 shader_bsdf_average_normal(KernelGlobals *kg, ShaderData *sd) +{ + float3 N = make_float3(0.0f, 0.0f, 0.0f); + + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; + if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) + N += sc->N*fabsf(average(sc->weight)); + } + + return (is_zero(N))? sd->N : normalize(N); +} + ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_factor, float3 *N_) { float3 eval = make_float3(0.0f, 0.0f, 0.0f); float3 N = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BSDF_DIFFUSE(sc->type)) { const DiffuseBsdf *bsdf = (const DiffuseBsdf*)sc; eval += sc->weight*ao_factor; - N += bsdf->N*average(sc->weight); + N += bsdf->N*fabsf(average(sc->weight)); } else if(CLOSURE_IS_AMBIENT_OCCLUSION(sc->type)) { eval += sc->weight; - N += ccl_fetch(sd, N)*average(sc->weight); + N += sd->N*fabsf(average(sc->weight)); } } - if(is_zero(N)) - N = ccl_fetch(sd, N); - else - N = normalize(N); - - *N_ = N; + *N_ = (is_zero(N))? sd->N : normalize(N); return eval; } @@ -786,8 +931,8 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b float3 N = make_float3(0.0f, 0.0f, 0.0f); float texture_blur = 0.0f, weight_sum = 0.0f; - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BSSRDF(sc->type)) { const Bssrdf *bssrdf = (const Bssrdf*)sc; @@ -801,35 +946,25 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b } if(N_) - *N_ = (is_zero(N))? ccl_fetch(sd, N): normalize(N); + *N_ = (is_zero(N))? sd->N: normalize(N); if(texture_blur_) - *texture_blur_ = texture_blur/weight_sum; - + *texture_blur_ = safe_divide(texture_blur, weight_sum); + return eval; } -#endif +#endif /* __SUBSURFACE__ */ /* Emission */ -ccl_device float3 emissive_eval(KernelGlobals *kg, ShaderData *sd, ShaderClosure *sc) -{ - return emissive_simple_eval(ccl_fetch(sd, Ng), ccl_fetch(sd, I)); -} - ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd) { - float3 eval; - eval = make_float3(0.0f, 0.0f, 0.0f); - - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); - - if(CLOSURE_IS_EMISSION(sc->type)) - eval += emissive_eval(kg, sd, sc)*sc->weight; + if(sd->flag & SD_EMISSION) { + return emissive_simple_eval(sd->Ng, sd->I) * sd->closure_emission_background; + } + else { + return make_float3(0.0f, 0.0f, 0.0f); } - - return eval; } /* Holdout */ @@ -838,8 +973,8 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd) { float3 weight = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_HOLDOUT(sc->type)) weight += sc->weight; @@ -850,16 +985,26 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd) /* Surface Evaluation */ -ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, ccl_addr_space RNG *rng, - ccl_addr_space PathState *state, float randb, int path_flag, ShaderContext ctx) +ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, + ccl_addr_space PathState *state, int path_flag) { - ccl_fetch(sd, num_closure) = 0; - ccl_fetch(sd, num_closure_extra) = 0; - ccl_fetch(sd, randb_closure) = randb; + /* If path is being terminated, we are tracing a shadow ray or evaluating + * emission, then we don't need to store closures. The emission and shadow + * shader data also do not have a closure array to save GPU memory. */ + int max_closures; + if(path_flag & (PATH_RAY_TERMINATE|PATH_RAY_SHADOW|PATH_RAY_EMISSION)) { + max_closures = 0; + } + else { + max_closures = kernel_data.integrator.max_closures; + } + + sd->num_closure = 0; + sd->num_closure_left = max_closures; #ifdef __OSL__ if(kg->osl) - OSLShader::eval_surface(kg, sd, state, path_flag, ctx); + OSLShader::eval_surface(kg, sd, state, path_flag); else #endif { @@ -869,49 +1014,46 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, ccl_addr_ DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd, sizeof(DiffuseBsdf), make_float3(0.8f, 0.8f, 0.8f)); - bsdf->N = ccl_fetch(sd, N); - ccl_fetch(sd, flag) |= bsdf_diffuse_setup(bsdf); + if (bsdf != NULL) { + bsdf->N = sd->N; + sd->flag |= bsdf_diffuse_setup(bsdf); + } #endif } - if(rng && (ccl_fetch(sd, flag) & SD_BSDF_NEEDS_LCG)) { - ccl_fetch(sd, lcg_state) = lcg_state_init_addrspace(rng, state, 0xb4bc3953); + if(sd->flag & SD_BSDF_NEEDS_LCG) { + sd->lcg_state = lcg_state_init_addrspace(state, 0xb4bc3953); } } /* Background Evaluation */ ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd, - ccl_addr_space PathState *state, int path_flag, ShaderContext ctx) + ccl_addr_space PathState *state, int path_flag) { - ccl_fetch(sd, num_closure) = 0; - ccl_fetch(sd, num_closure_extra) = 0; - ccl_fetch(sd, randb_closure) = 0.0f; + sd->num_closure = 0; + sd->num_closure_left = 0; #ifdef __SVM__ -#ifdef __OSL__ +# ifdef __OSL__ if(kg->osl) { - OSLShader::eval_background(kg, sd, state, path_flag, ctx); + OSLShader::eval_background(kg, sd, state, path_flag); } else -#endif +# endif /* __OSL__ */ { svm_eval_nodes(kg, sd, state, SHADER_TYPE_SURFACE, path_flag); } - float3 eval = make_float3(0.0f, 0.0f, 0.0f); - - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - const ShaderClosure *sc = ccl_fetch_array(sd, closure, i); - - if(CLOSURE_IS_BACKGROUND(sc->type)) - eval += sc->weight; + if(sd->flag & SD_EMISSION) { + return sd->closure_emission_background; } - - return eval; -#else + else { + return make_float3(0.0f, 0.0f, 0.0f); + } +#else /* __SVM__ */ return make_float3(0.8f, 0.8f, 0.8f); -#endif +#endif /* __SVM__ */ } /* Volume */ @@ -932,7 +1074,7 @@ ccl_device_inline void _shader_volume_phase_multi_eval(const ShaderData *sd, con float3 eval = volume_phase_eval(sd, sc, omega_in, &phase_pdf); if(phase_pdf != 0.0f) { - bsdf_eval_accum(result_eval, sc->type, eval); + bsdf_eval_accum(result_eval, sc->type, eval, 1.0f); sum_pdf += phase_pdf*sc->sample_weight; } @@ -963,22 +1105,27 @@ ccl_device int shader_volume_phase_sample(KernelGlobals *kg, const ShaderData *s for(sampled = 0; sampled < sd->num_closure; sampled++) { const ShaderClosure *sc = &sd->closure[sampled]; - + if(CLOSURE_IS_PHASE(sc->type)) sum += sc->sample_weight; } - float r = sd->randb_closure*sum; - sum = 0.0f; + float r = randu*sum; + float partial_sum = 0.0f; for(sampled = 0; sampled < sd->num_closure; sampled++) { const ShaderClosure *sc = &sd->closure[sampled]; - + if(CLOSURE_IS_PHASE(sc->type)) { - sum += sc->sample_weight; + float next_sum = partial_sum + sc->sample_weight; - if(r <= sum) + if(r <= next_sum) { + /* Rescale to reuse for BSDF direction sample. */ + randu = (r - partial_sum) / sc->sample_weight; break; + } + + partial_sum = next_sum; } } @@ -1024,15 +1171,25 @@ ccl_device int shader_phase_sample_closure(KernelGlobals *kg, const ShaderData * ccl_device_inline void shader_eval_volume(KernelGlobals *kg, ShaderData *sd, - PathState *state, - VolumeStack *stack, - int path_flag, - ShaderContext ctx) + ccl_addr_space PathState *state, + ccl_addr_space VolumeStack *stack, + int path_flag) { + /* If path is being terminated, we are tracing a shadow ray or evaluating + * emission, then we don't need to store closures. The emission and shadow + * shader data also do not have a closure array to save GPU memory. */ + int max_closures; + if(path_flag & (PATH_RAY_TERMINATE|PATH_RAY_SHADOW|PATH_RAY_EMISSION)) { + max_closures = 0; + } + else { + max_closures = kernel_data.integrator.max_closures; + } + /* reset closures once at the start, we will be accumulating the closures * for all volumes in the stack into a single array of closures */ sd->num_closure = 0; - sd->num_closure_extra = 0; + sd->num_closure_left = max_closures; sd->flag = 0; sd->object_flag = 0; @@ -1040,10 +1197,11 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg, /* setup shaderdata from stack. it's mostly setup already in * shader_setup_from_volume, this switching should be quick */ sd->object = stack[i].object; + sd->lamp = LAMP_NONE; sd->shader = stack[i].shader; sd->flag &= ~SD_SHADER_FLAGS; - sd->flag |= kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE); + sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags; sd->object_flag &= ~SD_OBJECT_FLAGS; if(sd->object != OBJECT_NONE) { @@ -1060,7 +1218,7 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg, #ifdef __SVM__ # ifdef __OSL__ if(kg->osl) { - OSLShader::eval_volume(kg, sd, state, path_flag, ctx); + OSLShader::eval_volume(kg, sd, state, path_flag); } else # endif @@ -1075,21 +1233,20 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg, } } -#endif +#endif /* __VOLUME__ */ /* Displacement Evaluation */ -ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, ShaderContext ctx) +ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state) { - ccl_fetch(sd, num_closure) = 0; - ccl_fetch(sd, num_closure_extra) = 0; - ccl_fetch(sd, randb_closure) = 0.0f; + sd->num_closure = 0; + sd->num_closure_left = 0; /* this will modify sd->P */ #ifdef __SVM__ # ifdef __OSL__ if(kg->osl) - OSLShader::eval_displacement(kg, sd, ctx); + OSLShader::eval_displacement(kg, sd, state); else # endif { @@ -1117,11 +1274,10 @@ ccl_device bool shader_transparent_shadow(KernelGlobals *kg, Intersection *isect shader = __float_as_int(str.z); } #endif - int flag = kernel_tex_fetch(__shader_flag, (shader & SHADER_MASK)*SHADER_SIZE); + int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags; return (flag & SD_HAS_TRANSPARENT_SHADOW) != 0; } -#endif +#endif /* __TRANSPARENT_SHADOWS__ */ CCL_NAMESPACE_END - diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h index 06a77a208cb..8a0da6c3b13 100644 --- a/intern/cycles/kernel/kernel_shadow.h +++ b/intern/cycles/kernel/kernel_shadow.h @@ -16,6 +16,42 @@ CCL_NAMESPACE_BEGIN +#ifdef __VOLUME__ +typedef struct VolumeState { +# ifdef __SPLIT_KERNEL__ +# else + PathState ps; +# endif +} VolumeState; + +/* Get PathState ready for use for volume stack evaluation. */ +# ifdef __SPLIT_KERNEL__ +ccl_addr_space +# endif +ccl_device_inline PathState *shadow_blocked_volume_path_state( + KernelGlobals *kg, + VolumeState *volume_state, + ccl_addr_space PathState *state, + ShaderData *sd, + Ray *ray) +{ +# ifdef __SPLIT_KERNEL__ + ccl_addr_space PathState *ps = + &kernel_split_state.state_shadow[ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0)]; +# else + PathState *ps = &volume_state->ps; +# endif + *ps = *state; + /* We are checking for shadow on the "other" side of the surface, so need + * to discard volume we are currently at. + */ + if(dot(sd->Ng, ray->D) < 0.0f) { + kernel_volume_stack_enter_exit(kg, sd, ps->volume_stack); + } + return ps; +} +#endif /* __VOLUME__ */ + /* Attenuate throughput accordingly to the given intersection event. * Returns true if the throughput is zero and traversal can be aborted. */ @@ -24,7 +60,7 @@ ccl_device_forceinline bool shadow_handle_transparent_isect( ShaderData *shadow_sd, ccl_addr_space PathState *state, # ifdef __VOLUME__ - struct PathState *volume_state, + ccl_addr_space struct PathState *volume_state, # endif Intersection *isect, Ray *ray, @@ -45,15 +81,12 @@ ccl_device_forceinline bool shadow_handle_transparent_isect( /* Setup shader data at surface. */ shader_setup_from_ray(kg, shadow_sd, isect, ray); /* Attenuation from transparent surface. */ - if(!(ccl_fetch(shadow_sd, flag) & SD_HAS_ONLY_VOLUME)) { + if(!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) { path_state_modify_bounce(state, true); shader_eval_surface(kg, shadow_sd, - NULL, state, - 0.0f, - PATH_RAY_SHADOW, - SHADER_CONTEXT_SHADOW); + PATH_RAY_SHADOW); path_state_modify_bounce(state, false); *throughput *= shader_bsdf_transparency(kg, shadow_sd); } @@ -72,13 +105,14 @@ ccl_device_forceinline bool shadow_handle_transparent_isect( ccl_device bool shadow_blocked_opaque(KernelGlobals *kg, ShaderData *shadow_sd, ccl_addr_space PathState *state, + const uint visibility, Ray *ray, Intersection *isect, float3 *shadow) { const bool blocked = scene_intersect(kg, *ray, - PATH_RAY_SHADOW_OPAQUE, + visibility & PATH_RAY_SHADOW_OPAQUE, isect, NULL, 0.0f, 0.0f); @@ -126,8 +160,10 @@ ccl_device bool shadow_blocked_opaque(KernelGlobals *kg, * Note that hits array should be as big as max_hits+1. */ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg, + ShaderData *sd, ShaderData *shadow_sd, ccl_addr_space PathState *state, + const uint visibility, Ray *ray, Intersection *hits, uint max_hits, @@ -140,8 +176,12 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg, const bool blocked = scene_intersect_shadow_all(kg, ray, hits, + visibility, max_hits, &num_hits); +# ifdef __VOLUME__ + VolumeState volume_state; +# endif /* If no opaque surface found but we did find transparent hits, * shade them. */ @@ -152,7 +192,14 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg, int bounce = state->transparent_bounce; Intersection *isect = hits; # ifdef __VOLUME__ - PathState ps = *state; +# ifdef __SPLIT_KERNEL__ + ccl_addr_space +# endif + PathState *ps = shadow_blocked_volume_path_state(kg, + &volume_state, + state, + sd, + ray); # endif sort_intersections(hits, num_hits); for(int hit = 0; hit < num_hits; hit++, isect++) { @@ -171,7 +218,7 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg, shadow_sd, state, #ifdef __VOLUME__ - &ps, + ps, #endif isect, ray, @@ -180,7 +227,7 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg, return true; } /* Move ray forward. */ - ray->P = ccl_fetch(shadow_sd, P); + ray->P = shadow_sd->P; if(ray->t != FLT_MAX) { ray->D = normalize_len(Pend - ray->P, &ray->t); } @@ -188,8 +235,8 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg, } # ifdef __VOLUME__ /* Attenuation for last line segment towards light. */ - if(ps.volume_stack[0].shader != SHADER_NONE) { - kernel_volume_shadow(kg, shadow_sd, &ps, ray, &throughput); + if(ps->volume_stack[0].shader != SHADER_NONE) { + kernel_volume_shadow(kg, shadow_sd, ps, ray, &throughput); } # endif *shadow = throughput; @@ -197,8 +244,16 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg, } # ifdef __VOLUME__ if(!blocked && state->volume_stack[0].shader != SHADER_NONE) { - /* Apply attenuation from current volume shader/ */ - kernel_volume_shadow(kg, shadow_sd, state, ray, shadow); + /* Apply attenuation from current volume shader. */ +# ifdef __SPLIT_KERNEL__ + ccl_addr_space +# endif + PathState *ps = shadow_blocked_volume_path_state(kg, + &volume_state, + state, + sd, + ray); + kernel_volume_shadow(kg, shadow_sd, ps, ray, shadow); } # endif return blocked; @@ -208,13 +263,18 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg, * loop to help readability of the actual logic. */ ccl_device bool shadow_blocked_transparent_all(KernelGlobals *kg, + ShaderData *sd, ShaderData *shadow_sd, ccl_addr_space PathState *state, + const uint visibility, Ray *ray, uint max_hits, float3 *shadow) { -# ifdef __KERNEL_CUDA__ +# ifdef __SPLIT_KERNEL__ + Intersection hits_[SHADOW_STACK_MAX_HITS]; + Intersection *hits = &hits_[0]; +# elif defined(__KERNEL_CUDA__) Intersection *hits = kg->hits_stack; # else Intersection hits_stack[SHADOW_STACK_MAX_HITS]; @@ -239,8 +299,10 @@ ccl_device bool shadow_blocked_transparent_all(KernelGlobals *kg, # endif /* __KERNEL_GPU__ */ /* Invoke actual traversal. */ return shadow_blocked_transparent_all_loop(kg, + sd, shadow_sd, state, + visibility, ray, hits, max_hits, @@ -248,7 +310,7 @@ ccl_device bool shadow_blocked_transparent_all(KernelGlobals *kg, } # endif /* __SHADOW_RECORD_ALL__ */ -# ifdef __KERNEL_GPU__ +# if defined(__KERNEL_GPU__) || !defined(__SHADOW_RECORD_ALL__) /* Shadow function to compute how much light is blocked, * * Here we raytrace from one transparent surface to the next step by step. @@ -263,20 +325,32 @@ ccl_device bool shadow_blocked_transparent_all(KernelGlobals *kg, */ ccl_device bool shadow_blocked_transparent_stepped_loop( KernelGlobals *kg, + ShaderData *sd, ShaderData *shadow_sd, ccl_addr_space PathState *state, + const uint visibility, Ray *ray, Intersection *isect, const bool blocked, const bool is_transparent_isect, float3 *shadow) { +# ifdef __VOLUME__ + VolumeState volume_state; +# endif if(blocked && is_transparent_isect) { float3 throughput = make_float3(1.0f, 1.0f, 1.0f); float3 Pend = ray->P + ray->D*ray->t; int bounce = state->transparent_bounce; # ifdef __VOLUME__ - PathState ps = *state; +# ifdef __SPLIT_KERNEL__ + ccl_addr_space +# endif + PathState *ps = shadow_blocked_volume_path_state(kg, + &volume_state, + state, + sd, + ray); # endif for(;;) { if(bounce >= kernel_data.integrator.transparent_max_bounce) { @@ -284,7 +358,7 @@ ccl_device bool shadow_blocked_transparent_stepped_loop( } if(!scene_intersect(kg, *ray, - PATH_RAY_SHADOW_TRANSPARENT, + visibility & PATH_RAY_SHADOW_TRANSPARENT, isect, NULL, 0.0f, 0.0f)) @@ -299,7 +373,7 @@ ccl_device bool shadow_blocked_transparent_stepped_loop( shadow_sd, state, #ifdef __VOLUME__ - &ps, + ps, #endif isect, ray, @@ -308,7 +382,7 @@ ccl_device bool shadow_blocked_transparent_stepped_loop( return true; } /* Move ray forward. */ - ray->P = ray_offset(ccl_fetch(shadow_sd, P), -ccl_fetch(shadow_sd, Ng)); + ray->P = ray_offset(shadow_sd->P, -shadow_sd->Ng); if(ray->t != FLT_MAX) { ray->D = normalize_len(Pend - ray->P, &ray->t); } @@ -316,8 +390,8 @@ ccl_device bool shadow_blocked_transparent_stepped_loop( } # ifdef __VOLUME__ /* Attenuation for last line segment towards light. */ - if(ps.volume_stack[0].shader != SHADER_NONE) { - kernel_volume_shadow(kg, shadow_sd, &ps, ray, &throughput); + if(ps->volume_stack[0].shader != SHADER_NONE) { + kernel_volume_shadow(kg, shadow_sd, ps, ray, &throughput); } # endif *shadow *= throughput; @@ -326,7 +400,15 @@ ccl_device bool shadow_blocked_transparent_stepped_loop( # ifdef __VOLUME__ if(!blocked && state->volume_stack[0].shader != SHADER_NONE) { /* Apply attenuation from current volume shader. */ - kernel_volume_shadow(kg, shadow_sd, state, ray, shadow); +# ifdef __SPLIT_KERNEL__ + ccl_addr_space +# endif + PathState *ps = shadow_blocked_volume_path_state(kg, + &volume_state, + state, + sd, + ray); + kernel_volume_shadow(kg, shadow_sd, ps, ray, shadow); } # endif return blocked; @@ -334,24 +416,28 @@ ccl_device bool shadow_blocked_transparent_stepped_loop( ccl_device bool shadow_blocked_transparent_stepped( KernelGlobals *kg, + ShaderData *sd, ShaderData *shadow_sd, ccl_addr_space PathState *state, + const uint visibility, Ray *ray, Intersection *isect, float3 *shadow) { - const bool blocked = scene_intersect(kg, - *ray, - PATH_RAY_SHADOW_OPAQUE, - isect, - NULL, - 0.0f, 0.0f); - const bool is_transparent_isect = blocked - ? shader_transparent_shadow(kg, isect) - : false; + bool blocked = scene_intersect(kg, + *ray, + visibility & PATH_RAY_SHADOW_OPAQUE, + isect, + NULL, + 0.0f, 0.0f); + bool is_transparent_isect = blocked + ? shader_transparent_shadow(kg, isect) + : false; return shadow_blocked_transparent_stepped_loop(kg, + sd, shadow_sd, state, + visibility, ray, isect, blocked, @@ -359,32 +445,30 @@ ccl_device bool shadow_blocked_transparent_stepped( shadow); } -# endif /* __KERNEL_GPU__ */ +# endif /* __KERNEL_GPU__ || !__SHADOW_RECORD_ALL__ */ #endif /* __TRANSPARENT_SHADOWS__ */ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, + ShaderData *sd, ShaderData *shadow_sd, ccl_addr_space PathState *state, - ccl_addr_space Ray *ray_input, + Ray *ray_input, float3 *shadow) { - /* Special trickery for split kernel: some data is coming from the - * global memory. - */ -#ifdef __SPLIT_KERNEL__ - Ray private_ray = *ray_input; - Ray *ray = &private_ray; - Intersection *isect = &kg->isect_shadow[SD_THREAD]; -#else /* __SPLIT_KERNEL__ */ Ray *ray = ray_input; - Intersection isect_object; - Intersection *isect = &isect_object; -#endif /* __SPLIT_KERNEL__ */ + Intersection isect; /* Some common early checks. */ *shadow = make_float3(1.0f, 1.0f, 1.0f); if(ray->t == 0.0f) { return false; } +#ifdef __SHADOW_TRICKS__ + const uint visibility = (state->flag & PATH_RAY_SHADOW_CATCHER) + ? PATH_RAY_SHADOW_NON_CATCHER + : PATH_RAY_SHADOW; +#else + const uint visibility = PATH_RAY_SHADOW; +#endif /* Do actual shadow shading. */ /* First of all, we check if integrator requires transparent shadows. * if not, we use simplest and fastest ever way to calculate occlusion. @@ -396,8 +480,9 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, return shadow_blocked_opaque(kg, shadow_sd, state, + visibility, ray, - isect, + &isect, shadow); } #ifdef __TRANSPARENT_SHADOWS__ @@ -422,39 +507,45 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, */ const bool blocked = scene_intersect(kg, *ray, - PATH_RAY_SHADOW_OPAQUE, - isect, + visibility & PATH_RAY_SHADOW_OPAQUE, + &isect, NULL, 0.0f, 0.0f); const bool is_transparent_isect = blocked - ? shader_transparent_shadow(kg, isect) + ? shader_transparent_shadow(kg, &isect) : false; if(!blocked || !is_transparent_isect || max_hits + 1 >= SHADOW_STACK_MAX_HITS) { return shadow_blocked_transparent_stepped_loop(kg, + sd, shadow_sd, state, + visibility, ray, - isect, + &isect, blocked, is_transparent_isect, shadow); } # endif /* __KERNEL_GPU__ */ return shadow_blocked_transparent_all(kg, + sd, shadow_sd, state, + visibility, ray, max_hits, shadow); # else /* __SHADOW_RECORD_ALL__ */ /* Fallback to a slowest version which works on all devices. */ return shadow_blocked_transparent_stepped(kg, + sd, shadow_sd, state, + visibility, ray, - isect, + &isect, shadow); # endif /* __SHADOW_RECORD_ALL__ */ #endif /* __TRANSPARENT_SHADOWS__ */ diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h index 52c05b85aee..e8553d84547 100644 --- a/intern/cycles/kernel/kernel_subsurface.h +++ b/intern/cycles/kernel/kernel_subsurface.h @@ -20,134 +20,82 @@ CCL_NAMESPACE_BEGIN * * BSSRDF Importance Sampling, SIGGRAPH 2013 * http://library.imageworks.com/pdfs/imageworks-library-BSSRDF-sampling.pdf - * - */ - -/* TODO: - * - test using power heuristic for combing bssrdfs - * - try to reduce one sample model variance */ -#define BSSRDF_MULTI_EVAL - -ccl_device ShaderClosure *subsurface_scatter_pick_closure(KernelGlobals *kg, ShaderData *sd, float *probability) -{ - /* sum sample weights of bssrdf and bsdf */ - float bsdf_sum = 0.0f; - float bssrdf_sum = 0.0f; - - for(int i = 0; i < sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; - - if(CLOSURE_IS_BSDF(sc->type)) - bsdf_sum += sc->sample_weight; - else if(CLOSURE_IS_BSSRDF(sc->type)) - bssrdf_sum += sc->sample_weight; - } - - /* use bsdf or bssrdf? */ - float r = sd->randb_closure*(bsdf_sum + bssrdf_sum); - - if(r < bsdf_sum) { - /* use bsdf, and adjust randb so we can reuse it for picking a bsdf */ - sd->randb_closure = r/bsdf_sum; - *probability = (bsdf_sum > 0.0f)? (bsdf_sum + bssrdf_sum)/bsdf_sum: 1.0f; - return NULL; - } - - /* use bssrdf */ - r -= bsdf_sum; - - float sum = 0.0f; - - for(int i = 0; i < sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; - - if(CLOSURE_IS_BSSRDF(sc->type)) { - sum += sc->sample_weight; - - if(r <= sum) { - sd->randb_closure = (r - (sum - sc->sample_weight))/sc->sample_weight; - -#ifdef BSSRDF_MULTI_EVAL - *probability = (bssrdf_sum > 0.0f)? (bsdf_sum + bssrdf_sum)/bssrdf_sum: 1.0f; -#else - *probability = (bssrdf_sum > 0.0f)? (bsdf_sum + bssrdf_sum)/sc->sample_weight: 1.0f; -#endif - return sc; - } - } - } - - /* should never happen */ - sd->randb_closure = 0.0f; - *probability = 1.0f; - return NULL; -} - ccl_device_inline float3 subsurface_scatter_eval(ShaderData *sd, - ShaderClosure *sc, + const ShaderClosure *sc, float disk_r, float r, bool all) { -#ifdef BSSRDF_MULTI_EVAL /* this is the veach one-sample model with balance heuristic, some pdf * factors drop out when using balance heuristic weighting */ float3 eval_sum = make_float3(0.0f, 0.0f, 0.0f); float pdf_sum = 0.0f; - float sample_weight_sum = 0.0f; - int num_bssrdf = 0; + float sample_weight_inv = 0.0f; - for(int i = 0; i < sd->num_closure; i++) { - sc = &sd->closure[i]; - - if(CLOSURE_IS_BSSRDF(sc->type)) { - float sample_weight = (all)? 1.0f: sc->sample_weight; - sample_weight_sum += sample_weight; + if(!all) { + float sample_weight_sum = 0.0f; + + for(int i = 0; i < sd->num_closure; i++) { + sc = &sd->closure[i]; + + if(CLOSURE_IS_DISK_BSSRDF(sc->type)) { + sample_weight_sum += sc->sample_weight; + } } - } - float sample_weight_inv = 1.0f/sample_weight_sum; + sample_weight_inv = 1.0f/sample_weight_sum; + } for(int i = 0; i < sd->num_closure; i++) { sc = &sd->closure[i]; - if(CLOSURE_IS_BSSRDF(sc->type)) { + if(CLOSURE_IS_DISK_BSSRDF(sc->type)) { /* in case of branched path integrate we sample all bssrdf's once, * for path trace we pick one, so adjust pdf for that */ float sample_weight = (all)? 1.0f: sc->sample_weight * sample_weight_inv; /* compute pdf */ - float pdf = bssrdf_pdf(sc, r); - float disk_pdf = bssrdf_pdf(sc, disk_r); + float3 eval = bssrdf_eval(sc, r); + float pdf = bssrdf_pdf(sc, disk_r); - /* TODO power heuristic is not working correct here */ - eval_sum += sc->weight*pdf; //*sample_weight*disk_pdf; - pdf_sum += sample_weight*disk_pdf; //*sample_weight*disk_pdf; - - num_bssrdf++; + eval_sum += sc->weight * eval; + pdf_sum += sample_weight * pdf; } } return (pdf_sum > 0.0f)? eval_sum / pdf_sum : make_float3(0.0f, 0.0f, 0.0f); -#else - float pdf = bssrdf_pdf(pick_sc, r); - float disk_pdf = bssrdf_pdf(pick_sc, disk_r); - - return pick_sc->weight * pdf / disk_pdf; -#endif } /* replace closures with a single diffuse bsdf closure after scatter step */ -ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, float3 weight, bool hit, float3 N) +ccl_device void subsurface_scatter_setup_diffuse_bsdf(KernelGlobals *kg, ShaderData *sd, const ShaderClosure *sc, float3 weight, float3 N) { sd->flag &= ~SD_CLOSURE_FLAGS; - sd->randb_closure = 0.0f; sd->num_closure = 0; - sd->num_closure_extra = 0; + sd->num_closure_left = kernel_data.integrator.max_closures; + + Bssrdf *bssrdf = (Bssrdf *)sc; +#ifdef __PRINCIPLED__ + if(bssrdf->type == CLOSURE_BSSRDF_PRINCIPLED_ID || + bssrdf->type == CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID) + { + PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bsdf_alloc(sd, sizeof(PrincipledDiffuseBsdf), weight); + + if(bsdf) { + bsdf->N = N; + bsdf->roughness = bssrdf->roughness; + sd->flag |= bsdf_principled_diffuse_setup(bsdf); - if(hit) { + /* replace CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID with this special ID so render passes + * can recognize it as not being a regular Disney principled diffuse closure */ + bsdf->type = CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID; + } + } + else if(CLOSURE_IS_BSDF_BSSRDF(bssrdf->type) || + CLOSURE_IS_BSSRDF(bssrdf->type)) +#endif /* __PRINCIPLED__ */ + { DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight); if(bsdf) { @@ -185,8 +133,7 @@ ccl_device float3 subsurface_color_pow(float3 color, float exponent) ccl_device void subsurface_color_bump_blur(KernelGlobals *kg, ShaderData *sd, - PathState *state, - int state_flag, + ccl_addr_space PathState *state, float3 *eval, float3 *N) { @@ -199,7 +146,7 @@ ccl_device void subsurface_color_bump_blur(KernelGlobals *kg, if(bump || texture_blur > 0.0f) { /* average color and normal at incoming point */ - shader_eval_surface(kg, sd, NULL, state, 0.0f, state_flag, SHADER_CONTEXT_SSS); + shader_eval_surface(kg, sd, state, state->flag); float3 in_color = shader_bssrdf_sum(sd, (bump)? N: NULL, NULL); /* we simply divide out the average color and multiply with the average @@ -218,11 +165,11 @@ ccl_device void subsurface_color_bump_blur(KernelGlobals *kg, /* Subsurface scattering step, from a point on the surface to other * nearby points on the same object. */ -ccl_device_inline int subsurface_scatter_multi_intersect( +ccl_device_inline int subsurface_scatter_disk( KernelGlobals *kg, - SubsurfaceIntersection *ss_isect, + LocalIntersection *ss_isect, ShaderData *sd, - ShaderClosure *sc, + const ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, @@ -235,26 +182,20 @@ ccl_device_inline int subsurface_scatter_multi_intersect( disk_N = sd->Ng; make_orthonormals(disk_N, &disk_T, &disk_B); - /* reusing variable for picking the closure gives a bit nicer stratification - * for path tracer, for branched we do all closures so it doesn't help */ - float axisu = (all)? disk_u: sd->randb_closure; - - if(axisu < 0.5f) { + if(disk_v < 0.5f) { pick_pdf_N = 0.5f; pick_pdf_T = 0.25f; pick_pdf_B = 0.25f; - if(all) - disk_u *= 2.0f; + disk_v *= 2.0f; } - else if(axisu < 0.75f) { + else if(disk_v < 0.75f) { float3 tmp = disk_N; disk_N = disk_T; disk_T = tmp; pick_pdf_N = 0.25f; pick_pdf_T = 0.5f; pick_pdf_B = 0.25f; - if(all) - disk_u = (disk_u - 0.5f)*4.0f; + disk_v = (disk_v - 0.5f)*4.0f; } else { float3 tmp = disk_N; @@ -263,21 +204,24 @@ ccl_device_inline int subsurface_scatter_multi_intersect( pick_pdf_N = 0.25f; pick_pdf_T = 0.25f; pick_pdf_B = 0.5f; - if(all) - disk_u = (disk_u - 0.75f)*4.0f; + disk_v = (disk_v - 0.75f)*4.0f; } /* sample point on disk */ - float phi = M_2PI_F * disk_u; - float disk_r = disk_v; - float disk_height; + float phi = M_2PI_F * disk_v; + float disk_height, disk_r; - bssrdf_sample(sc, disk_r, &disk_r, &disk_height); + bssrdf_sample(sc, disk_u, &disk_r, &disk_height); float3 disk_P = (disk_r*cosf(phi)) * disk_T + (disk_r*sinf(phi)) * disk_B; /* create ray */ +#ifdef __SPLIT_KERNEL__ + Ray ray_object = ss_isect->ray; + Ray *ray = &ray_object; +#else Ray *ray = &ss_isect->ray; +#endif ray->P = sd->P + disk_N*disk_height + disk_P; ray->D = -disk_N; ray->t = 2.0f*disk_height; @@ -287,37 +231,37 @@ ccl_device_inline int subsurface_scatter_multi_intersect( /* intersect with the same object. if multiple intersections are found it * will use at most BSSRDF_MAX_HITS hits, a random subset of all hits */ - scene_intersect_subsurface(kg, - ray, - ss_isect, - sd->object, - lcg_state, - BSSRDF_MAX_HITS); + scene_intersect_local(kg, + *ray, + ss_isect, + sd->object, + lcg_state, + BSSRDF_MAX_HITS); int num_eval_hits = min(ss_isect->num_hits, BSSRDF_MAX_HITS); for(int hit = 0; hit < num_eval_hits; hit++) { /* Quickly retrieve P and Ng without setting up ShaderData. */ float3 hit_P; - if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) { - hit_P = triangle_refine_subsurface(kg, - sd, - &ss_isect->hits[hit], - ray); + if(sd->type & PRIMITIVE_TRIANGLE) { + hit_P = triangle_refine_local(kg, + sd, + &ss_isect->hits[hit], + ray); } #ifdef __OBJECT_MOTION__ - else if(ccl_fetch(sd, type) & PRIMITIVE_MOTION_TRIANGLE) { + else if(sd->type & PRIMITIVE_MOTION_TRIANGLE) { float3 verts[3]; motion_triangle_vertices( kg, - ccl_fetch(sd, object), + sd->object, kernel_tex_fetch(__prim_index, ss_isect->hits[hit].prim), - ccl_fetch(sd, time), + sd->time, verts); - hit_P = motion_triangle_refine_subsurface(kg, - sd, - &ss_isect->hits[hit], - ray, - verts); + hit_P = motion_triangle_refine_local(kg, + sd, + &ss_isect->hits[hit], + ray, + verts); } #endif /* __OBJECT_MOTION__ */ else { @@ -330,140 +274,262 @@ ccl_device_inline int subsurface_scatter_multi_intersect( object_normal_transform(kg, sd, &hit_Ng); } - /* probability densities for local frame axes */ + /* Probability densities for local frame axes. */ float pdf_N = pick_pdf_N * fabsf(dot(disk_N, hit_Ng)); float pdf_T = pick_pdf_T * fabsf(dot(disk_T, hit_Ng)); float pdf_B = pick_pdf_B * fabsf(dot(disk_B, hit_Ng)); - /* multiple importance sample between 3 axes, power heuristic - * found to be slightly better than balance heuristic */ - float mis_weight = power_heuristic_3(pdf_N, pdf_T, pdf_B); + /* Multiple importance sample between 3 axes, power heuristic + * found to be slightly better than balance heuristic. pdf_N + * in the MIS weight and denominator cancelled out. */ + float w = pdf_N / (sqr(pdf_N) + sqr(pdf_T) + sqr(pdf_B)); + if(ss_isect->num_hits > BSSRDF_MAX_HITS) { + w *= ss_isect->num_hits/(float)BSSRDF_MAX_HITS; + } - /* real distance to sampled point */ + /* Real distance to sampled point. */ float r = len(hit_P - sd->P); - /* evaluate */ - float w = mis_weight / pdf_N; - if(ss_isect->num_hits > BSSRDF_MAX_HITS) - w *= ss_isect->num_hits/(float)BSSRDF_MAX_HITS; + /* Evaluate profiles. */ float3 eval = subsurface_scatter_eval(sd, sc, disk_r, r, all) * w; ss_isect->weight[hit] = eval; } +#ifdef __SPLIT_KERNEL__ + ss_isect->ray = *ray; +#endif + return num_eval_hits; } ccl_device_noinline void subsurface_scatter_multi_setup( KernelGlobals *kg, - SubsurfaceIntersection* ss_isect, + LocalIntersection* ss_isect, int hit, ShaderData *sd, - PathState *state, - int state_flag, - ShaderClosure *sc, - bool all) + ccl_addr_space PathState *state, + const ShaderClosure *sc) { +#ifdef __SPLIT_KERNEL__ + Ray ray_object = ss_isect->ray; + Ray *ray = &ray_object; +#else + Ray *ray = &ss_isect->ray; +#endif + + /* Workaround for AMD GPU OpenCL compiler. Most probably cache bypass issue. */ +#if defined(__SPLIT_KERNEL__) && defined(__KERNEL_OPENCL_AMD__) && defined(__KERNEL_GPU__) + kernel_split_params.dummy_sd_flag = sd->flag; +#endif + /* Setup new shading point. */ - shader_setup_from_subsurface(kg, sd, &ss_isect->hits[hit], &ss_isect->ray); + shader_setup_from_subsurface(kg, sd, &ss_isect->hits[hit], ray); /* Optionally blur colors and bump mapping. */ float3 weight = ss_isect->weight[hit]; float3 N = sd->N; - subsurface_color_bump_blur(kg, sd, state, state_flag, &weight, &N); + subsurface_color_bump_blur(kg, sd, state, &weight, &N); /* Setup diffuse BSDF. */ - subsurface_scatter_setup_diffuse_bsdf(sd, weight, true, N); + subsurface_scatter_setup_diffuse_bsdf(kg, sd, sc, weight, N); } -/* subsurface scattering step, from a point on the surface to another nearby point on the same object */ -ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathState *state, - int state_flag, ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all) +/* Random walk subsurface scattering. + * + * "Practical and Controllable Subsurface Scattering for Production Path + * Tracing". Matt Jen-Yuan Chiang, Peter Kutz, Brent Burley. SIGGRAPH 2016. */ + +ccl_device void subsurface_random_walk_remap( + const float A, + const float d, + float *sigma_t, + float *sigma_s) { - float3 eval = make_float3(0.0f, 0.0f, 0.0f); + /* Compute attenuation and scattering coefficients from albedo. */ + const float a = 1.0f - expf(A * (-5.09406f + A * (2.61188f - A * 4.31805f))); + const float s = 1.9f - A + 3.5f * sqr(A - 0.8f); - /* pick random axis in local frame and point on disk */ - float3 disk_N, disk_T, disk_B; - float pick_pdf_N, pick_pdf_T, pick_pdf_B; + *sigma_t = 1.0f / fmaxf(d * s, 1e-16f); + *sigma_s = *sigma_t * a; +} - disk_N = sd->Ng; - make_orthonormals(disk_N, &disk_T, &disk_B); +ccl_device void subsurface_random_walk_coefficients( + const ShaderClosure *sc, + float3 *sigma_t, + float3 *sigma_s, + float3 *weight) +{ + const Bssrdf *bssrdf = (const Bssrdf*)sc; + const float3 A = bssrdf->albedo; + const float3 d = bssrdf->radius; + float sigma_t_x, sigma_t_y, sigma_t_z; + float sigma_s_x, sigma_s_y, sigma_s_z; - if(sd->randb_closure < 0.5f) { - pick_pdf_N = 0.5f; - pick_pdf_T = 0.25f; - pick_pdf_B = 0.25f; - } - else if(sd->randb_closure < 0.75f) { - float3 tmp = disk_N; - disk_N = disk_T; - disk_T = tmp; - pick_pdf_N = 0.25f; - pick_pdf_T = 0.5f; - pick_pdf_B = 0.25f; - } - else { - float3 tmp = disk_N; - disk_N = disk_B; - disk_B = tmp; - pick_pdf_N = 0.25f; - pick_pdf_T = 0.25f; - pick_pdf_B = 0.5f; + subsurface_random_walk_remap(A.x, d.x, &sigma_t_x, &sigma_s_x); + subsurface_random_walk_remap(A.y, d.y, &sigma_t_y, &sigma_s_y); + subsurface_random_walk_remap(A.z, d.z, &sigma_t_z, &sigma_s_z); + + *sigma_t = make_float3(sigma_t_x, sigma_t_y, sigma_t_z); + *sigma_s = make_float3(sigma_s_x, sigma_s_y, sigma_s_z); + + /* Closure mixing and Fresnel weights separate from albedo. */ + *weight = safe_divide_color(bssrdf->weight, A); +} + +ccl_device_noinline bool subsurface_random_walk( + KernelGlobals *kg, + LocalIntersection *ss_isect, + ShaderData *sd, + ccl_addr_space PathState *state, + const ShaderClosure *sc, + const float bssrdf_u, + const float bssrdf_v) +{ + /* Sample diffuse surface scatter into the object. */ + float3 D; + float pdf; + sample_cos_hemisphere(-sd->N, bssrdf_u, bssrdf_v, &D, &pdf); + if(dot(-sd->Ng, D) <= 0.0f) { + return 0; } - /* sample point on disk */ - float phi = M_2PI_F * disk_u; - float disk_r = disk_v; - float disk_height; + /* Convert subsurface to volume coefficients. */ + float3 sigma_t, sigma_s; + float3 throughput = make_float3(1.0f, 1.0f, 1.0f); + subsurface_random_walk_coefficients(sc, &sigma_t, &sigma_s, &throughput); + + /* Setup ray. */ +#ifdef __SPLIT_KERNEL__ + Ray ray_object = ss_isect->ray; + Ray *ray = &ray_object; +#else + Ray *ray = &ss_isect->ray; +#endif + ray->P = ray_offset(sd->P, -sd->Ng); + ray->D = D; + ray->t = FLT_MAX; + ray->time = sd->time; - bssrdf_sample(sc, disk_r, &disk_r, &disk_height); + /* Modify state for RNGs, decorrelated from other paths. */ + uint prev_rng_offset = state->rng_offset; + uint prev_rng_hash = state->rng_hash; + state->rng_hash = cmj_hash(state->rng_hash + state->rng_offset, 0xdeadbeef); - float3 disk_P = (disk_r*cosf(phi)) * disk_T + (disk_r*sinf(phi)) * disk_B; + /* Random walk until we hit the surface again. */ + bool hit = false; - /* create ray */ - Ray ray; - ray.P = sd->P + disk_N*disk_height + disk_P; - ray.D = -disk_N; - ray.t = 2.0f*disk_height; - ray.dP = sd->dP; - ray.dD = differential3_zero(); - ray.time = sd->time; - - /* intersect with the same object. if multiple intersections are - * found it will randomly pick one of them */ - SubsurfaceIntersection ss_isect; - scene_intersect_subsurface(kg, &ray, &ss_isect, sd->object, lcg_state, 1); - - /* evaluate bssrdf */ - if(ss_isect.num_hits > 0) { - float3 origP = sd->P; - - /* setup new shading point */ - shader_setup_from_subsurface(kg, sd, &ss_isect.hits[0], &ray); - - /* probability densities for local frame axes */ - float pdf_N = pick_pdf_N * fabsf(dot(disk_N, sd->Ng)); - float pdf_T = pick_pdf_T * fabsf(dot(disk_T, sd->Ng)); - float pdf_B = pick_pdf_B * fabsf(dot(disk_B, sd->Ng)); - - /* multiple importance sample between 3 axes, power heuristic - * found to be slightly better than balance heuristic */ - float mis_weight = power_heuristic_3(pdf_N, pdf_T, pdf_B); - - /* real distance to sampled point */ - float r = len(sd->P - origP); - - /* evaluate */ - float w = (mis_weight * ss_isect.num_hits) / pdf_N; - eval = subsurface_scatter_eval(sd, sc, disk_r, r, all) * w; + for(int bounce = 0; bounce < BSSRDF_MAX_BOUNCES; bounce++) { + /* Advance random number offset. */ + state->rng_offset += PRNG_BOUNCE_NUM; + + if(bounce > 0) { + /* Sample scattering direction. */ + const float anisotropy = 0.0f; + float scatter_u, scatter_v; + path_state_rng_2D(kg, state, PRNG_BSDF_U, &scatter_u, &scatter_v); + ray->D = henyey_greenstrein_sample(ray->D, anisotropy, scatter_u, scatter_v, NULL); + } + + /* Sample color channel, use MIS with balance heuristic. */ + float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL); + float3 albedo = safe_divide_color(sigma_s, sigma_t); + float3 channel_pdf; + int channel = kernel_volume_sample_channel(albedo, throughput, rphase, &channel_pdf); + + /* Distance sampling. */ + float rdist = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE); + float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel); + float t = -logf(1.0f - rdist)/sample_sigma_t; + + ray->t = t; + scene_intersect_local(kg, *ray, ss_isect, sd->object, NULL, 1); + hit = (ss_isect->num_hits > 0); + + if(hit) { + /* Compute world space distance to surface hit. */ + float3 D = ray->D; + object_inverse_dir_transform(kg, sd, &D); + D = normalize(D) * ss_isect->hits[0].t; + object_dir_transform(kg, sd, &D); + t = len(D); + } + + /* Advance to new scatter location. */ + ray->P += t * ray->D; + + /* Update throughput. */ + float3 transmittance = volume_color_transmittance(sigma_t, t); + float pdf = dot(channel_pdf, (hit)? transmittance: sigma_t * transmittance); + throughput *= ((hit)? transmittance: sigma_s * transmittance) / pdf; + + if(hit) { + /* If we hit the surface, we are done. */ + break; + } + + /* Russian roulette. */ + float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE); + float probability = min(max3(fabs(throughput)), 1.0f); + if(terminate >= probability) { + break; + } + throughput /= probability; } - /* optionally blur colors and bump mapping */ - float3 N = sd->N; - subsurface_color_bump_blur(kg, sd, state, state_flag, &eval, &N); + kernel_assert(isfinite_safe(throughput.x) && + isfinite_safe(throughput.y) && + isfinite_safe(throughput.z)); - /* setup diffuse bsdf */ - subsurface_scatter_setup_diffuse_bsdf(sd, eval, (ss_isect.num_hits > 0), N); + state->rng_offset = prev_rng_offset; + state->rng_hash = prev_rng_hash; + + /* Return number of hits in ss_isect. */ + if(!hit) { + return 0; + } + + /* TODO: gain back performance lost from merging with disk BSSRDF. We + * only need to return on hit so this indirect ray push/pop overhead + * is not actually needed, but it does keep the code simpler. */ + ss_isect->weight[0] = throughput; +#ifdef __SPLIT_KERNEL__ + ss_isect->ray = *ray; +#endif + + return 1; +} + +ccl_device_inline int subsurface_scatter_multi_intersect( + KernelGlobals *kg, + LocalIntersection *ss_isect, + ShaderData *sd, + ccl_addr_space PathState *state, + const ShaderClosure *sc, + uint *lcg_state, + float bssrdf_u, + float bssrdf_v, + bool all) +{ + if(CLOSURE_IS_DISK_BSSRDF(sc->type)) { + return subsurface_scatter_disk(kg, + ss_isect, + sd, + sc, + lcg_state, + bssrdf_u, + bssrdf_v, + all); + } + else { + return subsurface_random_walk(kg, + ss_isect, + sd, + state, + sc, + bssrdf_u, + bssrdf_v); + } } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h index cb1a3f40dee..9047b93a0b2 100644 --- a/intern/cycles/kernel/kernel_textures.h +++ b/intern/cycles/kernel/kernel_textures.h @@ -15,184 +15,71 @@ */ #ifndef KERNEL_TEX -# define KERNEL_TEX(type, ttype, name) -#endif - -#ifndef KERNEL_IMAGE_TEX -# define KERNEL_IMAGE_TEX(type, ttype, name) +# define KERNEL_TEX(type, name) #endif /* bvh */ -KERNEL_TEX(float4, texture_float4, __bvh_nodes) -KERNEL_TEX(float4, texture_float4, __bvh_leaf_nodes) -KERNEL_TEX(float4, texture_float4, __prim_tri_verts) -KERNEL_TEX(uint, texture_uint, __prim_tri_index) -KERNEL_TEX(uint, texture_uint, __prim_type) -KERNEL_TEX(uint, texture_uint, __prim_visibility) -KERNEL_TEX(uint, texture_uint, __prim_index) -KERNEL_TEX(uint, texture_uint, __prim_object) -KERNEL_TEX(uint, texture_uint, __object_node) -KERNEL_TEX(float2, texture_float2, __prim_time) +KERNEL_TEX(float4, __bvh_nodes) +KERNEL_TEX(float4, __bvh_leaf_nodes) +KERNEL_TEX(float4, __prim_tri_verts) +KERNEL_TEX(uint, __prim_tri_index) +KERNEL_TEX(uint, __prim_type) +KERNEL_TEX(uint, __prim_visibility) +KERNEL_TEX(uint, __prim_index) +KERNEL_TEX(uint, __prim_object) +KERNEL_TEX(uint, __object_node) +KERNEL_TEX(float2, __prim_time) /* objects */ -KERNEL_TEX(float4, texture_float4, __objects) -KERNEL_TEX(float4, texture_float4, __objects_vector) +KERNEL_TEX(KernelObject, __objects) +KERNEL_TEX(Transform, __object_motion_pass) +KERNEL_TEX(DecomposedTransform, __object_motion) +KERNEL_TEX(uint, __object_flag) + +/* cameras */ +KERNEL_TEX(DecomposedTransform, __camera_motion) /* triangles */ -KERNEL_TEX(uint, texture_uint, __tri_shader) -KERNEL_TEX(float4, texture_float4, __tri_vnormal) -KERNEL_TEX(uint4, texture_uint4, __tri_vindex) -KERNEL_TEX(uint, texture_uint, __tri_patch) -KERNEL_TEX(float2, texture_float2, __tri_patch_uv) +KERNEL_TEX(uint, __tri_shader) +KERNEL_TEX(float4, __tri_vnormal) +KERNEL_TEX(uint4, __tri_vindex) +KERNEL_TEX(uint, __tri_patch) +KERNEL_TEX(float2, __tri_patch_uv) /* curves */ -KERNEL_TEX(float4, texture_float4, __curves) -KERNEL_TEX(float4, texture_float4, __curve_keys) +KERNEL_TEX(float4, __curves) +KERNEL_TEX(float4, __curve_keys) /* patches */ -KERNEL_TEX(uint, texture_uint, __patches) +KERNEL_TEX(uint, __patches) /* attributes */ -KERNEL_TEX(uint4, texture_uint4, __attributes_map) -KERNEL_TEX(float, texture_float, __attributes_float) -KERNEL_TEX(float4, texture_float4, __attributes_float3) -KERNEL_TEX(uchar4, texture_uchar4, __attributes_uchar4) +KERNEL_TEX(uint4, __attributes_map) +KERNEL_TEX(float, __attributes_float) +KERNEL_TEX(float4, __attributes_float3) +KERNEL_TEX(uchar4, __attributes_uchar4) /* lights */ -KERNEL_TEX(float4, texture_float4, __light_distribution) -KERNEL_TEX(float4, texture_float4, __light_data) -KERNEL_TEX(float2, texture_float2, __light_background_marginal_cdf) -KERNEL_TEX(float2, texture_float2, __light_background_conditional_cdf) +KERNEL_TEX(KernelLightDistribution, __light_distribution) +KERNEL_TEX(KernelLight, __lights) +KERNEL_TEX(float2, __light_background_marginal_cdf) +KERNEL_TEX(float2, __light_background_conditional_cdf) /* particles */ -KERNEL_TEX(float4, texture_float4, __particles) +KERNEL_TEX(KernelParticle, __particles) /* shaders */ -KERNEL_TEX(uint4, texture_uint4, __svm_nodes) -KERNEL_TEX(uint, texture_uint, __shader_flag) -KERNEL_TEX(uint, texture_uint, __object_flag) +KERNEL_TEX(uint4, __svm_nodes) +KERNEL_TEX(KernelShader, __shaders) /* lookup tables */ -KERNEL_TEX(float, texture_float, __lookup_table) +KERNEL_TEX(float, __lookup_table) /* sobol */ -KERNEL_TEX(uint, texture_uint, __sobol_directions) - -#ifdef __KERNEL_CUDA__ -# if __CUDA_ARCH__ < 300 -/* full-float image */ -KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_000) -KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_001) -KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_002) -KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_003) -KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_004) - -KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_000) -KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_001) -KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_002) -KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_003) -KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_004) - -/* image */ -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_005) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_006) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_007) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_008) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_009) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_010) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_011) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_012) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_013) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_014) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_015) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_016) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_017) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_018) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_019) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_020) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_021) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_022) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_023) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_024) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_025) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_026) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_027) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_028) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_029) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_030) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_031) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_032) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_033) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_034) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_035) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_036) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_037) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_038) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_039) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_040) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_041) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_042) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_043) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_044) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_045) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_046) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_047) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_048) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_049) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_050) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_051) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_052) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_053) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_054) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_055) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_056) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_057) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_058) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_059) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_060) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_061) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_062) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_063) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_064) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_065) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_066) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_067) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_068) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_069) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_070) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_071) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_072) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_073) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_074) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_075) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_076) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_077) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_078) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_079) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_080) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_081) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_082) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_083) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_084) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_085) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_086) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_087) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_088) - -# else -/* bindless textures */ -KERNEL_TEX(uint, texture_uint, __bindless_mapping) -# endif -#endif +KERNEL_TEX(uint, __sobol_directions) -/* packed image (opencl) */ -KERNEL_TEX(uchar4, texture_uchar4, __tex_image_byte4_packed) -KERNEL_TEX(float4, texture_float4, __tex_image_float4_packed) -KERNEL_TEX(uchar, texture_uchar, __tex_image_byte_packed) -KERNEL_TEX(float, texture_float, __tex_image_float_packed) -KERNEL_TEX(uint4, texture_uint4, __tex_image_packed_info) +/* image textures */ +KERNEL_TEX(TextureInfo, __texture_info) #undef KERNEL_TEX -#undef KERNEL_IMAGE_TEX - diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index 8250eaf6073..72fbf7be557 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -17,9 +17,9 @@ #ifndef __KERNEL_TYPES_H__ #define __KERNEL_TYPES_H__ -#include "kernel_math.h" -#include "svm/svm_types.h" -#include "util_static_assert.h" +#include "kernel/kernel_math.h" +#include "kernel/svm/svm_types.h" +#include "util/util_static_assert.h" #ifndef __KERNEL_GPU__ # define __KERNEL_CPU__ @@ -34,18 +34,18 @@ CCL_NAMESPACE_BEGIN -/* constants */ -#define OBJECT_SIZE 12 -#define OBJECT_VECTOR_SIZE 6 -#define LIGHT_SIZE 11 -#define FILTER_TABLE_SIZE 1024 -#define RAMP_TABLE_SIZE 256 -#define SHUTTER_TABLE_SIZE 256 -#define PARTICLE_SIZE 5 -#define SHADER_SIZE 5 +/* Constants */ +#define OBJECT_MOTION_PASS_SIZE 2 +#define FILTER_TABLE_SIZE 1024 +#define RAMP_TABLE_SIZE 256 +#define SHUTTER_TABLE_SIZE 256 #define BSSRDF_MIN_RADIUS 1e-8f #define BSSRDF_MAX_HITS 4 +#define BSSRDF_MAX_BOUNCES 256 +#define LOCAL_MAX_HITS 4 + +#define VOLUME_BOUNDS_MAX 1024 #define BECKMANN_TABLE_SIZE 256 @@ -56,7 +56,28 @@ CCL_NAMESPACE_BEGIN #define VOLUME_STACK_SIZE 16 -/* device capabilities */ +/* Split kernel constants */ +#define WORK_POOL_SIZE_GPU 64 +#define WORK_POOL_SIZE_CPU 1 +#ifdef __KERNEL_GPU__ +# define WORK_POOL_SIZE WORK_POOL_SIZE_GPU +#else +# define WORK_POOL_SIZE WORK_POOL_SIZE_CPU +#endif + + +#define SHADER_SORT_BLOCK_SIZE 2048 + +#ifdef __KERNEL_OPENCL__ +# define SHADER_SORT_LOCAL_SIZE 64 +#elif defined(__KERNEL_CUDA__) +# define SHADER_SORT_LOCAL_SIZE 32 +#else +# define SHADER_SORT_LOCAL_SIZE 1 +#endif + + +/* Device capabilities */ #ifdef __KERNEL_CPU__ # ifdef __KERNEL_SSE2__ # define __QBVH__ @@ -67,24 +88,28 @@ CCL_NAMESPACE_BEGIN # ifdef WITH_OSL # define __OSL__ # endif +# define __PRINCIPLED__ # define __SUBSURFACE__ # define __CMJ__ # define __VOLUME__ -# define __VOLUME_DECOUPLED__ # define __VOLUME_SCATTER__ # define __SHADOW_RECORD_ALL__ +# define __VOLUME_DECOUPLED__ # define __VOLUME_RECORD_ALL__ #endif /* __KERNEL_CPU__ */ #ifdef __KERNEL_CUDA__ # define __KERNEL_SHADING__ # define __KERNEL_ADV_SHADING__ -# define __BRANCHED_PATH__ # define __VOLUME__ # define __VOLUME_SCATTER__ # define __SUBSURFACE__ -# define __CMJ__ +# define __PRINCIPLED__ # define __SHADOW_RECORD_ALL__ +# define __CMJ__ +# ifndef __SPLIT_KERNEL__ +# define __BRANCHED_PATH__ +# endif #endif /* __KERNEL_CUDA__ */ #ifdef __KERNEL_OPENCL__ @@ -94,41 +119,50 @@ CCL_NAMESPACE_BEGIN # ifdef __KERNEL_OPENCL_NVIDIA__ # define __KERNEL_SHADING__ # define __KERNEL_ADV_SHADING__ -# ifdef __KERNEL_EXPERIMENTAL__ -# define __CMJ__ -# endif +# define __SUBSURFACE__ +# define __PRINCIPLED__ +# define __VOLUME__ +# define __VOLUME_SCATTER__ +# define __SHADOW_RECORD_ALL__ +# define __CMJ__ +# define __BRANCHED_PATH__ # endif /* __KERNEL_OPENCL_NVIDIA__ */ # ifdef __KERNEL_OPENCL_APPLE__ # define __KERNEL_SHADING__ # define __KERNEL_ADV_SHADING__ +# define __PRINCIPLED__ +# define __CMJ__ /* TODO(sergey): Currently experimental section is ignored here, * this is because megakernel in device_opencl does not support * custom cflags depending on the scene features. */ -# ifdef __KERNEL_EXPERIMENTAL__ -# define __CMJ__ -# endif -# endif /* __KERNEL_OPENCL_NVIDIA__ */ +# endif /* __KERNEL_OPENCL_APPLE__ */ # ifdef __KERNEL_OPENCL_AMD__ # define __CL_USE_NATIVE__ # define __KERNEL_SHADING__ # define __KERNEL_ADV_SHADING__ +# define __SUBSURFACE__ +# define __PRINCIPLED__ +# define __VOLUME__ +# define __VOLUME_SCATTER__ +# define __SHADOW_RECORD_ALL__ +# define __CMJ__ +# define __BRANCHED_PATH__ # endif /* __KERNEL_OPENCL_AMD__ */ # ifdef __KERNEL_OPENCL_INTEL_CPU__ # define __CL_USE_NATIVE__ # define __KERNEL_SHADING__ # define __KERNEL_ADV_SHADING__ -# ifdef __KERNEL_EXPERIMENTAL__ -# define __CMJ__ -# endif +# define __PRINCIPLED__ +# define __CMJ__ # endif /* __KERNEL_OPENCL_INTEL_CPU__ */ #endif /* __KERNEL_OPENCL__ */ -/* kernel features */ +/* Kernel features */ #define __SOBOL__ #define __INSTANCING__ #define __DPDU__ @@ -141,6 +175,9 @@ CCL_NAMESPACE_BEGIN #define __INTERSECTION_REFINE__ #define __CLAMP_SAMPLE__ #define __PATCH_EVAL__ +#define __SHADOW_TRICKS__ +#define __DENOISING_FEATURES__ +#define __SHADER_RAYTRACE__ #ifdef __KERNEL_SHADING__ # define __SVM__ @@ -163,10 +200,6 @@ CCL_NAMESPACE_BEGIN # define __BAKING__ #endif -#ifdef WITH_CYCLES_DEBUG -# define __KERNEL_DEBUG__ -#endif - /* Scene-based selective features compilation. */ #ifdef __NO_CAMERA_MOTION__ # undef __CAMERA_MOTION__ @@ -196,10 +229,27 @@ CCL_NAMESPACE_BEGIN #ifdef __NO_TRANSPARENT__ # undef __TRANSPARENT_SHADOWS__ #endif +#ifdef __NO_SHADOW_TRICKS__ +# undef __SHADOW_TRICKS__ +#endif +#ifdef __NO_PRINCIPLED__ +# undef __PRINCIPLED__ +#endif +#ifdef __NO_DENOISING__ +# undef __DENOISING_FEATURES__ +#endif +#ifdef __NO_SHADER_RAYTRACE__ +# undef __SHADER_RAYTRACE__ +#endif -/* Random Numbers */ +/* Features that enable others */ +#ifdef WITH_CYCLES_DEBUG +# define __KERNEL_DEBUG__ +#endif -typedef uint RNG; +#if defined(__SUBSURFACE__) || defined(__SHADER_RAYTRACE__) +# define __BVH_LOCAL__ +#endif /* Shader Evaluation */ @@ -213,6 +263,7 @@ typedef enum ShaderEvalType { /* data passes */ SHADER_EVAL_NORMAL, SHADER_EVAL_UV, + SHADER_EVAL_ROUGHNESS, SHADER_EVAL_DIFFUSE_COLOR, SHADER_EVAL_GLOSSY_COLOR, SHADER_EVAL_TRANSMISSION_COLOR, @@ -240,31 +291,24 @@ enum PathTraceDimension { PRNG_FILTER_V = 1, PRNG_LENS_U = 2, PRNG_LENS_V = 3, -#ifdef __CAMERA_MOTION__ PRNG_TIME = 4, PRNG_UNUSED_0 = 5, PRNG_UNUSED_1 = 6, /* for some reason (6, 7) is a bad sobol pattern */ PRNG_UNUSED_2 = 7, /* with a low number of samples (< 64) */ -#endif - PRNG_BASE_NUM = 8, + PRNG_BASE_NUM = 10, PRNG_BSDF_U = 0, PRNG_BSDF_V = 1, - PRNG_BSDF = 2, - PRNG_LIGHT = 3, - PRNG_LIGHT_U = 4, - PRNG_LIGHT_V = 5, - PRNG_LIGHT_TERMINATE = 6, - PRNG_TERMINATE = 7, - -#ifdef __VOLUME__ - PRNG_PHASE_U = 8, - PRNG_PHASE_V = 9, - PRNG_PHASE = 10, - PRNG_SCATTER_DISTANCE = 11, -#endif - - PRNG_BOUNCE_NUM = 12, + PRNG_LIGHT_U = 2, + PRNG_LIGHT_V = 3, + PRNG_LIGHT_TERMINATE = 4, + PRNG_TERMINATE = 5, + PRNG_PHASE_CHANNEL = 6, + PRNG_SCATTER_DISTANCE = 7, + PRNG_BOUNCE_NUM = 8, + + PRNG_BEVEL_U = 6, /* reuse volume dimension, correlation won't harm */ + PRNG_BEVEL_V = 7, }; enum SamplingPattern { @@ -277,29 +321,56 @@ enum SamplingPattern { /* these flags values correspond to raytypes in osl.cpp, so keep them in sync! */ enum PathRayFlag { - PATH_RAY_CAMERA = 1, - PATH_RAY_REFLECT = 2, - PATH_RAY_TRANSMIT = 4, - PATH_RAY_DIFFUSE = 8, - PATH_RAY_GLOSSY = 16, - PATH_RAY_SINGULAR = 32, - PATH_RAY_TRANSPARENT = 64, - - PATH_RAY_SHADOW_OPAQUE = 128, - PATH_RAY_SHADOW_TRANSPARENT = 256, - PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT), - - PATH_RAY_CURVE = 512, /* visibility flag to define curve segments */ - PATH_RAY_VOLUME_SCATTER = 1024, /* volume scattering */ + PATH_RAY_CAMERA = (1 << 0), + PATH_RAY_REFLECT = (1 << 1), + PATH_RAY_TRANSMIT = (1 << 2), + PATH_RAY_DIFFUSE = (1 << 3), + PATH_RAY_GLOSSY = (1 << 4), + PATH_RAY_SINGULAR = (1 << 5), + PATH_RAY_TRANSPARENT = (1 << 6), + + PATH_RAY_SHADOW_OPAQUE_NON_CATCHER = (1 << 7), + PATH_RAY_SHADOW_OPAQUE_CATCHER = (1 << 8), + PATH_RAY_SHADOW_OPAQUE = (PATH_RAY_SHADOW_OPAQUE_NON_CATCHER|PATH_RAY_SHADOW_OPAQUE_CATCHER), + PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER = (1 << 9), + PATH_RAY_SHADOW_TRANSPARENT_CATCHER = (1 << 10), + PATH_RAY_SHADOW_TRANSPARENT = (PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER|PATH_RAY_SHADOW_TRANSPARENT_CATCHER), + PATH_RAY_SHADOW_NON_CATCHER = (PATH_RAY_SHADOW_OPAQUE_NON_CATCHER|PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER), + PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT), + + PATH_RAY_CURVE = (1 << 11), /* visibility flag to define curve segments */ + PATH_RAY_VOLUME_SCATTER = (1 << 12), /* volume scattering */ /* Special flag to tag unaligned BVH nodes. */ - PATH_RAY_NODE_UNALIGNED = 2048, - - PATH_RAY_ALL_VISIBILITY = (1|2|4|8|16|32|64|128|256|512|1024|2048), - - PATH_RAY_MIS_SKIP = 4096, - PATH_RAY_DIFFUSE_ANCESTOR = 8192, - PATH_RAY_SINGLE_PASS_DONE = 16384, + PATH_RAY_NODE_UNALIGNED = (1 << 13), + + PATH_RAY_ALL_VISIBILITY = ((1 << 14)-1), + + /* Don't apply multiple importance sampling weights to emission from + * lamp or surface hits, because they were not direct light sampled. */ + PATH_RAY_MIS_SKIP = (1 << 14), + /* Diffuse bounce earlier in the path, skip SSS to improve performance + * and avoid branching twice with disk sampling SSS. */ + PATH_RAY_DIFFUSE_ANCESTOR = (1 << 15), + /* Single pass has been written. */ + PATH_RAY_SINGLE_PASS_DONE = (1 << 16), + /* Ray is behind a shadow catcher .*/ + PATH_RAY_SHADOW_CATCHER = (1 << 17), + /* Store shadow data for shadow catcher or denoising. */ + PATH_RAY_STORE_SHADOW_INFO = (1 << 18), + /* Zero background alpha, for camera or transparent glass rays. */ + PATH_RAY_TRANSPARENT_BACKGROUND = (1 << 19), + /* Terminate ray immediately at next bounce. */ + PATH_RAY_TERMINATE_IMMEDIATE = (1 << 20), + /* Ray is to be terminated, but continue with transparent bounces and + * emission as long as we encounter them. This is required to make the + * MIS between direct and indirect light rays match, as shadow rays go + * through transparent surfaces to reach emisison too. */ + PATH_RAY_TERMINATE_AFTER_TRANSPARENT = (1 << 21), + /* Ray is to be terminated. */ + PATH_RAY_TERMINATE = (PATH_RAY_TERMINATE_IMMEDIATE|PATH_RAY_TERMINATE_AFTER_TRANSPARENT), + /* Path and shader is being evaluated for direct lighting emission. */ + PATH_RAY_EMISSION = (1 << 22) }; /* Closure Label */ @@ -313,49 +384,82 @@ typedef enum ClosureLabel { LABEL_SINGULAR = 16, LABEL_TRANSPARENT = 32, LABEL_VOLUME_SCATTER = 64, + LABEL_TRANSMIT_TRANSPARENT = 128, } ClosureLabel; /* Render Passes */ +#define PASS_NAME_JOIN(a, b) a ## _ ## b +#define PASSMASK(pass) (1 << ((PASS_NAME_JOIN(PASS, pass)) % 32)) + +#define PASSMASK_COMPONENT(comp) (PASSMASK(PASS_NAME_JOIN(comp, DIRECT)) | \ + PASSMASK(PASS_NAME_JOIN(comp, INDIRECT)) | \ + PASSMASK(PASS_NAME_JOIN(comp, COLOR))) + typedef enum PassType { PASS_NONE = 0, - PASS_COMBINED = (1 << 0), - PASS_DEPTH = (1 << 1), - PASS_NORMAL = (1 << 2), - PASS_UV = (1 << 3), - PASS_OBJECT_ID = (1 << 4), - PASS_MATERIAL_ID = (1 << 5), - PASS_DIFFUSE_COLOR = (1 << 6), - PASS_GLOSSY_COLOR = (1 << 7), - PASS_TRANSMISSION_COLOR = (1 << 8), - PASS_DIFFUSE_INDIRECT = (1 << 9), - PASS_GLOSSY_INDIRECT = (1 << 10), - PASS_TRANSMISSION_INDIRECT = (1 << 11), - PASS_DIFFUSE_DIRECT = (1 << 12), - PASS_GLOSSY_DIRECT = (1 << 13), - PASS_TRANSMISSION_DIRECT = (1 << 14), - PASS_EMISSION = (1 << 15), - PASS_BACKGROUND = (1 << 16), - PASS_AO = (1 << 17), - PASS_SHADOW = (1 << 18), - PASS_MOTION = (1 << 19), - PASS_MOTION_WEIGHT = (1 << 20), - PASS_MIST = (1 << 21), - PASS_SUBSURFACE_DIRECT = (1 << 22), - PASS_SUBSURFACE_INDIRECT = (1 << 23), - PASS_SUBSURFACE_COLOR = (1 << 24), - PASS_LIGHT = (1 << 25), /* no real pass, used to force use_light_pass */ + + /* Main passes */ + PASS_COMBINED = 1, + PASS_DEPTH, + PASS_NORMAL, + PASS_UV, + PASS_OBJECT_ID, + PASS_MATERIAL_ID, + PASS_MOTION, + PASS_MOTION_WEIGHT, #ifdef __KERNEL_DEBUG__ - PASS_BVH_TRAVERSED_NODES = (1 << 26), - PASS_BVH_TRAVERSED_INSTANCES = (1 << 27), - PASS_BVH_INTERSECTIONS = (1 << 28), - PASS_RAY_BOUNCES = (1 << 29), + PASS_BVH_TRAVERSED_NODES, + PASS_BVH_TRAVERSED_INSTANCES, + PASS_BVH_INTERSECTIONS, + PASS_RAY_BOUNCES, #endif + PASS_RENDER_TIME, + PASS_CATEGORY_MAIN_END = 31, + + PASS_MIST = 32, + PASS_EMISSION, + PASS_BACKGROUND, + PASS_AO, + PASS_SHADOW, + PASS_LIGHT, /* no real pass, used to force use_light_pass */ + PASS_DIFFUSE_DIRECT, + PASS_DIFFUSE_INDIRECT, + PASS_DIFFUSE_COLOR, + PASS_GLOSSY_DIRECT, + PASS_GLOSSY_INDIRECT, + PASS_GLOSSY_COLOR, + PASS_TRANSMISSION_DIRECT, + PASS_TRANSMISSION_INDIRECT, + PASS_TRANSMISSION_COLOR, + PASS_SUBSURFACE_DIRECT, + PASS_SUBSURFACE_INDIRECT, + PASS_SUBSURFACE_COLOR, + PASS_VOLUME_DIRECT, + PASS_VOLUME_INDIRECT, + /* No Scatter color since it's tricky to define what it would even mean. */ + PASS_CATEGORY_LIGHT_END = 63, } PassType; -#define PASS_ALL (~0) - -typedef enum BakePassFilter { +#define PASS_ANY (~0) + +typedef enum DenoisingPassOffsets { + DENOISING_PASS_NORMAL = 0, + DENOISING_PASS_NORMAL_VAR = 3, + DENOISING_PASS_ALBEDO = 6, + DENOISING_PASS_ALBEDO_VAR = 9, + DENOISING_PASS_DEPTH = 12, + DENOISING_PASS_DEPTH_VAR = 13, + DENOISING_PASS_SHADOW_A = 14, + DENOISING_PASS_SHADOW_B = 17, + DENOISING_PASS_COLOR = 20, + DENOISING_PASS_COLOR_VAR = 23, + + DENOISING_PASS_SIZE_BASE = 26, + DENOISING_PASS_SIZE_CLEAN = 3, +} DenoisingPassOffsets; + +typedef enum eBakePassFilter { BAKE_FILTER_NONE = 0, BAKE_FILTER_DIRECT = (1 << 0), BAKE_FILTER_INDIRECT = (1 << 1), @@ -366,7 +470,7 @@ typedef enum BakePassFilter { BAKE_FILTER_SUBSURFACE = (1 << 6), BAKE_FILTER_EMISSION = (1 << 7), BAKE_FILTER_AO = (1 << 8), -} BakePassFilter; +} eBakePassFilter; typedef enum BakePassFilterCombos { BAKE_FILTER_COMBINED = ( @@ -388,25 +492,60 @@ typedef enum BakePassFilterCombos { BAKE_FILTER_SUBSURFACE_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_SUBSURFACE), } BakePassFilterCombos; +typedef enum DenoiseFlag { + DENOISING_CLEAN_DIFFUSE_DIR = (1 << 0), + DENOISING_CLEAN_DIFFUSE_IND = (1 << 1), + DENOISING_CLEAN_GLOSSY_DIR = (1 << 2), + DENOISING_CLEAN_GLOSSY_IND = (1 << 3), + DENOISING_CLEAN_TRANSMISSION_DIR = (1 << 4), + DENOISING_CLEAN_TRANSMISSION_IND = (1 << 5), + DENOISING_CLEAN_SUBSURFACE_DIR = (1 << 6), + DENOISING_CLEAN_SUBSURFACE_IND = (1 << 7), + DENOISING_CLEAN_ALL_PASSES = (1 << 8)-1, +} DenoiseFlag; + +#ifdef __KERNEL_DEBUG__ +/* NOTE: This is a runtime-only struct, alignment is not + * really important here. + */ +typedef struct DebugData { + int num_bvh_traversed_nodes; + int num_bvh_traversed_instances; + int num_bvh_intersections; + int num_ray_bounces; +} DebugData; +#endif + +typedef ccl_addr_space struct PathRadianceState { +#ifdef __PASSES__ + float3 diffuse; + float3 glossy; + float3 transmission; + float3 subsurface; + float3 scatter; + + float3 direct; +#endif +} PathRadianceState; + typedef ccl_addr_space struct PathRadiance { #ifdef __PASSES__ int use_light_pass; #endif + float transparent; float3 emission; #ifdef __PASSES__ float3 background; float3 ao; float3 indirect; - float3 direct_throughput; float3 direct_emission; float3 color_diffuse; float3 color_glossy; float3 color_transmission; float3 color_subsurface; - float3 color_scatter; float3 direct_diffuse; float3 direct_glossy; @@ -420,15 +559,46 @@ typedef ccl_addr_space struct PathRadiance { float3 indirect_subsurface; float3 indirect_scatter; - float3 path_diffuse; - float3 path_glossy; - float3 path_transmission; - float3 path_subsurface; - float3 path_scatter; - float4 shadow; float mist; #endif + + struct PathRadianceState state; + +#ifdef __SHADOW_TRICKS__ + /* Total light reachable across the path, ignoring shadow blocked queries. */ + float3 path_total; + /* Total light reachable across the path with shadow blocked queries + * applied here. + * + * Dividing this figure by path_total will give estimate of shadow pass. + */ + float3 path_total_shaded; + + /* Color of the background on which shadow is alpha-overed. */ + float3 shadow_background_color; + + /* Path radiance sum and throughput at the moment when ray hits shadow + * catcher object. + */ + float shadow_throughput; + + /* Accumulated transparency along the path after shadow catcher bounce. */ + float shadow_transparency; + + /* Indicate if any shadow catcher data is set. */ + int has_shadow_catcher; +#endif + +#ifdef __DENOISING_FEATURES__ + float3 denoising_normal; + float3 denoising_albedo; + float denoising_depth; +#endif /* __DENOISING_FEATURES__ */ + +#ifdef __KERNEL_DEBUG__ + DebugData debug_data; +#endif /* __KERNEL_DEBUG__ */ } PathRadiance; typedef struct BsdfEval { @@ -444,6 +614,9 @@ typedef struct BsdfEval { float3 subsurface; float3 scatter; #endif +#ifdef __SHADOW_TRICKS__ + float3 sum_no_mis; +#endif } BsdfEval; /* Shader Flag */ @@ -537,7 +710,7 @@ typedef struct Ray { /* Intersection */ -typedef ccl_addr_space struct Intersection { +typedef struct Intersection { float t, u, v; int prim; int object; @@ -617,12 +790,14 @@ typedef enum AttributeStandard { ATTR_STD_MOTION_VERTEX_NORMAL, ATTR_STD_PARTICLE, ATTR_STD_CURVE_INTERCEPT, + ATTR_STD_CURVE_RANDOM, ATTR_STD_PTEX_FACE_ID, ATTR_STD_PTEX_UV, ATTR_STD_VOLUME_DENSITY, ATTR_STD_VOLUME_COLOR, ATTR_STD_VOLUME_FLAME, ATTR_STD_VOLUME_HEAT, + ATTR_STD_VOLUME_TEMPERATURE, ATTR_STD_VOLUME_VELOCITY, ATTR_STD_POINTINESS, ATTR_STD_NUM, @@ -645,10 +820,14 @@ typedef struct AttributeDescriptor { /* Closure data */ #ifdef __MULTI_CLOSURE__ -# ifndef __MAX_CLOSURE__ -# define MAX_CLOSURE 64 +# ifdef __SPLIT_KERNEL__ +# define MAX_CLOSURE 1 # else -# define MAX_CLOSURE __MAX_CLOSURE__ +# ifndef __MAX_CLOSURE__ +# define MAX_CLOSURE 64 +# else +# define MAX_CLOSURE __MAX_CLOSURE__ +# endif # endif #else # define MAX_CLOSURE 1 @@ -668,28 +847,15 @@ typedef struct AttributeDescriptor { #define SHADER_CLOSURE_BASE \ float3 weight; \ ClosureType type; \ - float sample_weight \ + float sample_weight; \ + float3 N typedef ccl_addr_space struct ccl_align(16) ShaderClosure { SHADER_CLOSURE_BASE; - float data[14]; /* pad to 80 bytes */ + float data[10]; /* pad to 80 bytes */ } ShaderClosure; -/* Shader Context - * - * For OSL we recycle a fixed number of contexts for speed */ - -typedef enum ShaderContext { - SHADER_CONTEXT_MAIN = 0, - SHADER_CONTEXT_INDIRECT = 1, - SHADER_CONTEXT_EMISSION = 2, - SHADER_CONTEXT_SHADOW = 3, - SHADER_CONTEXT_SSS = 4, - SHADER_CONTEXT_VOLUME = 5, - SHADER_CONTEXT_NUM = 6 -} ShaderContext; - /* Shader Data * * Main shader state at a point on the surface or in a volume. All coordinates @@ -701,7 +867,7 @@ enum ShaderDataFlag { /* Set when ray hits backside of surface. */ SD_BACKFACING = (1 << 0), - /* Shader has emissive closure. */ + /* Shader has non-zero emission. */ SD_EMISSION = (1 << 1), /* Shader has BSDF closure. */ SD_BSDF = (1 << 2), @@ -711,8 +877,8 @@ enum ShaderDataFlag { SD_BSSRDF = (1 << 4), /* Shader has holdout closure. */ SD_HOLDOUT = (1 << 5), - /* Shader has volume absorption closure. */ - SD_ABSORPTION = (1 << 6), + /* Shader has non-zero volume extinction. */ + SD_EXTINCTION = (1 << 6), /* Shader has have volume phase (scatter) closure. */ SD_SCATTER = (1 << 7), /* Shader has AO closure. */ @@ -727,7 +893,7 @@ enum ShaderDataFlag { SD_BSDF_HAS_EVAL | SD_BSSRDF | SD_HOLDOUT | - SD_ABSORPTION | + SD_EXTINCTION | SD_SCATTER | SD_AO | SD_BSDF_NEEDS_LCG), @@ -752,25 +918,28 @@ enum ShaderDataFlag { SD_VOLUME_MIS = (1 << 23), /* Use cubic interpolation for voxels. */ SD_VOLUME_CUBIC = (1 << 24), - /* Has data connected to the displacement input. */ + /* Has data connected to the displacement input or uses bump map. */ SD_HAS_BUMP = (1 << 25), /* Has true displacement. */ SD_HAS_DISPLACEMENT = (1 << 26), - /* Has constant emission (value stored in __shader_flag) */ + /* Has constant emission (value stored in __shaders) */ SD_HAS_CONSTANT_EMISSION = (1 << 27), + /* Needs to access attributes */ + SD_NEED_ATTRIBUTES = (1 << 28), SD_SHADER_FLAGS = (SD_USE_MIS | SD_HAS_TRANSPARENT_SHADOW | SD_HAS_VOLUME | SD_HAS_ONLY_VOLUME | - SD_HETEROGENEOUS_VOLUME| + SD_HETEROGENEOUS_VOLUME | SD_HAS_BSSRDF_BUMP | SD_VOLUME_EQUIANGULAR | SD_VOLUME_MIS | SD_VOLUME_CUBIC | SD_HAS_BUMP | SD_HAS_DISPLACEMENT | - SD_HAS_CONSTANT_EMISSION) + SD_HAS_CONSTANT_EMISSION | + SD_NEED_ATTRIBUTES) }; /* Object flags. */ @@ -789,115 +958,113 @@ enum ShaderDataObjectFlag { SD_OBJECT_INTERSECTS_VOLUME = (1 << 5), /* Has position for motion vertices. */ SD_OBJECT_HAS_VERTEX_MOTION = (1 << 6), + /* object is used to catch shadows */ + SD_OBJECT_SHADOW_CATCHER = (1 << 7), + /* object has volume attributes */ + SD_OBJECT_HAS_VOLUME_ATTRIBUTES = (1 << 8), SD_OBJECT_FLAGS = (SD_OBJECT_HOLDOUT_MASK | SD_OBJECT_MOTION | SD_OBJECT_TRANSFORM_APPLIED | SD_OBJECT_NEGATIVE_SCALE_APPLIED | SD_OBJECT_HAS_VOLUME | - SD_OBJECT_INTERSECTS_VOLUME) + SD_OBJECT_INTERSECTS_VOLUME | + SD_OBJECT_SHADOW_CATCHER | + SD_OBJECT_HAS_VOLUME_ATTRIBUTES) }; -#ifdef __SPLIT_KERNEL__ -# define SD_THREAD (get_global_id(1) * get_global_size(0) + get_global_id(0)) -# if !defined(__SPLIT_KERNEL_SOA__) - /* ShaderData is stored as an Array-of-Structures */ -# define ccl_soa_member(type, name) type soa_##name -# define ccl_fetch(s, t) (s[SD_THREAD].soa_##t) -# define ccl_fetch_array(s, t, index) (&s[SD_THREAD].soa_##t[index]) -# else - /* ShaderData is stored as an Structure-of-Arrays */ -# define SD_GLOBAL_SIZE (get_global_size(0) * get_global_size(1)) -# define SD_FIELD_SIZE(t) sizeof(((struct ShaderData*)0)->t) -# define SD_OFFSETOF(t) ((char*)(&((struct ShaderData*)0)->t) - (char*)0) -# define ccl_soa_member(type, name) type soa_##name -# define ccl_fetch(s, t) (((ShaderData*)((ccl_addr_space char*)s + SD_GLOBAL_SIZE * SD_OFFSETOF(soa_##t) + SD_FIELD_SIZE(soa_##t) * SD_THREAD - SD_OFFSETOF(soa_##t)))->soa_##t) -# define ccl_fetch_array(s, t, index) (&ccl_fetch(s, t)[index]) -# endif -#else -# define ccl_soa_member(type, name) type name -# define ccl_fetch(s, t) (s->t) -# define ccl_fetch_array(s, t, index) (&s->t[index]) -#endif - typedef ccl_addr_space struct ShaderData { /* position */ - ccl_soa_member(float3, P); + float3 P; /* smooth normal for shading */ - ccl_soa_member(float3, N); + float3 N; /* true geometric normal */ - ccl_soa_member(float3, Ng); + float3 Ng; /* view/incoming direction */ - ccl_soa_member(float3, I); + float3 I; /* shader id */ - ccl_soa_member(int, shader); + int shader; /* booleans describing shader, see ShaderDataFlag */ - ccl_soa_member(int, flag); + int flag; /* booleans describing object of the shader, see ShaderDataObjectFlag */ - ccl_soa_member(int, object_flag); + int object_flag; /* primitive id if there is one, ~0 otherwise */ - ccl_soa_member(int, prim); + int prim; /* combined type and curve segment for hair */ - ccl_soa_member(int, type); + int type; /* parametric coordinates * - barycentric weights for triangles */ - ccl_soa_member(float, u); - ccl_soa_member(float, v); + float u; + float v; /* object id if there is one, ~0 otherwise */ - ccl_soa_member(int, object); + int object; + /* lamp id if there is one, ~0 otherwise */ + int lamp; /* motion blur sample time */ - ccl_soa_member(float, time); + float time; /* length of the ray being shaded */ - ccl_soa_member(float, ray_length); + float ray_length; #ifdef __RAY_DIFFERENTIALS__ /* differential of P. these are orthogonal to Ng, not N */ - ccl_soa_member(differential3, dP); + differential3 dP; /* differential of I */ - ccl_soa_member(differential3, dI); + differential3 dI; /* differential of u, v */ - ccl_soa_member(differential, du); - ccl_soa_member(differential, dv); + differential du; + differential dv; #endif #ifdef __DPDU__ /* differential of P w.r.t. parametric coordinates. note that dPdu is * not readily suitable as a tangent for shading on triangles. */ - ccl_soa_member(float3, dPdu); - ccl_soa_member(float3, dPdv); + float3 dPdu; + float3 dPdv; #endif #ifdef __OBJECT_MOTION__ /* object <-> world space transformations, cached to avoid * re-interpolating them constantly for shading */ - ccl_soa_member(Transform, ob_tfm); - ccl_soa_member(Transform, ob_itfm); + Transform ob_tfm; + Transform ob_itfm; #endif - /* Closure data, we store a fixed array of closures */ - ccl_soa_member(struct ShaderClosure, closure[MAX_CLOSURE]); - ccl_soa_member(int, num_closure); - ccl_soa_member(int, num_closure_extra); - ccl_soa_member(float, randb_closure); - ccl_soa_member(float3, svm_closure_weight); - - /* LCG state for closures that require additional random numbers. */ - ccl_soa_member(uint, lcg_state); - /* ray start position, only set for backgrounds */ - ccl_soa_member(float3, ray_P); - ccl_soa_member(differential3, ray_dP); + float3 ray_P; + differential3 ray_dP; #ifdef __OSL__ struct KernelGlobals *osl_globals; struct PathState *osl_path_state; #endif + + /* LCG state for closures that require additional random numbers. */ + uint lcg_state; + + /* Closure data, we store a fixed array of closures */ + int num_closure; + int num_closure_left; + float randb_closure; + float3 svm_closure_weight; + + /* Closure weights summed directly, so we can evaluate + * emission and shadow transparency with MAX_CLOSURE 0. */ + float3 closure_emission_background; + float3 closure_transparent_extinction; + + /* At the end so we can adjust size in ShaderDataTinyStorage. */ + struct ShaderClosure closure[MAX_CLOSURE]; } ShaderData; +typedef ccl_addr_space struct ShaderDataTinyStorage { + char pad[sizeof(ShaderData) - sizeof(ShaderClosure) * MAX_CLOSURE]; +} ShaderDataTinyStorage; +#define AS_SHADER_DATA(shader_data_tiny_storage) ((ShaderData*)shader_data_tiny_storage) + /* Path State */ #ifdef __VOLUME__ @@ -912,9 +1079,11 @@ typedef struct PathState { int flag; /* random number generator state */ - int rng_offset; /* dimension offset */ - int sample; /* path sample number */ - int num_samples; /* total number of times this path will be sampled */ + uint rng_hash; /* per pixel hash */ + int rng_offset; /* dimension offset */ + int sample; /* path sample number */ + int num_samples; /* total number of times this path will be sampled */ + float branch_factor; /* number of branches in indirect paths */ /* bounce counting */ int bounce; @@ -923,6 +1092,10 @@ typedef struct PathState { int transmission_bounce; int transparent_bounce; +#ifdef __DENOISING_FEATURES__ + float denoising_feature_weight; +#endif /* __DENOISING_FEATURES__ */ + /* multiple importance sampling */ float min_ray_pdf; /* smallest bounce pdf over entire path up to now */ float ray_pdf; /* last bounce pdf */ @@ -933,37 +1106,34 @@ typedef struct PathState { /* volume rendering */ #ifdef __VOLUME__ int volume_bounce; - RNG rng_congruential; + int volume_bounds_bounce; VolumeStack volume_stack[VOLUME_STACK_SIZE]; #endif } PathState; -/* Subsurface */ - -/* Struct to gather multiple SSS hits. */ -struct SubsurfaceIntersection -{ +/* Struct to gather multiple nearby intersections. */ +typedef struct LocalIntersection { Ray ray; - float3 weight[BSSRDF_MAX_HITS]; + float3 weight[LOCAL_MAX_HITS]; int num_hits; - struct Intersection hits[BSSRDF_MAX_HITS]; - float3 Ng[BSSRDF_MAX_HITS]; -}; + struct Intersection hits[LOCAL_MAX_HITS]; + float3 Ng[LOCAL_MAX_HITS]; +} LocalIntersection; + +/* Subsurface */ /* Struct to gather SSS indirect rays and delay tracing them. */ -struct SubsurfaceIndirectRays -{ - bool need_update_volume_stack; - bool tracing; +typedef struct SubsurfaceIndirectRays { PathState state[BSSRDF_MAX_HITS]; - struct PathRadiance direct_L; int num_rays; + struct Ray rays[BSSRDF_MAX_HITS]; float3 throughputs[BSSRDF_MAX_HITS]; - struct PathRadiance L[BSSRDF_MAX_HITS]; -}; + struct PathRadianceState L_state[BSSRDF_MAX_HITS]; +} SubsurfaceIndirectRays; +static_assert(BSSRDF_MAX_HITS <= LOCAL_MAX_HITS, "BSSRDF hits too high."); /* Constant Kernel Data * @@ -989,7 +1159,7 @@ typedef struct KernelCamera { /* matrices */ Transform cameratoworld; - Transform rastertocamera; + ProjectionTransform rastertocamera; /* differentials */ float4 dx; @@ -1003,7 +1173,7 @@ typedef struct KernelCamera { /* motion blur */ float shuttertime; - int have_motion, have_perspective_motion; + int num_motion_steps, have_perspective_motion; /* clipping */ float nearclip; @@ -1023,22 +1193,22 @@ typedef struct KernelCamera { int is_inside_volume; /* more matrices */ - Transform screentoworld; - Transform rastertoworld; - /* work around cuda sm 2.0 crash, this seems to - * cross some limit in combination with motion - * Transform ndctoworld; */ - Transform worldtoscreen; - Transform worldtoraster; - Transform worldtondc; + ProjectionTransform screentoworld; + ProjectionTransform rastertoworld; + ProjectionTransform ndctoworld; + ProjectionTransform worldtoscreen; + ProjectionTransform worldtoraster; + ProjectionTransform worldtondc; Transform worldtocamera; - MotionTransform motion; + /* Stores changes in the projeciton matrix. Use for camera zoom motion + * blur and motion pass output for perspective camera. */ + ProjectionTransform perspective_pre; + ProjectionTransform perspective_post; - /* Denotes changes in the projective matrix, namely in rastertocamera. - * Used for camera zoom motion blur, - */ - PerspectiveMotionTransform perspective_motion; + /* Transforms for motion pass. */ + Transform motion_pass_pre; + Transform motion_pass_post; int shutter_table_offset; @@ -1053,6 +1223,7 @@ static_assert_align(KernelCamera, 16); typedef struct KernelFilm { float exposure; int pass_flag; + int light_pass_flag; int pass_stride; int use_light_pass; @@ -1075,11 +1246,13 @@ typedef struct KernelFilm { int pass_glossy_indirect; int pass_transmission_indirect; int pass_subsurface_indirect; + int pass_volume_indirect; int pass_diffuse_direct; int pass_glossy_direct; int pass_transmission_direct; int pass_subsurface_direct; + int pass_volume_direct; int pass_emission; int pass_background; @@ -1089,13 +1262,18 @@ typedef struct KernelFilm { int pass_shadow; float pass_shadow_scale; int filter_table_offset; - int pass_pad2; int pass_mist; float mist_start; float mist_inv_depth; float mist_falloff; + int pass_denoising_data; + int pass_denoising_clean; + int denoising_flags; + + int pad1, pad2, pad3; + #ifdef __KERNEL_DEBUG__ int pass_bvh_traversed_nodes; int pass_bvh_traversed_instances; @@ -1110,12 +1288,13 @@ typedef struct KernelBackground { int surface_shader; int volume_shader; int transparent; - int pad; + float transparent_roughness_squared_threshold; /* ambient occlusion */ float ao_factor; float ao_distance; - float ao_pad1, ao_pad2; + float ao_bounces_factor; + float ao_pad; } KernelBackground; static_assert_align(KernelBackground, 16); @@ -1127,8 +1306,8 @@ typedef struct KernelIntegrator { int num_all_lights; float pdf_triangles; float pdf_lights; - float inv_pdf_lights; int pdf_background_res; + float light_inv_rr_threshold; /* light portals */ float portal_pdf; @@ -1136,7 +1315,6 @@ typedef struct KernelIntegrator { int portal_offset; /* bounces */ - int min_bounce; int max_bounce; int max_diffuse_bounce; @@ -1147,7 +1325,6 @@ typedef struct KernelIntegrator { int ao_bounces; /* transparent */ - int transparent_min_bounce; int transparent_max_bounce; int transparent_shadows; @@ -1165,6 +1342,7 @@ typedef struct KernelIntegrator { /* branched path */ int branched; + int volume_decoupled; int diffuse_samples; int glossy_samples; int transmission_samples; @@ -1187,23 +1365,31 @@ typedef struct KernelIntegrator { float volume_step_size; int volume_samples; - float light_inv_rr_threshold; - int start_sample; - int pad1, pad2, pad3; + + int max_closures; } KernelIntegrator; static_assert_align(KernelIntegrator, 16); +typedef enum KernelBVHLayout { + BVH_LAYOUT_NONE = 0, + + BVH_LAYOUT_BVH2 = (1 << 0), + BVH_LAYOUT_BVH4 = (1 << 1), + + BVH_LAYOUT_DEFAULT = BVH_LAYOUT_BVH4, + BVH_LAYOUT_ALL = (unsigned int)(-1), +} KernelBVHLayout; + typedef struct KernelBVH { /* root node */ int root; - int attributes_map_stride; int have_motion; int have_curves; int have_instancing; - int use_qbvh; + int bvh_layout; int use_bvh_steps; - int pad1; + int pad1, pad2; } KernelBVH; static_assert_align(KernelBVH, 16); @@ -1244,17 +1430,113 @@ typedef struct KernelData { } KernelData; static_assert_align(KernelData, 16); -#ifdef __KERNEL_DEBUG__ -/* NOTE: This is a runtime-only struct, alignment is not - * really important here. - */ -typedef ccl_addr_space struct DebugData { - int num_bvh_traversed_nodes; - int num_bvh_traversed_instances; - int num_bvh_intersections; - int num_ray_bounces; -} DebugData; -#endif +/* Kernel data structures. */ + +typedef struct KernelObject { + Transform tfm; + Transform itfm; + + float surface_area; + float pass_id; + float random_number; + int particle_index; + + float dupli_generated[3]; + float dupli_uv[2]; + + int numkeys; + int numsteps; + int numverts; + + uint patch_map_offset; + uint attribute_map_offset; + uint motion_offset; + uint pad; +} KernelObject; +static_assert_align(KernelObject, 16); + +typedef struct KernelSpotLight { + float radius; + float invarea; + float spot_angle; + float spot_smooth; + float dir[3]; + float pad; +} KernelSpotLight; + +/* PointLight is SpotLight with only radius and invarea being used. */ + +typedef struct KernelAreaLight { + float axisu[3]; + float invarea; + float axisv[3]; + float pad1; + float dir[3]; + float pad2; +} KernelAreaLight; + +typedef struct KernelDistantLight { + float radius; + float cosangle; + float invarea; + float pad; +} KernelDistantLight; + +typedef struct KernelLight { + int type; + float co[3]; + int shader_id; + int samples; + float max_bounces; + float random; + Transform tfm; + Transform itfm; + union { + KernelSpotLight spot; + KernelAreaLight area; + KernelDistantLight distant; + }; +} KernelLight; +static_assert_align(KernelLight, 16); + +typedef struct KernelLightDistribution { + float totarea; + int prim; + union { + struct { + int shader_flag; + int object_id; + } mesh_light; + struct { + float pad; + float size; + } lamp; + }; +} KernelLightDistribution; +static_assert_align(KernelLightDistribution, 16); + +typedef struct KernelParticle { + int index; + float age; + float lifetime; + float size; + float4 rotation; + /* Only xyz are used of the following. float4 instead of float3 are used + * to ensure consistent padding/alignment across devices. */ + float4 location; + float4 velocity; + float4 angular_velocity; +} KernelParticle; +static_assert_align(KernelParticle, 16); + +typedef struct KernelShader { + float constant_emission[3]; + float pad1; + int flags; + int pass_id; + int pad2, pad3; +} KernelShader; +static_assert_align(KernelShader, 16); /* Declarations required for split kernel */ @@ -1268,7 +1550,6 @@ typedef ccl_addr_space struct DebugData { * Queue 3 - Shadow ray cast kernel - AO * Queeu 4 - Shadow ray cast kernel - direct lighting */ -#define NUM_QUEUES 4 /* Queue names */ enum QueueNumber { @@ -1281,45 +1562,77 @@ enum QueueNumber { * 3. Rays to be regenerated * are enqueued here. */ - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS = 1, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, /* All rays for which a shadow ray should be cast to determine radiance * contribution for AO are enqueued here. */ - QUEUE_SHADOW_RAY_CAST_AO_RAYS = 2, + QUEUE_SHADOW_RAY_CAST_AO_RAYS, /* All rays for which a shadow ray should be cast to determine radiance * contributing for direct lighting are enqueued here. */ - QUEUE_SHADOW_RAY_CAST_DL_RAYS = 3, + QUEUE_SHADOW_RAY_CAST_DL_RAYS, + + /* Rays sorted according to shader->id */ + QUEUE_SHADER_SORTED_RAYS, + +#ifdef __BRANCHED_PATH__ + /* All rays moving to next iteration of the indirect loop for light */ + QUEUE_LIGHT_INDIRECT_ITER, + /* Queue of all inactive rays. These are candidates for sharing work of indirect loops */ + QUEUE_INACTIVE_RAYS, +# ifdef __VOLUME__ + /* All rays moving to next iteration of the indirect loop for volumes */ + QUEUE_VOLUME_INDIRECT_ITER, +# endif +# ifdef __SUBSURFACE__ + /* All rays moving to next iteration of the indirect loop for subsurface */ + QUEUE_SUBSURFACE_INDIRECT_ITER, +# endif +#endif /* __BRANCHED_PATH__ */ + + NUM_QUEUES }; -/* We use RAY_STATE_MASK to get ray_state (enums 0 to 5) */ -#define RAY_STATE_MASK 0x007 -#define RAY_FLAG_MASK 0x0F8 +/* We use RAY_STATE_MASK to get ray_state */ +#define RAY_STATE_MASK 0x0F +#define RAY_FLAG_MASK 0xF0 enum RayState { + RAY_INVALID = 0, /* Denotes ray is actively involved in path-iteration. */ - RAY_ACTIVE = 0, + RAY_ACTIVE, /* Denotes ray has completed processing all samples and is inactive. */ - RAY_INACTIVE = 1, - /* Denoted ray has exited path-iteration and needs to update output buffer. */ - RAY_UPDATE_BUFFER = 2, + RAY_INACTIVE, + /* Denotes ray has exited path-iteration and needs to update output buffer. */ + RAY_UPDATE_BUFFER, + /* Denotes ray needs to skip most surface shader work. */ + RAY_HAS_ONLY_VOLUME, /* Donotes ray has hit background */ - RAY_HIT_BACKGROUND = 3, + RAY_HIT_BACKGROUND, /* Denotes ray has to be regenerated */ - RAY_TO_REGENERATE = 4, + RAY_TO_REGENERATE, /* Denotes ray has been regenerated */ - RAY_REGENERATED = 5, - /* Denotes ray should skip direct lighting */ - RAY_SKIP_DL = 6, - /* Flag's ray has to execute shadow blocked function in AO part */ - RAY_SHADOW_RAY_CAST_AO = 16, - /* Flag's ray has to execute shadow blocked function in direct lighting part. */ - RAY_SHADOW_RAY_CAST_DL = 32, + RAY_REGENERATED, + /* Denotes ray is moving to next iteration of the branched indirect loop */ + RAY_LIGHT_INDIRECT_NEXT_ITER, + RAY_VOLUME_INDIRECT_NEXT_ITER, + RAY_SUBSURFACE_INDIRECT_NEXT_ITER, + + /* Ray flags */ + + /* Flags to denote that the ray is currently evaluating the branched indirect loop */ + RAY_BRANCHED_LIGHT_INDIRECT = (1 << 4), + RAY_BRANCHED_VOLUME_INDIRECT = (1 << 5), + RAY_BRANCHED_SUBSURFACE_INDIRECT = (1 << 6), + RAY_BRANCHED_INDIRECT = (RAY_BRANCHED_LIGHT_INDIRECT | RAY_BRANCHED_VOLUME_INDIRECT | RAY_BRANCHED_SUBSURFACE_INDIRECT), + + /* Ray is evaluating an iteration of an indirect loop for another thread */ + RAY_BRANCHED_INDIRECT_SHARED = (1 << 7), }; #define ASSIGN_RAY_STATE(ray_state, ray_index, state) (ray_state[ray_index] = ((ray_state[ray_index] & RAY_FLAG_MASK) | state)) -#define IS_STATE(ray_state, ray_index, state) ((ray_state[ray_index] & RAY_STATE_MASK) == state) +#define IS_STATE(ray_state, ray_index, state) ((ray_index) != QUEUE_EMPTY_SLOT && ((ray_state)[(ray_index)] & RAY_STATE_MASK) == (state)) #define ADD_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] | flag)) #define REMOVE_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] & (~flag))) #define IS_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] & flag) @@ -1334,6 +1647,20 @@ enum RayState { #define PATCH_MAP_NODE_IS_LEAF (1u << 31) #define PATCH_MAP_NODE_INDEX_MASK (~(PATCH_MAP_NODE_IS_SET | PATCH_MAP_NODE_IS_LEAF)) +/* Work Tiles */ + +typedef struct WorkTile { + uint x, y, w, h; + + uint start_sample; + uint num_samples; + + uint offset; + uint stride; + + ccl_global float *buffer; +} WorkTile; + CCL_NAMESPACE_END #endif /* __KERNEL_TYPES_H__ */ diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h index c7cb29b5af2..86378289b02 100644 --- a/intern/cycles/kernel/kernel_volume.h +++ b/intern/cycles/kernel/kernel_volume.h @@ -30,78 +30,65 @@ typedef enum VolumeIntegrateResult { * sigma_t = sigma_a + sigma_s */ typedef struct VolumeShaderCoefficients { - float3 sigma_a; + float3 sigma_t; float3 sigma_s; float3 emission; } VolumeShaderCoefficients; +#ifdef __VOLUME__ + /* evaluate shader to get extinction coefficient at P */ ccl_device_inline bool volume_shader_extinction_sample(KernelGlobals *kg, ShaderData *sd, - PathState *state, + ccl_addr_space PathState *state, float3 P, float3 *extinction) { sd->P = P; - shader_eval_volume(kg, sd, state, state->volume_stack, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW); + shader_eval_volume(kg, sd, state, state->volume_stack, PATH_RAY_SHADOW); - if(!(sd->flag & (SD_ABSORPTION|SD_SCATTER))) + if(sd->flag & SD_EXTINCTION) { + *extinction = sd->closure_transparent_extinction; + return true; + } + else { return false; - - float3 sigma_t = make_float3(0.0f, 0.0f, 0.0f); - - for(int i = 0; i < sd->num_closure; i++) { - const ShaderClosure *sc = &sd->closure[i]; - - if(CLOSURE_IS_VOLUME(sc->type)) - sigma_t += sc->weight; } - - *extinction = sigma_t; - return true; } /* evaluate shader to get absorption, scattering and emission at P */ ccl_device_inline bool volume_shader_sample(KernelGlobals *kg, ShaderData *sd, - PathState *state, + ccl_addr_space PathState *state, float3 P, VolumeShaderCoefficients *coeff) { sd->P = P; - shader_eval_volume(kg, sd, state, state->volume_stack, state->flag, SHADER_CONTEXT_VOLUME); + shader_eval_volume(kg, sd, state, state->volume_stack, state->flag); - if(!(sd->flag & (SD_ABSORPTION|SD_SCATTER|SD_EMISSION))) + if(!(sd->flag & (SD_EXTINCTION|SD_SCATTER|SD_EMISSION))) return false; - coeff->sigma_a = make_float3(0.0f, 0.0f, 0.0f); coeff->sigma_s = make_float3(0.0f, 0.0f, 0.0f); - coeff->emission = make_float3(0.0f, 0.0f, 0.0f); - - for(int i = 0; i < sd->num_closure; i++) { - const ShaderClosure *sc = &sd->closure[i]; - - if(sc->type == CLOSURE_VOLUME_ABSORPTION_ID) - coeff->sigma_a += sc->weight; - else if(sc->type == CLOSURE_EMISSION_ID) - coeff->emission += sc->weight; - else if(CLOSURE_IS_VOLUME(sc->type)) - coeff->sigma_s += sc->weight; - } + coeff->sigma_t = (sd->flag & SD_EXTINCTION)? sd->closure_transparent_extinction: + make_float3(0.0f, 0.0f, 0.0f); + coeff->emission = (sd->flag & SD_EMISSION)? sd->closure_emission_background: + make_float3(0.0f, 0.0f, 0.0f); - /* when at the max number of bounces, treat scattering as absorption */ if(sd->flag & SD_SCATTER) { - if(state->volume_bounce >= kernel_data.integrator.max_volume_bounce) { - coeff->sigma_a += coeff->sigma_s; - coeff->sigma_s = make_float3(0.0f, 0.0f, 0.0f); - sd->flag &= ~SD_SCATTER; - sd->flag |= SD_ABSORPTION; + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; + + if(CLOSURE_IS_VOLUME(sc->type)) + coeff->sigma_s += sc->weight; } } return true; } +#endif /* __VOLUME__ */ + ccl_device float3 volume_color_transmittance(float3 sigma, float t) { return make_float3(expf(-sigma.x * t), expf(-sigma.y * t), expf(-sigma.z * t)); @@ -112,13 +99,28 @@ ccl_device float kernel_volume_channel_get(float3 value, int channel) return (channel == 0)? value.x: ((channel == 1)? value.y: value.z); } -ccl_device bool volume_stack_is_heterogeneous(KernelGlobals *kg, VolumeStack *stack) +#ifdef __VOLUME__ + +ccl_device bool volume_stack_is_heterogeneous(KernelGlobals *kg, ccl_addr_space VolumeStack *stack) { for(int i = 0; stack[i].shader != SHADER_NONE; i++) { - int shader_flag = kernel_tex_fetch(__shader_flag, (stack[i].shader & SHADER_MASK)*SHADER_SIZE); + int shader_flag = kernel_tex_fetch(__shaders, (stack[i].shader & SHADER_MASK)).flags; - if(shader_flag & SD_HETEROGENEOUS_VOLUME) + if(shader_flag & SD_HETEROGENEOUS_VOLUME) { return true; + } + else if(shader_flag & SD_NEED_ATTRIBUTES) { + /* We want to render world or objects without any volume grids + * as homogenous, but can only verify this at runtime since other + * heterogenous volume objects may be using the same shader. */ + int object = stack[i].object; + if(object != OBJECT_NONE) { + int object_flag = kernel_tex_fetch(__object_flag, object); + if(object_flag & SD_OBJECT_HAS_VOLUME_ATTRIBUTES) { + return true; + } + } + } } return false; @@ -132,7 +134,7 @@ ccl_device int volume_stack_sampling_method(KernelGlobals *kg, VolumeStack *stac int method = -1; for(int i = 0; stack[i].shader != SHADER_NONE; i++) { - int shader_flag = kernel_tex_fetch(__shader_flag, (stack[i].shader & SHADER_MASK)*SHADER_SIZE); + int shader_flag = kernel_tex_fetch(__shaders, (stack[i].shader & SHADER_MASK)).flags; if(shader_flag & SD_VOLUME_MIS) { return SD_VOLUME_MIS; @@ -154,6 +156,24 @@ ccl_device int volume_stack_sampling_method(KernelGlobals *kg, VolumeStack *stac return method; } +ccl_device_inline void kernel_volume_step_init(KernelGlobals *kg, + ccl_addr_space PathState *state, + float t, + float *step_size, + float *step_offset) +{ + const int max_steps = kernel_data.integrator.volume_max_steps; + float step = min(kernel_data.integrator.volume_step_size, t); + + /* compute exact steps in advance for malloc */ + if(t > max_steps * step) { + step = t / (float)max_steps; + } + + *step_size = step; + *step_offset = path_state_rng_1D_hash(kg, state, 0x1e31d8a4) * step; +} + /* Volume Shadows * * These functions are used to attenuate shadow rays to lights. Both absorption @@ -161,7 +181,11 @@ ccl_device int volume_stack_sampling_method(KernelGlobals *kg, VolumeStack *stac /* homogeneous volume: assume shader evaluation at the starts gives * the extinction coefficient for the entire line segment */ -ccl_device void kernel_volume_shadow_homogeneous(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, float3 *throughput) +ccl_device void kernel_volume_shadow_homogeneous(KernelGlobals *kg, + ccl_addr_space PathState *state, + Ray *ray, + ShaderData *sd, + float3 *throughput) { float3 sigma_t; @@ -171,15 +195,19 @@ ccl_device void kernel_volume_shadow_homogeneous(KernelGlobals *kg, PathState *s /* heterogeneous volume: integrate stepping through the volume until we * reach the end, get absorbed entirely, or run out of iterations */ -ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, float3 *throughput) +ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, + ccl_addr_space PathState *state, + Ray *ray, + ShaderData *sd, + float3 *throughput) { float3 tp = *throughput; const float tp_eps = 1e-6f; /* todo: this is likely not the right value */ /* prepare for stepping */ int max_steps = kernel_data.integrator.volume_max_steps; - float step = kernel_data.integrator.volume_step_size; - float random_jitter_offset = lcg_step_float(&state->rng_congruential) * step; + float step_offset, step_size; + kernel_volume_step_init(kg, state, ray->t, &step_size, &step_offset); /* compute extinction at the start */ float t = 0.0f; @@ -188,14 +216,15 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState for(int i = 0; i < max_steps; i++) { /* advance to new position */ - float new_t = min(ray->t, (i+1) * step); - float dt = new_t - t; + float new_t = min(ray->t, (i+1) * step_size); - /* use random position inside this segment to sample shader */ - if(new_t == ray->t) - random_jitter_offset = lcg_step_float(&state->rng_congruential) * dt; + /* use random position inside this segment to sample shader, adjust + * for last step that is shorter than other steps. */ + if(new_t == ray->t) { + step_offset *= (new_t - t) / step_size; + } - float3 new_P = ray->P + ray->D * (t + random_jitter_offset); + float3 new_P = ray->P + ray->D * (t + step_offset); float3 sigma_t; /* compute attenuation over segment */ @@ -227,7 +256,11 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState /* get the volume attenuation over line segment defined by ray, with the * assumption that there are no surfaces blocking light between the endpoints */ -ccl_device_noinline void kernel_volume_shadow(KernelGlobals *kg, ShaderData *shadow_sd, PathState *state, Ray *ray, float3 *throughput) +ccl_device_noinline void kernel_volume_shadow(KernelGlobals *kg, + ShaderData *shadow_sd, + ccl_addr_space PathState *state, + Ray *ray, + float3 *throughput) { shader_setup_from_volume(kg, shadow_sd, ray); @@ -237,6 +270,8 @@ ccl_device_noinline void kernel_volume_shadow(KernelGlobals *kg, ShaderData *sha kernel_volume_shadow_homogeneous(kg, state, ray, shadow_sd, throughput); } +#endif /* __VOLUME__ */ + /* Equi-angular sampling as in: * "Importance Sampling Techniques for Path Tracing in Participating Media" */ @@ -324,8 +359,8 @@ ccl_device float3 kernel_volume_emission_integrate(VolumeShaderCoefficients *coe * todo: we should use an epsilon to avoid precision issues near zero sigma_t */ float3 emission = coeff->emission; - if(closure_flag & SD_ABSORPTION) { - float3 sigma_t = coeff->sigma_a + coeff->sigma_s; + if(closure_flag & SD_EXTINCTION) { + float3 sigma_t = coeff->sigma_t; emission.x *= (sigma_t.x > 0.0f)? (1.0f - transmittance.x)/sigma_t.x: t; emission.y *= (sigma_t.y > 0.0f)? (1.0f - transmittance.y)/sigma_t.y: t; @@ -339,11 +374,46 @@ ccl_device float3 kernel_volume_emission_integrate(VolumeShaderCoefficients *coe /* Volume Path */ +ccl_device int kernel_volume_sample_channel(float3 albedo, float3 throughput, float rand, float3 *pdf) +{ + /* Sample color channel proportional to throughput and single scattering + * albedo, to significantly reduce noise with many bounce, following: + * + * "Practical and Controllable Subsurface Scattering for Production Path + * Tracing". Matt Jen-Yuan Chiang, Peter Kutz, Brent Burley. SIGGRAPH 2016. */ + float3 weights = fabs(throughput * albedo); + float sum_weights = weights.x + weights.y + weights.z; + + if(sum_weights > 0.0f) { + *pdf = weights/sum_weights; + } + else { + *pdf = make_float3(1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f); + } + + if(rand < pdf->x) { + return 0; + } + else if(rand < pdf->x + pdf->y) { + return 1; + } + else { + return 2; + } +} + +#ifdef __VOLUME__ + /* homogeneous volume: assume shader evaluation at the start gives * the volume shading coefficient for the entire line segment */ -ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGlobals *kg, - PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput, - RNG *rng, bool probalistic_scatter) +ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous( + KernelGlobals *kg, + ccl_addr_space PathState *state, + Ray *ray, + ShaderData *sd, + PathRadiance *L, + ccl_addr_space float3 *throughput, + bool probalistic_scatter) { VolumeShaderCoefficients coeff; @@ -357,21 +427,18 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba #ifdef __VOLUME_SCATTER__ /* randomly scatter, and if we do t is shortened */ if(closure_flag & SD_SCATTER) { - /* extinction coefficient */ - float3 sigma_t = coeff.sigma_a + coeff.sigma_s; - - /* pick random color channel, we use the Veach one-sample - * model with balance heuristic for the channels */ - float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE); - int channel = (int)(rphase*3.0f); - sd->randb_closure = rphase*3.0f - channel; + /* Sample channel, use MIS with balance heuristic. */ + float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL); + float3 albedo = safe_divide_color(coeff.sigma_s, coeff.sigma_t); + float3 channel_pdf; + int channel = kernel_volume_sample_channel(albedo, *throughput, rphase, &channel_pdf); /* decide if we will hit or miss */ bool scatter = true; - float xi = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE); + float xi = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE); if(probalistic_scatter) { - float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel); + float sample_sigma_t = kernel_volume_channel_get(coeff.sigma_t, channel); float sample_transmittance = expf(-sample_sigma_t * t); if(1.0f - xi >= sample_transmittance) { @@ -392,40 +459,39 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba float sample_t; /* distance sampling */ - sample_t = kernel_volume_distance_sample(ray->t, sigma_t, channel, xi, &transmittance, &pdf); + sample_t = kernel_volume_distance_sample(ray->t, coeff.sigma_t, channel, xi, &transmittance, &pdf); /* modify pdf for hit/miss decision */ if(probalistic_scatter) - pdf *= make_float3(1.0f, 1.0f, 1.0f) - volume_color_transmittance(sigma_t, t); + pdf *= make_float3(1.0f, 1.0f, 1.0f) - volume_color_transmittance(coeff.sigma_t, t); - new_tp = *throughput * coeff.sigma_s * transmittance / average(pdf); + new_tp = *throughput * coeff.sigma_s * transmittance / dot(channel_pdf, pdf); t = sample_t; } else { /* no scattering */ - float3 transmittance = volume_color_transmittance(sigma_t, t); - float pdf = average(transmittance); + float3 transmittance = volume_color_transmittance(coeff.sigma_t, t); + float pdf = dot(channel_pdf, transmittance); new_tp = *throughput * transmittance / pdf; } } else #endif - if(closure_flag & SD_ABSORPTION) { + if(closure_flag & SD_EXTINCTION) { /* absorption only, no sampling needed */ - float3 transmittance = volume_color_transmittance(coeff.sigma_a, t); + float3 transmittance = volume_color_transmittance(coeff.sigma_t, t); new_tp = *throughput * transmittance; } /* integrate emission attenuated by extinction */ if(L && (closure_flag & SD_EMISSION)) { - float3 sigma_t = coeff.sigma_a + coeff.sigma_s; - float3 transmittance = volume_color_transmittance(sigma_t, ray->t); + float3 transmittance = volume_color_transmittance(coeff.sigma_t, ray->t); float3 emission = kernel_volume_emission_integrate(&coeff, closure_flag, transmittance, ray->t); - path_radiance_accum_emission(L, *throughput, emission, state->bounce); + path_radiance_accum_emission(L, state, *throughput, emission); } /* modify throughput */ - if(closure_flag & (SD_ABSORPTION|SD_SCATTER)) { + if(closure_flag & SD_EXTINCTION) { *throughput = new_tp; /* prepare to scatter to new direction */ @@ -444,16 +510,21 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba * volume until we reach the end, get absorbed entirely, or run out of * iterations. this does probabilistically scatter or get transmitted through * for path tracing where we don't want to branch. */ -ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(KernelGlobals *kg, - PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput, RNG *rng) +ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance( + KernelGlobals *kg, + ccl_addr_space PathState *state, + Ray *ray, + ShaderData *sd, + PathRadiance *L, + ccl_addr_space float3 *throughput) { float3 tp = *throughput; const float tp_eps = 1e-6f; /* todo: this is likely not the right value */ /* prepare for stepping */ int max_steps = kernel_data.integrator.volume_max_steps; - float step_size = kernel_data.integrator.volume_step_size; - float random_jitter_offset = lcg_step_float(&state->rng_congruential) * step_size; + float step_offset, step_size; + kernel_volume_step_init(kg, state, ray->t, &step_size, &step_offset); /* compute coefficients at the start */ float t = 0.0f; @@ -461,10 +532,8 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance( /* pick random color channel, we use the Veach one-sample * model with balance heuristic for the channels */ - float xi = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE); - float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE); - int channel = (int)(rphase*3.0f); - sd->randb_closure = rphase*3.0f - channel; + float xi = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE); + float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL); bool has_scatter = false; for(int i = 0; i < max_steps; i++) { @@ -472,11 +541,13 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance( float new_t = min(ray->t, (i+1) * step_size); float dt = new_t - t; - /* use random position inside this segment to sample shader */ - if(new_t == ray->t) - random_jitter_offset = lcg_step_float(&state->rng_congruential) * dt; + /* use random position inside this segment to sample shader, + * for last shorter step we remap it to fit within the segment. */ + if(new_t == ray->t) { + step_offset *= (new_t - t) / step_size; + } - float3 new_P = ray->P + ray->D * (t + random_jitter_offset); + float3 new_P = ray->P + ray->D * (t + step_offset); VolumeShaderCoefficients coeff; /* compute segment */ @@ -488,35 +559,37 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance( /* distance sampling */ #ifdef __VOLUME_SCATTER__ - if((closure_flag & SD_SCATTER) || (has_scatter && (closure_flag & SD_ABSORPTION))) { + if((closure_flag & SD_SCATTER) || (has_scatter && (closure_flag & SD_EXTINCTION))) { has_scatter = true; - float3 sigma_t = coeff.sigma_a + coeff.sigma_s; - float3 sigma_s = coeff.sigma_s; + /* Sample channel, use MIS with balance heuristic. */ + float3 albedo = safe_divide_color(coeff.sigma_s, coeff.sigma_t); + float3 channel_pdf; + int channel = kernel_volume_sample_channel(albedo, tp, rphase, &channel_pdf); /* compute transmittance over full step */ - transmittance = volume_color_transmittance(sigma_t, dt); + transmittance = volume_color_transmittance(coeff.sigma_t, dt); /* decide if we will scatter or continue */ float sample_transmittance = kernel_volume_channel_get(transmittance, channel); if(1.0f - xi >= sample_transmittance) { /* compute sampling distance */ - float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel); + float sample_sigma_t = kernel_volume_channel_get(coeff.sigma_t, channel); float new_dt = -logf(1.0f - xi)/sample_sigma_t; new_t = t + new_dt; /* transmittance and pdf */ - float3 new_transmittance = volume_color_transmittance(sigma_t, new_dt); - float3 pdf = sigma_t * new_transmittance; + float3 new_transmittance = volume_color_transmittance(coeff.sigma_t, new_dt); + float3 pdf = coeff.sigma_t * new_transmittance; /* throughput */ - new_tp = tp * sigma_s * new_transmittance / average(pdf); + new_tp = tp * coeff.sigma_s * new_transmittance / dot(channel_pdf, pdf); scatter = true; } else { /* throughput */ - float pdf = average(transmittance); + float pdf = dot(channel_pdf, transmittance); new_tp = tp * transmittance / pdf; /* remap xi so we can reuse it and keep thing stratified */ @@ -525,22 +598,20 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance( } else #endif - if(closure_flag & SD_ABSORPTION) { + if(closure_flag & SD_EXTINCTION) { /* absorption only, no sampling needed */ - float3 sigma_a = coeff.sigma_a; - - transmittance = volume_color_transmittance(sigma_a, dt); + transmittance = volume_color_transmittance(coeff.sigma_t, dt); new_tp = tp * transmittance; } /* integrate emission attenuated by absorption */ if(L && (closure_flag & SD_EMISSION)) { float3 emission = kernel_volume_emission_integrate(&coeff, closure_flag, transmittance, dt); - path_radiance_accum_emission(L, tp, emission, state->bounce); + path_radiance_accum_emission(L, state, tp, emission); } /* modify throughput */ - if(closure_flag & (SD_ABSORPTION|SD_SCATTER)) { + if(closure_flag & SD_EXTINCTION) { tp = new_tp; /* stop if nearly all light blocked */ @@ -579,17 +650,24 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance( * ray, with the assumption that there are no surfaces blocking light * between the endpoints. distance sampling is used to decide if we will * scatter or not. */ -ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals *kg, - PathState *state, ShaderData *sd, Ray *ray, PathRadiance *L, float3 *throughput, RNG *rng, bool heterogeneous) +ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate( + KernelGlobals *kg, + ccl_addr_space PathState *state, + ShaderData *sd, + Ray *ray, + PathRadiance *L, + ccl_addr_space float3 *throughput, + bool heterogeneous) { shader_setup_from_volume(kg, sd, ray); if(heterogeneous) - return kernel_volume_integrate_heterogeneous_distance(kg, state, ray, sd, L, throughput, rng); + return kernel_volume_integrate_heterogeneous_distance(kg, state, ray, sd, L, throughput); else - return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, rng, true); + return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, true); } +#ifndef __SPLIT_KERNEL__ /* Decoupled Volume Sampling * * VolumeSegment is list of coefficients and transmittance stored at all steps @@ -618,6 +696,7 @@ typedef struct VolumeSegment { float3 accum_emission; /* accumulated emission at end of segment */ float3 accum_transmittance; /* accumulated transmittance at end of segment */ + float3 accum_albedo; /* accumulated average albedo over segment */ int sampling_method; /* volume sampling method */ } VolumeSegment; @@ -628,6 +707,7 @@ typedef struct VolumeSegment { * but the entire segment is needed to do always scattering, rather than probabilistically * hitting or missing the volume. if we don't know the transmittance at the end of the * volume we can't generate stratified distance samples up to that transmittance */ +#ifdef __VOLUME_DECOUPLED__ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, VolumeSegment *segment, bool heterogeneous) { @@ -635,19 +715,12 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta /* prepare for volume stepping */ int max_steps; - float step_size, random_jitter_offset; + float step_size, step_offset; if(heterogeneous) { - const int global_max_steps = kernel_data.integrator.volume_max_steps; - step_size = kernel_data.integrator.volume_step_size; - /* compute exact steps in advance for malloc */ - if(ray->t > global_max_steps*step_size) { - max_steps = global_max_steps; - step_size = ray->t / (float)max_steps; - } - else { - max_steps = max((int)ceilf(ray->t/step_size), 1); - } + max_steps = kernel_data.integrator.volume_max_steps; + kernel_volume_step_init(kg, state, ray->t, &step_size, &step_offset); + #ifdef __KERNEL_CPU__ /* NOTE: For the branched path tracing it's possible to have direct * and indirect light integration both having volume segments allocated. @@ -664,25 +737,25 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta sizeof(*kg->decoupled_volume_steps)); if(kg->decoupled_volume_steps[index] == NULL) { kg->decoupled_volume_steps[index] = - (VolumeStep*)malloc(sizeof(VolumeStep)*global_max_steps); + (VolumeStep*)malloc(sizeof(VolumeStep)*max_steps); } segment->steps = kg->decoupled_volume_steps[index]; ++kg->decoupled_volume_steps_index; #else segment->steps = (VolumeStep*)malloc(sizeof(VolumeStep)*max_steps); #endif - random_jitter_offset = lcg_step_float(&state->rng_congruential) * step_size; } else { max_steps = 1; step_size = ray->t; - random_jitter_offset = 0.0f; + step_offset = 0.0f; segment->steps = &segment->stack_step; } /* init accumulation variables */ float3 accum_emission = make_float3(0.0f, 0.0f, 0.0f); float3 accum_transmittance = make_float3(1.0f, 1.0f, 1.0f); + float3 accum_albedo = make_float3(0.0f, 0.0f, 0.0f); float3 cdf_distance = make_float3(0.0f, 0.0f, 0.0f); float t = 0.0f; @@ -697,17 +770,24 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta float new_t = min(ray->t, (i+1) * step_size); float dt = new_t - t; - /* use random position inside this segment to sample shader */ - if(heterogeneous && new_t == ray->t) - random_jitter_offset = lcg_step_float(&state->rng_congruential) * dt; + /* use random position inside this segment to sample shader, + * for last shorter step we remap it to fit within the segment. */ + if(new_t == ray->t) { + step_offset *= (new_t - t) / step_size; + } - float3 new_P = ray->P + ray->D * (t + random_jitter_offset); + float3 new_P = ray->P + ray->D * (t + step_offset); VolumeShaderCoefficients coeff; /* compute segment */ if(volume_shader_sample(kg, sd, state, new_P, &coeff)) { int closure_flag = sd->flag; - float3 sigma_t = coeff.sigma_a + coeff.sigma_s; + float3 sigma_t = coeff.sigma_t; + + /* compute average albedo for channel sampling */ + if(closure_flag & SD_SCATTER) { + accum_albedo += dt * safe_divide_color(coeff.sigma_s, sigma_t); + } /* compute accumulated transmittance */ float3 transmittance = volume_color_transmittance(sigma_t, dt); @@ -753,7 +833,7 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta step->accum_transmittance = accum_transmittance; step->cdf_distance = cdf_distance; step->t = new_t; - step->shade_t = t + random_jitter_offset; + step->shade_t = t + step_offset; /* stop if at the end of the volume */ t = new_t; @@ -768,6 +848,7 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta /* store total emission and transmittance */ segment->accum_emission = accum_emission; segment->accum_transmittance = accum_transmittance; + segment->accum_albedo = accum_albedo; /* normalize cumulative density function for distance sampling */ VolumeStep *last_step = segment->steps + segment->numsteps - 1; @@ -797,6 +878,7 @@ ccl_device void kernel_volume_decoupled_free(KernelGlobals *kg, VolumeSegment *s #endif } } +#endif /* __VOLUME_DECOUPLED__ */ /* scattering for homogeneous and heterogeneous volumes, using decoupled ray * marching. @@ -809,10 +891,13 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter( { kernel_assert(segment->closure_flag & SD_SCATTER); - /* pick random color channel, we use the Veach one-sample - * model with balance heuristic for the channels */ - int channel = (int)(rphase*3.0f); - sd->randb_closure = rphase*3.0f - channel; + /* Sample color channel, use MIS with balance heuristic. */ + float3 channel_pdf; + int channel = kernel_volume_sample_channel(segment->accum_albedo, + *throughput, + rphase, + &channel_pdf); + float xi = rscatter; /* probabilistic scattering decision based on transmittance */ @@ -899,7 +984,7 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter( if(probalistic_scatter) distance_pdf *= make_float3(1.0f, 1.0f, 1.0f) - segment->accum_transmittance; - pdf = average(distance_pdf * step_pdf_distance); + pdf = dot(channel_pdf, distance_pdf * step_pdf_distance); /* multiple importance sampling */ if(use_mis) { @@ -962,12 +1047,12 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter( /* multiple importance sampling */ if(use_mis) { float3 distance_pdf3 = kernel_volume_distance_pdf(step_t, step->sigma_t, step_sample_t); - float distance_pdf = average(distance_pdf3 * step_pdf_distance); + float distance_pdf = dot(channel_pdf, distance_pdf3 * step_pdf_distance); mis_weight = 2.0f*power_heuristic(pdf, distance_pdf); } } - if(sample_t < 1e-6f) { - return VOLUME_PATH_SCATTERED; + if(sample_t < 0.0f || pdf == 0.0f) { + return VOLUME_PATH_MISSED; } /* compute transmittance up to this step */ @@ -990,6 +1075,7 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter( return VOLUME_PATH_SCATTERED; } +#endif /* __SPLIT_KERNEL */ /* decide if we need to use decoupled or not */ ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg, bool heterogeneous, bool direct, int sampling_method) @@ -997,6 +1083,9 @@ ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg, bool heterogeneou /* decoupled ray marching for heterogeneous volumes not supported on the GPU, * which also means equiangular and multiple importance sampling is not * support for that case */ + if(!kernel_data.integrator.volume_decoupled) + return false; + #ifdef __KERNEL_GPU__ if(heterogeneous) return false; @@ -1021,9 +1110,9 @@ ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg, bool heterogeneou ccl_device void kernel_volume_stack_init(KernelGlobals *kg, ShaderData *stack_sd, - const PathState *state, - const Ray *ray, - VolumeStack *stack) + ccl_addr_space const PathState *state, + ccl_addr_space const Ray *ray, + ccl_addr_space VolumeStack *stack) { /* NULL ray happens in the baker, does it need proper initialization of * camera in volume? @@ -1166,7 +1255,7 @@ ccl_device void kernel_volume_stack_init(KernelGlobals *kg, } } -ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd, VolumeStack *stack) +ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd, ccl_addr_space VolumeStack *stack) { /* todo: we should have some way for objects to indicate if they want the * world shader to work inside them. excluding it by default is problematic @@ -1215,7 +1304,7 @@ ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd ccl_device void kernel_volume_stack_update_for_subsurface(KernelGlobals *kg, ShaderData *stack_sd, Ray *ray, - VolumeStack *stack) + ccl_addr_space VolumeStack *stack) { kernel_assert(kernel_data.integrator.use_volumes); @@ -1277,7 +1366,7 @@ ccl_device void kernel_volume_stack_update_for_subsurface(KernelGlobals *kg, * the world's one after the last bounce to avoid render artifacts. */ ccl_device_inline void kernel_volume_clean_stack(KernelGlobals *kg, - VolumeStack *volume_stack) + ccl_addr_space VolumeStack *volume_stack) { if(kernel_data.background.volume_shader != SHADER_NONE) { /* Keep the world's volume in stack. */ @@ -1288,4 +1377,6 @@ ccl_device_inline void kernel_volume_clean_stack(KernelGlobals *kg, } } +#endif /* __VOLUME__ */ + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h index 7d559b1aa31..0c2d9379b63 100644 --- a/intern/cycles/kernel/kernel_work_stealing.h +++ b/intern/cycles/kernel/kernel_work_stealing.h @@ -17,177 +17,66 @@ #ifndef __KERNEL_WORK_STEALING_H__ #define __KERNEL_WORK_STEALING_H__ +CCL_NAMESPACE_BEGIN + /* * Utility functions for work stealing */ -#ifdef __WORK_STEALING__ - #ifdef __KERNEL_OPENCL__ # pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable #endif -uint get_group_id_with_ray_index(uint ray_index, - uint tile_dim_x, - uint tile_dim_y, - uint parallel_samples, - int dim) +#ifdef __SPLIT_KERNEL__ +/* Returns true if there is work */ +ccl_device bool get_next_work(KernelGlobals *kg, + ccl_global uint *work_pools, + uint total_work_size, + uint ray_index, + ccl_private uint *global_work_index) { - if(dim == 0) { - uint x_span = ray_index % (tile_dim_x * parallel_samples); - return x_span / get_local_size(0); + /* With a small amount of work there may be more threads than work due to + * rounding up of global size, stop such threads immediately. */ + if(ray_index >= total_work_size) { + return false; } - else /*if(dim == 1)*/ { - kernel_assert(dim == 1); - uint y_span = ray_index / (tile_dim_x * parallel_samples); - return y_span / get_local_size(1); - } -} - -uint get_total_work(uint tile_dim_x, - uint tile_dim_y, - uint grp_idx, - uint grp_idy, - uint num_samples) -{ - uint threads_within_tile_border_x = - (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0) - : get_local_size(0); - uint threads_within_tile_border_y = - (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1) - : get_local_size(1); - - threads_within_tile_border_x = - (threads_within_tile_border_x == 0) ? get_local_size(0) - : threads_within_tile_border_x; - threads_within_tile_border_y = - (threads_within_tile_border_y == 0) ? get_local_size(1) - : threads_within_tile_border_y; - - return threads_within_tile_border_x * - threads_within_tile_border_y * - num_samples; -} -/* Returns 0 in case there is no next work available */ -/* Returns 1 in case work assigned is valid */ -int get_next_work(ccl_global uint *work_pool, - ccl_private uint *my_work, - uint tile_dim_x, - uint tile_dim_y, - uint num_samples, - uint parallel_samples, - uint ray_index) -{ - uint grp_idx = get_group_id_with_ray_index(ray_index, - tile_dim_x, - tile_dim_y, - parallel_samples, - 0); - uint grp_idy = get_group_id_with_ray_index(ray_index, - tile_dim_x, - tile_dim_y, - parallel_samples, - 1); - uint total_work = get_total_work(tile_dim_x, - tile_dim_y, - grp_idx, - grp_idy, - num_samples); - uint group_index = grp_idy * get_num_groups(0) + grp_idx; - *my_work = atomic_inc(&work_pool[group_index]); - return (*my_work < total_work) ? 1 : 0; -} + /* Increase atomic work index counter in pool. */ + uint pool = ray_index / WORK_POOL_SIZE; + uint work_index = atomic_fetch_and_inc_uint32(&work_pools[pool]); -/* This function assumes that the passed my_work is valid. */ -/* Decode sample number w.r.t. assigned my_work. */ -uint get_my_sample(uint my_work, - uint tile_dim_x, - uint tile_dim_y, - uint parallel_samples, - uint ray_index) -{ - uint grp_idx = get_group_id_with_ray_index(ray_index, - tile_dim_x, - tile_dim_y, - parallel_samples, - 0); - uint grp_idy = get_group_id_with_ray_index(ray_index, - tile_dim_x, - tile_dim_y, - parallel_samples, - 1); - uint threads_within_tile_border_x = - (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0) - : get_local_size(0); - uint threads_within_tile_border_y = - (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1) - : get_local_size(1); + /* Map per-pool work index to a global work index. */ + uint global_size = ccl_global_size(0) * ccl_global_size(1); + kernel_assert(global_size % WORK_POOL_SIZE == 0); + kernel_assert(ray_index < global_size); - threads_within_tile_border_x = - (threads_within_tile_border_x == 0) ? get_local_size(0) - : threads_within_tile_border_x; - threads_within_tile_border_y = - (threads_within_tile_border_y == 0) ? get_local_size(1) - : threads_within_tile_border_y; + *global_work_index = (work_index / WORK_POOL_SIZE) * global_size + + (pool * WORK_POOL_SIZE) + + (work_index % WORK_POOL_SIZE); - return my_work / - (threads_within_tile_border_x * threads_within_tile_border_y); + /* Test if all work for this pool is done. */ + return (*global_work_index < total_work_size); } +#endif -/* Decode pixel and tile position w.r.t. assigned my_work. */ -void get_pixel_tile_position(ccl_private uint *pixel_x, - ccl_private uint *pixel_y, - ccl_private uint *tile_x, - ccl_private uint *tile_y, - uint my_work, - uint tile_dim_x, - uint tile_dim_y, - uint tile_offset_x, - uint tile_offset_y, - uint parallel_samples, - uint ray_index) +/* Map global work index to tile, pixel X/Y and sample. */ +ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile, + uint global_work_index, + ccl_private uint *x, + ccl_private uint *y, + ccl_private uint *sample) { - uint grp_idx = get_group_id_with_ray_index(ray_index, - tile_dim_x, - tile_dim_y, - parallel_samples, - 0); - uint grp_idy = get_group_id_with_ray_index(ray_index, - tile_dim_x, - tile_dim_y, - parallel_samples, - 1); - uint threads_within_tile_border_x = - (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0) - : get_local_size(0); - uint threads_within_tile_border_y = - (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1) - : get_local_size(1); - - threads_within_tile_border_x = - (threads_within_tile_border_x == 0) ? get_local_size(0) - : threads_within_tile_border_x; - threads_within_tile_border_y = - (threads_within_tile_border_y == 0) ? get_local_size(1) - : threads_within_tile_border_y; - - uint total_associated_pixels = - threads_within_tile_border_x * threads_within_tile_border_y; - uint work_group_pixel_index = my_work % total_associated_pixels; - uint work_group_pixel_x = - work_group_pixel_index % threads_within_tile_border_x; - uint work_group_pixel_y = - work_group_pixel_index / threads_within_tile_border_x; - - *pixel_x = - tile_offset_x + (grp_idx * get_local_size(0)) + work_group_pixel_x; - *pixel_y = - tile_offset_y + (grp_idy * get_local_size(1)) + work_group_pixel_y; - *tile_x = *pixel_x - tile_offset_x; - *tile_y = *pixel_y - tile_offset_y; + uint tile_pixels = tile->w * tile->h; + uint sample_offset = global_work_index / tile_pixels; + uint pixel_offset = global_work_index - sample_offset * tile_pixels; + uint y_offset = pixel_offset / tile->w; + uint x_offset = pixel_offset - y_offset * tile->w; + + *x = tile->x + x_offset; + *y = tile->y + y_offset; + *sample = tile->start_sample + sample_offset; } -#endif /* __WORK_STEALING__ */ +CCL_NAMESPACE_END #endif /* __KERNEL_WORK_STEALING_H__ */ diff --git a/intern/cycles/kernel/kernels/cpu/filter.cpp b/intern/cycles/kernel/kernels/cpu/filter.cpp new file mode 100644 index 00000000000..2ff1a392dc3 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter.cpp @@ -0,0 +1,61 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CPU kernel entry points */ + +/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this + * one with SSE2 intrinsics. + */ +#if defined(__x86_64__) || defined(_M_X64) +# define __KERNEL_SSE2__ +#endif + +/* When building kernel for native machine detect kernel features from the flags + * set by compiler. + */ +#ifdef WITH_KERNEL_NATIVE +# ifdef __SSE2__ +# ifndef __KERNEL_SSE2__ +# define __KERNEL_SSE2__ +# endif +# endif +# ifdef __SSE3__ +# define __KERNEL_SSE3__ +# endif +# ifdef __SSSE3__ +# define __KERNEL_SSSE3__ +# endif +# ifdef __SSE4_1__ +# define __KERNEL_SSE41__ +# endif +# ifdef __AVX__ +# define __KERNEL_SSE__ +# define __KERNEL_AVX__ +# endif +# ifdef __AVX2__ +# define __KERNEL_SSE__ +# define __KERNEL_AVX2__ +# endif +#endif + +/* quiet unused define warnings */ +#if defined(__KERNEL_SSE2__) + /* do nothing */ +#endif + +#include "kernel/filter/filter.h" +#define KERNEL_ARCH cpu +#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp new file mode 100644 index 00000000000..4a9e6047ecf --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp @@ -0,0 +1,39 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with AVX + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ + +#include "kernel/filter/filter.h" +#define KERNEL_ARCH cpu_avx +#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp new file mode 100644 index 00000000000..c22ec576254 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp @@ -0,0 +1,40 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with AVX2 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# define __KERNEL_AVX2__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ + +#include "kernel/filter/filter.h" +#define KERNEL_ARCH cpu_avx2 +#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h new file mode 100644 index 00000000000..4231aba88d7 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h @@ -0,0 +1,133 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Templated common declaration part of all CPU kernels. */ + +void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample, + TilesInfo *tiles, + int x, + int y, + float *unfilteredA, + float *unfilteredB, + float *sampleV, + float *sampleVV, + float *bufferV, + int* prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset); + +void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, + TilesInfo *tiles, + int m_offset, + int v_offset, + int x, + int y, + float *mean, + float *variance, + int* prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset); + +void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y, + ccl_global float *image, + ccl_global float *variance, + ccl_global float *depth, + ccl_global float *output, + int *rect, + int pass_stride); + +void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y, + float *mean, + float *variance, + float *a, + float *b, + int* prefilter_rect, + int r); + +void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, + int x, + int y, + int storage_ofs, + float *transform, + int *rank, + int* rect, + int pass_stride, + int radius, + float pca_threshold); + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, + int dy, + float *weight_image, + float *variance, + float *difference_image, + int* rect, + int stride, + int channel_offset, + float a, + float k_2); + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *difference_image, + float *out_image, + int* rect, + int stride, + int f); + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *difference_image, + float *out_image, + int* rect, + int stride, + int f); + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, + int dy, + float *difference_image, + float *image, + float *out_image, + float *accum_image, + int* rect, + int stride, + int f); + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, + int dy, + float *difference_image, + float *buffer, + float *transform, + int *rank, + float *XtWX, + float3 *XtWY, + int *rect, + int *filter_window, + int stride, + int f, + int pass_stride); + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image, + float *accum_image, + int* rect, + int stride); + +void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x, + int y, + int storage_ofs, + float *buffer, + int *rank, + float *XtWX, + float3 *XtWY, + int *buffer_params, + int sample); + +#undef KERNEL_ARCH diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h new file mode 100644 index 00000000000..504622ecfd9 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h @@ -0,0 +1,264 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Templated common implementation part of all CPU kernels. + * + * The idea is that particular .cpp files sets needed optimization flags and + * simply includes this file without worry of copying actual implementation over. + */ + +#include "kernel/kernel_compat_cpu.h" + +#include "kernel/filter/filter_kernel.h" + +#ifdef KERNEL_STUB +# define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!")) +#endif + +CCL_NAMESPACE_BEGIN + + +/* Denoise filter */ + +void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample, + TilesInfo *tiles, + int x, + int y, + float *unfilteredA, + float *unfilteredB, + float *sampleVariance, + float *sampleVarianceV, + float *bufferVariance, + int* prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow); +#else + kernel_filter_divide_shadow(sample, tiles, + x, y, + unfilteredA, + unfilteredB, + sampleVariance, + sampleVarianceV, + bufferVariance, + load_int4(prefilter_rect), + buffer_pass_stride, + buffer_denoising_offset); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, + TilesInfo *tiles, + int m_offset, + int v_offset, + int x, + int y, + float *mean, float *variance, + int* prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_get_feature); +#else + kernel_filter_get_feature(sample, tiles, + m_offset, v_offset, + x, y, + mean, variance, + load_int4(prefilter_rect), + buffer_pass_stride, + buffer_denoising_offset); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y, + ccl_global float *image, + ccl_global float *variance, + ccl_global float *depth, + ccl_global float *output, + int *rect, + int pass_stride) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_detect_outliers); +#else + kernel_filter_detect_outliers(x, y, image, variance, depth, output, load_int4(rect), pass_stride); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y, + float *mean, + float *variance, + float *a, + float *b, + int* prefilter_rect, + int r) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_combine_halves); +#else + kernel_filter_combine_halves(x, y, mean, variance, a, b, load_int4(prefilter_rect), r); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, + int x, + int y, + int storage_ofs, + float *transform, + int *rank, + int* prefilter_rect, + int pass_stride, + int radius, + float pca_threshold) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_construct_transform); +#else + rank += storage_ofs; + transform += storage_ofs*TRANSFORM_SIZE; + kernel_filter_construct_transform(buffer, + x, y, + load_int4(prefilter_rect), + pass_stride, + transform, + rank, + radius, + pca_threshold); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, + int dy, + float *weight_image, + float *variance, + float *difference_image, + int *rect, + int stride, + int channel_offset, + float a, + float k_2) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference); +#else + kernel_filter_nlm_calc_difference(dx, dy, weight_image, variance, difference_image, load_int4(rect), stride, channel_offset, a, k_2); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *difference_image, + float *out_image, + int *rect, + int stride, + int f) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_nlm_blur); +#else + kernel_filter_nlm_blur(difference_image, out_image, load_int4(rect), stride, f); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *difference_image, + float *out_image, + int *rect, + int stride, + int f) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_weight); +#else + kernel_filter_nlm_calc_weight(difference_image, out_image, load_int4(rect), stride, f); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, + int dy, + float *difference_image, + float *image, + float *out_image, + float *accum_image, + int *rect, + int stride, + int f) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output); +#else + kernel_filter_nlm_update_output(dx, dy, difference_image, image, out_image, accum_image, load_int4(rect), stride, f); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, + int dy, + float *difference_image, + float *buffer, + float *transform, + int *rank, + float *XtWX, + float3 *XtWY, + int *rect, + int *filter_window, + int stride, + int f, + int pass_stride) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian); +#else + kernel_filter_nlm_construct_gramian(dx, dy, difference_image, buffer, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_window), stride, f, pass_stride); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image, + float *accum_image, + int *rect, + int stride) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_nlm_normalize); +#else + kernel_filter_nlm_normalize(out_image, accum_image, load_int4(rect), stride); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x, + int y, + int storage_ofs, + float *buffer, + int *rank, + float *XtWX, + float3 *XtWY, + int *buffer_params, + int sample) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_finalize); +#else + XtWX += storage_ofs*XTWX_SIZE; + XtWY += storage_ofs*XTWY_SIZE; + rank += storage_ofs; + kernel_filter_finalize(x, y, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample); +#endif +} + +#undef KERNEL_STUB +#undef STUB_ASSERT +#undef KERNEL_ARCH + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp new file mode 100644 index 00000000000..f7c9935f1d0 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp @@ -0,0 +1,34 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE2 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ + +#include "kernel/filter/filter.h" +#define KERNEL_ARCH cpu_sse2 +#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp new file mode 100644 index 00000000000..070b95a3505 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp @@ -0,0 +1,36 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */ + +#include "kernel/filter/filter.h" +#define KERNEL_ARCH cpu_sse3 +#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp new file mode 100644 index 00000000000..254025be4e2 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp @@ -0,0 +1,38 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ + +#include "kernel/filter/filter.h" +#define KERNEL_ARCH cpu_sse41 +#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel.cpp b/intern/cycles/kernel/kernels/cpu/kernel.cpp index 72dbbd9a416..de487f6123f 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel.cpp @@ -56,9 +56,9 @@ /* do nothing */ #endif -#include "kernel.h" +#include "kernel/kernel.h" #define KERNEL_ARCH cpu -#include "kernel_cpu_impl.h" +#include "kernel/kernels/cpu/kernel_cpu_impl.h" CCL_NAMESPACE_BEGIN @@ -74,122 +74,21 @@ void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t s void kernel_tex_copy(KernelGlobals *kg, const char *name, - device_ptr mem, - size_t width, - size_t height, - size_t depth, - InterpolationType interpolation, - ExtensionType extension) + void *mem, + size_t size) { if(0) { } -#define KERNEL_TEX(type, ttype, tname) \ +#define KERNEL_TEX(type, tname) \ else if(strcmp(name, #tname) == 0) { \ kg->tname.data = (type*)mem; \ - kg->tname.width = width; \ + kg->tname.width = size; \ } -#define KERNEL_IMAGE_TEX(type, ttype, tname) -#include "kernel_textures.h" - - else if(strstr(name, "__tex_image_float4")) { - texture_image_float4 *tex = NULL; - int id = atoi(name + strlen("__tex_image_float4_")); - int array_index = id; - - if(array_index >= 0 && array_index < TEX_NUM_FLOAT4_CPU) { - tex = &kg->texture_float4_images[array_index]; - } - - if(tex) { - tex->data = (float4*)mem; - tex->dimensions_set(width, height, depth); - tex->interpolation = interpolation; - tex->extension = extension; - } - } - else if(strstr(name, "__tex_image_float")) { - texture_image_float *tex = NULL; - int id = atoi(name + strlen("__tex_image_float_")); - int array_index = id - TEX_START_FLOAT_CPU; - - if(array_index >= 0 && array_index < TEX_NUM_FLOAT_CPU) { - tex = &kg->texture_float_images[array_index]; - } - - if(tex) { - tex->data = (float*)mem; - tex->dimensions_set(width, height, depth); - tex->interpolation = interpolation; - tex->extension = extension; - } - } - else if(strstr(name, "__tex_image_byte4")) { - texture_image_uchar4 *tex = NULL; - int id = atoi(name + strlen("__tex_image_byte4_")); - int array_index = id - TEX_START_BYTE4_CPU; - - if(array_index >= 0 && array_index < TEX_NUM_BYTE4_CPU) { - tex = &kg->texture_byte4_images[array_index]; - } - - if(tex) { - tex->data = (uchar4*)mem; - tex->dimensions_set(width, height, depth); - tex->interpolation = interpolation; - tex->extension = extension; - } - } - else if(strstr(name, "__tex_image_byte")) { - texture_image_uchar *tex = NULL; - int id = atoi(name + strlen("__tex_image_byte_")); - int array_index = id - TEX_START_BYTE_CPU; - - if(array_index >= 0 && array_index < TEX_NUM_BYTE_CPU) { - tex = &kg->texture_byte_images[array_index]; - } - - if(tex) { - tex->data = (uchar*)mem; - tex->dimensions_set(width, height, depth); - tex->interpolation = interpolation; - tex->extension = extension; - } - } - else if(strstr(name, "__tex_image_half4")) { - texture_image_half4 *tex = NULL; - int id = atoi(name + strlen("__tex_image_half4_")); - int array_index = id - TEX_START_HALF4_CPU; - - if(array_index >= 0 && array_index < TEX_NUM_HALF4_CPU) { - tex = &kg->texture_half4_images[array_index]; - } - - if(tex) { - tex->data = (half4*)mem; - tex->dimensions_set(width, height, depth); - tex->interpolation = interpolation; - tex->extension = extension; - } - } - else if(strstr(name, "__tex_image_half")) { - texture_image_half *tex = NULL; - int id = atoi(name + strlen("__tex_image_half_")); - int array_index = id - TEX_START_HALF_CPU; - - if(array_index >= 0 && array_index < TEX_NUM_HALF_CPU) { - tex = &kg->texture_half_images[array_index]; - } - - if(tex) { - tex->data = (half*)mem; - tex->dimensions_set(width, height, depth); - tex->interpolation = interpolation; - tex->extension = extension; - } - } - else +#include "kernel/kernel_textures.h" + else { assert(0); + } } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp index 1350d9e5c2e..a645fb4d8dd 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp @@ -17,21 +17,23 @@ /* Optimized CPU kernel entry points. This file is compiled with AVX * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ - -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE__ -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -# define __KERNEL_AVX__ -#endif -#include "util_optimization.h" +#include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX -# include "kernel.h" -# define KERNEL_ARCH cpu_avx -# include "kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_avx +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp index 1a416e771ee..6bbb87727b9 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp @@ -18,21 +18,23 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE__ -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -# define __KERNEL_AVX__ -# define __KERNEL_AVX2__ -#endif - -#include "util_optimization.h" +#include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 -# include "kernel.h" -# define KERNEL_ARCH cpu_avx2 -# include "kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# define __KERNEL_AVX2__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_avx2 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h index 1a07c705f1c..6bdb8546a24 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h @@ -18,7 +18,6 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg, float *buffer, - unsigned int *rng_state, int sample, int x, int y, int offset, @@ -42,11 +41,50 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg, void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, uint4 *input, float4 *output, - float *output_luma, int type, int filter, int i, int offset, int sample); +/* Split kernels */ + +void KERNEL_FUNCTION_FULL_NAME(data_init)( + KernelGlobals *kg, + ccl_constant KernelData *data, + ccl_global void *split_data_buffer, + int num_elements, + ccl_global char *ray_state, + int start_sample, + int end_sample, + int sx, int sy, int sw, int sh, int offset, int stride, + ccl_global int *Queue_index, + int queuesize, + ccl_global char *use_queues_flag, + ccl_global unsigned int *work_pool_wgs, + unsigned int num_samples, + ccl_global float *buffer); + +#define DECLARE_SPLIT_KERNEL_FUNCTION(name) \ + void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData *data); + +DECLARE_SPLIT_KERNEL_FUNCTION(path_init) +DECLARE_SPLIT_KERNEL_FUNCTION(scene_intersect) +DECLARE_SPLIT_KERNEL_FUNCTION(lamp_emission) +DECLARE_SPLIT_KERNEL_FUNCTION(do_volume) +DECLARE_SPLIT_KERNEL_FUNCTION(queue_enqueue) +DECLARE_SPLIT_KERNEL_FUNCTION(indirect_background) +DECLARE_SPLIT_KERNEL_FUNCTION(shader_setup) +DECLARE_SPLIT_KERNEL_FUNCTION(shader_sort) +DECLARE_SPLIT_KERNEL_FUNCTION(shader_eval) +DECLARE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao) +DECLARE_SPLIT_KERNEL_FUNCTION(subsurface_scatter) +DECLARE_SPLIT_KERNEL_FUNCTION(direct_lighting) +DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao) +DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl) +DECLARE_SPLIT_KERNEL_FUNCTION(enqueue_inactive) +DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup) +DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface) +DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update) + #undef KERNEL_ARCH diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h index af68907a5c2..56c38d8101c 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h @@ -17,62 +17,503 @@ #ifndef __KERNEL_CPU_IMAGE_H__ #define __KERNEL_CPU_IMAGE_H__ -#ifdef __KERNEL_CPU__ - CCL_NAMESPACE_BEGIN -ccl_device float4 kernel_tex_image_interp_impl(KernelGlobals *kg, int tex, float x, float y) -{ - if(tex >= TEX_START_HALF_CPU) - return kg->texture_half_images[tex - TEX_START_HALF_CPU].interp(x, y); - else if(tex >= TEX_START_BYTE_CPU) - return kg->texture_byte_images[tex - TEX_START_BYTE_CPU].interp(x, y); - else if(tex >= TEX_START_FLOAT_CPU) - return kg->texture_float_images[tex - TEX_START_FLOAT_CPU].interp(x, y); - else if(tex >= TEX_START_HALF4_CPU) - return kg->texture_half4_images[tex - TEX_START_HALF4_CPU].interp(x, y); - else if(tex >= TEX_START_BYTE4_CPU) - return kg->texture_byte4_images[tex - TEX_START_BYTE4_CPU].interp(x, y); - else - return kg->texture_float4_images[tex].interp(x, y); -} +template<typename T> struct TextureInterpolator { +#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \ + { \ + u[0] = (((-1.0f/6.0f)* t + 0.5f) * t - 0.5f) * t + (1.0f/6.0f); \ + u[1] = (( 0.5f * t - 1.0f) * t ) * t + (2.0f/3.0f); \ + u[2] = (( -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \ + u[3] = (1.0f / 6.0f) * t * t * t; \ + } (void)0 + + static ccl_always_inline float4 read(float4 r) + { + return r; + } + + static ccl_always_inline float4 read(uchar4 r) + { + float f = 1.0f/255.0f; + return make_float4(r.x*f, r.y*f, r.z*f, r.w*f); + } + + static ccl_always_inline float4 read(uchar r) + { + float f = r*(1.0f/255.0f); + return make_float4(f, f, f, 1.0f); + } + + static ccl_always_inline float4 read(float r) + { + /* TODO(dingto): Optimize this, so interpolation + * happens on float instead of float4 */ + return make_float4(r, r, r, 1.0f); + } + + static ccl_always_inline float4 read(half4 r) + { + return half4_to_float4(r); + } + + static ccl_always_inline float4 read(half r) + { + float f = half_to_float(r); + return make_float4(f, f, f, 1.0f); + } + + static ccl_always_inline float4 read(const T *data, + int x, int y, + int width, int height) + { + if(x < 0 || y < 0 || x >= width || y >= height) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + return read(data[y * width + x]); + } + + static ccl_always_inline int wrap_periodic(int x, int width) + { + x %= width; + if(x < 0) + x += width; + return x; + } + + static ccl_always_inline int wrap_clamp(int x, int width) + { + return clamp(x, 0, width-1); + } + + static ccl_always_inline float frac(float x, int *ix) + { + int i = float_to_int(x) - ((x < 0.0f)? 1: 0); + *ix = i; + return x - (float)i; + } + + /* ******** 2D interpolation ******** */ + + static ccl_always_inline float4 interp_closest(const TextureInfo& info, + float x, float y) + { + const T *data = (const T*)info.data; + const int width = info.width; + const int height = info.height; + int ix, iy; + frac(x*(float)width, &ix); + frac(y*(float)height, &iy); + switch(info.extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + break; + case EXTENSION_CLIP: + if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + ATTR_FALLTHROUGH; + case EXTENSION_EXTEND: + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + break; + default: + kernel_assert(0); + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + return read(data[ix + iy*width]); + } + + static ccl_always_inline float4 interp_linear(const TextureInfo& info, + float x, float y) + { + const T *data = (const T*)info.data; + const int width = info.width; + const int height = info.height; + int ix, iy, nix, niy; + const float tx = frac(x*(float)width - 0.5f, &ix); + const float ty = frac(y*(float)height - 0.5f, &iy); + switch(info.extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + nix = wrap_periodic(ix+1, width); + niy = wrap_periodic(iy+1, height); + break; + case EXTENSION_CLIP: + nix = ix + 1; + niy = iy + 1; + break; + case EXTENSION_EXTEND: + nix = wrap_clamp(ix+1, width); + niy = wrap_clamp(iy+1, height); + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + break; + default: + kernel_assert(0); + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + return (1.0f - ty) * (1.0f - tx) * read(data, ix, iy, width, height) + + (1.0f - ty) * tx * read(data, nix, iy, width, height) + + ty * (1.0f - tx) * read(data, ix, niy, width, height) + + ty * tx * read(data, nix, niy, width, height); + } + + static ccl_always_inline float4 interp_cubic(const TextureInfo& info, + float x, float y) + { + const T *data = (const T*)info.data; + const int width = info.width; + const int height = info.height; + int ix, iy, nix, niy; + const float tx = frac(x*(float)width - 0.5f, &ix); + const float ty = frac(y*(float)height - 0.5f, &iy); + int pix, piy, nnix, nniy; + switch(info.extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + pix = wrap_periodic(ix-1, width); + piy = wrap_periodic(iy-1, height); + nix = wrap_periodic(ix+1, width); + niy = wrap_periodic(iy+1, height); + nnix = wrap_periodic(ix+2, width); + nniy = wrap_periodic(iy+2, height); + break; + case EXTENSION_CLIP: + pix = ix - 1; + piy = iy - 1; + nix = ix + 1; + niy = iy + 1; + nnix = ix + 2; + nniy = iy + 2; + break; + case EXTENSION_EXTEND: + pix = wrap_clamp(ix-1, width); + piy = wrap_clamp(iy-1, height); + nix = wrap_clamp(ix+1, width); + niy = wrap_clamp(iy+1, height); + nnix = wrap_clamp(ix+2, width); + nniy = wrap_clamp(iy+2, height); + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + break; + default: + kernel_assert(0); + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + const int xc[4] = {pix, ix, nix, nnix}; + const int yc[4] = {piy, iy, niy, nniy}; + float u[4], v[4]; + /* Some helper macro to keep code reasonable size, + * let compiler to inline all the matrix multiplications. + */ +#define DATA(x, y) (read(data, xc[x], yc[y], width, height)) +#define TERM(col) \ + (v[col] * (u[0] * DATA(0, col) + \ + u[1] * DATA(1, col) + \ + u[2] * DATA(2, col) + \ + u[3] * DATA(3, col))) + + SET_CUBIC_SPLINE_WEIGHTS(u, tx); + SET_CUBIC_SPLINE_WEIGHTS(v, ty); + + /* Actual interpolation. */ + return TERM(0) + TERM(1) + TERM(2) + TERM(3); +#undef TERM +#undef DATA + } + + static ccl_always_inline float4 interp(const TextureInfo& info, + float x, float y) + { + if(UNLIKELY(!info.data)) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + switch(info.interpolation) { + case INTERPOLATION_CLOSEST: + return interp_closest(info, x, y); + case INTERPOLATION_LINEAR: + return interp_linear(info, x, y); + default: + return interp_cubic(info, x, y); + } + } + + /* ******** 3D interpolation ******** */ + + static ccl_always_inline float4 interp_3d_closest(const TextureInfo& info, + float x, float y, float z) + { + int width = info.width; + int height = info.height; + int depth = info.depth; + int ix, iy, iz; + + frac(x*(float)width, &ix); + frac(y*(float)height, &iy); + frac(z*(float)depth, &iz); + + switch(info.extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + iz = wrap_periodic(iz, depth); + break; + case EXTENSION_CLIP: + if(x < 0.0f || y < 0.0f || z < 0.0f || + x > 1.0f || y > 1.0f || z > 1.0f) + { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + ATTR_FALLTHROUGH; + case EXTENSION_EXTEND: + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + iz = wrap_clamp(iz, depth); + break; + default: + kernel_assert(0); + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + + const T *data = (const T*)info.data; + return read(data[ix + iy*width + iz*width*height]); + } + + static ccl_always_inline float4 interp_3d_linear(const TextureInfo& info, + float x, float y, float z) + { + int width = info.width; + int height = info.height; + int depth = info.depth; + int ix, iy, iz; + int nix, niy, niz; -ccl_device float4 kernel_tex_image_interp_3d_impl(KernelGlobals *kg, int tex, float x, float y, float z) + float tx = frac(x*(float)width - 0.5f, &ix); + float ty = frac(y*(float)height - 0.5f, &iy); + float tz = frac(z*(float)depth - 0.5f, &iz); + + switch(info.extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + iz = wrap_periodic(iz, depth); + + nix = wrap_periodic(ix+1, width); + niy = wrap_periodic(iy+1, height); + niz = wrap_periodic(iz+1, depth); + break; + case EXTENSION_CLIP: + if(x < 0.0f || y < 0.0f || z < 0.0f || + x > 1.0f || y > 1.0f || z > 1.0f) + { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + ATTR_FALLTHROUGH; + case EXTENSION_EXTEND: + nix = wrap_clamp(ix+1, width); + niy = wrap_clamp(iy+1, height); + niz = wrap_clamp(iz+1, depth); + + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + iz = wrap_clamp(iz, depth); + break; + default: + kernel_assert(0); + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + + const T *data = (const T*)info.data; + float4 r; + + r = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + iz*width*height]); + r += (1.0f - tz)*(1.0f - ty)*tx*read(data[nix + iy*width + iz*width*height]); + r += (1.0f - tz)*ty*(1.0f - tx)*read(data[ix + niy*width + iz*width*height]); + r += (1.0f - tz)*ty*tx*read(data[nix + niy*width + iz*width*height]); + + r += tz*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + niz*width*height]); + r += tz*(1.0f - ty)*tx*read(data[nix + iy*width + niz*width*height]); + r += tz*ty*(1.0f - tx)*read(data[ix + niy*width + niz*width*height]); + r += tz*ty*tx*read(data[nix + niy*width + niz*width*height]); + + return r; + } + + /* TODO(sergey): For some unspeakable reason both GCC-6 and Clang-3.9 are + * causing stack overflow issue in this function unless it is inlined. + * + * Only happens for AVX2 kernel and global __KERNEL_SSE__ vectorization + * enabled. + */ +#ifdef __GNUC__ + static ccl_always_inline +#else + static ccl_never_inline +#endif + float4 interp_3d_tricubic(const TextureInfo& info, float x, float y, float z) + { + int width = info.width; + int height = info.height; + int depth = info.depth; + int ix, iy, iz; + int nix, niy, niz; + /* Tricubic b-spline interpolation. */ + const float tx = frac(x*(float)width - 0.5f, &ix); + const float ty = frac(y*(float)height - 0.5f, &iy); + const float tz = frac(z*(float)depth - 0.5f, &iz); + int pix, piy, piz, nnix, nniy, nniz; + + switch(info.extension) { + case EXTENSION_REPEAT: + ix = wrap_periodic(ix, width); + iy = wrap_periodic(iy, height); + iz = wrap_periodic(iz, depth); + + pix = wrap_periodic(ix-1, width); + piy = wrap_periodic(iy-1, height); + piz = wrap_periodic(iz-1, depth); + + nix = wrap_periodic(ix+1, width); + niy = wrap_periodic(iy+1, height); + niz = wrap_periodic(iz+1, depth); + + nnix = wrap_periodic(ix+2, width); + nniy = wrap_periodic(iy+2, height); + nniz = wrap_periodic(iz+2, depth); + break; + case EXTENSION_CLIP: + if(x < 0.0f || y < 0.0f || z < 0.0f || + x > 1.0f || y > 1.0f || z > 1.0f) + { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + ATTR_FALLTHROUGH; + case EXTENSION_EXTEND: + pix = wrap_clamp(ix-1, width); + piy = wrap_clamp(iy-1, height); + piz = wrap_clamp(iz-1, depth); + + nix = wrap_clamp(ix+1, width); + niy = wrap_clamp(iy+1, height); + niz = wrap_clamp(iz+1, depth); + + nnix = wrap_clamp(ix+2, width); + nniy = wrap_clamp(iy+2, height); + nniz = wrap_clamp(iz+2, depth); + + ix = wrap_clamp(ix, width); + iy = wrap_clamp(iy, height); + iz = wrap_clamp(iz, depth); + break; + default: + kernel_assert(0); + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + + const int xc[4] = {pix, ix, nix, nnix}; + const int yc[4] = {width * piy, + width * iy, + width * niy, + width * nniy}; + const int zc[4] = {width * height * piz, + width * height * iz, + width * height * niz, + width * height * nniz}; + float u[4], v[4], w[4]; + + /* Some helper macro to keep code reasonable size, + * let compiler to inline all the matrix multiplications. + */ +#define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]])) +#define COL_TERM(col, row) \ + (v[col] * (u[0] * DATA(0, col, row) + \ + u[1] * DATA(1, col, row) + \ + u[2] * DATA(2, col, row) + \ + u[3] * DATA(3, col, row))) +#define ROW_TERM(row) \ + (w[row] * (COL_TERM(0, row) + \ + COL_TERM(1, row) + \ + COL_TERM(2, row) + \ + COL_TERM(3, row))) + + SET_CUBIC_SPLINE_WEIGHTS(u, tx); + SET_CUBIC_SPLINE_WEIGHTS(v, ty); + SET_CUBIC_SPLINE_WEIGHTS(w, tz); + + /* Actual interpolation. */ + const T *data = (const T*)info.data; + return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3); + +#undef COL_TERM +#undef ROW_TERM +#undef DATA + } + + static ccl_always_inline float4 interp_3d(const TextureInfo& info, + float x, float y, float z, + InterpolationType interp) + { + if(UNLIKELY(!info.data)) + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + + switch((interp == INTERPOLATION_NONE)? info.interpolation: interp) { + case INTERPOLATION_CLOSEST: + return interp_3d_closest(info, x, y, z); + case INTERPOLATION_LINEAR: + return interp_3d_linear(info, x, y, z); + default: + return interp_3d_tricubic(info, x, y, z); + } + } +#undef SET_CUBIC_SPLINE_WEIGHTS +}; + +ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y) { - if(tex >= TEX_START_HALF_CPU) - return kg->texture_half_images[tex - TEX_START_HALF_CPU].interp_3d(x, y, z); - else if(tex >= TEX_START_BYTE_CPU) - return kg->texture_byte_images[tex - TEX_START_BYTE_CPU].interp_3d(x, y, z); - else if(tex >= TEX_START_FLOAT_CPU) - return kg->texture_float_images[tex - TEX_START_FLOAT_CPU].interp_3d(x, y, z); - else if(tex >= TEX_START_HALF4_CPU) - return kg->texture_half4_images[tex - TEX_START_HALF4_CPU].interp_3d(x, y, z); - else if(tex >= TEX_START_BYTE4_CPU) - return kg->texture_byte4_images[tex - TEX_START_BYTE4_CPU].interp_3d(x, y, z); - else - return kg->texture_float4_images[tex].interp_3d(x, y, z); + const TextureInfo& info = kernel_tex_fetch(__texture_info, id); + switch(kernel_tex_type(id)) { + case IMAGE_DATA_TYPE_HALF: + return TextureInterpolator<half>::interp(info, x, y); + case IMAGE_DATA_TYPE_BYTE: + return TextureInterpolator<uchar>::interp(info, x, y); + case IMAGE_DATA_TYPE_FLOAT: + return TextureInterpolator<float>::interp(info, x, y); + case IMAGE_DATA_TYPE_HALF4: + return TextureInterpolator<half4>::interp(info, x, y); + case IMAGE_DATA_TYPE_BYTE4: + return TextureInterpolator<uchar4>::interp(info, x, y); + case IMAGE_DATA_TYPE_FLOAT4: + default: + return TextureInterpolator<float4>::interp(info, x, y); + } } -ccl_device float4 kernel_tex_image_interp_3d_ex_impl(KernelGlobals *kg, int tex, float x, float y, float z, int interpolation) +ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z, InterpolationType interp) { - if(tex >= TEX_START_HALF_CPU) - return kg->texture_half_images[tex - TEX_START_HALF_CPU].interp_3d_ex(x, y, z, interpolation); - else if(tex >= TEX_START_BYTE_CPU) - return kg->texture_byte_images[tex - TEX_START_BYTE_CPU].interp_3d_ex(x, y, z, interpolation); - else if(tex >= TEX_START_FLOAT_CPU) - return kg->texture_float_images[tex - TEX_START_FLOAT_CPU].interp_3d_ex(x, y, z, interpolation); - else if(tex >= TEX_START_HALF4_CPU) - return kg->texture_half4_images[tex - TEX_START_HALF4_CPU].interp_3d_ex(x, y, z, interpolation); - else if(tex >= TEX_START_BYTE4_CPU) - return kg->texture_byte4_images[tex - TEX_START_BYTE4_CPU].interp_3d_ex(x, y, z, interpolation); - else - return kg->texture_float4_images[tex].interp_3d_ex(x, y, z, interpolation); + const TextureInfo& info = kernel_tex_fetch(__texture_info, id); + + switch(kernel_tex_type(id)) { + case IMAGE_DATA_TYPE_HALF: + return TextureInterpolator<half>::interp_3d(info, x, y, z, interp); + case IMAGE_DATA_TYPE_BYTE: + return TextureInterpolator<uchar>::interp_3d(info, x, y, z, interp); + case IMAGE_DATA_TYPE_FLOAT: + return TextureInterpolator<float>::interp_3d(info, x, y, z, interp); + case IMAGE_DATA_TYPE_HALF4: + return TextureInterpolator<half4>::interp_3d(info, x, y, z, interp); + case IMAGE_DATA_TYPE_BYTE4: + return TextureInterpolator<uchar4>::interp_3d(info, x, y, z, interp); + case IMAGE_DATA_TYPE_FLOAT4: + default: + return TextureInterpolator<float4>::interp_3d(info, x, y, z, interp); + } } CCL_NAMESPACE_END -#endif // __KERNEL_CPU__ - - #endif // __KERNEL_CPU_IMAGE_H__ diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h index ec82d4b4c22..ccca023a15f 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h @@ -20,43 +20,83 @@ * simply includes this file without worry of copying actual implementation over. */ -#include "kernel_compat_cpu.h" -#include "kernel_math.h" -#include "kernel_types.h" -#include "kernel_globals.h" -#include "kernel_cpu_image.h" -#include "kernel_film.h" -#include "kernel_path.h" -#include "kernel_path_branched.h" -#include "kernel_bake.h" +#include "kernel/kernel_compat_cpu.h" + +#ifndef KERNEL_STUB +# ifndef __SPLIT_KERNEL__ +# include "kernel/kernel_math.h" +# include "kernel/kernel_types.h" + +# include "kernel/split/kernel_split_data.h" +# include "kernel/kernel_globals.h" + +# include "kernel/kernels/cpu/kernel_cpu_image.h" +# include "kernel/kernel_film.h" +# include "kernel/kernel_path.h" +# include "kernel/kernel_path_branched.h" +# include "kernel/kernel_bake.h" +# else +# include "kernel/split/kernel_split_common.h" + +# include "kernel/split/kernel_data_init.h" +# include "kernel/split/kernel_path_init.h" +# include "kernel/split/kernel_scene_intersect.h" +# include "kernel/split/kernel_lamp_emission.h" +# include "kernel/split/kernel_do_volume.h" +# include "kernel/split/kernel_queue_enqueue.h" +# include "kernel/split/kernel_indirect_background.h" +# include "kernel/split/kernel_shader_setup.h" +# include "kernel/split/kernel_shader_sort.h" +# include "kernel/split/kernel_shader_eval.h" +# include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h" +# include "kernel/split/kernel_subsurface_scatter.h" +# include "kernel/split/kernel_direct_lighting.h" +# include "kernel/split/kernel_shadow_blocked_ao.h" +# include "kernel/split/kernel_shadow_blocked_dl.h" +# include "kernel/split/kernel_enqueue_inactive.h" +# include "kernel/split/kernel_next_iteration_setup.h" +# include "kernel/split/kernel_indirect_subsurface.h" +# include "kernel/split/kernel_buffer_update.h" +# endif /* __SPLIT_KERNEL__ */ +#else +# define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!")) + +# ifdef __SPLIT_KERNEL__ +# include "kernel/split/kernel_data_init.h" +# endif /* __SPLIT_KERNEL__ */ +#endif /* KERNEL_STUB */ CCL_NAMESPACE_BEGIN +#ifndef __SPLIT_KERNEL__ + /* Path Tracing */ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg, float *buffer, - unsigned int *rng_state, int sample, int x, int y, int offset, int stride) { -#ifdef __BRANCHED_PATH__ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, path_trace); +#else +# ifdef __BRANCHED_PATH__ if(kernel_data.integrator.branched) { kernel_branched_path_trace(kg, buffer, - rng_state, sample, x, y, offset, stride); } else -#endif +# endif { - kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride); + kernel_path_trace(kg, buffer, sample, x, y, offset, stride); } +#endif /* KERNEL_STUB */ } /* Film */ @@ -69,6 +109,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg, int offset, int stride) { +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, convert_to_byte); +#else kernel_film_convert_to_byte(kg, rgba, buffer, @@ -76,6 +119,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg, x, y, offset, stride); +#endif /* KERNEL_STUB */ } void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg, @@ -86,6 +130,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg, int offset, int stride) { +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, convert_to_half_float); +#else kernel_film_convert_to_half_float(kg, rgba, buffer, @@ -93,6 +140,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg, x, y, offset, stride); +#endif /* KERNEL_STUB */ } /* Shader Evaluate */ @@ -100,16 +148,17 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg, void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, uint4 *input, float4 *output, - float *output_luma, int type, int filter, int i, int offset, int sample) { +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, shader); +#else if(type >= SHADER_EVAL_BAKE) { - kernel_assert(output_luma == NULL); -#ifdef __BAKING__ +# ifdef __BAKING__ kernel_bake_evaluate(kg, input, output, @@ -118,17 +167,70 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, i, offset, sample); -#endif +# endif + } + else if(type == SHADER_EVAL_DISPLACE) { + kernel_displace_evaluate(kg, input, output, i); } else { - kernel_shader_evaluate(kg, - input, - output, - output_luma, - (ShaderEvalType)type, - i, - sample); + kernel_background_evaluate(kg, input, output, i); } +#endif /* KERNEL_STUB */ } +#else /* __SPLIT_KERNEL__ */ + +/* Split Kernel Path Tracing */ + +#ifdef KERNEL_STUB +# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ + void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \ + { \ + STUB_ASSERT(KERNEL_ARCH, name); \ + } + +# define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \ + void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \ + { \ + STUB_ASSERT(KERNEL_ARCH, name); \ + } +#else +# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ + void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \ + { \ + kernel_##name(kg); \ + } + +# define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \ + void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \ + { \ + ccl_local type locals; \ + kernel_##name(kg, &locals); \ + } +#endif /* KERNEL_STUB */ + +DEFINE_SPLIT_KERNEL_FUNCTION(path_init) +DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect) +DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission) +DEFINE_SPLIT_KERNEL_FUNCTION(do_volume) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals) +DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals) +DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals) +DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint) +DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao) +DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint) +DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint) +#endif /* __SPLIT_KERNEL__ */ + +#undef KERNEL_STUB +#undef STUB_ASSERT +#undef KERNEL_ARCH + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split.cpp new file mode 100644 index 00000000000..ca750e5a00d --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/kernel_split.cpp @@ -0,0 +1,63 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CPU kernel entry points */ + +/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this + * one with SSE2 intrinsics. + */ +#if defined(__x86_64__) || defined(_M_X64) +# define __KERNEL_SSE2__ +#endif + +#define __SPLIT_KERNEL__ + +/* When building kernel for native machine detect kernel features from the flags + * set by compiler. + */ +#ifdef WITH_KERNEL_NATIVE +# ifdef __SSE2__ +# ifndef __KERNEL_SSE2__ +# define __KERNEL_SSE2__ +# endif +# endif +# ifdef __SSE3__ +# define __KERNEL_SSE3__ +# endif +# ifdef __SSSE3__ +# define __KERNEL_SSSE3__ +# endif +# ifdef __SSE4_1__ +# define __KERNEL_SSE41__ +# endif +# ifdef __AVX__ +# define __KERNEL_AVX__ +# endif +# ifdef __AVX2__ +# define __KERNEL_SSE__ +# define __KERNEL_AVX2__ +# endif +#endif + +/* quiet unused define warnings */ +#if defined(__KERNEL_SSE2__) + /* do nothing */ +#endif + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu +#include "kernel/kernels/cpu/kernel_cpu_impl.h" + diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp new file mode 100644 index 00000000000..6ba3425a343 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp @@ -0,0 +1,41 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with AVX + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#define __SPLIT_KERNEL__ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_avx +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp new file mode 100644 index 00000000000..76b2d77ebb8 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp @@ -0,0 +1,42 @@ +/* + * Copyright 2011-2014 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with AVX2 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#define __SPLIT_KERNEL__ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# define __KERNEL_AVX2__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_avx2 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp new file mode 100644 index 00000000000..b468b6f44c8 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp @@ -0,0 +1,36 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE2 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#define __SPLIT_KERNEL__ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_sse2 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp new file mode 100644 index 00000000000..3e5792d0b17 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp @@ -0,0 +1,38 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#define __SPLIT_KERNEL__ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_sse3 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp new file mode 100644 index 00000000000..3629f21cd29 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp @@ -0,0 +1,39 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#define __SPLIT_KERNEL__ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_sse41 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp index a5f2d6e7294..57530c88710 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp @@ -18,15 +18,17 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -#endif - -#include "util_optimization.h" +#include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 -# include "kernel.h" -# define KERNEL_ARCH cpu_sse2 -# include "kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_sse2 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp index 86f9ce991f8..c607753bc4b 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp @@ -18,17 +18,19 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -#endif - -#include "util_optimization.h" +#include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 -# include "kernel.h" -# define KERNEL_ARCH cpu_sse3 -# include "kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_sse3 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp index c174406047d..a278554731c 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp @@ -18,18 +18,20 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -#endif - -#include "util_optimization.h" +#include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 -# include "kernel.h" -# define KERNEL_ARCH cpu_sse41 -# include "kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_sse41 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu new file mode 100644 index 00000000000..035f0484488 --- /dev/null +++ b/intern/cycles/kernel/kernels/cuda/filter.cu @@ -0,0 +1,301 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CUDA kernel entry points */ + +#ifdef __CUDA_ARCH__ + +#include "kernel_config.h" + +#include "kernel/kernel_compat_cuda.h" + +#include "kernel/filter/filter_kernel.h" + +/* kernels */ + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_divide_shadow(int sample, + TilesInfo *tiles, + float *unfilteredA, + float *unfilteredB, + float *sampleVariance, + float *sampleVarianceV, + float *bufferVariance, + int4 prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset) +{ + int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; + int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_divide_shadow(sample, + tiles, + x, y, + unfilteredA, + unfilteredB, + sampleVariance, + sampleVarianceV, + bufferVariance, + prefilter_rect, + buffer_pass_stride, + buffer_denoising_offset); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_get_feature(int sample, + TilesInfo *tiles, + int m_offset, + int v_offset, + float *mean, + float *variance, + int4 prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset) +{ + int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; + int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_get_feature(sample, + tiles, + m_offset, v_offset, + x, y, + mean, variance, + prefilter_rect, + buffer_pass_stride, + buffer_denoising_offset); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_detect_outliers(float *image, + float *variance, + float *depth, + float *output, + int4 prefilter_rect, + int pass_stride) +{ + int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; + int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r) +{ + int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; + int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_construct_transform(float const* __restrict__ buffer, + float *transform, int *rank, + int4 filter_area, int4 rect, + int radius, float pca_threshold, + int pass_stride) +{ + int x = blockDim.x*blockIdx.x + threadIdx.x; + int y = blockDim.y*blockIdx.y + threadIdx.y; + if(x < filter_area.z && y < filter_area.w) { + int *l_rank = rank + y*filter_area.z + x; + float *l_transform = transform + y*filter_area.z + x; + kernel_filter_construct_transform(buffer, + x + filter_area.x, y + filter_area.y, + rect, pass_stride, + l_transform, l_rank, + radius, pca_threshold, + filter_area.z*filter_area.w, + threadIdx.y*blockDim.x + threadIdx.x); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image, + const float *ccl_restrict variance_image, + float *difference_image, + int w, + int h, + int stride, + int shift_stride, + int r, + int channel_offset, + float a, + float k_2) +{ + int4 co, rect; + int ofs; + if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w, + weight_image, + variance_image, + difference_image + ofs, + rect, stride, + channel_offset, a, k_2); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image, + float *out_image, + int w, + int h, + int stride, + int shift_stride, + int r, + int f) +{ + int4 co, rect; + int ofs; + if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + kernel_filter_nlm_blur(co.x, co.y, + difference_image + ofs, + out_image + ofs, + rect, stride, f); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image, + float *out_image, + int w, + int h, + int stride, + int shift_stride, + int r, + int f) +{ + int4 co, rect; + int ofs; + if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + kernel_filter_nlm_calc_weight(co.x, co.y, + difference_image + ofs, + out_image + ofs, + rect, stride, f); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image, + const float *ccl_restrict image, + float *out_image, + float *accum_image, + int w, + int h, + int stride, + int shift_stride, + int r, + int f) +{ + int4 co, rect; + int ofs; + if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w, + difference_image + ofs, + image, + out_image, + accum_image, + rect, stride, f); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_nlm_normalize(float *out_image, + const float *ccl_restrict accum_image, + int w, + int h, + int stride) +{ + int x = blockDim.x*blockIdx.x + threadIdx.x; + int y = blockDim.y*blockIdx.y + threadIdx.y; + if(x < w && y < h) { + kernel_filter_nlm_normalize(x, y, out_image, accum_image, stride); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_image, + const float *ccl_restrict buffer, + float const* __restrict__ transform, + int *rank, + float *XtWX, + float3 *XtWY, + int4 filter_window, + int w, + int h, + int stride, + int shift_stride, + int r, + int f, + int pass_stride) +{ + int4 co, rect; + int ofs; + if(get_nlm_coords_window(w, h, r, shift_stride, &rect, &co, &ofs, filter_window)) { + kernel_filter_nlm_construct_gramian(co.x, co.y, + co.z, co.w, + difference_image + ofs, + buffer, + transform, rank, + XtWX, XtWY, + rect, filter_window, + stride, f, + pass_stride, + threadIdx.y*blockDim.x + threadIdx.x); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_finalize(float *buffer, + int *rank, + float *XtWX, + float3 *XtWY, + int4 filter_area, + int4 buffer_params, + int sample) +{ + int x = blockDim.x*blockIdx.x + threadIdx.x; + int y = blockDim.y*blockIdx.y + threadIdx.y; + if(x < filter_area.z && y < filter_area.w) { + int storage_ofs = y*filter_area.z+x; + rank += storage_ofs; + XtWX += storage_ofs; + XtWY += storage_ofs; + kernel_filter_finalize(x, y, buffer, rank, + filter_area.z*filter_area.w, + XtWX, XtWY, + buffer_params, sample); + } +} + +#endif + diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu index 090ab2c50c2..3c93e00ccf1 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel.cu +++ b/intern/cycles/kernel/kernels/cuda/kernel.cu @@ -16,137 +16,52 @@ /* CUDA kernel entry points */ -#include "../../kernel_compat_cuda.h" -#include "../../kernel_math.h" -#include "../../kernel_types.h" -#include "../../kernel_globals.h" -#include "../../kernel_film.h" -#include "../../kernel_path.h" -#include "../../kernel_path_branched.h" -#include "../../kernel_bake.h" - -/* device data taken from CUDA occupancy calculator */ - #ifdef __CUDA_ARCH__ -/* 2.0 and 2.1 */ -#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 63 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 32 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40 - -/* 3.0 and 3.5 */ -#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 63 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 63 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 - -/* 3.2 */ -#elif __CUDA_ARCH__ == 320 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 63 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 63 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 - -/* 3.7 */ -#elif __CUDA_ARCH__ == 370 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 255 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 63 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 - -/* 5.0, 5.2, 5.3, 6.0, 6.1 */ -#elif __CUDA_ARCH__ >= 500 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 255 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 48 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 - -/* unknown architecture */ -#else -# error "Unknown or unsupported CUDA architecture, can't determine launch bounds" -#endif - -/* compute number of threads per block and minimum blocks per multiprocessor - * given the maximum number of registers per thread */ - -#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \ - __launch_bounds__( \ - threads_block_width*threads_block_width, \ - CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \ - ) - -/* sanity checks */ - -#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS -# error "Maximum number of threads per block exceeded" -#endif - -#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS -# error "Maximum number of blocks per multiprocessor exceeded" -#endif +#include "kernel/kernel_compat_cuda.h" +#include "kernel_config.h" -#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS -# error "Maximum number of registers per thread exceeded" -#endif +#include "util/util_atomic.h" -#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS -# error "Maximum number of registers per thread exceeded" -#endif +#include "kernel/kernel_math.h" +#include "kernel/kernel_types.h" +#include "kernel/kernel_globals.h" +#include "kernel/kernels/cuda/kernel_cuda_image.h" +#include "kernel/kernel_film.h" +#include "kernel/kernel_path.h" +#include "kernel/kernel_path_branched.h" +#include "kernel/kernel_bake.h" +#include "kernel/kernel_work_stealing.h" /* kernels */ - extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride) +kernel_cuda_path_trace(WorkTile *tile, uint total_work_size) { - int x = sx + blockDim.x*blockIdx.x + threadIdx.x; - int y = sy + blockDim.y*blockIdx.y + threadIdx.y; + int work_index = ccl_global_id(0); + + if(work_index < total_work_size) { + uint x, y, sample; + get_work_pixel(tile, work_index, &x, &y, &sample); - if(x < sx + sw && y < sy + sh) { KernelGlobals kg; - kernel_path_trace(&kg, buffer, rng_state, sample, x, y, offset, stride); + kernel_path_trace(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride); } } #ifdef __BRANCHED_PATH__ extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_BRANCHED_MAX_REGISTERS) -kernel_cuda_branched_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride) +kernel_cuda_branched_path_trace(WorkTile *tile, uint total_work_size) { - int x = sx + blockDim.x*blockIdx.x + threadIdx.x; - int y = sy + blockDim.y*blockIdx.y + threadIdx.y; + int work_index = ccl_global_id(0); + + if(work_index < total_work_size) { + uint x, y, sample; + get_work_pixel(tile, work_index, &x, &y, &sample); - if(x < sx + sw && y < sy + sh) { KernelGlobals kg; - kernel_branched_path_trace(&kg, buffer, rng_state, sample, x, y, offset, stride); + kernel_branched_path_trace(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride); } } #endif @@ -177,26 +92,37 @@ kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scal extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_shader(uint4 *input, - float4 *output, - float *output_luma, - int type, - int sx, - int sw, - int offset, - int sample) +kernel_cuda_displace(uint4 *input, + float4 *output, + int type, + int sx, + int sw, + int offset, + int sample) +{ + int x = sx + blockDim.x*blockIdx.x + threadIdx.x; + + if(x < sx + sw) { + KernelGlobals kg; + kernel_displace_evaluate(&kg, input, output, x); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_background(uint4 *input, + float4 *output, + int type, + int sx, + int sw, + int offset, + int sample) { int x = sx + blockDim.x*blockIdx.x + threadIdx.x; if(x < sx + sw) { KernelGlobals kg; - kernel_shader_evaluate(&kg, - input, - output, - output_luma, - (ShaderEvalType)type, - x, - sample); + kernel_background_evaluate(&kg, input, output, x); } } diff --git a/intern/cycles/kernel/kernels/cuda/kernel_config.h b/intern/cycles/kernel/kernels/cuda/kernel_config.h new file mode 100644 index 00000000000..f3d0d721c5c --- /dev/null +++ b/intern/cycles/kernel/kernels/cuda/kernel_config.h @@ -0,0 +1,109 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* device data taken from CUDA occupancy calculator */ + +/* 3.0 and 3.5 */ +#if __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350 +# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 +# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 +# define CUDA_BLOCK_MAX_THREADS 1024 +# define CUDA_THREAD_MAX_REGISTERS 63 + +/* tunable parameters */ +# define CUDA_THREADS_BLOCK_WIDTH 16 +# define CUDA_KERNEL_MAX_REGISTERS 63 +# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 + +/* 3.2 */ +#elif __CUDA_ARCH__ == 320 +# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768 +# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 +# define CUDA_BLOCK_MAX_THREADS 1024 +# define CUDA_THREAD_MAX_REGISTERS 63 + +/* tunable parameters */ +# define CUDA_THREADS_BLOCK_WIDTH 16 +# define CUDA_KERNEL_MAX_REGISTERS 63 +# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 + +/* 3.7 */ +#elif __CUDA_ARCH__ == 370 +# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 +# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 +# define CUDA_BLOCK_MAX_THREADS 1024 +# define CUDA_THREAD_MAX_REGISTERS 255 + +/* tunable parameters */ +# define CUDA_THREADS_BLOCK_WIDTH 16 +# define CUDA_KERNEL_MAX_REGISTERS 63 +# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 + +/* 5.0, 5.2, 5.3, 6.0, 6.1 */ +#elif __CUDA_ARCH__ >= 500 +# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 +# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32 +# define CUDA_BLOCK_MAX_THREADS 1024 +# define CUDA_THREAD_MAX_REGISTERS 255 + +/* tunable parameters */ +# define CUDA_THREADS_BLOCK_WIDTH 16 +/* CUDA 9.0 seems to cause slowdowns on high-end Pascal cards unless we increase the number of registers */ +# if __CUDACC_VER_MAJOR__ == 9 && __CUDA_ARCH__ >= 600 +# define CUDA_KERNEL_MAX_REGISTERS 64 +# else +# define CUDA_KERNEL_MAX_REGISTERS 48 +# endif +# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 + + +/* unknown architecture */ +#else +# error "Unknown or unsupported CUDA architecture, can't determine launch bounds" +#endif + +/* For split kernel using all registers seems fastest for now, but this + * is unlikely to be optimal once we resolve other bottlenecks. */ + +#define CUDA_KERNEL_SPLIT_MAX_REGISTERS CUDA_THREAD_MAX_REGISTERS + +/* Compute number of threads per block and minimum blocks per multiprocessor + * given the maximum number of registers per thread. */ + +#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \ + __launch_bounds__( \ + threads_block_width*threads_block_width, \ + CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \ + ) + +/* sanity checks */ + +#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS +# error "Maximum number of threads per block exceeded" +#endif + +#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS +# error "Maximum number of blocks per multiprocessor exceeded" +#endif + +#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS +# error "Maximum number of registers per thread exceeded" +#endif + +#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS +# error "Maximum number of registers per thread exceeded" +#endif + diff --git a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h b/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h new file mode 100644 index 00000000000..91ad289a858 --- /dev/null +++ b/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h @@ -0,0 +1,189 @@ +/* + * Copyright 2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* w0, w1, w2, and w3 are the four cubic B-spline basis functions. */ +ccl_device float cubic_w0(float a) +{ + return (1.0f/6.0f)*(a*(a*(-a + 3.0f) - 3.0f) + 1.0f); +} + +ccl_device float cubic_w1(float a) +{ + return (1.0f/6.0f)*(a*a*(3.0f*a - 6.0f) + 4.0f); +} + +ccl_device float cubic_w2(float a) +{ + return (1.0f/6.0f)*(a*(a*(-3.0f*a + 3.0f) + 3.0f) + 1.0f); +} + +ccl_device float cubic_w3(float a) +{ + return (1.0f/6.0f)*(a*a*a); +} + +/* g0 and g1 are the two amplitude functions. */ +ccl_device float cubic_g0(float a) +{ + return cubic_w0(a) + cubic_w1(a); +} + +ccl_device float cubic_g1(float a) +{ + return cubic_w2(a) + cubic_w3(a); +} + +/* h0 and h1 are the two offset functions */ +ccl_device float cubic_h0(float a) +{ + /* Note +0.5 offset to compensate for CUDA linear filtering convention. */ + return -1.0f + cubic_w1(a) / (cubic_w0(a) + cubic_w1(a)) + 0.5f; +} + +ccl_device float cubic_h1(float a) +{ + return 1.0f + cubic_w3(a) / (cubic_w2(a) + cubic_w3(a)) + 0.5f; +} + +/* Fast bicubic texture lookup using 4 bilinear lookups, adapted from CUDA samples. */ +template<typename T> +ccl_device T kernel_tex_image_interp_bicubic(const TextureInfo& info, CUtexObject tex, float x, float y) +{ + x = (x * info.width) - 0.5f; + y = (y * info.height) - 0.5f; + + float px = floor(x); + float py = floor(y); + float fx = x - px; + float fy = y - py; + + float g0x = cubic_g0(fx); + float g1x = cubic_g1(fx); + float x0 = (px + cubic_h0(fx)) / info.width; + float x1 = (px + cubic_h1(fx)) / info.width; + float y0 = (py + cubic_h0(fy)) / info.height; + float y1 = (py + cubic_h1(fy)) / info.height; + + return cubic_g0(fy) * (g0x * tex2D<T>(tex, x0, y0) + + g1x * tex2D<T>(tex, x1, y0)) + + cubic_g1(fy) * (g0x * tex2D<T>(tex, x0, y1) + + g1x * tex2D<T>(tex, x1, y1)); +} + +/* Fast tricubic texture lookup using 8 trilinear lookups. */ +template<typename T> +ccl_device T kernel_tex_image_interp_bicubic_3d(const TextureInfo& info, CUtexObject tex, float x, float y, float z) +{ + x = (x * info.width) - 0.5f; + y = (y * info.height) - 0.5f; + z = (z * info.depth) - 0.5f; + + float px = floor(x); + float py = floor(y); + float pz = floor(z); + float fx = x - px; + float fy = y - py; + float fz = z - pz; + + float g0x = cubic_g0(fx); + float g1x = cubic_g1(fx); + float g0y = cubic_g0(fy); + float g1y = cubic_g1(fy); + float g0z = cubic_g0(fz); + float g1z = cubic_g1(fz); + + float x0 = (px + cubic_h0(fx)) / info.width; + float x1 = (px + cubic_h1(fx)) / info.width; + float y0 = (py + cubic_h0(fy)) / info.height; + float y1 = (py + cubic_h1(fy)) / info.height; + float z0 = (pz + cubic_h0(fz)) / info.depth; + float z1 = (pz + cubic_h1(fz)) / info.depth; + + return g0z * (g0y * (g0x * tex3D<T>(tex, x0, y0, z0) + + g1x * tex3D<T>(tex, x1, y0, z0)) + + g1y * (g0x * tex3D<T>(tex, x0, y1, z0) + + g1x * tex3D<T>(tex, x1, y1, z0))) + + g1z * (g0y * (g0x * tex3D<T>(tex, x0, y0, z1) + + g1x * tex3D<T>(tex, x1, y0, z1)) + + g1y * (g0x * tex3D<T>(tex, x0, y1, z1) + + g1x * tex3D<T>(tex, x1, y1, z1))); +} + +ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y) +{ + const TextureInfo& info = kernel_tex_fetch(__texture_info, id); + CUtexObject tex = (CUtexObject)info.data; + + /* float4, byte4 and half4 */ + const int texture_type = kernel_tex_type(id); + if(texture_type == IMAGE_DATA_TYPE_FLOAT4 || + texture_type == IMAGE_DATA_TYPE_BYTE4 || + texture_type == IMAGE_DATA_TYPE_HALF4) + { + if(info.interpolation == INTERPOLATION_CUBIC) { + return kernel_tex_image_interp_bicubic<float4>(info, tex, x, y); + } + else { + return tex2D<float4>(tex, x, y); + } + } + /* float, byte and half */ + else { + float f; + + if(info.interpolation == INTERPOLATION_CUBIC) { + f = kernel_tex_image_interp_bicubic<float>(info, tex, x, y); + } + else { + f = tex2D<float>(tex, x, y); + } + + return make_float4(f, f, f, 1.0f); + } +} + +ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z, InterpolationType interp) +{ + const TextureInfo& info = kernel_tex_fetch(__texture_info, id); + CUtexObject tex = (CUtexObject)info.data; + uint interpolation = (interp == INTERPOLATION_NONE)? info.interpolation: interp; + + const int texture_type = kernel_tex_type(id); + if(texture_type == IMAGE_DATA_TYPE_FLOAT4 || + texture_type == IMAGE_DATA_TYPE_BYTE4 || + texture_type == IMAGE_DATA_TYPE_HALF4) + { + if(interpolation == INTERPOLATION_CUBIC) { + return kernel_tex_image_interp_bicubic_3d<float4>(info, tex, x, y, z); + } + else { + return tex3D<float4>(tex, x, y, z); + } + } + else { + float f; + + if(interpolation == INTERPOLATION_CUBIC) { + f = kernel_tex_image_interp_bicubic_3d<float>(info, tex, x, y, z); + } + else { + f = tex3D<float>(tex, x, y, z); + } + + return make_float4(f, f, f, 1.0f); + } +} + diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu new file mode 100644 index 00000000000..43b3d0aa0e6 --- /dev/null +++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu @@ -0,0 +1,148 @@ +/* + * Copyright 2011-2016 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CUDA split kernel entry points */ + +#ifdef __CUDA_ARCH__ + +#define __SPLIT_KERNEL__ + +#include "kernel/kernel_compat_cuda.h" +#include "kernel_config.h" + +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_data_init.h" +#include "kernel/split/kernel_path_init.h" +#include "kernel/split/kernel_scene_intersect.h" +#include "kernel/split/kernel_lamp_emission.h" +#include "kernel/split/kernel_do_volume.h" +#include "kernel/split/kernel_queue_enqueue.h" +#include "kernel/split/kernel_indirect_background.h" +#include "kernel/split/kernel_shader_setup.h" +#include "kernel/split/kernel_shader_sort.h" +#include "kernel/split/kernel_shader_eval.h" +#include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h" +#include "kernel/split/kernel_subsurface_scatter.h" +#include "kernel/split/kernel_direct_lighting.h" +#include "kernel/split/kernel_shadow_blocked_ao.h" +#include "kernel/split/kernel_shadow_blocked_dl.h" +#include "kernel/split/kernel_enqueue_inactive.h" +#include "kernel/split/kernel_next_iteration_setup.h" +#include "kernel/split/kernel_indirect_subsurface.h" +#include "kernel/split/kernel_buffer_update.h" + +#include "kernel/kernel_film.h" + +/* kernels */ +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_state_buffer_size(uint num_threads, uint64_t *size) +{ + *size = split_data_buffer_size(NULL, num_threads); +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_path_trace_data_init( + ccl_global void *split_data_buffer, + int num_elements, + ccl_global char *ray_state, + int start_sample, + int end_sample, + int sx, int sy, int sw, int sh, int offset, int stride, + ccl_global int *Queue_index, + int queuesize, + ccl_global char *use_queues_flag, + ccl_global unsigned int *work_pool_wgs, + unsigned int num_samples, + ccl_global float *buffer) +{ + kernel_data_init(NULL, + NULL, + split_data_buffer, + num_elements, + ray_state, + start_sample, + end_sample, + sx, sy, sw, sh, offset, stride, + Queue_index, + queuesize, + use_queues_flag, + work_pool_wgs, + num_samples, + buffer); +} + +#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ + extern "C" __global__ void \ + CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \ + kernel_cuda_##name() \ + { \ + kernel_##name(NULL); \ + } + +#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \ + extern "C" __global__ void \ + CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \ + kernel_cuda_##name() \ + { \ + ccl_local type locals; \ + kernel_##name(NULL, &locals); \ + } + +DEFINE_SPLIT_KERNEL_FUNCTION(path_init) +DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect) +DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission) +DEFINE_SPLIT_KERNEL_FUNCTION(do_volume) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals) +DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals) +DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals) +DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint) +DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao) +DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint) +DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint) + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) +{ + int x = sx + blockDim.x*blockIdx.x + threadIdx.x; + int y = sy + blockDim.y*blockIdx.y + threadIdx.y; + + if(x < sx + sw && y < sy + sh) + kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride); +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) +{ + int x = sx + blockDim.x*blockIdx.x + threadIdx.x; + int y = sy + blockDim.y*blockIdx.y + threadIdx.y; + + if(x < sx + sw && y < sy + sh) + kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride); +} + +#endif + diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl new file mode 100644 index 00000000000..2b77807c38b --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/filter.cl @@ -0,0 +1,302 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* OpenCL kernel entry points */ + +#include "kernel/kernel_compat_opencl.h" + +#include "kernel/filter/filter_kernel.h" + +/* kernels */ + +__kernel void kernel_ocl_filter_divide_shadow(int sample, + ccl_global TilesInfo *tiles, + ccl_global float *unfilteredA, + ccl_global float *unfilteredB, + ccl_global float *sampleVariance, + ccl_global float *sampleVarianceV, + ccl_global float *bufferVariance, + int4 prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset) +{ + int x = prefilter_rect.x + get_global_id(0); + int y = prefilter_rect.y + get_global_id(1); + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_divide_shadow(sample, + tiles, + x, y, + unfilteredA, + unfilteredB, + sampleVariance, + sampleVarianceV, + bufferVariance, + prefilter_rect, + buffer_pass_stride, + buffer_denoising_offset); + } +} + +__kernel void kernel_ocl_filter_get_feature(int sample, + ccl_global TilesInfo *tiles, + int m_offset, + int v_offset, + ccl_global float *mean, + ccl_global float *variance, + int4 prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset) +{ + int x = prefilter_rect.x + get_global_id(0); + int y = prefilter_rect.y + get_global_id(1); + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_get_feature(sample, + tiles, + m_offset, v_offset, + x, y, + mean, variance, + prefilter_rect, + buffer_pass_stride, + buffer_denoising_offset); + } +} + +__kernel void kernel_ocl_filter_detect_outliers(ccl_global float *image, + ccl_global float *variance, + ccl_global float *depth, + ccl_global float *output, + int4 prefilter_rect, + int pass_stride) +{ + int x = prefilter_rect.x + get_global_id(0); + int y = prefilter_rect.y + get_global_id(1); + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride); + } +} + +__kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean, + ccl_global float *variance, + ccl_global float *a, + ccl_global float *b, + int4 prefilter_rect, + int r) +{ + int x = prefilter_rect.x + get_global_id(0); + int y = prefilter_rect.y + get_global_id(1); + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r); + } +} + +__kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_restrict buffer, + ccl_global float *transform, + ccl_global int *rank, + int4 filter_area, + int4 rect, + int pass_stride, + int radius, + float pca_threshold) +{ + int x = get_global_id(0); + int y = get_global_id(1); + if(x < filter_area.z && y < filter_area.w) { + ccl_global int *l_rank = rank + y*filter_area.z + x; + ccl_global float *l_transform = transform + y*filter_area.z + x; + kernel_filter_construct_transform(buffer, + x + filter_area.x, y + filter_area.y, + rect, pass_stride, + l_transform, l_rank, + radius, pca_threshold, + filter_area.z*filter_area.w, + get_local_id(1)*get_local_size(0) + get_local_id(0)); + } +} + +__kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_restrict weight_image, + const ccl_global float *ccl_restrict variance_image, + ccl_global float *difference_image, + int w, + int h, + int stride, + int shift_stride, + int r, + int channel_offset, + float a, + float k_2) +{ + int4 co, rect; + int ofs; + if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w, + weight_image, + variance_image, + difference_image + ofs, + rect, stride, + channel_offset, a, k_2); + } +} + +__kernel void kernel_ocl_filter_nlm_blur(const ccl_global float *ccl_restrict difference_image, + ccl_global float *out_image, + int w, + int h, + int stride, + int shift_stride, + int r, + int f) +{ + int4 co, rect; + int ofs; + if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + kernel_filter_nlm_blur(co.x, co.y, + difference_image + ofs, + out_image + ofs, + rect, stride, f); + } +} + +__kernel void kernel_ocl_filter_nlm_calc_weight(const ccl_global float *ccl_restrict difference_image, + ccl_global float *out_image, + int w, + int h, + int stride, + int shift_stride, + int r, + int f) +{ + int4 co, rect; + int ofs; + if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + kernel_filter_nlm_calc_weight(co.x, co.y, + difference_image + ofs, + out_image + ofs, + rect, stride, f); + } +} + +__kernel void kernel_ocl_filter_nlm_update_output(const ccl_global float *ccl_restrict difference_image, + const ccl_global float *ccl_restrict image, + ccl_global float *out_image, + ccl_global float *accum_image, + int w, + int h, + int stride, + int shift_stride, + int r, + int f) +{ + int4 co, rect; + int ofs; + if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w, + difference_image + ofs, + image, + out_image, + accum_image, + rect, stride, f); + } +} + +__kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *out_image, + const ccl_global float *ccl_restrict accum_image, + int w, + int h, + int stride) +{ + int x = get_global_id(0); + int y = get_global_id(1); + if(x < w && y < h) { + kernel_filter_nlm_normalize(x, y, out_image, accum_image, stride); + } +} + +__kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *ccl_restrict difference_image, + const ccl_global float *ccl_restrict buffer, + const ccl_global float *ccl_restrict transform, + ccl_global int *rank, + ccl_global float *XtWX, + ccl_global float3 *XtWY, + int4 filter_window, + int w, + int h, + int stride, + int shift_stride, + int r, + int f, + int pass_stride) +{ + int4 co, rect; + int ofs; + if(get_nlm_coords_window(w, h, r, shift_stride, &rect, &co, &ofs, filter_window)) { + kernel_filter_nlm_construct_gramian(co.x, co.y, + co.z, co.w, + difference_image + ofs, + buffer, + transform, rank, + XtWX, XtWY, + rect, filter_window, + stride, f, + pass_stride, + get_local_id(1)*get_local_size(0) + get_local_id(0)); + } +} + +__kernel void kernel_ocl_filter_finalize(ccl_global float *buffer, + ccl_global int *rank, + ccl_global float *XtWX, + ccl_global float3 *XtWY, + int4 filter_area, + int4 buffer_params, + int sample) +{ + int x = get_global_id(0); + int y = get_global_id(1); + if(x < filter_area.z && y < filter_area.w) { + int storage_ofs = y*filter_area.z+x; + rank += storage_ofs; + XtWX += storage_ofs; + XtWY += storage_ofs; + kernel_filter_finalize(x, y, buffer, rank, + filter_area.z*filter_area.w, + XtWX, XtWY, + buffer_params, sample); + } +} + +__kernel void kernel_ocl_filter_set_tiles(ccl_global TilesInfo* tiles, + ccl_global float *buffer_1, + ccl_global float *buffer_2, + ccl_global float *buffer_3, + ccl_global float *buffer_4, + ccl_global float *buffer_5, + ccl_global float *buffer_6, + ccl_global float *buffer_7, + ccl_global float *buffer_8, + ccl_global float *buffer_9) +{ + if((get_global_id(0) == 0) && (get_global_id(1) == 0)) { + tiles->buffers[0] = buffer_1; + tiles->buffers[1] = buffer_2; + tiles->buffers[2] = buffer_3; + tiles->buffers[3] = buffer_4; + tiles->buffers[4] = buffer_5; + tiles->buffers[5] = buffer_6; + tiles->buffers[6] = buffer_7; + tiles->buffers[7] = buffer_8; + tiles->buffers[8] = buffer_9; + } +} diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl index a68f97857b6..9d5d784e140 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel.cl @@ -16,45 +16,42 @@ /* OpenCL kernel entry points - unfinished */ -#include "../../kernel_compat_opencl.h" -#include "../../kernel_math.h" -#include "../../kernel_types.h" -#include "../../kernel_globals.h" -#include "../../kernel_image_opencl.h" +#include "kernel/kernel_compat_opencl.h" +#include "kernel/kernel_math.h" +#include "kernel/kernel_types.h" +#include "kernel/kernel_globals.h" +#include "kernel/kernels/opencl/kernel_opencl_image.h" -#include "../../kernel_film.h" +#include "kernel/kernel_film.h" #if defined(__COMPILE_ONLY_MEGAKERNEL__) || !defined(__NO_BAKING__) -# include "../../kernel_path.h" -# include "../../kernel_path_branched.h" +# include "kernel/kernel_path.h" +# include "kernel/kernel_path_branched.h" #else /* __COMPILE_ONLY_MEGAKERNEL__ */ /* Include only actually used headers for the case * when path tracing kernels are not needed. */ -# include "../../kernel_random.h" -# include "../../kernel_differential.h" -# include "../../kernel_montecarlo.h" -# include "../../kernel_projection.h" -# include "../../geom/geom.h" -# include "../../bvh/bvh.h" - -# include "../../kernel_accumulate.h" -# include "../../kernel_camera.h" -# include "../../kernel_shader.h" +# include "kernel/kernel_random.h" +# include "kernel/kernel_differential.h" +# include "kernel/kernel_montecarlo.h" +# include "kernel/kernel_projection.h" +# include "kernel/geom/geom.h" +# include "kernel/bvh/bvh.h" + +# include "kernel/kernel_accumulate.h" +# include "kernel/kernel_camera.h" +# include "kernel/kernel_shader.h" #endif /* defined(__COMPILE_ONLY_MEGAKERNEL__) || !defined(__NO_BAKING__) */ -#include "../../kernel_bake.h" +#include "kernel/kernel_bake.h" #ifdef __COMPILE_ONLY_MEGAKERNEL__ __kernel void kernel_ocl_path_trace( ccl_constant KernelData *data, ccl_global float *buffer, - ccl_global uint *rng_state, -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, -#include "../../kernel_textures.h" + KERNEL_BUFFER_PARAMS, int sample, int sx, int sy, int sw, int sh, int offset, int stride) @@ -63,28 +60,24 @@ __kernel void kernel_ocl_path_trace( kg->data = data; -#define KERNEL_TEX(type, ttype, name) \ - kg->name = name; -#include "../../kernel_textures.h" + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + kernel_set_buffer_info(kg); - int x = sx + get_global_id(0); - int y = sy + get_global_id(1); + int x = sx + ccl_global_id(0); + int y = sy + ccl_global_id(1); if(x < sx + sw && y < sy + sh) - kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride); + kernel_path_trace(kg, buffer, sample, x, y, offset, stride); } #else /* __COMPILE_ONLY_MEGAKERNEL__ */ -__kernel void kernel_ocl_shader( +__kernel void kernel_ocl_displace( ccl_constant KernelData *data, ccl_global uint4 *input, ccl_global float4 *output, - ccl_global float *output_luma, -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, -#include "../../kernel_textures.h" + KERNEL_BUFFER_PARAMS, int type, int sx, int sw, int offset, int sample) { @@ -92,20 +85,35 @@ __kernel void kernel_ocl_shader( kg->data = data; -#define KERNEL_TEX(type, ttype, name) \ - kg->name = name; -#include "../../kernel_textures.h" + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + kernel_set_buffer_info(kg); - int x = sx + get_global_id(0); + int x = sx + ccl_global_id(0); if(x < sx + sw) { - kernel_shader_evaluate(kg, - input, - output, - output_luma, - (ShaderEvalType)type, - x, - sample); + kernel_displace_evaluate(kg, input, output, x); + } +} +__kernel void kernel_ocl_background( + ccl_constant KernelData *data, + ccl_global uint4 *input, + ccl_global float4 *output, + + KERNEL_BUFFER_PARAMS, + + int type, int sx, int sw, int offset, int sample) +{ + KernelGlobals kglobals, *kg = &kglobals; + + kg->data = data; + + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + kernel_set_buffer_info(kg); + + int x = sx + ccl_global_id(0); + + if(x < sx + sw) { + kernel_background_evaluate(kg, input, output, x); } } @@ -114,9 +122,7 @@ __kernel void kernel_ocl_bake( ccl_global uint4 *input, ccl_global float4 *output, -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, -#include "../../kernel_textures.h" + KERNEL_BUFFER_PARAMS, int type, int filter, int sx, int sw, int offset, int sample) { @@ -124,11 +130,10 @@ __kernel void kernel_ocl_bake( kg->data = data; -#define KERNEL_TEX(type, ttype, name) \ - kg->name = name; -#include "../../kernel_textures.h" + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + kernel_set_buffer_info(kg); - int x = sx + get_global_id(0); + int x = sx + ccl_global_id(0); if(x < sx + sw) { #ifdef __NO_BAKING__ @@ -144,9 +149,7 @@ __kernel void kernel_ocl_convert_to_byte( ccl_global uchar4 *rgba, ccl_global float *buffer, -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, -#include "../../kernel_textures.h" + KERNEL_BUFFER_PARAMS, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) @@ -155,12 +158,11 @@ __kernel void kernel_ocl_convert_to_byte( kg->data = data; -#define KERNEL_TEX(type, ttype, name) \ - kg->name = name; -#include "../../kernel_textures.h" + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + kernel_set_buffer_info(kg); - int x = sx + get_global_id(0); - int y = sy + get_global_id(1); + int x = sx + ccl_global_id(0); + int y = sy + ccl_global_id(1); if(x < sx + sw && y < sy + sh) kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride); @@ -171,9 +173,7 @@ __kernel void kernel_ocl_convert_to_half_float( ccl_global uchar4 *rgba, ccl_global float *buffer, -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, -#include "../../kernel_textures.h" + KERNEL_BUFFER_PARAMS, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) @@ -182,15 +182,30 @@ __kernel void kernel_ocl_convert_to_half_float( kg->data = data; -#define KERNEL_TEX(type, ttype, name) \ - kg->name = name; -#include "../../kernel_textures.h" + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + kernel_set_buffer_info(kg); - int x = sx + get_global_id(0); - int y = sy + get_global_id(1); + int x = sx + ccl_global_id(0); + int y = sy + ccl_global_id(1); if(x < sx + sw && y < sy + sh) kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride); } +__kernel void kernel_ocl_zero_buffer(ccl_global float4 *buffer, uint64_t size, uint64_t offset) +{ + size_t i = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0); + + if(i < size / sizeof(float4)) { + buffer[i+offset/sizeof(float4)] = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + else if(i == size / sizeof(float4)) { + ccl_global uchar *b = (ccl_global uchar*)&buffer[i+offset/sizeof(float4)]; + + for(i = 0; i < size % sizeof(float4); i++) { + *(b++) = 0; + } + } +} + #endif /* __COMPILE_ONLY_MEGAKERNEL__ */ diff --git a/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl b/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl deleted file mode 100644 index 1914d241eb1..00000000000 --- a/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "split/kernel_background_buffer_update.h" - -__kernel void kernel_ocl_path_trace_background_buffer_update( - ccl_global char *kg, - ccl_constant KernelData *data, - ccl_global float *per_sample_output_buffers, - ccl_global uint *rng_state, - ccl_global uint *rng_coop, /* Required for buffer Update */ - ccl_global float3 *throughput_coop, /* Required for background hit processing */ - PathRadiance *PathRadiance_coop, /* Required for background hit processing and buffer Update */ - ccl_global Ray *Ray_coop, /* Required for background hit processing */ - ccl_global PathState *PathState_coop, /* Required for background hit processing */ - ccl_global float *L_transparent_coop, /* Required for background hit processing and buffer Update */ - ccl_global char *ray_state, /* Stores information on the current state of a ray */ - int sw, int sh, int sx, int sy, int stride, - int rng_state_offset_x, - int rng_state_offset_y, - int rng_state_stride, - ccl_global unsigned int *work_array, /* Denotes work of each ray */ - ccl_global int *Queue_data, /* Queues memory */ - ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ - int queuesize, /* Size (capacity) of each queue */ - int end_sample, - int start_sample, -#ifdef __WORK_STEALING__ - ccl_global unsigned int *work_pool_wgs, - unsigned int num_samples, -#endif -#ifdef __KERNEL_DEBUG__ - DebugData *debugdata_coop, -#endif - int parallel_samples) /* Number of samples to be processed in parallel */ -{ - ccl_local unsigned int local_queue_atomics; - if(get_local_id(0) == 0 && get_local_id(1) == 0) { - local_queue_atomics = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - if(ray_index == 0) { - /* We will empty this queue in this kernel. */ - Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0; - } - char enqueue_flag = 0; - ray_index = get_ray_index(ray_index, - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - Queue_data, - queuesize, - 1); - -#ifdef __COMPUTE_DEVICE_GPU__ - /* If we are executing on a GPU device, we exit all threads that are not - * required. - * - * If we are executing on a CPU device, then we need to keep all threads - * active since we have barrier() calls later in the kernel. CPU devices, - * expect all threads to execute barrier statement. - */ - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } -#endif - -#ifndef __COMPUTE_DEVICE_GPU__ - if(ray_index != QUEUE_EMPTY_SLOT) { -#endif - enqueue_flag = - kernel_background_buffer_update((KernelGlobals *)kg, - per_sample_output_buffers, - rng_state, - rng_coop, - throughput_coop, - PathRadiance_coop, - Ray_coop, - PathState_coop, - L_transparent_coop, - ray_state, - sw, sh, sx, sy, stride, - rng_state_offset_x, - rng_state_offset_y, - rng_state_stride, - work_array, - end_sample, - start_sample, -#ifdef __WORK_STEALING__ - work_pool_wgs, - num_samples, -#endif -#ifdef __KERNEL_DEBUG__ - debugdata_coop, -#endif - parallel_samples, - ray_index); -#ifndef __COMPUTE_DEVICE_GPU__ - } -#endif - - /* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS; - * These rays will be made active during next SceneIntersectkernel. - */ - enqueue_ray_index_local(ray_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - enqueue_flag, - queuesize, - &local_queue_atomics, - Queue_data, - Queue_index); -} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl b/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl new file mode 100644 index 00000000000..dcea2630aef --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl @@ -0,0 +1,26 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_buffer_update.h" + +#define KERNEL_NAME buffer_update +#define LOCALS_TYPE unsigned int +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl index 18139687eab..7125348a49f 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl @@ -14,77 +14,40 @@ * limitations under the License. */ -#include "split/kernel_data_init.h" +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_data_init.h" __kernel void kernel_ocl_path_trace_data_init( - ccl_global char *globals, - ccl_global char *sd_DL_shadow, + ccl_global char *kg, ccl_constant KernelData *data, - ccl_global float *per_sample_output_buffers, - ccl_global uint *rng_state, - ccl_global uint *rng_coop, /* rng array to store rng values for all rays */ - ccl_global float3 *throughput_coop, /* throughput array to store throughput values for all rays */ - ccl_global float *L_transparent_coop, /* L_transparent array to store L_transparent values for all rays */ - PathRadiance *PathRadiance_coop, /* PathRadiance array to store PathRadiance values for all rays */ - ccl_global Ray *Ray_coop, /* Ray array to store Ray information for all rays */ - ccl_global PathState *PathState_coop, /* PathState array to store PathState information for all rays */ - Intersection *Intersection_coop_shadow, - ccl_global char *ray_state, /* Stores information on current state of a ray */ - -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, -#include "../../kernel_textures.h" - - int start_sample, int sx, int sy, int sw, int sh, int offset, int stride, - int rng_state_offset_x, - int rng_state_offset_y, - int rng_state_stride, - ccl_global int *Queue_data, /* Memory for queues */ + ccl_global void *split_data_buffer, + int num_elements, + ccl_global char *ray_state, + KERNEL_BUFFER_PARAMS, + int start_sample, + int end_sample, + int sx, int sy, int sw, int sh, int offset, int stride, ccl_global int *Queue_index, /* Tracks the number of elements in queues */ int queuesize, /* size (capacity) of the queue */ ccl_global char *use_queues_flag, /* flag to decide if scene-intersect kernel should use queues to fetch ray index */ - ccl_global unsigned int *work_array, /* work array to store which work each ray belongs to */ -#ifdef __WORK_STEALING__ ccl_global unsigned int *work_pool_wgs, /* Work pool for each work group */ unsigned int num_samples, /* Total number of samples per pixel */ -#endif -#ifdef __KERNEL_DEBUG__ - DebugData *debugdata_coop, -#endif - int parallel_samples) /* Number of samples to be processed in parallel */ + ccl_global float *buffer) { - kernel_data_init((KernelGlobals *)globals, - (ShaderData *)sd_DL_shadow, + kernel_data_init((KernelGlobals*)kg, data, - per_sample_output_buffers, - rng_state, - rng_coop, - throughput_coop, - L_transparent_coop, - PathRadiance_coop, - Ray_coop, - PathState_coop, - Intersection_coop_shadow, + split_data_buffer, + num_elements, ray_state, - -#define KERNEL_TEX(type, ttype, name) name, -#include "../../kernel_textures.h" - - start_sample, sx, sy, sw, sh, offset, stride, - rng_state_offset_x, - rng_state_offset_y, - rng_state_stride, - Queue_data, + KERNEL_BUFFER_ARGS, + start_sample, + end_sample, + sx, sy, sw, sh, offset, stride, Queue_index, queuesize, use_queues_flag, - work_array, -#ifdef __WORK_STEALING__ work_pool_wgs, num_samples, -#endif -#ifdef __KERNEL_DEBUG__ - debugdata_coop, -#endif - parallel_samples); + buffer); } diff --git a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl index c6a2c8d050c..ed64ae01aae 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl @@ -14,74 +14,13 @@ * limitations under the License. */ -#include "split/kernel_direct_lighting.h" +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_direct_lighting.h" -__kernel void kernel_ocl_path_trace_direct_lighting( - ccl_global char *kg, - ccl_constant KernelData *data, - ccl_global char *sd, /* Required for direct lighting */ - ccl_global uint *rng_coop, /* Required for direct lighting */ - ccl_global PathState *PathState_coop, /* Required for direct lighting */ - ccl_global int *ISLamp_coop, /* Required for direct lighting */ - ccl_global Ray *LightRay_coop, /* Required for direct lighting */ - ccl_global BsdfEval *BSDFEval_coop, /* Required for direct lighting */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - ccl_global int *Queue_data, /* Queue memory */ - ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ - int queuesize) /* Size (capacity) of each queue */ -{ - ccl_local unsigned int local_queue_atomics; - if(get_local_id(0) == 0 && get_local_id(1) == 0) { - local_queue_atomics = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); +#define KERNEL_NAME direct_lighting +#define LOCALS_TYPE unsigned int +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE - char enqueue_flag = 0; - int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - ray_index = get_ray_index(ray_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - Queue_data, - queuesize, - 0); - -#ifdef __COMPUTE_DEVICE_GPU__ - /* If we are executing on a GPU device, we exit all threads that are not - * required. - * - * If we are executing on a CPU device, then we need to keep all threads - * active since we have barrier() calls later in the kernel. CPU devices, - * expect all threads to execute barrier statement. - */ - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } -#endif - -#ifndef __COMPUTE_DEVICE_GPU__ - if(ray_index != QUEUE_EMPTY_SLOT) { -#endif - enqueue_flag = kernel_direct_lighting((KernelGlobals *)kg, - (ShaderData *)sd, - rng_coop, - PathState_coop, - ISLamp_coop, - LightRay_coop, - BSDFEval_coop, - ray_state, - ray_index); - -#ifndef __COMPUTE_DEVICE_GPU__ - } -#endif - -#ifdef __EMISSION__ - /* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */ - enqueue_ray_index_local(ray_index, - QUEUE_SHADOW_RAY_CAST_DL_RAYS, - enqueue_flag, - queuesize, - &local_queue_atomics, - Queue_data, - Queue_index); -#endif -} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl b/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl new file mode 100644 index 00000000000..8afaa686e28 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl @@ -0,0 +1,24 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_do_volume.h" + +#define KERNEL_NAME do_volume +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl b/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl new file mode 100644 index 00000000000..e68d4104a91 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl @@ -0,0 +1,26 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_enqueue_inactive.h" + +#define KERNEL_NAME enqueue_inactive +#define LOCALS_TYPE unsigned int +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl index e063614da1a..9e1e57beba6 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl @@ -14,110 +14,13 @@ * limitations under the License. */ -#include "split/kernel_holdout_emission_blurring_pathtermination_ao.h" +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h" -__kernel void kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao( - ccl_global char *kg, - ccl_constant KernelData *data, - ccl_global char *sd, /* Required throughout the kernel except probabilistic path termination and AO */ - ccl_global float *per_sample_output_buffers, - ccl_global uint *rng_coop, /* Required for "kernel_write_data_passes" and AO */ - ccl_global float3 *throughput_coop, /* Required for handling holdout material and AO */ - ccl_global float *L_transparent_coop, /* Required for handling holdout material */ - PathRadiance *PathRadiance_coop, /* Required for "kernel_write_data_passes" and indirect primitive emission */ - ccl_global PathState *PathState_coop, /* Required throughout the kernel and AO */ - Intersection *Intersection_coop, /* Required for indirect primitive emission */ - ccl_global float3 *AOAlpha_coop, /* Required for AO */ - ccl_global float3 *AOBSDF_coop, /* Required for AO */ - ccl_global Ray *AOLightRay_coop, /* Required for AO */ - int sw, int sh, int sx, int sy, int stride, - ccl_global char *ray_state, /* Denotes the state of each ray */ - ccl_global unsigned int *work_array, /* Denotes the work that each ray belongs to */ - ccl_global int *Queue_data, /* Queue memory */ - ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ - int queuesize, /* Size (capacity) of each queue */ -#ifdef __WORK_STEALING__ - unsigned int start_sample, -#endif - int parallel_samples) /* Number of samples to be processed in parallel */ -{ - ccl_local unsigned int local_queue_atomics_bg; - ccl_local unsigned int local_queue_atomics_ao; - if(get_local_id(0) == 0 && get_local_id(1) == 0) { - local_queue_atomics_bg = 0; - local_queue_atomics_ao = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); +#define KERNEL_NAME holdout_emission_blurring_pathtermination_ao +#define LOCALS_TYPE BackgroundAOLocals +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE - char enqueue_flag = 0; - char enqueue_flag_AO_SHADOW_RAY_CAST = 0; - int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - ray_index = get_ray_index(ray_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - Queue_data, - queuesize, - 0); - -#ifdef __COMPUTE_DEVICE_GPU__ - /* If we are executing on a GPU device, we exit all threads that are not - * required. - * - * If we are executing on a CPU device, then we need to keep all threads - * active since we have barrier() calls later in the kernel. CPU devices, - * expect all threads to execute barrier statement. - */ - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } -#endif /* __COMPUTE_DEVICE_GPU__ */ - -#ifndef __COMPUTE_DEVICE_GPU__ - if(ray_index != QUEUE_EMPTY_SLOT) { -#endif - kernel_holdout_emission_blurring_pathtermination_ao( - (KernelGlobals *)kg, - (ShaderData *)sd, - per_sample_output_buffers, - rng_coop, - throughput_coop, - L_transparent_coop, - PathRadiance_coop, - PathState_coop, - Intersection_coop, - AOAlpha_coop, - AOBSDF_coop, - AOLightRay_coop, - sw, sh, sx, sy, stride, - ray_state, - work_array, -#ifdef __WORK_STEALING__ - start_sample, -#endif - parallel_samples, - ray_index, - &enqueue_flag, - &enqueue_flag_AO_SHADOW_RAY_CAST); -#ifndef __COMPUTE_DEVICE_GPU__ - } -#endif - - /* Enqueue RAY_UPDATE_BUFFER rays. */ - enqueue_ray_index_local(ray_index, - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - enqueue_flag, - queuesize, - &local_queue_atomics_bg, - Queue_data, - Queue_index); - -#ifdef __AO__ - /* Enqueue to-shadow-ray-cast rays. */ - enqueue_ray_index_local(ray_index, - QUEUE_SHADOW_RAY_CAST_AO_RAYS, - enqueue_flag_AO_SHADOW_RAY_CAST, - queuesize, - &local_queue_atomics_ao, - Queue_data, - Queue_index); -#endif -} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl b/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl new file mode 100644 index 00000000000..192d01444ba --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl @@ -0,0 +1,24 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_indirect_background.h" + +#define KERNEL_NAME indirect_background +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl b/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl new file mode 100644 index 00000000000..84938b889e5 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl @@ -0,0 +1,24 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_indirect_subsurface.h" + +#define KERNEL_NAME indirect_subsurface +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl index 267bddc2ffc..c314dc96c33 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl @@ -14,67 +14,11 @@ * limitations under the License. */ -#include "split/kernel_lamp_emission.h" +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_lamp_emission.h" -__kernel void kernel_ocl_path_trace_lamp_emission( - ccl_global char *kg, - ccl_constant KernelData *data, - ccl_global float3 *throughput_coop, /* Required for lamp emission */ - PathRadiance *PathRadiance_coop, /* Required for lamp emission */ - ccl_global Ray *Ray_coop, /* Required for lamp emission */ - ccl_global PathState *PathState_coop, /* Required for lamp emission */ - Intersection *Intersection_coop, /* Required for lamp emission */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - int sw, int sh, - ccl_global int *Queue_data, /* Memory for queues */ - ccl_global int *Queue_index, /* Tracks the number of elements in queues */ - int queuesize, /* Size (capacity) of queues */ - ccl_global char *use_queues_flag, /* Used to decide if this kernel should use - * queues to fetch ray index - */ - int parallel_samples) /* Number of samples to be processed in parallel */ -{ - int x = get_global_id(0); - int y = get_global_id(1); +#define KERNEL_NAME lamp_emission +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME - /* We will empty this queue in this kernel. */ - if(get_global_id(0) == 0 && get_global_id(1) == 0) { - Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; - } - /* Fetch use_queues_flag. */ - ccl_local char local_use_queues_flag; - if(get_local_id(0) == 0 && get_local_id(1) == 0) { - local_use_queues_flag = use_queues_flag[0]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - int ray_index; - if(local_use_queues_flag) { - int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - ray_index = get_ray_index(thread_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - Queue_data, - queuesize, - 1); - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } - } else { - if(x < (sw * parallel_samples) && y < sh) { - ray_index = x + y * (sw * parallel_samples); - } else { - return; - } - } - - kernel_lamp_emission((KernelGlobals *)kg, - throughput_coop, - PathRadiance_coop, - Ray_coop, - PathState_coop, - Intersection_coop, - ray_state, - sw, sh, - use_queues_flag, - ray_index); -} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl index 6d49b6294a8..8b1332bf013 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl @@ -14,101 +14,13 @@ * limitations under the License. */ -#include "split/kernel_next_iteration_setup.h" +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_next_iteration_setup.h" -__kernel void kernel_ocl_path_trace_next_iteration_setup( - ccl_global char *kg, - ccl_constant KernelData *data, - ccl_global char *sd, /* Required for setting up ray for next iteration */ - ccl_global uint *rng_coop, /* Required for setting up ray for next iteration */ - ccl_global float3 *throughput_coop, /* Required for setting up ray for next iteration */ - PathRadiance *PathRadiance_coop, /* Required for setting up ray for next iteration */ - ccl_global Ray *Ray_coop, /* Required for setting up ray for next iteration */ - ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */ - ccl_global Ray *LightRay_dl_coop, /* Required for radiance update - direct lighting */ - ccl_global int *ISLamp_coop, /* Required for radiance update - direct lighting */ - ccl_global BsdfEval *BSDFEval_coop, /* Required for radiance update - direct lighting */ - ccl_global Ray *LightRay_ao_coop, /* Required for radiance update - AO */ - ccl_global float3 *AOBSDF_coop, /* Required for radiance update - AO */ - ccl_global float3 *AOAlpha_coop, /* Required for radiance update - AO */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - ccl_global int *Queue_data, /* Queue memory */ - ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ - int queuesize, /* Size (capacity) of each queue */ - ccl_global char *use_queues_flag) /* flag to decide if scene_intersect kernel should - * use queues to fetch ray index */ -{ - ccl_local unsigned int local_queue_atomics; - if(get_local_id(0) == 0 && get_local_id(1) == 0) { - local_queue_atomics = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); +#define KERNEL_NAME next_iteration_setup +#define LOCALS_TYPE unsigned int +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE - if(get_global_id(0) == 0 && get_global_id(1) == 0) { - /* If we are here, then it means that scene-intersect kernel - * has already been executed atleast once. From the next time, - * scene-intersect kernel may operate on queues to fetch ray index - */ - use_queues_flag[0] = 1; - - /* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and - * QUEUE_SHADOW_RAY_CAST_DL_RAYS queues that were made empty during the - * previous kernel. - */ - Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0; - Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0; - } - - char enqueue_flag = 0; - int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - ray_index = get_ray_index(ray_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - Queue_data, - queuesize, - 0); - -#ifdef __COMPUTE_DEVICE_GPU__ - /* If we are executing on a GPU device, we exit all threads that are not - * required. - * - * If we are executing on a CPU device, then we need to keep all threads - * active since we have barrier() calls later in the kernel. CPU devices, - * expect all threads to execute barrier statement. - */ - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } -#endif - -#ifndef __COMPUTE_DEVICE_GPU__ - if(ray_index != QUEUE_EMPTY_SLOT) { -#endif - enqueue_flag = kernel_next_iteration_setup((KernelGlobals *)kg, - (ShaderData *)sd, - rng_coop, - throughput_coop, - PathRadiance_coop, - Ray_coop, - PathState_coop, - LightRay_dl_coop, - ISLamp_coop, - BSDFEval_coop, - LightRay_ao_coop, - AOBSDF_coop, - AOAlpha_coop, - ray_state, - use_queues_flag, - ray_index); -#ifndef __COMPUTE_DEVICE_GPU__ - } -#endif - - /* Enqueue RAY_UPDATE_BUFFER rays. */ - enqueue_ray_index_local(ray_index, - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - enqueue_flag, - queuesize, - &local_queue_atomics, - Queue_data, - Queue_index); -} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h new file mode 100644 index 00000000000..faa9dd66d0e --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h @@ -0,0 +1,246 @@ +/* + * Copyright 2016 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* For OpenCL we do manual lookup and interpolation. */ + +ccl_device_inline ccl_global TextureInfo* kernel_tex_info(KernelGlobals *kg, uint id) { + const uint tex_offset = id +#define KERNEL_TEX(type, name) + 1 +#include "kernel/kernel_textures.h" + ; + + return &((ccl_global TextureInfo*)kg->buffers[0])[tex_offset]; +} + +#define tex_fetch(type, info, index) ((ccl_global type*)(kg->buffers[info->cl_buffer] + info->data))[(index)] + +ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width) +{ + x %= width; + if(x < 0) + x += width; + return x; +} + +ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width) +{ + return clamp(x, 0, width-1); +} + +ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, const ccl_global TextureInfo *info, int id, int offset) +{ + const int texture_type = kernel_tex_type(id); + + /* Float4 */ + if(texture_type == IMAGE_DATA_TYPE_FLOAT4) { + return tex_fetch(float4, info, offset); + } + /* Byte4 */ + else if(texture_type == IMAGE_DATA_TYPE_BYTE4) { + uchar4 r = tex_fetch(uchar4, info, offset); + float f = 1.0f/255.0f; + return make_float4(r.x*f, r.y*f, r.z*f, r.w*f); + } + /* Float */ + else if(texture_type == IMAGE_DATA_TYPE_FLOAT) { + float f = tex_fetch(float, info, offset); + return make_float4(f, f, f, 1.0f); + } + /* Byte */ + else { + uchar r = tex_fetch(uchar, info, offset); + float f = r * (1.0f/255.0f); + return make_float4(f, f, f, 1.0f); + } +} + +ccl_device_inline float4 svm_image_texture_read_2d(KernelGlobals *kg, int id, int x, int y) +{ + const ccl_global TextureInfo *info = kernel_tex_info(kg, id); + + /* Wrap */ + if(info->extension == EXTENSION_REPEAT) { + x = svm_image_texture_wrap_periodic(x, info->width); + y = svm_image_texture_wrap_periodic(y, info->height); + } + else { + x = svm_image_texture_wrap_clamp(x, info->width); + y = svm_image_texture_wrap_clamp(y, info->height); + } + + int offset = x + info->width * y; + return svm_image_texture_read(kg, info, id, offset); +} + +ccl_device_inline float4 svm_image_texture_read_3d(KernelGlobals *kg, int id, int x, int y, int z) +{ + const ccl_global TextureInfo *info = kernel_tex_info(kg, id); + + /* Wrap */ + if(info->extension == EXTENSION_REPEAT) { + x = svm_image_texture_wrap_periodic(x, info->width); + y = svm_image_texture_wrap_periodic(y, info->height); + z = svm_image_texture_wrap_periodic(z, info->depth); + } + else { + x = svm_image_texture_wrap_clamp(x, info->width); + y = svm_image_texture_wrap_clamp(y, info->height); + z = svm_image_texture_wrap_clamp(z, info->depth); + } + + int offset = x + info->width * y + info->width * info->height * z; + return svm_image_texture_read(kg, info, id, offset); +} + + +ccl_device_inline float svm_image_texture_frac(float x, int *ix) +{ + int i = float_to_int(x) - ((x < 0.0f)? 1: 0); + *ix = i; + return x - (float)i; +} + +#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \ + { \ + u[0] = (((-1.0f/6.0f)* t + 0.5f) * t - 0.5f) * t + (1.0f/6.0f); \ + u[1] = (( 0.5f * t - 1.0f) * t ) * t + (2.0f/3.0f); \ + u[2] = (( -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \ + u[3] = (1.0f / 6.0f) * t * t * t; \ + } (void)0 + +ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y) +{ + const ccl_global TextureInfo *info = kernel_tex_info(kg, id); + + if(info->extension == EXTENSION_CLIP) { + if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + } + + if(info->interpolation == INTERPOLATION_CLOSEST) { + /* Closest interpolation. */ + int ix, iy; + svm_image_texture_frac(x*info->width, &ix); + svm_image_texture_frac(y*info->height, &iy); + + return svm_image_texture_read_2d(kg, id, ix, iy); + } + else if(info->interpolation == INTERPOLATION_LINEAR) { + /* Bilinear interpolation. */ + int ix, iy; + float tx = svm_image_texture_frac(x*info->width - 0.5f, &ix); + float ty = svm_image_texture_frac(y*info->height - 0.5f, &iy); + + float4 r; + r = (1.0f - ty)*(1.0f - tx)*svm_image_texture_read_2d(kg, id, ix, iy); + r += (1.0f - ty)*tx*svm_image_texture_read_2d(kg, id, ix+1, iy); + r += ty*(1.0f - tx)*svm_image_texture_read_2d(kg, id, ix, iy+1); + r += ty*tx*svm_image_texture_read_2d(kg, id, ix+1, iy+1); + return r; + } + else { + /* Bicubic interpolation. */ + int ix, iy; + float tx = svm_image_texture_frac(x*info->width - 0.5f, &ix); + float ty = svm_image_texture_frac(y*info->height - 0.5f, &iy); + + float u[4], v[4]; + SET_CUBIC_SPLINE_WEIGHTS(u, tx); + SET_CUBIC_SPLINE_WEIGHTS(v, ty); + + float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + + for(int y = 0; y < 4; y++) { + for(int x = 0; x < 4; x++) { + float weight = u[x]*v[y]; + r += weight*svm_image_texture_read_2d(kg, id, ix+x-1, iy+y-1); + } + } + return r; + } +} + + +ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z, int interp) +{ + const ccl_global TextureInfo *info = kernel_tex_info(kg, id); + + if(info->extension == EXTENSION_CLIP) { + if(x < 0.0f || y < 0.0f || z < 0.0f || + x > 1.0f || y > 1.0f || z > 1.0f) + { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + } + + uint interpolation = (interp == INTERPOLATION_NONE)? info->interpolation: interp; + + if(interpolation == INTERPOLATION_CLOSEST) { + /* Closest interpolation. */ + int ix, iy, iz; + svm_image_texture_frac(x*info->width, &ix); + svm_image_texture_frac(y*info->height, &iy); + svm_image_texture_frac(z*info->depth, &iz); + + return svm_image_texture_read_3d(kg, id, ix, iy, iz); + } + else if(interpolation == INTERPOLATION_LINEAR) { + /* Bilinear interpolation. */ + int ix, iy, iz; + float tx = svm_image_texture_frac(x*info->width - 0.5f, &ix); + float ty = svm_image_texture_frac(y*info->height - 0.5f, &iy); + float tz = svm_image_texture_frac(z*info->depth - 0.5f, &iz); + + float4 r; + r = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*svm_image_texture_read_3d(kg, id, ix, iy, iz); + r += (1.0f - tz)*(1.0f - ty)*tx*svm_image_texture_read_3d(kg, id, ix+1, iy, iz); + r += (1.0f - tz)*ty*(1.0f - tx)*svm_image_texture_read_3d(kg, id, ix, iy+1, iz); + r += (1.0f - tz)*ty*tx*svm_image_texture_read_3d(kg, id, ix+1, iy+1, iz); + + r += tz*(1.0f - ty)*(1.0f - tx)*svm_image_texture_read_3d(kg, id, ix, iy, iz+1); + r += tz*(1.0f - ty)*tx*svm_image_texture_read_3d(kg, id, ix+1, iy, iz+1); + r += tz*ty*(1.0f - tx)*svm_image_texture_read_3d(kg, id, ix, iy+1, iz+1); + r += tz*ty*tx*svm_image_texture_read_3d(kg, id, ix+1, iy+1, iz+1); + return r; + } + else { + /* Bicubic interpolation. */ + int ix, iy, iz; + float tx = svm_image_texture_frac(x*info->width - 0.5f, &ix); + float ty = svm_image_texture_frac(y*info->height - 0.5f, &iy); + float tz = svm_image_texture_frac(z*info->depth - 0.5f, &iz); + + float u[4], v[4], w[4]; + SET_CUBIC_SPLINE_WEIGHTS(u, tx); + SET_CUBIC_SPLINE_WEIGHTS(v, ty); + SET_CUBIC_SPLINE_WEIGHTS(w, tz); + + float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + + for(int z = 0; z < 4; z++) { + for(int y = 0; y < 4; y++) { + for(int x = 0; x < 4; x++) { + float weight = u[x]*v[y]*w[z]; + r += weight*svm_image_texture_read_3d(kg, id, ix+x-1, iy+y-1, iz+z-1); + } + } + } + return r; + } +} + +#undef SET_CUBIC_SPLINE_WEIGHTS diff --git a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl new file mode 100644 index 00000000000..fa210e747c0 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl @@ -0,0 +1,24 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_path_init.h" + +#define KERNEL_NAME path_init +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl index 3156dc255fb..68ee6f1d536 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl @@ -14,93 +14,13 @@ * limitations under the License. */ -#include "../../kernel_compat_opencl.h" -#include "../../kernel_math.h" -#include "../../kernel_types.h" -#include "../../kernel_globals.h" -#include "../../kernel_queues.h" +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_queue_enqueue.h" -/* - * The kernel "kernel_queue_enqueue" enqueues rays of - * different ray state into their appropriate Queues; - * 1. Rays that have been determined to hit the background from the - * "kernel_scene_intersect" kernel - * are enqueued in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS; - * 2. Rays that have been determined to be actively participating in path-iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS. - * - * The input and output of the kernel is as follows, - * - * ray_state -------------------------------------------|--- kernel_queue_enqueue --|--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS) - * Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------| |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS) - * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---| | - * queuesize -------------------------------------------| | - * - * Note on Queues : - * State of queues during the first time this kernel is called : - * At entry, - * Both QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty. - * At exit, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays - * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_HIT_BACKGROUND rays. - * - * State of queue during other times this kernel is called : - * At entry, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty. - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays. - * At exit, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays. - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays. - */ -__kernel void kernel_ocl_path_trace_queue_enqueue( - ccl_global int *Queue_data, /* Queue memory */ - ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - int queuesize) /* Size (capacity) of each queue */ -{ - /* We have only 2 cases (Hit/Not-Hit) */ - ccl_local unsigned int local_queue_atomics[2]; - - int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0); - int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - - if(lidx < 2 ) { - local_queue_atomics[lidx] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - int queue_number = -1; - - if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { - queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS; - } - else if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS; - } - - unsigned int my_lqidx; - if(queue_number != -1) { - my_lqidx = get_local_queue_index(queue_number, local_queue_atomics); - } - barrier(CLK_LOCAL_MEM_FENCE); - - if(lidx == 0) { - local_queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = - get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS, - local_queue_atomics, - Queue_index); - local_queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = - get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - local_queue_atomics, - Queue_index); - } - barrier(CLK_LOCAL_MEM_FENCE); +#define KERNEL_NAME queue_enqueue +#define LOCALS_TYPE QueueEnqueueLocals +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE - unsigned int my_gqidx; - if(queue_number != -1) { - my_gqidx = get_global_queue_index(queue_number, - queuesize, - my_lqidx, - local_queue_atomics); - Queue_data[my_gqidx] = ray_index; - } -} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl index 7f3f433c7a6..10d09377ba9 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl @@ -14,67 +14,11 @@ * limitations under the License. */ -#include "split/kernel_scene_intersect.h" +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_scene_intersect.h" -__kernel void kernel_ocl_path_trace_scene_intersect( - ccl_global char *kg, - ccl_constant KernelData *data, - ccl_global uint *rng_coop, - ccl_global Ray *Ray_coop, /* Required for scene_intersect */ - ccl_global PathState *PathState_coop, /* Required for scene_intersect */ - Intersection *Intersection_coop, /* Required for scene_intersect */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - int sw, int sh, - ccl_global int *Queue_data, /* Memory for queues */ - ccl_global int *Queue_index, /* Tracks the number of elements in queues */ - int queuesize, /* Size (capacity) of queues */ - ccl_global char *use_queues_flag, /* used to decide if this kernel should use - * queues to fetch ray index */ -#ifdef __KERNEL_DEBUG__ - DebugData *debugdata_coop, -#endif - int parallel_samples) /* Number of samples to be processed in parallel */ -{ - int x = get_global_id(0); - int y = get_global_id(1); +#define KERNEL_NAME scene_intersect +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME - /* Fetch use_queues_flag */ - ccl_local char local_use_queues_flag; - if(get_local_id(0) == 0 && get_local_id(1) == 0) { - local_use_queues_flag = use_queues_flag[0]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - int ray_index; - if(local_use_queues_flag) { - int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - ray_index = get_ray_index(thread_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - Queue_data, - queuesize, - 0); - - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } - } else { - if(x < (sw * parallel_samples) && y < sh) { - ray_index = x + y * (sw * parallel_samples); - } else { - return; - } - } - - kernel_scene_intersect((KernelGlobals *)kg, - rng_coop, - Ray_coop, - PathState_coop, - Intersection_coop, - ray_state, - sw, sh, - use_queues_flag, -#ifdef __KERNEL_DEBUG__ - debugdata_coop, -#endif - ray_index); -} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl index c37856c8f30..40eaa561863 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl @@ -14,55 +14,11 @@ * limitations under the License. */ -#include "split/kernel_shader_eval.h" +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_shader_eval.h" -__kernel void kernel_ocl_path_trace_shader_eval( - ccl_global char *kg, - ccl_constant KernelData *data, - ccl_global char *sd, /* Output ShaderData structure to be filled */ - ccl_global uint *rng_coop, /* Required for rbsdf calculation */ - ccl_global Ray *Ray_coop, /* Required for setting up shader from ray */ - ccl_global PathState *PathState_coop, /* Required for all functions in this kernel */ - Intersection *Intersection_coop, /* Required for setting up shader from ray */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - ccl_global int *Queue_data, /* queue memory */ - ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ - int queuesize) /* Size (capacity) of each queue */ -{ - /* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */ - ccl_local unsigned int local_queue_atomics; - if(get_local_id(0) == 0 && get_local_id(1) == 0) { - local_queue_atomics = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); +#define KERNEL_NAME shader_eval +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME - int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - ray_index = get_ray_index(ray_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - Queue_data, - queuesize, - 0); - - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } - - char enqueue_flag = (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0; - enqueue_ray_index_local(ray_index, - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - enqueue_flag, - queuesize, - &local_queue_atomics, - Queue_data, - Queue_index); - - /* Continue on with shader evaluation. */ - kernel_shader_eval((KernelGlobals *)kg, - (ShaderData *)sd, - rng_coop, - Ray_coop, - PathState_coop, - Intersection_coop, - ray_state, - ray_index); -} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl new file mode 100644 index 00000000000..8c36100f762 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl @@ -0,0 +1,26 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_shader_setup.h" + +#define KERNEL_NAME shader_setup +#define LOCALS_TYPE unsigned int +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl new file mode 100644 index 00000000000..bcacaa4a054 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl @@ -0,0 +1,27 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_shader_sort.h" + +__attribute__((reqd_work_group_size(64, 1, 1))) +#define KERNEL_NAME shader_sort +#define LOCALS_TYPE ShaderSortLocals +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME +#undef LOCALS_TYPE + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl deleted file mode 100644 index edf76fba714..00000000000 --- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "split/kernel_shadow_blocked.h" - -__kernel void kernel_ocl_path_trace_shadow_blocked( - ccl_global char *kg, - ccl_constant KernelData *data, - ccl_global PathState *PathState_coop, /* Required for shadow blocked */ - ccl_global Ray *LightRay_dl_coop, /* Required for direct lighting's shadow blocked */ - ccl_global Ray *LightRay_ao_coop, /* Required for AO's shadow blocked */ - ccl_global char *ray_state, - ccl_global int *Queue_data, /* Queue memory */ - ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ - int queuesize) /* Size (capacity) of each queue */ -{ - int lidx = get_local_id(1) * get_local_id(0) + get_local_id(0); - - ccl_local unsigned int ao_queue_length; - ccl_local unsigned int dl_queue_length; - if(lidx == 0) { - ao_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS]; - dl_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - /* flag determining if the current ray is to process shadow ray for AO or DL */ - char shadow_blocked_type = -1; - - int ray_index = QUEUE_EMPTY_SLOT; - int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - if(thread_index < ao_queue_length + dl_queue_length) { - if(thread_index < ao_queue_length) { - ray_index = get_ray_index(thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, Queue_data, queuesize, 1); - shadow_blocked_type = RAY_SHADOW_RAY_CAST_AO; - } else { - ray_index = get_ray_index(thread_index - ao_queue_length, QUEUE_SHADOW_RAY_CAST_DL_RAYS, Queue_data, queuesize, 1); - shadow_blocked_type = RAY_SHADOW_RAY_CAST_DL; - } - } - - if(ray_index == QUEUE_EMPTY_SLOT) - return; - - kernel_shadow_blocked((KernelGlobals *)kg, - PathState_coop, - LightRay_dl_coop, - LightRay_ao_coop, - ray_state, - shadow_blocked_type, - ray_index); -} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl new file mode 100644 index 00000000000..8de250a375c --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl @@ -0,0 +1,24 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_shadow_blocked_ao.h" + +#define KERNEL_NAME shadow_blocked_ao +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl new file mode 100644 index 00000000000..29da77022ed --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl @@ -0,0 +1,24 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_shadow_blocked_dl.h" + +#define KERNEL_NAME shadow_blocked_dl +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split.cl b/intern/cycles/kernel/kernels/opencl/kernel_split.cl new file mode 100644 index 00000000000..4cbda1bc2e7 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_split.cl @@ -0,0 +1,41 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" // PRECOMPILED +#include "kernel/split/kernel_split_common.h" // PRECOMPILED + +#include "kernel/kernels/opencl/kernel_state_buffer_size.cl" +#include "kernel/kernels/opencl/kernel_data_init.cl" +#include "kernel/kernels/opencl/kernel_path_init.cl" + +#include "kernel/kernels/opencl/kernel_scene_intersect.cl" +#include "kernel/kernels/opencl/kernel_lamp_emission.cl" +#include "kernel/kernels/opencl/kernel_do_volume.cl" +#include "kernel/kernels/opencl/kernel_indirect_background.cl" +#include "kernel/kernels/opencl/kernel_queue_enqueue.cl" +#include "kernel/kernels/opencl/kernel_shader_setup.cl" +#include "kernel/kernels/opencl/kernel_shader_sort.cl" +#include "kernel/kernels/opencl/kernel_shader_eval.cl" +#include "kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl" +#include "kernel/kernels/opencl/kernel_subsurface_scatter.cl" +#include "kernel/kernels/opencl/kernel_direct_lighting.cl" +#include "kernel/kernels/opencl/kernel_shadow_blocked_ao.cl" +#include "kernel/kernels/opencl/kernel_shadow_blocked_dl.cl" +#include "kernel/kernels/opencl/kernel_enqueue_inactive.cl" +#include "kernel/kernels/opencl/kernel_next_iteration_setup.cl" +#include "kernel/kernels/opencl/kernel_indirect_subsurface.cl" +#include "kernel/kernels/opencl/kernel_buffer_update.cl" + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h new file mode 100644 index 00000000000..6aa7681cbed --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h @@ -0,0 +1,65 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define KERNEL_NAME_JOIN(a, b) a ## _ ## b +#define KERNEL_NAME_EVAL(a, b) KERNEL_NAME_JOIN(a, b) + +__kernel void KERNEL_NAME_EVAL(kernel_ocl_path_trace, KERNEL_NAME)( + ccl_global char *kg_global, + ccl_constant KernelData *data, + + ccl_global void *split_data_buffer, + ccl_global char *ray_state, + + KERNEL_BUFFER_PARAMS, + + ccl_global int *queue_index, + ccl_global char *use_queues_flag, + ccl_global unsigned int *work_pools, + ccl_global float *buffer + ) +{ +#ifdef LOCALS_TYPE + ccl_local LOCALS_TYPE locals; +#endif + + KernelGlobals *kg = (KernelGlobals*)kg_global; + + if(ccl_local_id(0) + ccl_local_id(1) == 0) { + kg->data = data; + + kernel_split_params.queue_index = queue_index; + kernel_split_params.use_queues_flag = use_queues_flag; + kernel_split_params.work_pools = work_pools; + kernel_split_params.tile.buffer = buffer; + + split_data_init(kg, &kernel_split_state, ccl_global_size(0)*ccl_global_size(1), split_data_buffer, ray_state); + + } + + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + + KERNEL_NAME_EVAL(kernel, KERNEL_NAME)( + kg +#ifdef LOCALS_TYPE + , &locals +#endif + ); +} + +#undef KERNEL_NAME_JOIN +#undef KERNEL_NAME_EVAL + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl b/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl new file mode 100644 index 00000000000..c10ecc426c6 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl @@ -0,0 +1,29 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" + +__kernel void kernel_ocl_path_trace_state_buffer_size( + ccl_global char *kg, + ccl_constant KernelData *data, + uint num_threads, + ccl_global uint64_t *size) +{ + ((KernelGlobals*)kg)->data = data; + *size = split_data_buffer_size((KernelGlobals*)kg, num_threads); +} + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl new file mode 100644 index 00000000000..2b3be38df84 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl @@ -0,0 +1,24 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_subsurface_scatter.h" + +#define KERNEL_NAME subsurface_scatter +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl b/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl deleted file mode 100644 index 88a1ed830af..00000000000 --- a/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "split/kernel_sum_all_radiance.h" - -__kernel void kernel_ocl_path_trace_sum_all_radiance( - ccl_constant KernelData *data, /* To get pass_stride to offet into buffer */ - ccl_global float *buffer, /* Output buffer of RenderTile */ - ccl_global float *per_sample_output_buffer, /* Radiance contributed by all samples */ - int parallel_samples, int sw, int sh, int stride, - int buffer_offset_x, - int buffer_offset_y, - int buffer_stride, - int start_sample) -{ - kernel_sum_all_radiance(data, - buffer, - per_sample_output_buffer, - parallel_samples, - sw, sh, stride, - buffer_offset_x, - buffer_offset_y, - buffer_stride, - start_sample); -} diff --git a/intern/cycles/kernel/osl/CMakeLists.txt b/intern/cycles/kernel/osl/CMakeLists.txt index 98de40e5a8a..159de63a044 100644 --- a/intern/cycles/kernel/osl/CMakeLists.txt +++ b/intern/cycles/kernel/osl/CMakeLists.txt @@ -1,12 +1,6 @@ set(INC - . - .. - ../svm - ../../graph - ../../render - ../../util - ../../device + ../.. ) set(INC_SYS @@ -36,5 +30,5 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${RTTI_DISABLE_FLAGS}") include_directories(${INC}) include_directories(SYSTEM ${INC_SYS}) -add_library(cycles_kernel_osl ${SRC} ${HEADER_SRC}) +cycles_add_library(cycles_kernel_osl ${SRC} ${HEADER_SRC}) diff --git a/intern/cycles/kernel/osl/background.cpp b/intern/cycles/kernel/osl/background.cpp index d835f9be45c..8fff19407d9 100644 --- a/intern/cycles/kernel/osl/background.cpp +++ b/intern/cycles/kernel/osl/background.cpp @@ -34,10 +34,11 @@ #include <OSL/genclosure.h> -#include "osl_closures.h" +#include "kernel/osl/osl_closures.h" -#include "kernel_compat_cpu.h" -#include "closure/alloc.h" +#include "kernel/kernel_compat_cpu.h" +#include "kernel/closure/alloc.h" +#include "kernel/closure/emissive.h" CCL_NAMESPACE_BEGIN @@ -53,7 +54,7 @@ class GenericBackgroundClosure : public CClosurePrimitive { public: void setup(ShaderData *sd, int /* path_flag */, float3 weight) { - closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, weight); + background_setup(sd, weight); } }; diff --git a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp index bc26f42b559..ea18f2c8c86 100644 --- a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp +++ b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp @@ -34,13 +34,13 @@ #include <OSL/genclosure.h> -#include "kernel_compat_cpu.h" -#include "osl_closures.h" +#include "kernel/kernel_compat_cpu.h" +#include "kernel/osl/osl_closures.h" -#include "kernel_types.h" -#include "kernel_montecarlo.h" -#include "closure/alloc.h" -#include "closure/bsdf_diffuse_ramp.h" +#include "kernel/kernel_types.h" +#include "kernel/kernel_montecarlo.h" +#include "kernel/closure/alloc.h" +#include "kernel/closure/bsdf_diffuse_ramp.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp index 14c7644936e..a26671eb09e 100644 --- a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp +++ b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp @@ -34,12 +34,12 @@ #include <OSL/genclosure.h> -#include "kernel_compat_cpu.h" -#include "osl_closures.h" +#include "kernel/kernel_compat_cpu.h" +#include "kernel/osl/osl_closures.h" -#include "kernel_types.h" -#include "closure/alloc.h" -#include "closure/bsdf_phong_ramp.h" +#include "kernel/kernel_types.h" +#include "kernel/closure/alloc.h" +#include "kernel/closure/bsdf_phong_ramp.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/kernel/osl/emissive.cpp b/intern/cycles/kernel/osl/emissive.cpp index 3f13e08b302..6162786b527 100644 --- a/intern/cycles/kernel/osl/emissive.cpp +++ b/intern/cycles/kernel/osl/emissive.cpp @@ -34,12 +34,12 @@ #include <OSL/genclosure.h> -#include "osl_closures.h" +#include "kernel/osl/osl_closures.h" -#include "kernel_compat_cpu.h" -#include "kernel_types.h" -#include "closure/alloc.h" -#include "closure/emissive.h" +#include "kernel/kernel_compat_cpu.h" +#include "kernel/kernel_types.h" +#include "kernel/closure/alloc.h" +#include "kernel/closure/emissive.h" CCL_NAMESPACE_BEGIN @@ -56,8 +56,7 @@ class GenericEmissiveClosure : public CClosurePrimitive { public: void setup(ShaderData *sd, int /* path_flag */, float3 weight) { - closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, weight); - sd->flag |= SD_EMISSION; + emission_setup(sd, weight); } }; diff --git a/intern/cycles/kernel/osl/osl_bssrdf.cpp b/intern/cycles/kernel/osl/osl_bssrdf.cpp index 3614717e28c..da7368bbc61 100644 --- a/intern/cycles/kernel/osl/osl_bssrdf.cpp +++ b/intern/cycles/kernel/osl/osl_bssrdf.cpp @@ -32,153 +32,104 @@ #include <OSL/genclosure.h> -#include "kernel_compat_cpu.h" -#include "osl_closures.h" +#include "kernel/kernel_compat_cpu.h" +#include "kernel/osl/osl_closures.h" -#include "kernel_types.h" -#include "kernel_montecarlo.h" +#include "kernel/kernel_types.h" +#include "kernel/kernel_montecarlo.h" -#include "closure/alloc.h" -#include "closure/bsdf_diffuse.h" -#include "closure/bssrdf.h" +#include "kernel/closure/alloc.h" +#include "kernel/closure/bsdf_util.h" +#include "kernel/closure/bsdf_diffuse.h" +#include "kernel/closure/bsdf_principled_diffuse.h" +#include "kernel/closure/bssrdf.h" CCL_NAMESPACE_BEGIN using namespace OSL; +static ustring u_cubic("cubic"); +static ustring u_gaussian("gaussian"); +static ustring u_burley("burley"); +static ustring u_principled("principled"); +static ustring u_random_walk("random_walk"); +static ustring u_principled_random_walk("principled_random_walk"); + class CBSSRDFClosure : public CClosurePrimitive { public: Bssrdf params; - float3 radius; - float3 albedo; + ustring method; - void alloc(ShaderData *sd, int path_flag, float3 weight, ClosureType type) + CBSSRDFClosure() { - float sample_weight = fabsf(average(weight)); - - /* disable in case of diffuse ancestor, can't see it well then and - * adds considerably noise due to probabilities of continuing path - * getting lower and lower */ - if(path_flag & PATH_RAY_DIFFUSE_ANCESTOR) { - radius = make_float3(0.0f, 0.0f, 0.0f); - } - - if(sample_weight > CLOSURE_WEIGHT_CUTOFF) { - /* sharpness */ - float sharpness = params.sharpness; - /* texture color blur */ - float texture_blur = params.texture_blur; - - /* create one closure per color channel */ - Bssrdf *bssrdf = bssrdf_alloc(sd, make_float3(weight.x, 0.0f, 0.0f)); - if(bssrdf) { - bssrdf->sample_weight = sample_weight; - bssrdf->radius = radius.x; - bssrdf->texture_blur = texture_blur; - bssrdf->albedo = albedo.x; - bssrdf->sharpness = sharpness; - bssrdf->N = params.N; - ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type); - } - - bssrdf = bssrdf_alloc(sd, make_float3(0.0f, weight.y, 0.0f)); - if(bssrdf) { - bssrdf->sample_weight = sample_weight; - bssrdf->radius = radius.y; - bssrdf->texture_blur = texture_blur; - bssrdf->albedo = albedo.y; - bssrdf->sharpness = sharpness; - bssrdf->N = params.N; - ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type); - } - - bssrdf = bssrdf_alloc(sd, make_float3(0.0f, 0.0f, weight.z)); - if(bssrdf) { - bssrdf->sample_weight = sample_weight; - bssrdf->radius = radius.z; - bssrdf->texture_blur = texture_blur; - bssrdf->albedo = albedo.z; - bssrdf->sharpness = sharpness; - bssrdf->N = params.N; - ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type); - } - } + params.texture_blur = 0.0f; + params.sharpness = 0.0f; + params.roughness = 0.0f; } -}; - -/* Cubic */ -class CubicBSSRDFClosure : public CBSSRDFClosure { -public: void setup(ShaderData *sd, int path_flag, float3 weight) { - alloc(sd, path_flag, weight, CLOSURE_BSSRDF_CUBIC_ID); + if (method == u_cubic) { + alloc(sd, path_flag, weight, CLOSURE_BSSRDF_CUBIC_ID); + } + else if (method == u_gaussian) { + alloc(sd, path_flag, weight, CLOSURE_BSSRDF_GAUSSIAN_ID); + } + else if (method == u_burley) { + alloc(sd, path_flag, weight, CLOSURE_BSSRDF_BURLEY_ID); + } + else if (method == u_principled) { + alloc(sd, path_flag, weight, CLOSURE_BSSRDF_PRINCIPLED_ID); + } + else if (method == u_random_walk) { + alloc(sd, path_flag, weight, CLOSURE_BSSRDF_RANDOM_WALK_ID); + } + else if (method == u_principled_random_walk) { + alloc(sd, path_flag, weight, CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID); + } } -}; - -ClosureParam *closure_bssrdf_cubic_params() -{ - static ClosureParam params[] = { - CLOSURE_FLOAT3_PARAM(CubicBSSRDFClosure, params.N), - CLOSURE_FLOAT3_PARAM(CubicBSSRDFClosure, radius), - CLOSURE_FLOAT_PARAM(CubicBSSRDFClosure, params.texture_blur), - CLOSURE_FLOAT_PARAM(CubicBSSRDFClosure, params.sharpness), - CLOSURE_STRING_KEYPARAM(CubicBSSRDFClosure, label, "label"), - CLOSURE_FINISH_PARAM(CubicBSSRDFClosure) - }; - return params; -} -CCLOSURE_PREPARE(closure_bssrdf_cubic_prepare, CubicBSSRDFClosure) - -/* Gaussian */ - -class GaussianBSSRDFClosure : public CBSSRDFClosure { -public: - void setup(ShaderData *sd, int path_flag, float3 weight) + void alloc(ShaderData *sd, int path_flag, float3 weight, ClosureType type) { - alloc(sd, path_flag, weight, CLOSURE_BSSRDF_GAUSSIAN_ID); - } -}; - -ClosureParam *closure_bssrdf_gaussian_params() -{ - static ClosureParam params[] = { - CLOSURE_FLOAT3_PARAM(GaussianBSSRDFClosure, params.N), - CLOSURE_FLOAT3_PARAM(GaussianBSSRDFClosure, radius), - CLOSURE_FLOAT_PARAM(GaussianBSSRDFClosure, params.texture_blur), - CLOSURE_STRING_KEYPARAM(GaussianBSSRDFClosure, label, "label"), - CLOSURE_FINISH_PARAM(GaussianBSSRDFClosure) - }; - return params; -} - -CCLOSURE_PREPARE(closure_bssrdf_gaussian_prepare, GaussianBSSRDFClosure) - -/* Burley */ + Bssrdf *bssrdf = bssrdf_alloc(sd, weight); + + if(bssrdf) { + /* disable in case of diffuse ancestor, can't see it well then and + * adds considerably noise due to probabilities of continuing path + * getting lower and lower */ + if(path_flag & PATH_RAY_DIFFUSE_ANCESTOR) { + params.radius = make_float3(0.0f, 0.0f, 0.0f); + } -class BurleyBSSRDFClosure : public CBSSRDFClosure { -public: - void setup(ShaderData *sd, int path_flag, float3 weight) - { - alloc(sd, path_flag, weight, CLOSURE_BSSRDF_BURLEY_ID); + /* create one closure per color channel */ + bssrdf->radius = params.radius; + bssrdf->albedo = params.albedo; + bssrdf->texture_blur = params.texture_blur; + bssrdf->sharpness = params.sharpness; + bssrdf->N = params.N; + bssrdf->roughness = params.roughness; + sd->flag |= bssrdf_setup(sd, bssrdf, (ClosureType)type); + } } }; -ClosureParam *closure_bssrdf_burley_params() +ClosureParam *closure_bssrdf_params() { static ClosureParam params[] = { - CLOSURE_FLOAT3_PARAM(BurleyBSSRDFClosure, params.N), - CLOSURE_FLOAT3_PARAM(BurleyBSSRDFClosure, radius), - CLOSURE_FLOAT_PARAM(BurleyBSSRDFClosure, params.texture_blur), - CLOSURE_FLOAT3_PARAM(BurleyBSSRDFClosure, albedo), - CLOSURE_STRING_KEYPARAM(BurleyBSSRDFClosure, label, "label"), - CLOSURE_FINISH_PARAM(BurleyBSSRDFClosure) + CLOSURE_STRING_PARAM(CBSSRDFClosure, method), + CLOSURE_FLOAT3_PARAM(CBSSRDFClosure, params.N), + CLOSURE_FLOAT3_PARAM(CBSSRDFClosure, params.radius), + CLOSURE_FLOAT3_PARAM(CBSSRDFClosure, params.albedo), + CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, params.texture_blur, "texture_blur"), + CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, params.sharpness, "sharpness"), + CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, params.roughness, "roughness"), + CLOSURE_STRING_KEYPARAM(CBSSRDFClosure, label, "label"), + CLOSURE_FINISH_PARAM(CBSSRDFClosure) }; return params; } -CCLOSURE_PREPARE(closure_bssrdf_burley_prepare, BurleyBSSRDFClosure) +CCLOSURE_PREPARE(closure_bssrdf_prepare, CBSSRDFClosure) CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp index 94de782dca0..ee16ddaf0fd 100644 --- a/intern/cycles/kernel/osl/osl_closures.cpp +++ b/intern/cycles/kernel/osl/osl_closures.cpp @@ -33,33 +33,35 @@ #include <OSL/genclosure.h> #include <OSL/oslclosure.h> -#include "osl_closures.h" -#include "osl_shader.h" - -#include "util_debug.h" -#include "util_math.h" -#include "util_param.h" - -#include "kernel_types.h" -#include "kernel_compat_cpu.h" -#include "kernel_globals.h" -#include "kernel_montecarlo.h" -#include "kernel_random.h" - -#include "closure/alloc.h" -#include "closure/bsdf_util.h" -#include "closure/bsdf_ashikhmin_velvet.h" -#include "closure/bsdf_diffuse.h" -#include "closure/bsdf_microfacet.h" -#include "closure/bsdf_microfacet_multi.h" -#include "closure/bsdf_oren_nayar.h" -#include "closure/bsdf_reflection.h" -#include "closure/bsdf_refraction.h" -#include "closure/bsdf_transparent.h" -#include "closure/bsdf_ashikhmin_shirley.h" -#include "closure/bsdf_toon.h" -#include "closure/bsdf_hair.h" -#include "closure/volume.h" +#include "kernel/osl/osl_closures.h" +#include "kernel/osl/osl_shader.h" + +#include "util/util_math.h" +#include "util/util_param.h" + +#include "kernel/kernel_types.h" +#include "kernel/kernel_compat_cpu.h" +#include "kernel/split/kernel_split_data_types.h" +#include "kernel/kernel_globals.h" +#include "kernel/kernel_montecarlo.h" +#include "kernel/kernel_random.h" + +#include "kernel/closure/alloc.h" +#include "kernel/closure/bsdf_util.h" +#include "kernel/closure/bsdf_ashikhmin_velvet.h" +#include "kernel/closure/bsdf_diffuse.h" +#include "kernel/closure/bsdf_microfacet.h" +#include "kernel/closure/bsdf_microfacet_multi.h" +#include "kernel/closure/bsdf_oren_nayar.h" +#include "kernel/closure/bsdf_reflection.h" +#include "kernel/closure/bsdf_refraction.h" +#include "kernel/closure/bsdf_transparent.h" +#include "kernel/closure/bsdf_ashikhmin_shirley.h" +#include "kernel/closure/bsdf_toon.h" +#include "kernel/closure/bsdf_hair.h" +#include "kernel/closure/bsdf_principled_diffuse.h" +#include "kernel/closure/bsdf_principled_sheen.h" +#include "kernel/closure/volume.h" CCL_NAMESPACE_BEGIN @@ -89,9 +91,6 @@ BSDF_CLOSURE_CLASS_BEGIN(Refraction, refraction, MicrofacetBsdf, LABEL_SINGULAR) CLOSURE_FLOAT_PARAM(RefractionClosure, params.ior), BSDF_CLOSURE_CLASS_END(Refraction, refraction) -BSDF_CLOSURE_CLASS_BEGIN(Transparent, transparent, ShaderClosure, LABEL_SINGULAR) -BSDF_CLOSURE_CLASS_END(Transparent, transparent) - BSDF_CLOSURE_CLASS_BEGIN(AshikhminVelvet, ashikhmin_velvet, VelvetBsdf, LABEL_DIFFUSE) CLOSURE_FLOAT3_PARAM(AshikhminVelvetClosure, params.N), CLOSURE_FLOAT_PARAM(AshikhminVelvetClosure, params.sigma), @@ -153,7 +152,7 @@ BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmannRefraction, microfacet_beckmann_refra BSDF_CLOSURE_CLASS_END(MicrofacetBeckmannRefraction, microfacet_beckmann_refraction) BSDF_CLOSURE_CLASS_BEGIN(HairReflection, hair_reflection, HairBsdf, LABEL_GLOSSY) - CLOSURE_FLOAT3_PARAM(HairReflectionClosure, unused), + CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.N), CLOSURE_FLOAT_PARAM(HairReflectionClosure, params.roughness1), CLOSURE_FLOAT_PARAM(HairReflectionClosure, params.roughness2), CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.T), @@ -161,19 +160,75 @@ BSDF_CLOSURE_CLASS_BEGIN(HairReflection, hair_reflection, HairBsdf, LABEL_GLOSSY BSDF_CLOSURE_CLASS_END(HairReflection, hair_reflection) BSDF_CLOSURE_CLASS_BEGIN(HairTransmission, hair_transmission, HairBsdf, LABEL_GLOSSY) - CLOSURE_FLOAT3_PARAM(HairTransmissionClosure, unused), + CLOSURE_FLOAT3_PARAM(HairTransmissionClosure, params.N), CLOSURE_FLOAT_PARAM(HairTransmissionClosure, params.roughness1), CLOSURE_FLOAT_PARAM(HairTransmissionClosure, params.roughness2), CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.T), CLOSURE_FLOAT_PARAM(HairReflectionClosure, params.offset), BSDF_CLOSURE_CLASS_END(HairTransmission, hair_transmission) -VOLUME_CLOSURE_CLASS_BEGIN(VolumeHenyeyGreenstein, henyey_greenstein, HenyeyGreensteinVolume, LABEL_VOLUME_SCATTER) - CLOSURE_FLOAT_PARAM(VolumeHenyeyGreensteinClosure, params.g), -VOLUME_CLOSURE_CLASS_END(VolumeHenyeyGreenstein, henyey_greenstein) +BSDF_CLOSURE_CLASS_BEGIN(PrincipledDiffuse, principled_diffuse, PrincipledDiffuseBsdf, LABEL_DIFFUSE) + CLOSURE_FLOAT3_PARAM(PrincipledDiffuseClosure, params.N), + CLOSURE_FLOAT_PARAM(PrincipledDiffuseClosure, params.roughness), +BSDF_CLOSURE_CLASS_END(PrincipledDiffuse, principled_diffuse) + +BSDF_CLOSURE_CLASS_BEGIN(PrincipledSheen, principled_sheen, PrincipledSheenBsdf, LABEL_DIFFUSE) + CLOSURE_FLOAT3_PARAM(PrincipledSheenClosure, params.N), +BSDF_CLOSURE_CLASS_END(PrincipledSheen, principled_sheen) + +/* DISNEY PRINCIPLED CLEARCOAT */ +class PrincipledClearcoatClosure : public CBSDFClosure { +public: + MicrofacetBsdf params; + float clearcoat, clearcoat_roughness; + + MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight) + { + MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc_osl(sd, sizeof(MicrofacetBsdf), weight, ¶ms); + if(!bsdf) { + return NULL; + } + + MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); + if(!extra) { + return NULL; + } + + bsdf->T = make_float3(0.0f, 0.0f, 0.0f); + bsdf->extra = extra; + bsdf->ior = 1.5f; + bsdf->alpha_x = clearcoat_roughness; + bsdf->alpha_y = clearcoat_roughness; + bsdf->extra->color = make_float3(0.0f, 0.0f, 0.0f); + bsdf->extra->cspec0 = make_float3(0.04f, 0.04f, 0.04f); + bsdf->extra->clearcoat = clearcoat; + return bsdf; + } + + void setup(ShaderData *sd, int path_flag, float3 weight) + { + MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); + if(!bsdf) { + return; + } + + sd->flag |= bsdf_microfacet_ggx_clearcoat_setup(bsdf, sd); + } +}; + +ClosureParam *closure_bsdf_principled_clearcoat_params() +{ + static ClosureParam params[] = { + CLOSURE_FLOAT3_PARAM(PrincipledClearcoatClosure, params.N), + CLOSURE_FLOAT_PARAM(PrincipledClearcoatClosure, clearcoat), + CLOSURE_FLOAT_PARAM(PrincipledClearcoatClosure, clearcoat_roughness), + CLOSURE_STRING_KEYPARAM(PrincipledClearcoatClosure, label, "label"), + CLOSURE_FINISH_PARAM(PrincipledClearcoatClosure) + }; + return params; +} +CCLOSURE_PREPARE(closure_bsdf_principled_clearcoat_prepare, PrincipledClearcoatClosure) -VOLUME_CLOSURE_CLASS_BEGIN(VolumeAbsorption, absorption, ShaderClosure, LABEL_SINGULAR) -VOLUME_CLOSURE_CLASS_END(VolumeAbsorption, absorption) /* Registration */ @@ -182,7 +237,11 @@ static void register_closure(OSL::ShadingSystem *ss, const char *name, int id, O /* optimization: it's possible to not use a prepare function at all and * only initialize the actual class when accessing the closure component * data, but then we need to map the id to the class somehow */ +#if OSL_LIBRARY_VERSION_CODE >= 10900 + ss->register_closure(name, id, params, prepare, NULL); +#else ss->register_closure(name, id, params, prepare, NULL, 16); +#endif } void OSLShader::register_closures(OSLShadingSystem *ss_) @@ -201,7 +260,7 @@ void OSLShader::register_closures(OSLShadingSystem *ss_) register_closure(ss, "refraction", id++, bsdf_refraction_params(), bsdf_refraction_prepare); register_closure(ss, "transparent", id++, - bsdf_transparent_params(), bsdf_transparent_prepare); + closure_bsdf_transparent_params(), closure_bsdf_transparent_prepare); register_closure(ss, "microfacet_ggx", id++, bsdf_microfacet_ggx_params(), bsdf_microfacet_ggx_prepare); register_closure(ss, "microfacet_ggx_aniso", id++, @@ -214,6 +273,16 @@ void OSLShader::register_closures(OSLShadingSystem *ss_) closure_bsdf_microfacet_multi_ggx_glass_params(), closure_bsdf_microfacet_multi_ggx_glass_prepare); register_closure(ss, "microfacet_multi_ggx_aniso", id++, closure_bsdf_microfacet_multi_ggx_aniso_params(), closure_bsdf_microfacet_multi_ggx_aniso_prepare); + register_closure(ss, "microfacet_ggx_fresnel", id++, + closure_bsdf_microfacet_ggx_fresnel_params(), closure_bsdf_microfacet_ggx_fresnel_prepare); + register_closure(ss, "microfacet_ggx_aniso_fresnel", id++, + closure_bsdf_microfacet_ggx_aniso_fresnel_params(), closure_bsdf_microfacet_ggx_aniso_fresnel_prepare); + register_closure(ss, "microfacet_multi_ggx_fresnel", id++, + closure_bsdf_microfacet_multi_ggx_fresnel_params(), closure_bsdf_microfacet_multi_ggx_fresnel_prepare); + register_closure(ss, "microfacet_multi_ggx_glass_fresnel", id++, + closure_bsdf_microfacet_multi_ggx_glass_fresnel_params(), closure_bsdf_microfacet_multi_ggx_glass_fresnel_prepare); + register_closure(ss, "microfacet_multi_ggx_aniso_fresnel", id++, + closure_bsdf_microfacet_multi_ggx_aniso_fresnel_params(), closure_bsdf_microfacet_multi_ggx_aniso_fresnel_prepare); register_closure(ss, "microfacet_beckmann", id++, bsdf_microfacet_beckmann_params(), bsdf_microfacet_beckmann_prepare); register_closure(ss, "microfacet_beckmann_aniso", id++, @@ -228,6 +297,12 @@ void OSLShader::register_closures(OSLShadingSystem *ss_) bsdf_diffuse_toon_params(), bsdf_diffuse_toon_prepare); register_closure(ss, "glossy_toon", id++, bsdf_glossy_toon_params(), bsdf_glossy_toon_prepare); + register_closure(ss, "principled_diffuse", id++, + bsdf_principled_diffuse_params(), bsdf_principled_diffuse_prepare); + register_closure(ss, "principled_sheen", id++, + bsdf_principled_sheen_params(), bsdf_principled_sheen_prepare); + register_closure(ss, "principled_clearcoat", id++, + closure_bsdf_principled_clearcoat_params(), closure_bsdf_principled_clearcoat_prepare); register_closure(ss, "emission", id++, closure_emission_params(), closure_emission_prepare); @@ -241,12 +316,8 @@ void OSLShader::register_closures(OSLShadingSystem *ss_) closure_bsdf_diffuse_ramp_params(), closure_bsdf_diffuse_ramp_prepare); register_closure(ss, "phong_ramp", id++, closure_bsdf_phong_ramp_params(), closure_bsdf_phong_ramp_prepare); - register_closure(ss, "bssrdf_cubic", id++, - closure_bssrdf_cubic_params(), closure_bssrdf_cubic_prepare); - register_closure(ss, "bssrdf_gaussian", id++, - closure_bssrdf_gaussian_params(), closure_bssrdf_gaussian_prepare); - register_closure(ss, "bssrdf_burley", id++, - closure_bssrdf_burley_params(), closure_bssrdf_burley_prepare); + register_closure(ss, "bssrdf", id++, + closure_bssrdf_params(), closure_bssrdf_prepare); register_closure(ss, "hair_reflection", id++, bsdf_hair_reflection_params(), bsdf_hair_reflection_prepare); @@ -254,9 +325,9 @@ void OSLShader::register_closures(OSLShadingSystem *ss_) bsdf_hair_transmission_params(), bsdf_hair_transmission_prepare); register_closure(ss, "henyey_greenstein", id++, - volume_henyey_greenstein_params(), volume_henyey_greenstein_prepare); + closure_henyey_greenstein_params(), closure_henyey_greenstein_prepare); register_closure(ss, "absorption", id++, - volume_absorption_params(), volume_absorption_prepare); + closure_absorption_params(), closure_absorption_prepare); } /* BSDF Closure */ @@ -277,6 +348,103 @@ bool CBSDFClosure::skip(const ShaderData *sd, int path_flag, int scattering) return false; } + +/* GGX closures with Fresnel */ + +class MicrofacetFresnelClosure : public CBSDFClosure { +public: + MicrofacetBsdf params; + float3 color; + float3 cspec0; + + MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight) + { + /* Technically, the MultiGGX Glass closure may also transmit. However, + * since this is set statically and only used for caustic flags, this + * is probably as good as it gets. */ + if(skip(sd, path_flag, LABEL_GLOSSY | LABEL_REFLECT)) { + return NULL; + } + + MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc_osl(sd, sizeof(MicrofacetBsdf), weight, ¶ms); + if(!bsdf) { + return NULL; + } + + MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); + if(!extra) { + return NULL; + } + + bsdf->extra = extra; + bsdf->extra->color = color; + bsdf->extra->cspec0 = cspec0; + bsdf->extra->clearcoat = 0.0f; + return bsdf; + } +}; + +class MicrofacetGGXFresnelClosure : public MicrofacetFresnelClosure { +public: + void setup(ShaderData *sd, int path_flag, float3 weight) + { + MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); + if(!bsdf) { + return; + } + + bsdf->T = make_float3(0.0f, 0.0f, 0.0f); + bsdf->alpha_y = bsdf->alpha_x; + sd->flag |= bsdf_microfacet_ggx_fresnel_setup(bsdf, sd); + } +}; + +ClosureParam *closure_bsdf_microfacet_ggx_fresnel_params() +{ + static ClosureParam params[] = { + CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, params.N), + CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.alpha_x), + CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.ior), + CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, color), + CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, cspec0), + CLOSURE_STRING_KEYPARAM(MicrofacetGGXFresnelClosure, label, "label"), + CLOSURE_FINISH_PARAM(MicrofacetGGXFresnelClosure) + }; + return params; +} +CCLOSURE_PREPARE(closure_bsdf_microfacet_ggx_fresnel_prepare, MicrofacetGGXFresnelClosure); + +class MicrofacetGGXAnisoFresnelClosure : public MicrofacetFresnelClosure { +public: + void setup(ShaderData *sd, int path_flag, float3 weight) + { + MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); + if(!bsdf) { + return; + } + + sd->flag |= bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf, sd); + } +}; + +ClosureParam *closure_bsdf_microfacet_ggx_aniso_fresnel_params() +{ + static ClosureParam params[] = { + CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, params.N), + CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, params.T), + CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.alpha_x), + CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.alpha_y), + CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.ior), + CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, color), + CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, cspec0), + CLOSURE_STRING_KEYPARAM(MicrofacetGGXFresnelClosure, label, "label"), + CLOSURE_FINISH_PARAM(MicrofacetGGXFresnelClosure) + }; + return params; +} +CCLOSURE_PREPARE(closure_bsdf_microfacet_ggx_aniso_fresnel_prepare, MicrofacetGGXAnisoFresnelClosure); + + /* Multiscattering GGX closures */ class MicrofacetMultiClosure : public CBSDFClosure { @@ -286,20 +454,28 @@ public: MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight) { - /* Technically, the MultiGGX Glass closure may also transmit. However, + /* Technically, the MultiGGX closure may also transmit. However, * since this is set statically and only used for caustic flags, this * is probably as good as it gets. */ - if(!skip(sd, path_flag, LABEL_GLOSSY|LABEL_REFLECT)) { - MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc_osl(sd, sizeof(MicrofacetBsdf), weight, ¶ms); - MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); - if(bsdf && extra) { - bsdf->extra = extra; - bsdf->extra->color = color; - return bsdf; - } + if(skip(sd, path_flag, LABEL_GLOSSY|LABEL_REFLECT)) { + return NULL; + } + + MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc_osl(sd, sizeof(MicrofacetBsdf), weight, ¶ms); + if(!bsdf) { + return NULL; + } + + MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); + if(!extra) { + return NULL; } - return NULL; + bsdf->extra = extra; + bsdf->extra->color = color; + bsdf->extra->cspec0 = make_float3(0.0f, 0.0f, 0.0f); + bsdf->extra->clearcoat = 0.0f; + return bsdf; } }; @@ -308,7 +484,14 @@ public: void setup(ShaderData *sd, int path_flag, float3 weight) { MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); - sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_setup(bsdf) : 0; + if(!bsdf) { + return; + } + + bsdf->ior = 0.0f; + bsdf->T = make_float3(0.0f, 0.0f, 0.0f); + bsdf->alpha_y = bsdf->alpha_x; + sd->flag |= bsdf_microfacet_multi_ggx_setup(bsdf); } }; @@ -330,7 +513,12 @@ public: void setup(ShaderData *sd, int path_flag, float3 weight) { MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); - sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_aniso_setup(bsdf) : 0; + if(!bsdf) { + return; + } + + bsdf->ior = 0.0f; + sd->flag |= bsdf_microfacet_multi_ggx_aniso_setup(bsdf); } }; @@ -356,7 +544,13 @@ public: void setup(ShaderData *sd, int path_flag, float3 weight) { MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); - sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_glass_setup(bsdf) : 0; + if(!bsdf) { + return; + } + + bsdf->T = make_float3(0.0f, 0.0f, 0.0f); + bsdf->alpha_y = bsdf->alpha_x; + sd->flag |= bsdf_microfacet_multi_ggx_glass_setup(bsdf); } }; @@ -374,5 +568,208 @@ ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_params() } CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_glass_prepare, MicrofacetMultiGGXGlassClosure); + +/* Multiscattering GGX closures with Fresnel */ + +class MicrofacetMultiFresnelClosure : public CBSDFClosure { +public: + MicrofacetBsdf params; + float3 color; + float3 cspec0; + + MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight) + { + /* Technically, the MultiGGX closure may also transmit. However, + * since this is set statically and only used for caustic flags, this + * is probably as good as it gets. */ + if(skip(sd, path_flag, LABEL_GLOSSY | LABEL_REFLECT)) { + return NULL; + } + + MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc_osl(sd, sizeof(MicrofacetBsdf), weight, ¶ms); + if(!bsdf) { + return NULL; + } + + MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); + if(!extra) { + return NULL; + } + + bsdf->extra = extra; + bsdf->extra->color = color; + bsdf->extra->cspec0 = cspec0; + bsdf->extra->clearcoat = 0.0f; + return bsdf; + } +}; + +class MicrofacetMultiGGXFresnelClosure : public MicrofacetMultiFresnelClosure { +public: + void setup(ShaderData *sd, int path_flag, float3 weight) + { + MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); + if(!bsdf) { + return; + } + + bsdf->T = make_float3(0.0f, 0.0f, 0.0f); + bsdf->alpha_y = bsdf->alpha_x; + sd->flag |= bsdf_microfacet_multi_ggx_fresnel_setup(bsdf, sd); + } +}; + +ClosureParam *closure_bsdf_microfacet_multi_ggx_fresnel_params() +{ + static ClosureParam params[] = { + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.N), + CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_x), + CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.ior), + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, color), + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, cspec0), + CLOSURE_STRING_KEYPARAM(MicrofacetMultiGGXFresnelClosure, label, "label"), + CLOSURE_FINISH_PARAM(MicrofacetMultiGGXFresnelClosure) + }; + return params; +} +CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_fresnel_prepare, MicrofacetMultiGGXFresnelClosure); + +class MicrofacetMultiGGXAnisoFresnelClosure : public MicrofacetMultiFresnelClosure { +public: + void setup(ShaderData *sd, int path_flag, float3 weight) + { + MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); + if(!bsdf) { + return; + } + + sd->flag |= bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf, sd); + } +}; + +ClosureParam *closure_bsdf_microfacet_multi_ggx_aniso_fresnel_params() +{ + static ClosureParam params[] = { + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.N), + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.T), + CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_x), + CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_y), + CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.ior), + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, color), + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, cspec0), + CLOSURE_STRING_KEYPARAM(MicrofacetMultiGGXFresnelClosure, label, "label"), + CLOSURE_FINISH_PARAM(MicrofacetMultiGGXFresnelClosure) + }; + return params; +} +CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_aniso_fresnel_prepare, MicrofacetMultiGGXAnisoFresnelClosure); + +class MicrofacetMultiGGXGlassFresnelClosure : public MicrofacetMultiFresnelClosure { +public: + MicrofacetMultiGGXGlassFresnelClosure() : MicrofacetMultiFresnelClosure() {} + + void setup(ShaderData *sd, int path_flag, float3 weight) + { + MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight); + if(!bsdf) { + return; + } + + bsdf->T = make_float3(0.0f, 0.0f, 0.0f); + bsdf->alpha_y = bsdf->alpha_x; + sd->flag |= bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf, sd); + } +}; + +ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_fresnel_params() +{ + static ClosureParam params[] = { + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.N), + CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_x), + CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.ior), + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, color), + CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, cspec0), + CLOSURE_STRING_KEYPARAM(MicrofacetMultiGGXFresnelClosure, label, "label"), + CLOSURE_FINISH_PARAM(MicrofacetMultiGGXFresnelClosure) + }; + return params; +} +CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_glass_fresnel_prepare, MicrofacetMultiGGXGlassFresnelClosure); + +/* Transparent */ + +class TransparentClosure : public CBSDFClosure { +public: + ShaderClosure params; + float3 unused; + + void setup(ShaderData *sd, int path_flag, float3 weight) + { + bsdf_transparent_setup(sd, weight, path_flag); + } +}; + +ClosureParam *closure_bsdf_transparent_params() +{ + static ClosureParam params[] = { + CLOSURE_STRING_KEYPARAM(TransparentClosure, label, "label"), + CLOSURE_FINISH_PARAM(TransparentClosure) + }; + return params; +} + +CCLOSURE_PREPARE(closure_bsdf_transparent_prepare, TransparentClosure) + +/* Volume */ + +class VolumeAbsorptionClosure : public CBSDFClosure { +public: + void setup(ShaderData *sd, int path_flag, float3 weight) + { + volume_extinction_setup(sd, weight); + } +}; + +ClosureParam *closure_absorption_params() +{ + static ClosureParam params[] = { + CLOSURE_STRING_KEYPARAM(VolumeAbsorptionClosure, label, "label"), + CLOSURE_FINISH_PARAM(VolumeAbsorptionClosure) + }; + return params; +} + +CCLOSURE_PREPARE(closure_absorption_prepare, VolumeAbsorptionClosure) + +class VolumeHenyeyGreensteinClosure : public CBSDFClosure { +public: + HenyeyGreensteinVolume params; + + void setup(ShaderData *sd, int path_flag, float3 weight) + { + volume_extinction_setup(sd, weight); + + HenyeyGreensteinVolume *volume = (HenyeyGreensteinVolume*)bsdf_alloc_osl(sd, sizeof(HenyeyGreensteinVolume), weight, ¶ms); + if(!volume) { + return; + } + + sd->flag |= volume_henyey_greenstein_setup(volume); + } +}; + +ClosureParam *closure_henyey_greenstein_params() +{ + static ClosureParam params[] = { + CLOSURE_FLOAT_PARAM(VolumeHenyeyGreensteinClosure, params.g), + CLOSURE_STRING_KEYPARAM(VolumeHenyeyGreensteinClosure, label, "label"), + CLOSURE_FINISH_PARAM(VolumeHenyeyGreensteinClosure) + }; + return params; +} + +CCLOSURE_PREPARE(closure_henyey_greenstein_prepare, VolumeHenyeyGreensteinClosure) + + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/osl/osl_closures.h b/intern/cycles/kernel/osl/osl_closures.h index cd7b33703ff..dca7e74f154 100644 --- a/intern/cycles/kernel/osl/osl_closures.h +++ b/intern/cycles/kernel/osl/osl_closures.h @@ -33,8 +33,8 @@ #ifndef __OSL_CLOSURES_H__ #define __OSL_CLOSURES_H__ -#include "util_types.h" -#include "kernel_types.h" +#include "util/util_types.h" +#include "kernel/kernel_types.h" #include <OSL/oslclosure.h> #include <OSL/oslexec.h> @@ -48,13 +48,19 @@ OSL::ClosureParam *closure_holdout_params(); OSL::ClosureParam *closure_ambient_occlusion_params(); OSL::ClosureParam *closure_bsdf_diffuse_ramp_params(); OSL::ClosureParam *closure_bsdf_phong_ramp_params(); -OSL::ClosureParam *closure_bssrdf_cubic_params(); -OSL::ClosureParam *closure_bssrdf_gaussian_params(); -OSL::ClosureParam *closure_bssrdf_burley_params(); -OSL::ClosureParam *closure_henyey_greenstein_volume_params(); +OSL::ClosureParam *closure_bsdf_transparent_params(); +OSL::ClosureParam *closure_bssrdf_params(); +OSL::ClosureParam *closure_absorption_params(); +OSL::ClosureParam *closure_henyey_greenstein_params(); OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_params(); OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_params(); OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_aniso_params(); +OSL::ClosureParam *closure_bsdf_microfacet_ggx_fresnel_params(); +OSL::ClosureParam *closure_bsdf_microfacet_ggx_aniso_fresnel_params(); +OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_fresnel_params(); +OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_fresnel_params(); +OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_aniso_fresnel_params(); +OSL::ClosureParam *closure_bsdf_principled_clearcoat_params(); void closure_emission_prepare(OSL::RendererServices *, int id, void *data); void closure_background_prepare(OSL::RendererServices *, int id, void *data); @@ -62,13 +68,19 @@ void closure_holdout_prepare(OSL::RendererServices *, int id, void *data); void closure_ambient_occlusion_prepare(OSL::RendererServices *, int id, void *data); void closure_bsdf_diffuse_ramp_prepare(OSL::RendererServices *, int id, void *data); void closure_bsdf_phong_ramp_prepare(OSL::RendererServices *, int id, void *data); -void closure_bssrdf_cubic_prepare(OSL::RendererServices *, int id, void *data); -void closure_bssrdf_gaussian_prepare(OSL::RendererServices *, int id, void *data); -void closure_bssrdf_burley_prepare(OSL::RendererServices *, int id, void *data); -void closure_henyey_greenstein_volume_prepare(OSL::RendererServices *, int id, void *data); +void closure_bsdf_transparent_prepare(OSL::RendererServices *, int id, void *data); +void closure_bssrdf_prepare(OSL::RendererServices *, int id, void *data); +void closure_absorption_prepare(OSL::RendererServices *, int id, void *data); +void closure_henyey_greenstein_prepare(OSL::RendererServices *, int id, void *data); void closure_bsdf_microfacet_multi_ggx_prepare(OSL::RendererServices *, int id, void *data); void closure_bsdf_microfacet_multi_ggx_glass_prepare(OSL::RendererServices *, int id, void *data); void closure_bsdf_microfacet_multi_ggx_aniso_prepare(OSL::RendererServices *, int id, void *data); +void closure_bsdf_microfacet_ggx_fresnel_prepare(OSL::RendererServices *, int id, void *data); +void closure_bsdf_microfacet_ggx_aniso_fresnel_prepare(OSL::RendererServices *, int id, void *data); +void closure_bsdf_microfacet_multi_ggx_fresnel_prepare(OSL::RendererServices *, int id, void *data); +void closure_bsdf_microfacet_multi_ggx_glass_fresnel_prepare(OSL::RendererServices *, int id, void *data); +void closure_bsdf_microfacet_multi_ggx_aniso_fresnel_prepare(OSL::RendererServices *, int id, void *data); +void closure_bsdf_principled_clearcoat_prepare(OSL::RendererServices *, int id, void *data); #define CCLOSURE_PREPARE(name, classname) \ void name(RendererServices *, int id, void *data) \ @@ -133,36 +145,6 @@ static ClosureParam *bsdf_##lower##_params() \ \ CCLOSURE_PREPARE_STATIC(bsdf_##lower##_prepare, Upper##Closure) -/* Volume */ - -#define VOLUME_CLOSURE_CLASS_BEGIN(Upper, lower, structname, TYPE) \ -\ -class Upper##Closure : public CBSDFClosure { \ -public: \ - structname params; \ -\ - void setup(ShaderData *sd, int path_flag, float3 weight) \ - { \ - structname *volume = (structname*)bsdf_alloc_osl(sd, sizeof(structname), weight, ¶ms); \ - sd->flag |= (volume) ? volume_##lower##_setup(volume) : 0; \ - } \ -}; \ -\ -static ClosureParam *volume_##lower##_params() \ -{ \ - static ClosureParam params[] = { - -/* parameters */ - -#define VOLUME_CLOSURE_CLASS_END(Upper, lower) \ - CLOSURE_STRING_KEYPARAM(Upper##Closure, label, "label"), \ - CLOSURE_FINISH_PARAM(Upper##Closure) \ - }; \ - return params; \ -} \ -\ -CCLOSURE_PREPARE_STATIC(volume_##lower##_prepare, Upper##Closure) - CCL_NAMESPACE_END #endif /* __OSL_CLOSURES_H__ */ diff --git a/intern/cycles/kernel/osl/osl_globals.h b/intern/cycles/kernel/osl/osl_globals.h index 65cb7ecc6b4..9585d9f4825 100644 --- a/intern/cycles/kernel/osl/osl_globals.h +++ b/intern/cycles/kernel/osl/osl_globals.h @@ -21,10 +21,10 @@ #include <OSL/oslexec.h> -#include "util_map.h" -#include "util_param.h" -#include "util_thread.h" -#include "util_vector.h" +#include "util/util_map.h" +#include "util/util_param.h" +#include "util/util_thread.h" +#include "util/util_vector.h" #ifndef WIN32 using std::isfinite; @@ -86,7 +86,7 @@ struct OSLThreadData { OSL::ShaderGlobals globals; OSL::PerThreadInfo *osl_thread_info; OSLTraceData tracedata; - OSL::ShadingContext *context[SHADER_CONTEXT_NUM]; + OSL::ShadingContext *context; OIIO::TextureSystem::Perthread *oiio_thread_info; }; diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp index 58bbdc33920..0c5e5e30e47 100644 --- a/intern/cycles/kernel/osl/osl_services.cpp +++ b/intern/cycles/kernel/osl/osl_services.cpp @@ -25,33 +25,34 @@ #include <string.h> -#include "mesh.h" -#include "object.h" -#include "scene.h" - -#include "osl_closures.h" -#include "osl_globals.h" -#include "osl_services.h" -#include "osl_shader.h" - -#include "util_foreach.h" -#include "util_logging.h" -#include "util_string.h" - -#include "kernel_compat_cpu.h" -#include "kernel_globals.h" -#include "kernel_random.h" -#include "kernel_projection.h" -#include "kernel_differential.h" -#include "kernel_montecarlo.h" -#include "kernel_camera.h" -#include "kernels/cpu/kernel_cpu_image.h" -#include "geom/geom.h" -#include "bvh/bvh.h" - -#include "kernel_projection.h" -#include "kernel_accumulate.h" -#include "kernel_shader.h" +#include "render/mesh.h" +#include "render/object.h" +#include "render/scene.h" + +#include "kernel/osl/osl_closures.h" +#include "kernel/osl/osl_globals.h" +#include "kernel/osl/osl_services.h" +#include "kernel/osl/osl_shader.h" + +#include "util/util_foreach.h" +#include "util/util_logging.h" +#include "util/util_string.h" + +#include "kernel/kernel_compat_cpu.h" +#include "kernel/split/kernel_split_data_types.h" +#include "kernel/kernel_globals.h" +#include "kernel/kernel_random.h" +#include "kernel/kernel_projection.h" +#include "kernel/kernel_differential.h" +#include "kernel/kernel_montecarlo.h" +#include "kernel/kernel_camera.h" +#include "kernel/kernels/cpu/kernel_cpu_image.h" +#include "kernel/geom/geom.h" +#include "kernel/bvh/bvh.h" + +#include "kernel/kernel_projection.h" +#include "kernel/kernel_accumulate.h" +#include "kernel/kernel_shader.h" #ifdef WITH_PTEX # include <Ptexture.h> @@ -61,11 +62,17 @@ CCL_NAMESPACE_BEGIN /* RenderServices implementation */ -#define COPY_MATRIX44(m1, m2) { \ - CHECK_TYPE(m1, OSL::Matrix44*); \ - CHECK_TYPE(m2, Transform*); \ - memcpy(m1, m2, sizeof(*m2)); \ -} (void)0 +static void copy_matrix(OSL::Matrix44& m, const Transform& tfm) +{ + ProjectionTransform t = projection_transpose(ProjectionTransform(tfm)); + memcpy(&m, &t, sizeof(m)); +} + +static void copy_matrix(OSL::Matrix44& m, const ProjectionTransform& tfm) +{ + ProjectionTransform t = projection_transpose(tfm); + memcpy(&m, &t, sizeof(m)); +} /* static ustrings */ ustring OSLRenderServices::u_distance("distance"); @@ -82,6 +89,7 @@ ustring OSLRenderServices::u_geom_dupli_uv("geom:dupli_uv"); ustring OSLRenderServices::u_material_index("material:index"); ustring OSLRenderServices::u_object_random("object:random"); ustring OSLRenderServices::u_particle_index("particle:index"); +ustring OSLRenderServices::u_particle_random("particle:random"); ustring OSLRenderServices::u_particle_age("particle:age"); ustring OSLRenderServices::u_particle_lifetime("particle:lifetime"); ustring OSLRenderServices::u_particle_location("particle:location"); @@ -95,11 +103,10 @@ ustring OSLRenderServices::u_geom_polyvertices("geom:polyvertices"); ustring OSLRenderServices::u_geom_name("geom:name"); ustring OSLRenderServices::u_geom_undisplaced("geom:undisplaced"); ustring OSLRenderServices::u_is_smooth("geom:is_smooth"); -#ifdef __HAIR__ ustring OSLRenderServices::u_is_curve("geom:is_curve"); ustring OSLRenderServices::u_curve_thickness("geom:curve_thickness"); ustring OSLRenderServices::u_curve_tangent_normal("geom:curve_tangent_normal"); -#endif +ustring OSLRenderServices::u_curve_random("geom:curve_random"); ustring OSLRenderServices::u_path_ray_length("path:ray_length"); ustring OSLRenderServices::u_path_ray_depth("path:ray_depth"); ustring OSLRenderServices::u_path_diffuse_depth("path:diffuse_depth"); @@ -116,6 +123,7 @@ ustring OSLRenderServices::u_I("I"); ustring OSLRenderServices::u_u("u"); ustring OSLRenderServices::u_v("v"); ustring OSLRenderServices::u_empty; +ustring OSLRenderServices::u_at_bevel("@bevel"); OSLRenderServices::OSLRenderServices() { @@ -165,14 +173,12 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result #else Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM); #endif - tfm = transform_transpose(tfm); - COPY_MATRIX44(&result, &tfm); + copy_matrix(result, tfm); return true; } else if(sd->type == PRIMITIVE_LAMP) { - Transform tfm = transform_transpose(sd->ob_tfm); - COPY_MATRIX44(&result, &tfm); + copy_matrix(result, sd->ob_tfm); return true; } @@ -201,14 +207,12 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 #else Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM); #endif - itfm = transform_transpose(itfm); - COPY_MATRIX44(&result, &itfm); + copy_matrix(result, itfm); return true; } else if(sd->type == PRIMITIVE_LAMP) { - Transform tfm = transform_transpose(sd->ob_itfm); - COPY_MATRIX44(&result, &tfm); + copy_matrix(result, sd->ob_itfm); return true; } @@ -222,23 +226,19 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result KernelGlobals *kg = kernel_globals; if(from == u_ndc) { - Transform tfm = transform_transpose(transform_quick_inverse(kernel_data.cam.worldtondc)); - COPY_MATRIX44(&result, &tfm); + copy_matrix(result, kernel_data.cam.ndctoworld); return true; } else if(from == u_raster) { - Transform tfm = transform_transpose(kernel_data.cam.rastertoworld); - COPY_MATRIX44(&result, &tfm); + copy_matrix(result, kernel_data.cam.rastertoworld); return true; } else if(from == u_screen) { - Transform tfm = transform_transpose(kernel_data.cam.screentoworld); - COPY_MATRIX44(&result, &tfm); + copy_matrix(result, kernel_data.cam.screentoworld); return true; } else if(from == u_camera) { - Transform tfm = transform_transpose(kernel_data.cam.cameratoworld); - COPY_MATRIX44(&result, &tfm); + copy_matrix(result, kernel_data.cam.cameratoworld); return true; } else if(from == u_world) { @@ -254,23 +254,19 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 KernelGlobals *kg = kernel_globals; if(to == u_ndc) { - Transform tfm = transform_transpose(kernel_data.cam.worldtondc); - COPY_MATRIX44(&result, &tfm); + copy_matrix(result, kernel_data.cam.worldtondc); return true; } else if(to == u_raster) { - Transform tfm = transform_transpose(kernel_data.cam.worldtoraster); - COPY_MATRIX44(&result, &tfm); + copy_matrix(result, kernel_data.cam.worldtoraster); return true; } else if(to == u_screen) { - Transform tfm = transform_transpose(kernel_data.cam.worldtoscreen); - COPY_MATRIX44(&result, &tfm); + copy_matrix(result, kernel_data.cam.worldtoscreen); return true; } else if(to == u_camera) { - Transform tfm = transform_transpose(kernel_data.cam.worldtocamera); - COPY_MATRIX44(&result, &tfm); + copy_matrix(result, kernel_data.cam.worldtocamera); return true; } else if(to == u_world) { @@ -296,14 +292,12 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result KernelGlobals *kg = sd->osl_globals; Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM); #endif - tfm = transform_transpose(tfm); - COPY_MATRIX44(&result, &tfm); + copy_matrix(result, tfm); return true; } else if(sd->type == PRIMITIVE_LAMP) { - Transform tfm = transform_transpose(sd->ob_tfm); - COPY_MATRIX44(&result, &tfm); + copy_matrix(result, sd->ob_tfm); return true; } @@ -327,14 +321,12 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 KernelGlobals *kg = sd->osl_globals; Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM); #endif - tfm = transform_transpose(tfm); - COPY_MATRIX44(&result, &tfm); + copy_matrix(result, tfm); return true; } else if(sd->type == PRIMITIVE_LAMP) { - Transform tfm = transform_transpose(sd->ob_itfm); - COPY_MATRIX44(&result, &tfm); + copy_matrix(result, sd->ob_itfm); return true; } @@ -348,23 +340,19 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result KernelGlobals *kg = kernel_globals; if(from == u_ndc) { - Transform tfm = transform_transpose(transform_quick_inverse(kernel_data.cam.worldtondc)); - COPY_MATRIX44(&result, &tfm); + copy_matrix(result, kernel_data.cam.ndctoworld); return true; } else if(from == u_raster) { - Transform tfm = transform_transpose(kernel_data.cam.rastertoworld); - COPY_MATRIX44(&result, &tfm); + copy_matrix(result, kernel_data.cam.rastertoworld); return true; } else if(from == u_screen) { - Transform tfm = transform_transpose(kernel_data.cam.screentoworld); - COPY_MATRIX44(&result, &tfm); + copy_matrix(result, kernel_data.cam.screentoworld); return true; } else if(from == u_camera) { - Transform tfm = transform_transpose(kernel_data.cam.cameratoworld); - COPY_MATRIX44(&result, &tfm); + copy_matrix(result, kernel_data.cam.cameratoworld); return true; } @@ -376,23 +364,19 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 KernelGlobals *kg = kernel_globals; if(to == u_ndc) { - Transform tfm = transform_transpose(kernel_data.cam.worldtondc); - COPY_MATRIX44(&result, &tfm); + copy_matrix(result, kernel_data.cam.worldtondc); return true; } else if(to == u_raster) { - Transform tfm = transform_transpose(kernel_data.cam.worldtoraster); - COPY_MATRIX44(&result, &tfm); + copy_matrix(result, kernel_data.cam.worldtoraster); return true; } else if(to == u_screen) { - Transform tfm = transform_transpose(kernel_data.cam.worldtoscreen); - COPY_MATRIX44(&result, &tfm); + copy_matrix(result, kernel_data.cam.worldtoscreen); return true; } else if(to == u_camera) { - Transform tfm = transform_transpose(kernel_data.cam.worldtocamera); - COPY_MATRIX44(&result, &tfm); + copy_matrix(result, kernel_data.cam.worldtocamera); return true; } @@ -568,8 +552,7 @@ static bool set_attribute_float3_3(float3 P[3], TypeDesc type, bool derivatives, static bool set_attribute_matrix(const Transform& tfm, TypeDesc type, void *val) { if(type == TypeDesc::TypeMatrix) { - Transform transpose = transform_transpose(tfm); - memcpy(val, &transpose, sizeof(Transform)); + copy_matrix(*(OSL::Matrix44*)val, tfm); return true; } @@ -656,6 +639,12 @@ bool OSLRenderServices::get_object_standard_attribute(KernelGlobals *kg, ShaderD float f = particle_index(kg, particle_id); return set_attribute_float(f, type, derivatives, val); } + else if(name == u_particle_random) { + int particle_id = object_particle_id(kg, sd->object); + float f = hash_int_01(particle_index(kg, particle_id)); + return set_attribute_float(f, type, derivatives, val); + } + else if(name == u_particle_age) { int particle_id = object_particle_id(kg, sd->object); float f = particle_age(kg, particle_id); @@ -699,11 +688,7 @@ bool OSLRenderServices::get_object_standard_attribute(KernelGlobals *kg, ShaderD return set_attribute_int(3, type, derivatives, val); } else if((name == u_geom_trianglevertices || name == u_geom_polyvertices) -#ifdef __HAIR__ && sd->type & PRIMITIVE_ALL_TRIANGLE) -#else - ) -#endif { float3 P[3]; @@ -728,7 +713,6 @@ bool OSLRenderServices::get_object_standard_attribute(KernelGlobals *kg, ShaderD float f = ((sd->shader & SHADER_SMOOTH_NORMAL) != 0); return set_attribute_float(f, type, derivatives, val); } -#ifdef __HAIR__ /* Hair Attributes */ else if(name == u_is_curve) { float f = (sd->type & PRIMITIVE_ALL_CURVE) != 0; @@ -742,7 +726,6 @@ bool OSLRenderServices::get_object_standard_attribute(KernelGlobals *kg, ShaderD float3 f = curve_tangent_normal(kg, sd); return set_attribute_float3(f, type, derivatives, val); } -#endif else return false; } @@ -823,7 +806,7 @@ bool OSLRenderServices::get_background_attribute(KernelGlobals *kg, ShaderData * bool OSLRenderServices::get_attribute(OSL::ShaderGlobals *sg, bool derivatives, ustring object_name, TypeDesc type, ustring name, void *val) { - if(sg->renderstate == NULL) + if(sg == NULL || sg->renderstate == NULL) return false; ShaderData *sd = (ShaderData *)(sg->renderstate); @@ -957,20 +940,36 @@ bool OSLRenderServices::texture(ustring filename, return true; } #endif - bool status; + bool status = false; if(filename.length() && filename[0] == '@') { - int slot = atoi(filename.c_str() + 1); - float4 rgba = kernel_tex_image_interp(slot, s, 1.0f - t); - - result[0] = rgba[0]; - if(nchannels > 1) - result[1] = rgba[1]; - if(nchannels > 2) - result[2] = rgba[2]; - if(nchannels > 3) - result[3] = rgba[3]; - status = true; + if(filename == u_at_bevel) { + /* Bevel shader hack. */ + if(nchannels >= 3) { + PathState *state = sd->osl_path_state; + int num_samples = (int)s; + float radius = t; + float3 N = svm_bevel(kg, sd, state, radius, num_samples); + result[0] = N.x; + result[1] = N.y; + result[2] = N.z; + status = true; + } + } + else { + /* Packed texture. */ + int slot = atoi(filename.c_str() + 1); + float4 rgba = kernel_tex_image_interp(kg, slot, s, 1.0f - t); + + result[0] = rgba[0]; + if(nchannels > 1) + result[1] = rgba[1]; + if(nchannels > 2) + result[2] = rgba[2]; + if(nchannels > 3) + result[3] = rgba[3]; + status = true; + } } else { if(texture_handle != NULL) { @@ -1042,7 +1041,7 @@ bool OSLRenderServices::texture3d(ustring filename, bool status; if(filename.length() && filename[0] == '@') { int slot = atoi(filename.c_str() + 1); - float4 rgba = kernel_tex_image_interp_3d(slot, P.x, P.y, P.z); + float4 rgba = kernel_tex_image_interp_3d(kg, slot, P.x, P.y, P.z, INTERPOLATION_NONE); result[0] = rgba[0]; if(nchannels > 1) @@ -1196,8 +1195,9 @@ bool OSLRenderServices::trace(TraceOpt &options, OSL::ShaderGlobals *sg, tracedata->init = true; tracedata->sd.osl_globals = sd->osl_globals; - /* raytrace */ - return scene_intersect(sd->osl_globals, ray, PATH_RAY_ALL_VISIBILITY, &tracedata->isect, NULL, 0.0f, 0.0f); + /* Raytrace, leaving out shadow opaque to avoid early exit. */ + uint visibility = PATH_RAY_ALL_VISIBILITY - PATH_RAY_SHADOW_OPAQUE; + return scene_intersect(sd->osl_globals, ray, visibility, &tracedata->isect, NULL, 0.0f, 0.0f); } diff --git a/intern/cycles/kernel/osl/osl_services.h b/intern/cycles/kernel/osl/osl_services.h index ec34ca77115..d96048e26f2 100644 --- a/intern/cycles/kernel/osl/osl_services.h +++ b/intern/cycles/kernel/osl/osl_services.h @@ -147,6 +147,7 @@ public: static ustring u_material_index; static ustring u_object_random; static ustring u_particle_index; + static ustring u_particle_random; static ustring u_particle_age; static ustring u_particle_lifetime; static ustring u_particle_location; @@ -163,6 +164,7 @@ public: static ustring u_is_curve; static ustring u_curve_thickness; static ustring u_curve_tangent_normal; + static ustring u_curve_random; static ustring u_path_ray_length; static ustring u_path_ray_depth; static ustring u_path_diffuse_depth; @@ -179,6 +181,7 @@ public: static ustring u_u; static ustring u_v; static ustring u_empty; + static ustring u_at_bevel; private: KernelGlobals *kernel_globals; diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp index 0d762bbdb38..6b3a996ca12 100644 --- a/intern/cycles/kernel/osl/osl_shader.cpp +++ b/intern/cycles/kernel/osl/osl_shader.cpp @@ -16,21 +16,22 @@ #include <OSL/oslexec.h> -#include "kernel_compat_cpu.h" -#include "kernel_montecarlo.h" -#include "kernel_types.h" -#include "kernel_globals.h" +#include "kernel/kernel_compat_cpu.h" +#include "kernel/kernel_montecarlo.h" +#include "kernel/kernel_types.h" +#include "kernel/split/kernel_split_data_types.h" +#include "kernel/kernel_globals.h" -#include "geom/geom_object.h" +#include "kernel/geom/geom_object.h" -#include "osl_closures.h" -#include "osl_globals.h" -#include "osl_services.h" -#include "osl_shader.h" +#include "kernel/osl/osl_closures.h" +#include "kernel/osl/osl_globals.h" +#include "kernel/osl/osl_services.h" +#include "kernel/osl/osl_shader.h" -#include "util_foreach.h" +#include "util/util_foreach.h" -#include "attribute.h" +#include "render/attribute.h" CCL_NAMESPACE_BEGIN @@ -56,9 +57,7 @@ void OSLShader::thread_init(KernelGlobals *kg, KernelGlobals *kernel_globals, OS tdata->globals.tracedata = &tdata->tracedata; tdata->globals.flipHandedness = false; tdata->osl_thread_info = ss->create_thread_info(); - - for(int i = 0; i < SHADER_CONTEXT_NUM; i++) - tdata->context[i] = ss->get_context(tdata->osl_thread_info); + tdata->context = ss->get_context(tdata->osl_thread_info); tdata->oiio_thread_info = osl_globals->ts->get_perthread_info(); @@ -73,9 +72,7 @@ void OSLShader::thread_free(KernelGlobals *kg) OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss; OSLThreadData *tdata = kg->osl_tdata; - - for(int i = 0; i < SHADER_CONTEXT_NUM; i++) - ss->release_context(tdata->context[i]); + ss->release_context(tdata->context); ss->destroy_thread_info(tdata->osl_thread_info); @@ -172,7 +169,7 @@ static void flatten_surface_closure_tree(ShaderData *sd, } } -void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx) +void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag) { /* setup shader globals from shader data */ OSLThreadData *tdata = kg->osl_tdata; @@ -181,7 +178,7 @@ void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state /* execute shader for this point */ OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss; OSL::ShaderGlobals *globals = &tdata->globals; - OSL::ShadingContext *octx = tdata->context[(int)ctx]; + OSL::ShadingContext *octx = tdata->context; int shader = sd->shader & SHADER_MASK; /* automatic bump shader */ @@ -273,7 +270,7 @@ static void flatten_background_closure_tree(ShaderData *sd, } } -void OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx) +void OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag) { /* setup shader globals from shader data */ OSLThreadData *tdata = kg->osl_tdata; @@ -282,7 +279,7 @@ void OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, PathState *st /* execute shader for this point */ OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss; OSL::ShaderGlobals *globals = &tdata->globals; - OSL::ShadingContext *octx = tdata->context[(int)ctx]; + OSL::ShadingContext *octx = tdata->context; if(kg->osl->background_state) { ss->execute(octx, *(kg->osl->background_state), *globals); @@ -328,7 +325,7 @@ static void flatten_volume_closure_tree(ShaderData *sd, } } -void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx) +void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag) { /* setup shader globals from shader data */ OSLThreadData *tdata = kg->osl_tdata; @@ -337,7 +334,7 @@ void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, /* execute shader */ OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss; OSL::ShaderGlobals *globals = &tdata->globals; - OSL::ShadingContext *octx = tdata->context[(int)ctx]; + OSL::ShadingContext *octx = tdata->context; int shader = sd->shader & SHADER_MASK; if(kg->osl->volume_state[shader]) { @@ -351,19 +348,17 @@ void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, /* Displacement */ -void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd, ShaderContext ctx) +void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd, PathState *state) { /* setup shader globals from shader data */ OSLThreadData *tdata = kg->osl_tdata; - PathState state = {0}; - - shaderdata_to_shaderglobals(kg, sd, &state, 0, tdata); + shaderdata_to_shaderglobals(kg, sd, state, 0, tdata); /* execute shader */ OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss; OSL::ShaderGlobals *globals = &tdata->globals; - OSL::ShadingContext *octx = tdata->context[(int)ctx]; + OSL::ShadingContext *octx = tdata->context; int shader = sd->shader & SHADER_MASK; if(kg->osl->displacement_state[shader]) { diff --git a/intern/cycles/kernel/osl/osl_shader.h b/intern/cycles/kernel/osl/osl_shader.h index ad06dd6929d..6b392b25cf7 100644 --- a/intern/cycles/kernel/osl/osl_shader.h +++ b/intern/cycles/kernel/osl/osl_shader.h @@ -29,7 +29,7 @@ * This means no thread state must be passed along in the kernel itself. */ -#include "kernel_types.h" +#include "kernel/kernel_types.h" CCL_NAMESPACE_BEGIN @@ -53,10 +53,10 @@ public: static void thread_free(KernelGlobals *kg); /* eval */ - static void eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx); - static void eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx); - static void eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx); - static void eval_displacement(KernelGlobals *kg, ShaderData *sd, ShaderContext ctx); + static void eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag); + static void eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag); + static void eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag); + static void eval_displacement(KernelGlobals *kg, ShaderData *sd, PathState *state); /* attributes */ static int find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id, AttributeDescriptor *desc); diff --git a/intern/cycles/kernel/shaders/CMakeLists.txt b/intern/cycles/kernel/shaders/CMakeLists.txt index b43f8402d42..6ec651a96d8 100644 --- a/intern/cycles/kernel/shaders/CMakeLists.txt +++ b/intern/cycles/kernel/shaders/CMakeLists.txt @@ -7,6 +7,7 @@ set(SRC_OSL node_anisotropic_bsdf.osl node_attribute.osl node_background.osl + node_bevel.osl node_brick_texture.osl node_brightness.osl node_bump.osl @@ -22,6 +23,8 @@ set(SRC_OSL node_convert_from_point.osl node_convert_from_vector.osl node_diffuse_bsdf.osl + node_displacement.osl + node_vector_displacement.osl node_emission.osl node_environment_texture.osl node_fresnel.osl @@ -33,6 +36,7 @@ set(SRC_OSL node_hair_info.osl node_scatter_volume.osl node_absorption_volume.osl + node_principled_volume.osl node_holdout.osl node_hsv.osl node_image_texture.osl @@ -81,13 +85,15 @@ set(SRC_OSL node_wireframe.osl node_hair_bsdf.osl node_uv_map.osl + node_principled_bsdf.osl node_rgb_to_bw.osl ) set(SRC_OSL_HEADERS - node_texture.h node_color.h node_fresnel.h + node_ramp_util.h + node_texture.h stdosl.h oslutil.h ) @@ -99,6 +105,7 @@ set(SRC_OSO # TODO, add a module to compile OSL foreach(_file ${SRC_OSL}) set(_OSL_FILE ${CMAKE_CURRENT_SOURCE_DIR}/${_file}) + set_source_files_properties(${_file} PROPERTIES HEADER_FILE_ONLY TRUE) string(REPLACE ".osl" ".oso" _OSO_FILE ${_OSL_FILE}) string(REPLACE ${CMAKE_SOURCE_DIR} ${CMAKE_BINARY_DIR} _OSO_FILE ${_OSO_FILE}) add_custom_command( @@ -113,7 +120,8 @@ foreach(_file ${SRC_OSL}) unset(_OSO_FILE) endforeach() -add_custom_target(cycles_osl_shaders ALL DEPENDS ${SRC_OSO} ${SRC_OSL_HEADERS} ${OSL_COMPILER}) +add_custom_target(cycles_osl_shaders ALL DEPENDS ${SRC_OSO} ${SRC_OSL_HEADERS} ${OSL_COMPILER} SOURCES ${SRC_OSL}) +cycles_set_solution_folder(cycles_osl_shaders) # CMAKE_CURRENT_SOURCE_DIR is already included in OSO paths delayed_install("" "${SRC_OSO}" ${CYCLES_INSTALL_PATH}/shader) diff --git a/intern/cycles/kernel/shaders/node_anisotropic_bsdf.osl b/intern/cycles/kernel/shaders/node_anisotropic_bsdf.osl index bef6d7e8809..21e28ece65d 100644 --- a/intern/cycles/kernel/shaders/node_anisotropic_bsdf.osl +++ b/intern/cycles/kernel/shaders/node_anisotropic_bsdf.osl @@ -33,27 +33,28 @@ shader node_anisotropic_bsdf( T = rotate(T, Rotation * M_2PI, point(0.0, 0.0, 0.0), Normal); /* compute roughness */ - float RoughnessU, RoughnessV; + float roughness = Roughness * Roughness; + float roughness_u, roughness_v; float aniso = clamp(Anisotropy, -0.99, 0.99); if (aniso < 0.0) { - RoughnessU = Roughness / (1.0 + aniso); - RoughnessV = Roughness * (1.0 + aniso); + roughness_u = roughness / (1.0 + aniso); + roughness_v = roughness * (1.0 + aniso); } else { - RoughnessU = Roughness * (1.0 - aniso); - RoughnessV = Roughness / (1.0 - aniso); + roughness_u = roughness * (1.0 - aniso); + roughness_v = roughness / (1.0 - aniso); } if (distribution == "sharp") BSDF = Color * reflection(Normal); else if (distribution == "beckmann") - BSDF = Color * microfacet_beckmann_aniso(Normal, T, RoughnessU, RoughnessV); + BSDF = Color * microfacet_beckmann_aniso(Normal, T, roughness_u, roughness_v); else if (distribution == "GGX") - BSDF = Color * microfacet_ggx_aniso(Normal, T, RoughnessU, RoughnessV); + BSDF = Color * microfacet_ggx_aniso(Normal, T, roughness_u, roughness_v); else if (distribution == "Multiscatter GGX") - BSDF = Color * microfacet_multi_ggx_aniso(Normal, T, RoughnessU, RoughnessV, Color); + BSDF = Color * microfacet_multi_ggx_aniso(Normal, T, roughness_u, roughness_v, Color); else - BSDF = Color * ashikhmin_shirley(Normal, T, RoughnessU, RoughnessV); + BSDF = Color * ashikhmin_shirley(Normal, T, roughness_u, roughness_v); } diff --git a/intern/cycles/kernel/shaders/node_bevel.osl b/intern/cycles/kernel/shaders/node_bevel.osl new file mode 100644 index 00000000000..9c4ca15be17 --- /dev/null +++ b/intern/cycles/kernel/shaders/node_bevel.osl @@ -0,0 +1,31 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "stdosl.h" + +shader node_bevel( + int samples = 4, + float Radius = 0.05, + normal NormalIn = N, + output normal NormalOut = N) +{ + /* Abuse texture call with special @bevel token. */ + vector bevel_N = (normal)(color)texture("@bevel", samples, Radius); + + /* Preserve input normal. */ + NormalOut = normalize(NormalIn + (bevel_N - N)); +} + diff --git a/intern/cycles/kernel/shaders/node_displacement.osl b/intern/cycles/kernel/shaders/node_displacement.osl new file mode 100644 index 00000000000..89f35841527 --- /dev/null +++ b/intern/cycles/kernel/shaders/node_displacement.osl @@ -0,0 +1,38 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "stdosl.h" + +shader node_displacement( + string space = "object", + float Height = 0.0, + float Midlevel = 0.5, + float Scale = 1.0, + normal Normal = N, + output vector Displacement = vector(0.0, 0.0, 0.0)) +{ + Displacement = Normal; + if(space == "object") { + Displacement = transform("object", Displacement); + } + + Displacement = normalize(Displacement) * (Height - Midlevel) * Scale; + + if(space == "object") { + Displacement = transform("object", "world", Displacement); + } +} + diff --git a/intern/cycles/kernel/shaders/node_glass_bsdf.osl b/intern/cycles/kernel/shaders/node_glass_bsdf.osl index a9723a8300a..2e713861c58 100644 --- a/intern/cycles/kernel/shaders/node_glass_bsdf.osl +++ b/intern/cycles/kernel/shaders/node_glass_bsdf.osl @@ -29,16 +29,17 @@ shader node_glass_bsdf( float eta = backfacing() ? 1.0 / f : f; float cosi = dot(I, Normal); float Fr = fresnel_dielectric_cos(cosi, eta); + float roughness = Roughness * Roughness; if (distribution == "sharp") BSDF = Color * (Fr * reflection(Normal) + (1.0 - Fr) * refraction(Normal, eta)); else if (distribution == "beckmann") - BSDF = Color * (Fr * microfacet_beckmann(Normal, Roughness) + - (1.0 - Fr) * microfacet_beckmann_refraction(Normal, Roughness, eta)); + BSDF = Color * (Fr * microfacet_beckmann(Normal, roughness) + + (1.0 - Fr) * microfacet_beckmann_refraction(Normal, roughness, eta)); else if (distribution == "Multiscatter GGX") - BSDF = Color * microfacet_multi_ggx_glass(Normal, Roughness, eta, Color); + BSDF = Color * microfacet_multi_ggx_glass(Normal, roughness, eta, Color); else if (distribution == "GGX") - BSDF = Color * (Fr * microfacet_ggx(Normal, Roughness) + - (1.0 - Fr) * microfacet_ggx_refraction(Normal, Roughness, eta)); + BSDF = Color * (Fr * microfacet_ggx(Normal, roughness) + + (1.0 - Fr) * microfacet_ggx_refraction(Normal, roughness, eta)); } diff --git a/intern/cycles/kernel/shaders/node_glossy_bsdf.osl b/intern/cycles/kernel/shaders/node_glossy_bsdf.osl index f4ea7e7dc6a..7415211b56d 100644 --- a/intern/cycles/kernel/shaders/node_glossy_bsdf.osl +++ b/intern/cycles/kernel/shaders/node_glossy_bsdf.osl @@ -24,16 +24,18 @@ shader node_glossy_bsdf( normal Normal = N, output closure color BSDF = 0) { + float roughness = Roughness * Roughness; + if (distribution == "sharp") BSDF = Color * reflection(Normal); else if (distribution == "beckmann") - BSDF = Color * microfacet_beckmann(Normal, Roughness); + BSDF = Color * microfacet_beckmann(Normal, roughness); else if (distribution == "GGX") - BSDF = Color * microfacet_ggx(Normal, Roughness); + BSDF = Color * microfacet_ggx(Normal, roughness); else if (distribution == "Multiscatter GGX") - BSDF = Color * microfacet_multi_ggx(Normal, Roughness, Color); + BSDF = Color * microfacet_multi_ggx(Normal, roughness, Color); else - BSDF = Color * ashikhmin_shirley(Normal, vector(0, 0, 0), Roughness, Roughness); + BSDF = Color * ashikhmin_shirley(Normal, vector(0, 0, 0), roughness, roughness); } diff --git a/intern/cycles/kernel/shaders/node_hair_info.osl b/intern/cycles/kernel/shaders/node_hair_info.osl index 965d2a3c7f7..19216f67579 100644 --- a/intern/cycles/kernel/shaders/node_hair_info.osl +++ b/intern/cycles/kernel/shaders/node_hair_info.osl @@ -20,11 +20,13 @@ shader node_hair_info( output float IsStrand = 0.0, output float Intercept = 0.0, output float Thickness = 0.0, - output normal TangentNormal = N) + output normal TangentNormal = N, + output float Random = 0) { getattribute("geom:is_curve", IsStrand); getattribute("geom:curve_intercept", Intercept); getattribute("geom:curve_thickness", Thickness); getattribute("geom:curve_tangent_normal", TangentNormal); + getattribute("geom:curve_random", Random); } diff --git a/intern/cycles/kernel/shaders/node_output_displacement.osl b/intern/cycles/kernel/shaders/node_output_displacement.osl index 294b8dd6bf2..5dbef0244fe 100644 --- a/intern/cycles/kernel/shaders/node_output_displacement.osl +++ b/intern/cycles/kernel/shaders/node_output_displacement.osl @@ -16,10 +16,8 @@ #include "stdosl.h" -displacement node_output_displacement(float Displacement = 0.0) +displacement node_output_displacement(vector Displacement = 0.0) { - vector dP = normalize(transform("object", N)); - dP *= Displacement * 0.1; /* todo: get rid of this factor */ - P += transform("object", "world", dP); + P += Displacement; } diff --git a/intern/cycles/kernel/shaders/node_particle_info.osl b/intern/cycles/kernel/shaders/node_particle_info.osl index 768b7753d02..2a0252d5e45 100644 --- a/intern/cycles/kernel/shaders/node_particle_info.osl +++ b/intern/cycles/kernel/shaders/node_particle_info.osl @@ -18,6 +18,7 @@ shader node_particle_info( output float Index = 0.0, + output float Random = 0.0, output float Age = 0.0, output float Lifetime = 0.0, output point Location = point(0.0, 0.0, 0.0), @@ -26,6 +27,7 @@ shader node_particle_info( output vector AngularVelocity = point(0.0, 0.0, 0.0)) { getattribute("particle:index", Index); + getattribute("particle:random", Random); getattribute("particle:age", Age); getattribute("particle:lifetime", Lifetime); getattribute("particle:location", Location); diff --git a/intern/cycles/kernel/shaders/node_principled_bsdf.osl b/intern/cycles/kernel/shaders/node_principled_bsdf.osl new file mode 100644 index 00000000000..6f54ba3a462 --- /dev/null +++ b/intern/cycles/kernel/shaders/node_principled_bsdf.osl @@ -0,0 +1,127 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "stdosl.h" +#include "node_fresnel.h" + +shader node_principled_bsdf( + string distribution = "Multiscatter GGX", + string subsurface_method = "burley", + color BaseColor = color(0.8, 0.8, 0.8), + float Subsurface = 0.0, + vector SubsurfaceRadius = vector(1.0, 1.0, 1.0), + color SubsurfaceColor = color(0.7, 0.1, 0.1), + float Metallic = 0.0, + float Specular = 0.5, + float SpecularTint = 0.0, + float Roughness = 0.5, + float Anisotropic = 0.0, + float AnisotropicRotation = 0.0, + float Sheen = 0.0, + float SheenTint = 0.5, + float Clearcoat = 0.0, + float ClearcoatRoughness = 0.03, + float IOR = 1.45, + float Transmission = 0.0, + float TransmissionRoughness = 0.0, + normal Normal = N, + normal ClearcoatNormal = N, + normal Tangent = normalize(dPdu), + output closure color BSDF = 0) +{ + float f = max(IOR, 1e-5); + float diffuse_weight = (1.0 - clamp(Metallic, 0.0, 1.0)) * (1.0 - clamp(Transmission, 0.0, 1.0)); + float final_transmission = clamp(Transmission, 0.0, 1.0) * (1.0 - clamp(Metallic, 0.0, 1.0)); + float specular_weight = (1.0 - final_transmission); + + vector T = Tangent; + + float m_cdlum = luminance(BaseColor); + color m_ctint = m_cdlum > 0.0 ? BaseColor / m_cdlum : color(0.0, 0.0, 0.0); // normalize lum. to isolate hue+sat + + /* rotate tangent */ + if (AnisotropicRotation != 0.0) + T = rotate(T, AnisotropicRotation * M_2PI, point(0.0, 0.0, 0.0), Normal); + + if (diffuse_weight > 1e-5) { + if (Subsurface > 1e-5) { + color mixed_ss_base_color = SubsurfaceColor * Subsurface + BaseColor * (1.0 - Subsurface); + if (subsurface_method == "burley") { + BSDF = mixed_ss_base_color * bssrdf("principled", Normal, Subsurface * SubsurfaceRadius, SubsurfaceColor, "roughness", Roughness); + } + else { + BSDF = mixed_ss_base_color * bssrdf("principled_random_walk", Normal, Subsurface * SubsurfaceRadius, mixed_ss_base_color, "roughness", Roughness); + } + } + else { + BSDF = BaseColor * principled_diffuse(Normal, Roughness); + } + + if (Sheen > 1e-5) { + color sheen_color = color(1.0, 1.0, 1.0) * (1.0 - SheenTint) + m_ctint * SheenTint; + + BSDF = BSDF + sheen_color * Sheen * principled_sheen(Normal); + } + + BSDF = BSDF * diffuse_weight; + } + + if (specular_weight > 1e-5) { + float aspect = sqrt(1.0 - Anisotropic * 0.9); + float r2 = Roughness * Roughness; + + float alpha_x = r2 / aspect; + float alpha_y = r2 * aspect; + + color tmp_col = color(1.0, 1.0, 1.0) * (1.0 - SpecularTint) + m_ctint * SpecularTint; + + color Cspec0 = (Specular * 0.08 * tmp_col) * (1.0 - Metallic) + BaseColor * Metallic; + + if (distribution == "GGX" || Roughness <= 0.075) { + BSDF = BSDF + specular_weight * microfacet_ggx_aniso_fresnel(Normal, T, alpha_x, alpha_y, (2.0 / (1.0 - sqrt(0.08 * Specular))) - 1.0, BaseColor, Cspec0); + } else { + BSDF = BSDF + specular_weight * microfacet_multi_ggx_aniso_fresnel(Normal, T, alpha_x, alpha_y, (2.0 / (1.0 - sqrt(0.08 * Specular))) - 1.0, BaseColor, Cspec0); + } + } + + if (final_transmission > 1e-5) { + color Cspec0 = BaseColor * SpecularTint + color(1.0, 1.0, 1.0) * (1.0 - SpecularTint); + float eta = backfacing() ? 1.0 / f : f; + + if (distribution == "GGX" || Roughness <= 5e-2) { + float cosNO = dot(Normal, I); + float Fr = fresnel_dielectric_cos(cosNO, eta); + + float refl_roughness = Roughness; + if (Roughness <= 1e-2) + refl_roughness = 0.0; + + float transmission_roughness = refl_roughness; + if (distribution == "GGX") + transmission_roughness = 1.0 - (1.0 - refl_roughness) * (1.0 - TransmissionRoughness); + + BSDF = BSDF + final_transmission * (Fr * microfacet_ggx_fresnel(Normal, refl_roughness * refl_roughness, eta, BaseColor, Cspec0) + + (1.0 - Fr) * BaseColor * microfacet_ggx_refraction(Normal, transmission_roughness * transmission_roughness, eta)); + } else { + BSDF = BSDF + final_transmission * microfacet_multi_ggx_glass_fresnel(Normal, Roughness * Roughness, eta, BaseColor, Cspec0); + } + } + + if (Clearcoat > 1e-5) { + BSDF = BSDF + principled_clearcoat(ClearcoatNormal, Clearcoat, ClearcoatRoughness * ClearcoatRoughness); + } +} + diff --git a/intern/cycles/kernel/shaders/node_principled_volume.osl b/intern/cycles/kernel/shaders/node_principled_volume.osl new file mode 100644 index 00000000000..ea8d6ab12c5 --- /dev/null +++ b/intern/cycles/kernel/shaders/node_principled_volume.osl @@ -0,0 +1,95 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "stdosl.h" + +shader node_principled_volume( + color Color = color(0.5, 0.5, 0.5), + float Density = 1.0, + float Anisotropy = 0.0, + color AbsorptionColor = color(0.0, 0.0, 0.0), + float EmissionStrength = 0.0, + color EmissionColor = color(1.0, 1.0, 1.0), + float BlackbodyIntensity = 0.0, + color BlackbodyTint = color(1.0, 1.0, 1.0), + float Temperature = 1500.0, + string DensityAttribute = "geom:density", + string ColorAttribute = "geom:color", + string TemperatureAttribute = "geom:temperature", + output closure color Volume = 0) +{ + /* Compute density. */ + float primitive_density = 1.0; + float density = max(Density, 0.0); + + if(density > 1e-5) { + if(getattribute(DensityAttribute, primitive_density)) { + density = max(density * primitive_density, 0.0); + } + } + + if(density > 1e-5) { + /* Compute scattering color. */ + color scatter_color = Color; + color primitive_color; + if(getattribute(ColorAttribute, primitive_color)) { + scatter_color *= primitive_color; + } + + /* Add scattering and absorption closures. */ + color scatter_coeff = scatter_color; + color absorption_color = sqrt(max(AbsorptionColor, 0.0)); + color absorption_coeff = max(1.0 - scatter_color, 0.0) * max(1.0 - absorption_color, 0.0); + Volume = scatter_coeff * density * henyey_greenstein(Anisotropy) + + absorption_coeff * density * absorption(); + } + + /* Compute emission. */ + float emission_strength = max(EmissionStrength, 0.0); + float blackbody_intensity = BlackbodyIntensity; + + if(emission_strength > 1e-5) { + Volume += emission_strength * EmissionColor * emission(); + } + + if(blackbody_intensity > 1e-3) { + float T = Temperature; + + /* Add temperature from attribute if available. */ + float temperature; + if(getattribute(TemperatureAttribute, temperature)) { + T *= max(temperature, 0.0); + } + + T = max(T, 0.0); + + /* Stefan-Boltzman law. */ + float T4 = (T * T) * (T * T); + float sigma = 5.670373e-8 * 1e-6 / M_PI; + float intensity = sigma * mix(1.0, T4, blackbody_intensity); + + if(intensity > 1e-5) { + color bb = blackbody(T); + float l = luminance(bb); + + if(l != 0.0) { + bb *= BlackbodyTint * intensity / l; + Volume += bb * emission(); + } + } + } +} + diff --git a/intern/cycles/kernel/shaders/node_refraction_bsdf.osl b/intern/cycles/kernel/shaders/node_refraction_bsdf.osl index 828becf1818..eaab7282243 100644 --- a/intern/cycles/kernel/shaders/node_refraction_bsdf.osl +++ b/intern/cycles/kernel/shaders/node_refraction_bsdf.osl @@ -26,12 +26,13 @@ shader node_refraction_bsdf( { float f = max(IOR, 1e-5); float eta = backfacing() ? 1.0 / f : f; + float roughness = Roughness * Roughness; if (distribution == "sharp") BSDF = Color * refraction(Normal, eta); else if (distribution == "beckmann") - BSDF = Color * microfacet_beckmann_refraction(Normal, Roughness, eta); + BSDF = Color * microfacet_beckmann_refraction(Normal, roughness, eta); else if (distribution == "GGX") - BSDF = Color * microfacet_ggx_refraction(Normal, Roughness, eta); + BSDF = Color * microfacet_ggx_refraction(Normal, roughness, eta); } diff --git a/intern/cycles/kernel/shaders/node_subsurface_scattering.osl b/intern/cycles/kernel/shaders/node_subsurface_scattering.osl index 5ba8f34021d..0df3256e1fd 100644 --- a/intern/cycles/kernel/shaders/node_subsurface_scattering.osl +++ b/intern/cycles/kernel/shaders/node_subsurface_scattering.osl @@ -27,10 +27,12 @@ shader node_subsurface_scattering( output closure color BSSRDF = 0) { if (falloff == "gaussian") - BSSRDF = Color * bssrdf_gaussian(Normal, Scale * Radius, TextureBlur); + BSSRDF = Color * bssrdf("gaussian", Normal, Scale * Radius, Color, "texture_blur", TextureBlur); else if (falloff == "cubic") - BSSRDF = Color * bssrdf_cubic(Normal, Scale * Radius, TextureBlur, Sharpness); + BSSRDF = Color * bssrdf("cubic", Normal, Scale * Radius, Color, "texture_blur", TextureBlur, "sharpness", Sharpness); + else if (falloff == "burley") + BSSRDF = Color * bssrdf("burley", Normal, Scale * Radius, Color, "texture_blur", TextureBlur); else - BSSRDF = Color * bssrdf_burley(Normal, Scale * Radius, TextureBlur, Color); + BSSRDF = Color * bssrdf("random_walk", Normal, Scale * Radius, Color, "texture_blur", TextureBlur); } diff --git a/intern/cycles/kernel/shaders/node_vector_displacement.osl b/intern/cycles/kernel/shaders/node_vector_displacement.osl new file mode 100644 index 00000000000..b19bc228e37 --- /dev/null +++ b/intern/cycles/kernel/shaders/node_vector_displacement.osl @@ -0,0 +1,60 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "stdosl.h" + +shader node_vector_displacement( + color Vector = color(0.0, 0.0, 0.0), + float Midlevel = 0.0, + float Scale = 1.0, + string space = "tangent", + string attr_name = "geom:tangent", + string attr_sign_name = "geom:tangent_sign", + output vector Displacement = vector(0.0, 0.0, 0.0)) +{ + vector offset = (Vector - vector(Midlevel)) * Scale; + + if(space == "tangent") { + /* Tangent space. */ + vector N_object = normalize(transform("world", "object", N)); + + vector T_object; + if(getattribute(attr_name, T_object)) { + T_object = normalize(T_object); + } + else { + T_object = normalize(dPdu); + } + + vector B_object = normalize(cross(N_object, T_object)); + float tangent_sign; + if(getattribute(attr_sign_name, tangent_sign)) { + B_object *= tangent_sign; + } + + Displacement = T_object*offset[0] + N_object*offset[1] + B_object*offset[2]; + } + else { + /* Object or world space. */ + Displacement = offset; + } + + if(space != "world") { + /* Tangent or object space. */ + Displacement = transform("object", "world", Displacement); + } +} + diff --git a/intern/cycles/kernel/shaders/stdosl.h b/intern/cycles/kernel/shaders/stdosl.h index a8dda8a12c9..091ade4a60d 100644 --- a/intern/cycles/kernel/shaders/stdosl.h +++ b/intern/cycles/kernel/shaders/stdosl.h @@ -530,6 +530,11 @@ closure color microfacet_ggx_refraction(normal N, float ag, float eta) BUILTIN; closure color microfacet_multi_ggx(normal N, float ag, color C) BUILTIN; closure color microfacet_multi_ggx_aniso(normal N, vector T, float ax, float ay, color C) BUILTIN; closure color microfacet_multi_ggx_glass(normal N, float ag, float eta, color C) BUILTIN; +closure color microfacet_ggx_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN; +closure color microfacet_ggx_aniso_fresnel(normal N, vector T, float ax, float ay, float eta, color C, color Cspec0) BUILTIN; +closure color microfacet_multi_ggx_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN; +closure color microfacet_multi_ggx_aniso_fresnel(normal N, vector T, float ax, float ay, float eta, color C, color Cspec0) BUILTIN; +closure color microfacet_multi_ggx_glass_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN; closure color microfacet_beckmann(normal N, float ab) BUILTIN; closure color microfacet_beckmann_aniso(normal N, vector T, float ax, float ay) BUILTIN; closure color microfacet_beckmann_refraction(normal N, float ab, float eta) BUILTIN; @@ -539,11 +544,12 @@ closure color emission() BUILTIN; closure color background() BUILTIN; closure color holdout() BUILTIN; closure color ambient_occlusion() BUILTIN; +closure color principled_diffuse(normal N, float roughness) BUILTIN; +closure color principled_sheen(normal N) BUILTIN; +closure color principled_clearcoat(normal N, float clearcoat, float clearcoat_roughness) BUILTIN; // BSSRDF -closure color bssrdf_cubic(normal N, vector radius, float texture_blur, float sharpness) BUILTIN; -closure color bssrdf_gaussian(normal N, vector radius, float texture_blur) BUILTIN; -closure color bssrdf_burley(normal N, vector radius, float texture_blur, color albedo) BUILTIN; +closure color bssrdf(string method, normal N, vector radius, color albedo) BUILTIN; // Hair closure color hair_reflection(normal N, float roughnessu, float roughnessv, vector T, float offset) BUILTIN; diff --git a/intern/cycles/kernel/split/kernel_background_buffer_update.h b/intern/cycles/kernel/split/kernel_background_buffer_update.h deleted file mode 100644 index 9bfa71c75ef..00000000000 --- a/intern/cycles/kernel/split/kernel_background_buffer_update.h +++ /dev/null @@ -1,248 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "kernel_split_common.h" - -/* Note on kernel_background_buffer_update kernel. - * This is the fourth kernel in the ray tracing logic, and the third - * of the path iteration kernels. This kernel takes care of rays that hit - * the background (sceneintersect kernel), and for the rays of - * state RAY_UPDATE_BUFFER it updates the ray's accumulated radiance in - * the output buffer. This kernel also takes care of rays that have been determined - * to-be-regenerated. - * - * We will empty QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue in this kernel - * - * Typically all rays that are in state RAY_HIT_BACKGROUND, RAY_UPDATE_BUFFER - * will be eventually set to RAY_TO_REGENERATE state in this kernel. Finally all rays of ray_state - * RAY_TO_REGENERATE will be regenerated and put in queue QUEUE_ACTIVE_AND_REGENERATED_RAYS. - * - * The input and output are as follows, - * - * rng_coop ---------------------------------------------|--- kernel_background_buffer_update --|--- PathRadiance_coop - * throughput_coop --------------------------------------| |--- L_transparent_coop - * per_sample_output_buffers ----------------------------| |--- per_sample_output_buffers - * Ray_coop ---------------------------------------------| |--- ray_state - * PathState_coop ---------------------------------------| |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) - * L_transparent_coop -----------------------------------| |--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) - * ray_state --------------------------------------------| |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) - * Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ----| |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) - * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------| |--- work_array - * parallel_samples -------------------------------------| |--- PathState_coop - * end_sample -------------------------------------------| |--- throughput_coop - * kg (globals) -----------------------------------------| |--- rng_coop - * rng_state --------------------------------------------| |--- Ray - * PathRadiance_coop ------------------------------------| | - * sw ---------------------------------------------------| | - * sh ---------------------------------------------------| | - * sx ---------------------------------------------------| | - * sy ---------------------------------------------------| | - * stride -----------------------------------------------| | - * work_array -------------------------------------------| |--- work_array - * queuesize --------------------------------------------| | - * start_sample -----------------------------------------| |--- work_pool_wgs - * work_pool_wgs ----------------------------------------| | - * num_samples ------------------------------------------| | - * - * note on sd : sd argument is neither an input nor an output for this kernel. It is just filled and consumed here itself. - * Note on Queues : - * This kernel fetches rays from QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. - * - * State of queues when this kernel is called : - * At entry, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND, RAY_TO_REGENERATE rays - * At exit, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty - */ -ccl_device char kernel_background_buffer_update( - KernelGlobals *kg, - ccl_global float *per_sample_output_buffers, - ccl_global uint *rng_state, - ccl_global uint *rng_coop, /* Required for buffer Update */ - ccl_global float3 *throughput_coop, /* Required for background hit processing */ - PathRadiance *PathRadiance_coop, /* Required for background hit processing and buffer Update */ - ccl_global Ray *Ray_coop, /* Required for background hit processing */ - ccl_global PathState *PathState_coop, /* Required for background hit processing */ - ccl_global float *L_transparent_coop, /* Required for background hit processing and buffer Update */ - ccl_global char *ray_state, /* Stores information on the current state of a ray */ - int sw, int sh, int sx, int sy, int stride, - int rng_state_offset_x, - int rng_state_offset_y, - int rng_state_stride, - ccl_global unsigned int *work_array, /* Denotes work of each ray */ - int end_sample, - int start_sample, -#ifdef __WORK_STEALING__ - ccl_global unsigned int *work_pool_wgs, - unsigned int num_samples, -#endif -#ifdef __KERNEL_DEBUG__ - DebugData *debugdata_coop, -#endif - int parallel_samples, /* Number of samples to be processed in parallel */ - int ray_index) -{ - char enqueue_flag = 0; -#ifdef __KERNEL_DEBUG__ - DebugData *debug_data = &debugdata_coop[ray_index]; -#endif - ccl_global PathState *state = &PathState_coop[ray_index]; - PathRadiance *L = L = &PathRadiance_coop[ray_index]; - ccl_global Ray *ray = &Ray_coop[ray_index]; - ccl_global float3 *throughput = &throughput_coop[ray_index]; - ccl_global float *L_transparent = &L_transparent_coop[ray_index]; - ccl_global uint *rng = &rng_coop[ray_index]; - -#ifdef __WORK_STEALING__ - unsigned int my_work; - ccl_global float *initial_per_sample_output_buffers; - ccl_global uint *initial_rng; -#endif - unsigned int sample; - unsigned int tile_x; - unsigned int tile_y; - unsigned int pixel_x; - unsigned int pixel_y; - unsigned int my_sample_tile; - -#ifdef __WORK_STEALING__ - my_work = work_array[ray_index]; - sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample; - get_pixel_tile_position(&pixel_x, &pixel_y, - &tile_x, &tile_y, - my_work, - sw, sh, sx, sy, - parallel_samples, - ray_index); - my_sample_tile = 0; - initial_per_sample_output_buffers = per_sample_output_buffers; - initial_rng = rng_state; -#else /* __WORK_STEALING__ */ - sample = work_array[ray_index]; - int tile_index = ray_index / parallel_samples; - /* buffer and rng_state's stride is "stride". Find x and y using ray_index */ - tile_x = tile_index % sw; - tile_y = tile_index / sw; - my_sample_tile = ray_index - (tile_index * parallel_samples); -#endif /* __WORK_STEALING__ */ - - rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride; - per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride; - - if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { - /* eval background shader if nothing hit */ - if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) { - *L_transparent = (*L_transparent) + average((*throughput)); -#ifdef __PASSES__ - if(!(kernel_data.film.pass_flag & PASS_BACKGROUND)) -#endif - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - } - - if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { -#ifdef __BACKGROUND__ - /* sample background shader */ - float3 L_background = indirect_background(kg, kg->sd_input, state, ray); - path_radiance_accum_background(L, (*throughput), L_background, state->bounce); -#endif - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - } - } - - if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) { - float3 L_sum = path_radiance_clamp_and_sum(kg, L); - kernel_write_light_passes(kg, per_sample_output_buffers, L, sample); -#ifdef __KERNEL_DEBUG__ - kernel_write_debug_passes(kg, per_sample_output_buffers, state, debug_data, sample); -#endif - float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent)); - - /* accumulate result in output buffer */ - kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad); - path_rng_end(kg, rng_state, *rng); - - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); - } - - if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { -#ifdef __WORK_STEALING__ - /* We have completed current work; So get next work */ - int valid_work = get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index); - if(!valid_work) { - /* If work is invalid, this means no more work is available and the thread may exit */ - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE); - } -#else /* __WORK_STEALING__ */ - if((sample + parallel_samples) >= end_sample) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE); - } -#endif /* __WORK_STEALING__ */ - - if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { -#ifdef __WORK_STEALING__ - work_array[ray_index] = my_work; - /* Get the sample associated with the current work */ - sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample; - /* Get pixel and tile position associated with current work */ - get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index); - my_sample_tile = 0; - - /* Remap rng_state according to the current work */ - rng_state = initial_rng + ((rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride); - /* Remap per_sample_output_buffers according to the current work */ - per_sample_output_buffers = initial_per_sample_output_buffers - + (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride; -#else /* __WORK_STEALING__ */ - work_array[ray_index] = sample + parallel_samples; - sample = work_array[ray_index]; - - /* Get ray position from ray index */ - pixel_x = sx + ((ray_index / parallel_samples) % sw); - pixel_y = sy + ((ray_index / parallel_samples) / sw); -#endif /* __WORK_STEALING__ */ - - /* Initialize random numbers and ray. */ - kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, rng, ray); - - if(ray->t != 0.0f) { - /* Initialize throughput, L_transparent, Ray, PathState; - * These rays proceed with path-iteration. - */ - *throughput = make_float3(1.0f, 1.0f, 1.0f); - *L_transparent = 0.0f; - path_radiance_init(L, kernel_data.film.use_light_pass); - path_state_init(kg, kg->sd_input, state, rng, sample, ray); -#ifdef __KERNEL_DEBUG__ - debug_data_init(debug_data); -#endif - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); - enqueue_flag = 1; - } - else { - /* These rays do not participate in path-iteration. */ - float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f); - /* Accumulate result in output buffer. */ - kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad); - path_rng_end(kg, rng_state, *rng); - - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); - } - } - } - return enqueue_flag; -} diff --git a/intern/cycles/kernel/split/kernel_branched.h b/intern/cycles/kernel/split/kernel_branched.h new file mode 100644 index 00000000000..368a4395760 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_branched.h @@ -0,0 +1,234 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +#ifdef __BRANCHED_PATH__ + +/* sets up the various state needed to do an indirect loop */ +ccl_device_inline void kernel_split_branched_path_indirect_loop_init(KernelGlobals *kg, int ray_index) +{ + SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; + + /* save a copy of the state to restore later */ +#define BRANCHED_STORE(name) \ + branched_state->name = kernel_split_state.name[ray_index]; + + BRANCHED_STORE(path_state); + BRANCHED_STORE(throughput); + BRANCHED_STORE(ray); + BRANCHED_STORE(isect); + BRANCHED_STORE(ray_state); + + *kernel_split_sd(branched_state_sd, ray_index) = *kernel_split_sd(sd, ray_index); + for(int i = 0; i < kernel_split_sd(branched_state_sd, ray_index)->num_closure; i++) { + kernel_split_sd(branched_state_sd, ray_index)->closure[i] = kernel_split_sd(sd, ray_index)->closure[i]; + } + +#undef BRANCHED_STORE + + /* set loop counters to intial position */ + branched_state->next_closure = 0; + branched_state->next_sample = 0; +} + +/* ends an indirect loop and restores the previous state */ +ccl_device_inline void kernel_split_branched_path_indirect_loop_end(KernelGlobals *kg, int ray_index) +{ + SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; + + /* restore state */ +#define BRANCHED_RESTORE(name) \ + kernel_split_state.name[ray_index] = branched_state->name; + + BRANCHED_RESTORE(path_state); + BRANCHED_RESTORE(throughput); + BRANCHED_RESTORE(ray); + BRANCHED_RESTORE(isect); + BRANCHED_RESTORE(ray_state); + + *kernel_split_sd(sd, ray_index) = *kernel_split_sd(branched_state_sd, ray_index); + for(int i = 0; i < kernel_split_sd(branched_state_sd, ray_index)->num_closure; i++) { + kernel_split_sd(sd, ray_index)->closure[i] = kernel_split_sd(branched_state_sd, ray_index)->closure[i]; + } + +#undef BRANCHED_RESTORE + + /* leave indirect loop */ + REMOVE_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT); +} + +ccl_device_inline bool kernel_split_branched_indirect_start_shared(KernelGlobals *kg, int ray_index) +{ + ccl_global char *ray_state = kernel_split_state.ray_state; + + int inactive_ray = dequeue_ray_index(QUEUE_INACTIVE_RAYS, + kernel_split_state.queue_data, kernel_split_params.queue_size, kernel_split_params.queue_index); + + if(!IS_STATE(ray_state, inactive_ray, RAY_INACTIVE)) { + return false; + } + +#define SPLIT_DATA_ENTRY(type, name, num) \ + if(num) { \ + kernel_split_state.name[inactive_ray] = kernel_split_state.name[ray_index]; \ + } + SPLIT_DATA_ENTRIES_BRANCHED_SHARED +#undef SPLIT_DATA_ENTRY + + *kernel_split_sd(sd, inactive_ray) = *kernel_split_sd(sd, ray_index); + for(int i = 0; i < kernel_split_sd(sd, ray_index)->num_closure; i++) { + kernel_split_sd(sd, inactive_ray)->closure[i] = kernel_split_sd(sd, ray_index)->closure[i]; + } + + kernel_split_state.branched_state[inactive_ray].shared_sample_count = 0; + kernel_split_state.branched_state[inactive_ray].original_ray = ray_index; + kernel_split_state.branched_state[inactive_ray].waiting_on_shared_samples = false; + + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + PathRadiance *inactive_L = &kernel_split_state.path_radiance[inactive_ray]; + + path_radiance_init(inactive_L, kernel_data.film.use_light_pass); + path_radiance_copy_indirect(inactive_L, L); + + ray_state[inactive_ray] = RAY_REGENERATED; + ADD_RAY_FLAG(ray_state, inactive_ray, RAY_BRANCHED_INDIRECT_SHARED); + ADD_RAY_FLAG(ray_state, inactive_ray, IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)); + + atomic_fetch_and_inc_uint32((ccl_global uint*)&kernel_split_state.branched_state[ray_index].shared_sample_count); + + return true; +} + +/* bounce off surface and integrate indirect light */ +ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(KernelGlobals *kg, + int ray_index, + float num_samples_adjust, + ShaderData *saved_sd, + bool reset_path_state, + bool wait_for_shared) +{ + SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; + + ShaderData *sd = saved_sd; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + float3 throughput = branched_state->throughput; + ccl_global PathState *ps = &kernel_split_state.path_state[ray_index]; + + float sum_sample_weight = 0.0f; +#ifdef __DENOISING_FEATURES__ + if(ps->denoising_feature_weight > 0.0f) { + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; + + /* transparency is not handled here, but in outer loop */ + if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) { + continue; + } + + sum_sample_weight += sc->sample_weight; + } + } + else { + sum_sample_weight = 1.0f; + } +#endif /* __DENOISING_FEATURES__ */ + + for(int i = branched_state->next_closure; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; + + if(!CLOSURE_IS_BSDF(sc->type)) + continue; + /* transparency is not handled here, but in outer loop */ + if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) + continue; + + int num_samples; + + if(CLOSURE_IS_BSDF_DIFFUSE(sc->type)) + num_samples = kernel_data.integrator.diffuse_samples; + else if(CLOSURE_IS_BSDF_BSSRDF(sc->type)) + num_samples = 1; + else if(CLOSURE_IS_BSDF_GLOSSY(sc->type)) + num_samples = kernel_data.integrator.glossy_samples; + else + num_samples = kernel_data.integrator.transmission_samples; + + num_samples = ceil_to_int(num_samples_adjust*num_samples); + + float num_samples_inv = num_samples_adjust/num_samples; + + for(int j = branched_state->next_sample; j < num_samples; j++) { + if(reset_path_state) { + *ps = branched_state->path_state; + } + + ps->rng_hash = cmj_hash(branched_state->path_state.rng_hash, i); + + ccl_global float3 *tp = &kernel_split_state.throughput[ray_index]; + *tp = throughput; + + ccl_global Ray *bsdf_ray = &kernel_split_state.ray[ray_index]; + + if(!kernel_branched_path_surface_bounce(kg, + sd, + sc, + j, + num_samples, + tp, + ps, + &L->state, + bsdf_ray, + sum_sample_weight)) + { + continue; + } + + ps->rng_hash = branched_state->path_state.rng_hash; + + /* update state for next iteration */ + branched_state->next_closure = i; + branched_state->next_sample = j+1; + + /* start the indirect path */ + *tp *= num_samples_inv; + + if(kernel_split_branched_indirect_start_shared(kg, ray_index)) { + continue; + } + + return true; + } + + branched_state->next_sample = 0; + } + + branched_state->next_closure = sd->num_closure; + + if(wait_for_shared) { + branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0); + if(branched_state->waiting_on_shared_samples) { + return true; + } + } + + return false; +} + +#endif /* __BRANCHED_PATH__ */ + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h new file mode 100644 index 00000000000..180c0b57077 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_buffer_update.h @@ -0,0 +1,159 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +/* This kernel takes care of rays that hit the background (sceneintersect + * kernel), and for the rays of state RAY_UPDATE_BUFFER it updates the ray's + * accumulated radiance in the output buffer. This kernel also takes care of + * rays that have been determined to-be-regenerated. + * + * We will empty QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue in this kernel. + * + * Typically all rays that are in state RAY_HIT_BACKGROUND, RAY_UPDATE_BUFFER + * will be eventually set to RAY_TO_REGENERATE state in this kernel. + * Finally all rays of ray_state RAY_TO_REGENERATE will be regenerated and put + * in queue QUEUE_ACTIVE_AND_REGENERATED_RAYS. + * + * State of queues when this kernel is called: + * At entry, + * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays. + * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with + * RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND, RAY_TO_REGENERATE rays. + * At exit, + * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and + * RAY_REGENERATED rays. + * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty. + */ +ccl_device void kernel_buffer_update(KernelGlobals *kg, + ccl_local_param unsigned int *local_queue_atomics) +{ + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + if(ray_index == 0) { + /* We will empty this queue in this kernel. */ + kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0; + } + char enqueue_flag = 0; + ray_index = get_ray_index(kg, ray_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); + +#ifdef __COMPUTE_DEVICE_GPU__ + /* If we are executing on a GPU device, we exit all threads that are not + * required. + * + * If we are executing on a CPU device, then we need to keep all threads + * active since we have barrier() calls later in the kernel. CPU devices, + * expect all threads to execute barrier statement. + */ + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } +#endif + +#ifndef __COMPUTE_DEVICE_GPU__ + if(ray_index != QUEUE_EMPTY_SLOT) { +#endif + + ccl_global char *ray_state = kernel_split_state.ray_state; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; + + if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) { + uint sample = state->sample; + uint buffer_offset = kernel_split_state.buffer_offset[ray_index]; + ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset; + + /* accumulate result in output buffer */ + kernel_write_result(kg, buffer, sample, L); + + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); + } + + if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { + /* We have completed current work; So get next work */ + ccl_global uint *work_pools = kernel_split_params.work_pools; + uint total_work_size = kernel_split_params.total_work_size; + uint work_index; + + if(!get_next_work(kg, work_pools, total_work_size, ray_index, &work_index)) { + /* If work is invalid, this means no more work is available and the thread may exit */ + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE); + } + + if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { + ccl_global WorkTile *tile = &kernel_split_params.tile; + uint x, y, sample; + get_work_pixel(tile, work_index, &x, &y, &sample); + + /* Store buffer offset for writing to passes. */ + uint buffer_offset = (tile->offset + x + y*tile->stride) * kernel_data.film.pass_stride; + kernel_split_state.buffer_offset[ray_index] = buffer_offset; + + /* Initialize random numbers and ray. */ + uint rng_hash; + kernel_path_trace_setup(kg, sample, x, y, &rng_hash, ray); + + if(ray->t != 0.0f) { + /* Initialize throughput, path radiance, Ray, PathState; + * These rays proceed with path-iteration. + */ + *throughput = make_float3(1.0f, 1.0f, 1.0f); + path_radiance_init(L, kernel_data.film.use_light_pass); + path_state_init(kg, + AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]), + state, + rng_hash, + sample, + ray); +#ifdef __SUBSURFACE__ + kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]); +#endif + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + enqueue_flag = 1; + } + else { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); + } + } + } + +#ifndef __COMPUTE_DEVICE_GPU__ + } +#endif + + /* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS; + * These rays will be made active during next SceneIntersectkernel. + */ + enqueue_ray_index_local(ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + enqueue_flag, + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h index 6e158d53d23..77fb61b80a8 100644 --- a/intern/cycles/kernel/split/kernel_data_init.h +++ b/intern/cycles/kernel/split/kernel_data_init.h @@ -14,221 +14,96 @@ * limitations under the License. */ -#include "kernel_split_common.h" +CCL_NAMESPACE_BEGIN -/* Note on kernel_data_initialization kernel - * This kernel Initializes structures needed in path-iteration kernels. - * This is the first kernel in ray-tracing logic. +/* This kernel Initializes structures needed in path-iteration kernels. * - * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE - * - * Its input and output are as follows, - * - * Un-initialized rng---------------|--- kernel_data_initialization ---|--- Initialized rng - * Un-initialized throughput -------| |--- Initialized throughput - * Un-initialized L_transparent ----| |--- Initialized L_transparent - * Un-initialized PathRadiance -----| |--- Initialized PathRadiance - * Un-initialized Ray --------------| |--- Initialized Ray - * Un-initialized PathState --------| |--- Initialized PathState - * Un-initialized QueueData --------| |--- Initialized QueueData (to QUEUE_EMPTY_SLOT) - * Un-initialized QueueIndex -------| |--- Initialized QueueIndex (to 0) - * Un-initialized use_queues_flag---| |--- Initialized use_queues_flag (to false) - * Un-initialized ray_state --------| |--- Initialized ray_state - * parallel_samples --------------- | |--- Initialized per_sample_output_buffers - * rng_state -----------------------| |--- Initialized work_array - * data ----------------------------| |--- Initialized work_pool_wgs - * start_sample --------------------| | - * sx ------------------------------| | - * sy ------------------------------| | - * sw ------------------------------| | - * sh ------------------------------| | - * stride --------------------------| | - * queuesize -----------------------| | - * num_samples ---------------------| | - * - * Note on Queues : + * Note on Queues: * All slots in queues are initialized to queue empty slot; * The number of elements in the queues is initialized to 0; */ + +#ifndef __KERNEL_CPU__ ccl_device void kernel_data_init( +#else +void KERNEL_FUNCTION_FULL_NAME(data_init)( +#endif KernelGlobals *kg, - ShaderData *sd_DL_shadow, ccl_constant KernelData *data, - ccl_global float *per_sample_output_buffers, - ccl_global uint *rng_state, - ccl_global uint *rng_coop, /* rng array to store rng values for all rays */ - ccl_global float3 *throughput_coop, /* throughput array to store throughput values for all rays */ - ccl_global float *L_transparent_coop, /* L_transparent array to store L_transparent values for all rays */ - PathRadiance *PathRadiance_coop, /* PathRadiance array to store PathRadiance values for all rays */ - ccl_global Ray *Ray_coop, /* Ray array to store Ray information for all rays */ - ccl_global PathState *PathState_coop, /* PathState array to store PathState information for all rays */ - Intersection *Intersection_coop_shadow, - ccl_global char *ray_state, /* Stores information on current state of a ray */ - -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name, -#include "../kernel_textures.h" - - int start_sample, int sx, int sy, int sw, int sh, int offset, int stride, - int rng_state_offset_x, - int rng_state_offset_y, - int rng_state_stride, - ccl_global int *Queue_data, /* Memory for queues */ + ccl_global void *split_data_buffer, + int num_elements, + ccl_global char *ray_state, + +#ifdef __KERNEL_OPENCL__ + KERNEL_BUFFER_PARAMS, +#endif + + int start_sample, + int end_sample, + int sx, int sy, int sw, int sh, int offset, int stride, ccl_global int *Queue_index, /* Tracks the number of elements in queues */ int queuesize, /* size (capacity) of the queue */ ccl_global char *use_queues_flag, /* flag to decide if scene-intersect kernel should use queues to fetch ray index */ - ccl_global unsigned int *work_array, /* work array to store which work each ray belongs to */ -#ifdef __WORK_STEALING__ - ccl_global unsigned int *work_pool_wgs, /* Work pool for each work group */ - unsigned int num_samples, /* Total number of samples per pixel */ -#endif -#ifdef __KERNEL_DEBUG__ - DebugData *debugdata_coop, -#endif - int parallel_samples) /* Number of samples to be processed in parallel */ + ccl_global unsigned int *work_pools, /* Work pool for each work group */ + unsigned int num_samples, + ccl_global float *buffer) { +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, data_init); +#else + +#ifdef __KERNEL_OPENCL__ kg->data = data; - kg->sd_input = sd_DL_shadow; - kg->isect_shadow = Intersection_coop_shadow; -#define KERNEL_TEX(type, ttype, name) \ - kg->name = name; -#include "../kernel_textures.h" - - int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - -#ifdef __WORK_STEALING__ - int lid = get_local_id(1) * get_local_size(0) + get_local_id(0); - /* Initialize work_pool_wgs */ - if(lid == 0) { - int group_index = get_group_id(1) * get_num_groups(0) + get_group_id(0); - work_pool_wgs[group_index] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); -#endif /* __WORK_STEALING__ */ +#endif - /* Initialize queue data and queue index. */ - if(thread_index < queuesize) { - /* Initialize active ray queue. */ - Queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; - /* Initialize background and buffer update queue. */ - Queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; - /* Initialize shadow ray cast of AO queue. */ - Queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; - /* Initialize shadow ray cast of direct lighting queue. */ - Queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; - } + kernel_split_params.tile.x = sx; + kernel_split_params.tile.y = sy; + kernel_split_params.tile.w = sw; + kernel_split_params.tile.h = sh; - if(thread_index == 0) { - Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; - Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0; - Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0; - Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0; - /* The scene-intersect kernel should not use the queues very first time. - * since the queue would be empty. - */ - use_queues_flag[0] = 0; - } + kernel_split_params.tile.start_sample = start_sample; + kernel_split_params.tile.num_samples = num_samples; - int x = get_global_id(0); - int y = get_global_id(1); + kernel_split_params.tile.offset = offset; + kernel_split_params.tile.stride = stride; - if(x < (sw * parallel_samples) && y < sh) { - int ray_index = x + y * (sw * parallel_samples); + kernel_split_params.tile.buffer = buffer; - /* This is the first assignment to ray_state; - * So we dont use ASSIGN_RAY_STATE macro. - */ - ray_state[ray_index] = RAY_ACTIVE; - - unsigned int my_sample; - unsigned int pixel_x; - unsigned int pixel_y; - unsigned int tile_x; - unsigned int tile_y; - unsigned int my_sample_tile; - -#ifdef __WORK_STEALING__ - unsigned int my_work = 0; - /* Get work. */ - get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index); - /* Get the sample associated with the work. */ - my_sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample; - - my_sample_tile = 0; - - /* Get pixel and tile position associated with the work. */ - get_pixel_tile_position(&pixel_x, &pixel_y, - &tile_x, &tile_y, - my_work, - sw, sh, sx, sy, - parallel_samples, - ray_index); - work_array[ray_index] = my_work; -#else /* __WORK_STEALING__ */ - unsigned int tile_index = ray_index / parallel_samples; - tile_x = tile_index % sw; - tile_y = tile_index / sw; - my_sample_tile = ray_index - (tile_index * parallel_samples); - my_sample = my_sample_tile + start_sample; - - /* Initialize work array. */ - work_array[ray_index] = my_sample ; - - /* Calculate pixel position of this ray. */ - pixel_x = sx + tile_x; - pixel_y = sy + tile_y; -#endif /* __WORK_STEALING__ */ - - rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride; - - /* Initialise per_sample_output_buffers to all zeros. */ - per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + (my_sample_tile)) * kernel_data.film.pass_stride; - int per_sample_output_buffers_iterator = 0; - for(per_sample_output_buffers_iterator = 0; - per_sample_output_buffers_iterator < kernel_data.film.pass_stride; - per_sample_output_buffers_iterator++) - { - per_sample_output_buffers[per_sample_output_buffers_iterator] = 0.0f; - } + kernel_split_params.total_work_size = sw * sh * num_samples; + + kernel_split_params.work_pools = work_pools; - /* Initialize random numbers and ray. */ - kernel_path_trace_setup(kg, - rng_state, - my_sample, - pixel_x, pixel_y, - &rng_coop[ray_index], - &Ray_coop[ray_index]); - - if(Ray_coop[ray_index].t != 0.0f) { - /* Initialize throughput, L_transparent, Ray, PathState; - * These rays proceed with path-iteration. - */ - throughput_coop[ray_index] = make_float3(1.0f, 1.0f, 1.0f); - L_transparent_coop[ray_index] = 0.0f; - path_radiance_init(&PathRadiance_coop[ray_index], kernel_data.film.use_light_pass); - path_state_init(kg, - kg->sd_input, - &PathState_coop[ray_index], - &rng_coop[ray_index], - my_sample, - &Ray_coop[ray_index]); -#ifdef __KERNEL_DEBUG__ - debug_data_init(&debugdata_coop[ray_index]); + kernel_split_params.queue_index = Queue_index; + kernel_split_params.queue_size = queuesize; + kernel_split_params.use_queues_flag = use_queues_flag; + + split_data_init(kg, &kernel_split_state, num_elements, split_data_buffer, ray_state); + +#ifdef __KERNEL_OPENCL__ + kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS); + kernel_set_buffer_info(kg); #endif - } - else { - /* These rays do not participate in path-iteration. */ - float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f); - /* Accumulate result in output buffer. */ - kernel_write_pass_float4(per_sample_output_buffers, my_sample, L_rad); - path_rng_end(kg, rng_state, rng_coop[ray_index]); - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); + + int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + + /* Initialize queue data and queue index. */ + if(thread_index < queuesize) { + for(int i = 0; i < NUM_QUEUES; i++) { + kernel_split_state.queue_data[i * queuesize + thread_index] = QUEUE_EMPTY_SLOT; } } - /* Mark rest of the ray-state indices as RAY_INACTIVE. */ - if(thread_index < (get_global_size(0) * get_global_size(1)) - (sh * (sw * parallel_samples))) { - /* First assignment, hence we dont use ASSIGN_RAY_STATE macro */ - ray_state[((sw * parallel_samples) * sh) + thread_index] = RAY_INACTIVE; + if(thread_index == 0) { + for(int i = 0; i < NUM_QUEUES; i++) { + Queue_index[i] = 0; + } + + /* The scene-intersect kernel should not use the queues very first time. + * since the queue would be empty. + */ + *use_queues_flag = 0; } +#endif /* KERENL_STUB */ } + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h index 82ca18829d3..ca79602c565 100644 --- a/intern/cycles/kernel/split/kernel_direct_lighting.h +++ b/intern/cycles/kernel/split/kernel_direct_lighting.h @@ -14,95 +14,145 @@ * limitations under the License. */ -#include "kernel_split_common.h" +CCL_NAMESPACE_BEGIN -/* Note on kernel_direct_lighting kernel. - * This is the eighth kernel in the ray tracing logic. This is the seventh - * of the path iteration kernels. This kernel takes care of direct lighting - * logic. However, the "shadow ray cast" part of direct lighting is handled +/* This kernel takes care of direct lighting logic. + * However, the "shadow ray cast" part of direct lighting is handled * in the next kernel. * - * This kernels determines the rays for which a shadow_blocked() function associated with direct lighting should be executed. - * Those rays for which a shadow_blocked() function for direct-lighting must be executed, are marked with flag RAY_SHADOW_RAY_CAST_DL and - * enqueued into the queue QUEUE_SHADOW_RAY_CAST_DL_RAYS + * This kernels determines the rays for which a shadow_blocked() function + * associated with direct lighting should be executed. Those rays for which + * a shadow_blocked() function for direct-lighting must be executed, are + * marked with flag RAY_SHADOW_RAY_CAST_DL and enqueued into the queue + * QUEUE_SHADOW_RAY_CAST_DL_RAYS * - * The input and output are as follows, + * Note on Queues: + * This kernel only reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue + * and processes only the rays of state RAY_ACTIVE; If a ray needs to execute + * the corresponding shadow_blocked part, after direct lighting, the ray is + * marked with RAY_SHADOW_RAY_CAST_DL flag. * - * rng_coop -----------------------------------------|--- kernel_direct_lighting --|--- BSDFEval_coop - * PathState_coop -----------------------------------| |--- ISLamp_coop - * sd -----------------------------------------------| |--- LightRay_coop - * ray_state ----------------------------------------| |--- ray_state - * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---| | - * kg (globals) -------------------------------------| | - * queuesize ----------------------------------------| | - * - * Note on Queues : - * This kernel only reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes - * only the rays of state RAY_ACTIVE; If a ray needs to execute the corresponding shadow_blocked - * part, after direct lighting, the ray is marked with RAY_SHADOW_RAY_CAST_DL flag. - * - * State of queues when this kernel is called : - * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same - * before and after this kernel call. - * QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a shadow_blocked function must be executed, after this - * kernel call. Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty. + * State of queues when this kernel is called: + * - State of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same before and after this + * kernel call. + * - QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a + * shadow_blocked function must be executed, after this kernel call + * Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty. */ -ccl_device char kernel_direct_lighting( - KernelGlobals *kg, - ShaderData *sd, /* Required for direct lighting */ - ccl_global uint *rng_coop, /* Required for direct lighting */ - ccl_global PathState *PathState_coop, /* Required for direct lighting */ - ccl_global int *ISLamp_coop, /* Required for direct lighting */ - ccl_global Ray *LightRay_coop, /* Required for direct lighting */ - ccl_global BsdfEval *BSDFEval_coop, /* Required for direct lighting */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - int ray_index) +ccl_device void kernel_direct_lighting(KernelGlobals *kg, + ccl_local_param unsigned int *local_queue_atomics) { + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + char enqueue_flag = 0; - if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - ccl_global PathState *state = &PathState_coop[ray_index]; + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + ray_index = get_ray_index(kg, ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 0); + + if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) { + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + ShaderData *sd = kernel_split_sd(sd, ray_index); /* direct lighting */ #ifdef __EMISSION__ - if((kernel_data.integrator.use_direct_light && - (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL))) - { + bool flag = (kernel_data.integrator.use_direct_light && + (sd->flag & SD_BSDF_HAS_EVAL)); + +# ifdef __BRANCHED_PATH__ + if(flag && kernel_data.integrator.branched) { + flag = false; + enqueue_flag = 1; + } +# endif /* __BRANCHED_PATH__ */ + +# ifdef __SHADOW_TRICKS__ + if(flag && state->flag & PATH_RAY_SHADOW_CATCHER) { + flag = false; + enqueue_flag = 1; + } +# endif /* __SHADOW_TRICKS__ */ + + if(flag) { /* Sample illumination from lights to find path contribution. */ - ccl_global RNG* rng = &rng_coop[ray_index]; - float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); float light_u, light_v; - path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); - float terminate = path_state_rng_light_termination(kg, rng, state); + path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v); + float terminate = path_state_rng_light_termination(kg, state); LightSample ls; if(light_sample(kg, - light_t, light_u, light_v, - ccl_fetch(sd, time), - ccl_fetch(sd, P), + light_u, light_v, + sd->time, + sd->P, state->bounce, &ls)) { Ray light_ray; -#ifdef __OBJECT_MOTION__ - light_ray.time = ccl_fetch(sd, time); -#endif + light_ray.time = sd->time; BsdfEval L_light; bool is_lamp; - if(direct_emission(kg, sd, kg->sd_input, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) { + if(direct_emission(kg, + sd, + AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]), + &ls, + state, + &light_ray, + &L_light, + &is_lamp, + terminate)) + { /* Write intermediate data to global memory to access from * the next kernel. */ - LightRay_coop[ray_index] = light_ray; - BSDFEval_coop[ray_index] = L_light; - ISLamp_coop[ray_index] = is_lamp; + kernel_split_state.light_ray[ray_index] = light_ray; + kernel_split_state.bsdf_eval[ray_index] = L_light; + kernel_split_state.is_lamp[ray_index] = is_lamp; /* Mark ray state for next shadow kernel. */ - ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL); enqueue_flag = 1; } } } #endif /* __EMISSION__ */ } - return enqueue_flag; + +#ifdef __EMISSION__ + /* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */ + enqueue_ray_index_local(ray_index, + QUEUE_SHADOW_RAY_CAST_DL_RAYS, + enqueue_flag, + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); +#endif + +#ifdef __BRANCHED_PATH__ + /* Enqueue RAY_LIGHT_INDIRECT_NEXT_ITER rays + * this is the last kernel before next_iteration_setup that uses local atomics so we do this here + */ + ccl_barrier(CCL_LOCAL_MEM_FENCE); + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + enqueue_ray_index_local(ray_index, + QUEUE_LIGHT_INDIRECT_ITER, + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER), + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); + +#endif /* __BRANCHED_PATH__ */ } + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_do_volume.h b/intern/cycles/kernel/split/kernel_do_volume.h new file mode 100644 index 00000000000..fb5bd3d48dd --- /dev/null +++ b/intern/cycles/kernel/split/kernel_do_volume.h @@ -0,0 +1,220 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +#if defined(__BRANCHED_PATH__) && defined(__VOLUME__) + +ccl_device_inline void kernel_split_branched_path_volume_indirect_light_init(KernelGlobals *kg, int ray_index) +{ + kernel_split_branched_path_indirect_loop_init(kg, ray_index); + + ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT); +} + +ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(KernelGlobals *kg, int ray_index) +{ + SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; + + ShaderData *sd = kernel_split_sd(sd, ray_index); + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]); + + /* GPU: no decoupled ray marching, scatter probalistically */ + int num_samples = kernel_data.integrator.volume_samples; + float num_samples_inv = 1.0f/num_samples; + + Ray volume_ray = branched_state->ray; + volume_ray.t = (!IS_STATE(&branched_state->ray_state, 0, RAY_HIT_BACKGROUND)) ? branched_state->isect.t : FLT_MAX; + + bool heterogeneous = volume_stack_is_heterogeneous(kg, branched_state->path_state.volume_stack); + + for(int j = branched_state->next_sample; j < num_samples; j++) { + ccl_global PathState *ps = &kernel_split_state.path_state[ray_index]; + *ps = branched_state->path_state; + + ccl_global Ray *pray = &kernel_split_state.ray[ray_index]; + *pray = branched_state->ray; + + ccl_global float3 *tp = &kernel_split_state.throughput[ray_index]; + *tp = branched_state->throughput * num_samples_inv; + + /* branch RNG state */ + path_state_branch(ps, j, num_samples); + + /* integrate along volume segment with distance sampling */ + VolumeIntegrateResult result = kernel_volume_integrate( + kg, ps, sd, &volume_ray, L, tp, heterogeneous); + +# ifdef __VOLUME_SCATTER__ + if(result == VOLUME_PATH_SCATTERED) { + /* direct lighting */ + kernel_path_volume_connect_light(kg, sd, emission_sd, *tp, &branched_state->path_state, L); + + /* indirect light bounce */ + if(!kernel_path_volume_bounce(kg, sd, tp, ps, &L->state, pray)) { + continue; + } + + /* start the indirect path */ + branched_state->next_closure = 0; + branched_state->next_sample = j+1; + + /* Attempting to share too many samples is slow for volumes as it causes us to + * loop here more and have many calls to kernel_volume_integrate which evaluates + * shaders. The many expensive shader evaluations cause the work load to become + * unbalanced and many threads to become idle in this kernel. Limiting the + * number of shared samples here helps quite a lot. + */ + if(branched_state->shared_sample_count < 2) { + if(kernel_split_branched_indirect_start_shared(kg, ray_index)) { + continue; + } + } + + return true; + } +# endif + } + + branched_state->next_sample = num_samples; + + branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0); + if(branched_state->waiting_on_shared_samples) { + return true; + } + + kernel_split_branched_path_indirect_loop_end(kg, ray_index); + + /* todo: avoid this calculation using decoupled ray marching */ + float3 throughput = kernel_split_state.throughput[ray_index]; + kernel_volume_shadow(kg, emission_sd, &kernel_split_state.path_state[ray_index], &volume_ray, &throughput); + kernel_split_state.throughput[ray_index] = throughput; + + return false; +} + +#endif /* __BRANCHED_PATH__ && __VOLUME__ */ + +ccl_device void kernel_do_volume(KernelGlobals *kg) +{ +#ifdef __VOLUME__ + /* We will empty this queue in this kernel. */ + if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) { + kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; +# ifdef __BRANCHED_PATH__ + kernel_split_params.queue_index[QUEUE_VOLUME_INDIRECT_ITER] = 0; +# endif /* __BRANCHED_PATH__ */ + } + + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + + if(*kernel_split_params.use_queues_flag) { + ray_index = get_ray_index(kg, ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); + } + + ccl_global char *ray_state = kernel_split_state.ray_state; + + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) || + IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { + ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + ccl_global Intersection *isect = &kernel_split_state.isect[ray_index]; + ShaderData *sd = kernel_split_sd(sd, ray_index); + ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]); + + bool hit = ! IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND); + + /* Sanitize volume stack. */ + if(!hit) { + kernel_volume_clean_stack(kg, state->volume_stack); + } + /* volume attenuation, emission, scatter */ + if(state->volume_stack[0].shader != SHADER_NONE) { + Ray volume_ray = *ray; + volume_ray.t = (hit)? isect->t: FLT_MAX; + +# ifdef __BRANCHED_PATH__ + if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { +# endif /* __BRANCHED_PATH__ */ + bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack); + + { + /* integrate along volume segment with distance sampling */ + VolumeIntegrateResult result = kernel_volume_integrate( + kg, state, sd, &volume_ray, L, throughput, heterogeneous); + +# ifdef __VOLUME_SCATTER__ + if(result == VOLUME_PATH_SCATTERED) { + /* direct lighting */ + kernel_path_volume_connect_light(kg, sd, emission_sd, *throughput, state, L); + + /* indirect light bounce */ + if(kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } + else { + kernel_split_path_end(kg, ray_index); + } + } +# endif /* __VOLUME_SCATTER__ */ + } + +# ifdef __BRANCHED_PATH__ + } + else { + kernel_split_branched_path_volume_indirect_light_init(kg, ray_index); + + if(kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } + } +# endif /* __BRANCHED_PATH__ */ + } + } + +# ifdef __BRANCHED_PATH__ + /* iter loop */ + ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0), + QUEUE_VOLUME_INDIRECT_ITER, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); + + if(IS_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER)) { + /* for render passes, sum and reset indirect light pass variables + * for the next samples */ + path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]); + path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]); + + if(kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } + } +# endif /* __BRANCHED_PATH__ */ + +#endif /* __VOLUME__ */ +} + + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_enqueue_inactive.h b/intern/cycles/kernel/split/kernel_enqueue_inactive.h new file mode 100644 index 00000000000..496355bbc3a --- /dev/null +++ b/intern/cycles/kernel/split/kernel_enqueue_inactive.h @@ -0,0 +1,46 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_enqueue_inactive(KernelGlobals *kg, + ccl_local_param unsigned int *local_queue_atomics) +{ +#ifdef __BRANCHED_PATH__ + /* Enqeueue RAY_INACTIVE rays into QUEUE_INACTIVE_RAYS queue. */ + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + + char enqueue_flag = 0; + if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_INACTIVE)) { + enqueue_flag = 1; + } + + enqueue_ray_index_local(ray_index, + QUEUE_INACTIVE_RAYS, + enqueue_flag, + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); +#endif /* __BRANCHED_PATH__ */ +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h index 5d951b972ed..88919f47c7a 100644 --- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h +++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h @@ -14,247 +14,161 @@ * limitations under the License. */ -#include "kernel_split_common.h" +CCL_NAMESPACE_BEGIN -/* Note on kernel_holdout_emission_blurring_pathtermination_ao kernel. - * This is the sixth kernel in the ray tracing logic. This is the fifth - * of the path iteration kernels. This kernel takes care of the logic to process - * "material of type holdout", indirect primitive emission, bsdf blurring, - * probabilistic path termination and AO. +/* This kernel takes care of the logic to process "material of type holdout", + * indirect primitive emission, bsdf blurring, probabilistic path termination + * and AO. * - * This kernels determines the rays for which a shadow_blocked() function associated with AO should be executed. - * Those rays for which a shadow_blocked() function for AO must be executed are marked with flag RAY_SHADOW_RAY_CAST_ao and - * enqueued into the queue QUEUE_SHADOW_RAY_CAST_AO_RAYS + * This kernels determines the rays for which a shadow_blocked() function + * associated with AO should be executed. Those rays for which a + * shadow_blocked() function for AO must be executed are marked with flag + * RAY_SHADOW_RAY_CAST_ao and enqueued into the queue + * QUEUE_SHADOW_RAY_CAST_AO_RAYS * * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER * - * The input and output are as follows, + * Note on Queues: + * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS + * and processes only the rays of state RAY_ACTIVE. + * There are different points in this kernel where a ray may terminate and + * reach RAY_UPDATE_BUFFER state. These rays are enqueued into + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will still be present + * in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has + * been changed to RAY_UPDATE_BUFFER, there is no problem. * - * rng_coop ---------------------------------------------|--- kernel_holdout_emission_blurring_pathtermination_ao ---|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) - * throughput_coop --------------------------------------| |--- PathState_coop - * PathRadiance_coop ------------------------------------| |--- throughput_coop - * Intersection_coop ------------------------------------| |--- L_transparent_coop - * PathState_coop ---------------------------------------| |--- per_sample_output_buffers - * L_transparent_coop -----------------------------------| |--- PathRadiance_coop - * sd ---------------------------------------------------| |--- ShaderData - * ray_state --------------------------------------------| |--- ray_state - * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -------| |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) - * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---| |--- AOAlpha_coop - * kg (globals) -----------------------------------------| |--- AOBSDF_coop - * parallel_samples -------------------------------------| |--- AOLightRay_coop - * per_sample_output_buffers ----------------------------| | - * sw ---------------------------------------------------| | - * sh ---------------------------------------------------| | - * sx ---------------------------------------------------| | - * sy ---------------------------------------------------| | - * stride -----------------------------------------------| | - * work_array -------------------------------------------| | - * queuesize --------------------------------------------| | - * start_sample -----------------------------------------| | - * - * Note on Queues : - * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only - * the rays of state RAY_ACTIVE. - * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFFER - * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will - * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been - * changed to RAY_UPDATE_BUFFER, there is no problem. - * - * State of queues when this kernel is called : + * State of queues when this kernel is called: * At entry, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays. - * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be empty. + * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and + * RAY_REGENERATED rays + * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with + * RAY_TO_REGENERATE rays. + * - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be empty. * At exit, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and RAY_UPDATE_BUFFER rays - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays - * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with flag RAY_SHADOW_RAY_CAST_AO + * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, + * RAY_REGENERATED and RAY_UPDATE_BUFFER rays. + * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with + * RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays. + * - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with + * flag RAY_SHADOW_RAY_CAST_AO */ + ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( KernelGlobals *kg, - ShaderData *sd, /* Required throughout the kernel except probabilistic path termination and AO */ - ccl_global float *per_sample_output_buffers, - ccl_global uint *rng_coop, /* Required for "kernel_write_data_passes" and AO */ - ccl_global float3 *throughput_coop, /* Required for handling holdout material and AO */ - ccl_global float *L_transparent_coop, /* Required for handling holdout material */ - PathRadiance *PathRadiance_coop, /* Required for "kernel_write_data_passes" and indirect primitive emission */ - ccl_global PathState *PathState_coop, /* Required throughout the kernel and AO */ - Intersection *Intersection_coop, /* Required for indirect primitive emission */ - ccl_global float3 *AOAlpha_coop, /* Required for AO */ - ccl_global float3 *AOBSDF_coop, /* Required for AO */ - ccl_global Ray *AOLightRay_coop, /* Required for AO */ - int sw, int sh, int sx, int sy, int stride, - ccl_global char *ray_state, /* Denotes the state of each ray */ - ccl_global unsigned int *work_array, /* Denotes the work that each ray belongs to */ -#ifdef __WORK_STEALING__ - unsigned int start_sample, -#endif - int parallel_samples, /* Number of samples to be processed in parallel */ - int ray_index, - char *enqueue_flag, - char *enqueue_flag_AO_SHADOW_RAY_CAST) + ccl_local_param BackgroundAOLocals *locals) { -#ifdef __WORK_STEALING__ - unsigned int my_work; - unsigned int pixel_x; - unsigned int pixel_y; + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + locals->queue_atomics_bg = 0; + locals->queue_atomics_ao = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + +#ifdef __AO__ + char enqueue_flag = 0; +#endif + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + ray_index = get_ray_index(kg, ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 0); + +#ifdef __COMPUTE_DEVICE_GPU__ + /* If we are executing on a GPU device, we exit all threads that are not + * required. + * + * If we are executing on a CPU device, then we need to keep all threads + * active since we have barrier() calls later in the kernel. CPU devices, + * expect all threads to execute barrier statement. + */ + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } +#endif /* __COMPUTE_DEVICE_GPU__ */ + +#ifndef __COMPUTE_DEVICE_GPU__ + if(ray_index != QUEUE_EMPTY_SLOT) { #endif - unsigned int tile_x; - unsigned int tile_y; - int my_sample_tile; - unsigned int sample; - ccl_global RNG *rng = 0x0; ccl_global PathState *state = 0x0; float3 throughput; - if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - - throughput = throughput_coop[ray_index]; - state = &PathState_coop[ray_index]; - rng = &rng_coop[ray_index]; -#ifdef __WORK_STEALING__ - my_work = work_array[ray_index]; - sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample; - get_pixel_tile_position(&pixel_x, &pixel_y, - &tile_x, &tile_y, - my_work, - sw, sh, sx, sy, - parallel_samples, - ray_index); - my_sample_tile = 0; -#else /* __WORK_STEALING__ */ - sample = work_array[ray_index]; - /* Buffer's stride is "stride"; Find x and y using ray_index. */ - int tile_index = ray_index / parallel_samples; - tile_x = tile_index % sw; - tile_y = tile_index / sw; - my_sample_tile = ray_index - (tile_index * parallel_samples); -#endif /* __WORK_STEALING__ */ - per_sample_output_buffers += - (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * - kernel_data.film.pass_stride; + ccl_global char *ray_state = kernel_split_state.ray_state; + ShaderData *sd = kernel_split_sd(sd, ray_index); - /* holdout */ -#ifdef __HOLDOUT__ - if(((ccl_fetch(sd, flag) & SD_HOLDOUT) || - (ccl_fetch(sd, object_flag) & SD_OBJECT_HOLDOUT_MASK)) && - (state->flag & PATH_RAY_CAMERA)) + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + uint buffer_offset = kernel_split_state.buffer_offset[ray_index]; + ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset; + + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]); + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + + throughput = kernel_split_state.throughput[ray_index]; + state = &kernel_split_state.path_state[ray_index]; + + if(!kernel_path_shader_apply(kg, + sd, + state, + ray, + throughput, + emission_sd, + L, + buffer)) { - if(kernel_data.background.transparent) { - float3 holdout_weight; - if(ccl_fetch(sd, object_flag) & SD_OBJECT_HOLDOUT_MASK) { - holdout_weight = make_float3(1.0f, 1.0f, 1.0f); - } - else { - holdout_weight = shader_holdout_eval(kg, sd); - } - /* any throughput is ok, should all be identical here */ - L_transparent_coop[ray_index] += average(holdout_weight*throughput); - } - if(ccl_fetch(sd, object_flag) & SD_OBJECT_HOLDOUT_MASK) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - *enqueue_flag = 1; - } + kernel_split_path_end(kg, ray_index); } -#endif /* __HOLDOUT__ */ } if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - PathRadiance *L = &PathRadiance_coop[ray_index]; - /* Holdout mask objects do not write data passes. */ - kernel_write_data_passes(kg, - per_sample_output_buffers, - L, - sd, - sample, - state, - throughput); - /* Blurring of bsdf after bounces, for rays that have a small likelihood - * of following this particular path (diffuse, rough glossy. - */ - if(kernel_data.integrator.filter_glossy != FLT_MAX) { - float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf; - if(blur_pdf < 1.0f) { - float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f; - shader_bsdf_blur(kg, sd, blur_roughness); - } - } - -#ifdef __EMISSION__ - /* emission */ - if(ccl_fetch(sd, flag) & SD_EMISSION) { - /* TODO(sergey): is isect.t wrong here for transparent surfaces? */ - float3 emission = indirect_primitive_emission( - kg, - sd, - Intersection_coop[ray_index].t, - state->flag, - state->ray_pdf); - path_radiance_accum_emission(L, throughput, emission, state->bounce); - } -#endif /* __EMISSION__ */ - /* Path termination. this is a strange place to put the termination, it's * mainly due to the mixed in MIS that we use. gives too many unneeded * shader evaluations, only need emission if we are going to terminate. */ - float probability = path_state_terminate_probability(kg, state, throughput); + float probability = path_state_continuation_probability(kg, state, throughput); if(probability == 0.0f) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - *enqueue_flag = 1; + kernel_split_path_end(kg, ray_index); + } + else if(probability < 1.0f) { + float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE); + if(terminate >= probability) { + kernel_split_path_end(kg, ray_index); + } + else { + kernel_split_state.throughput[ray_index] = throughput/probability; + } } if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - if(probability != 1.0f) { - float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE); - if(terminate >= probability) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - *enqueue_flag = 1; - } - else { - throughput_coop[ray_index] = throughput/probability; - } - } + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + kernel_update_denoising_features(kg, sd, state, L); } } #ifdef __AO__ if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { /* ambient occlusion */ - if(kernel_data.integrator.use_ambient_occlusion || - (ccl_fetch(sd, flag) & SD_AO)) - { - /* todo: solve correlation */ - float bsdf_u, bsdf_v; - path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); - - float ao_factor = kernel_data.background.ao_factor; - float3 ao_N; - AOBSDF_coop[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N); - AOAlpha_coop[ray_index] = shader_bsdf_alpha(kg, sd); - - float3 ao_D; - float ao_pdf; - sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf); - - if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) { - Ray _ray; - _ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng)); - _ray.D = ao_D; - _ray.t = kernel_data.background.ao_distance; -#ifdef __OBJECT_MOTION__ - _ray.time = ccl_fetch(sd, time); -#endif - _ray.dP = ccl_fetch(sd, dP); - _ray.dD = differential3_zero(); - AOLightRay_coop[ray_index] = _ray; - - ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO); - *enqueue_flag_AO_SHADOW_RAY_CAST = 1; - } + if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) { + enqueue_flag = 1; } } #endif /* __AO__ */ + +#ifndef __COMPUTE_DEVICE_GPU__ + } +#endif + +#ifdef __AO__ + /* Enqueue to-shadow-ray-cast rays. */ + enqueue_ray_index_local(ray_index, + QUEUE_SHADOW_RAY_CAST_AO_RAYS, + enqueue_flag, + kernel_split_params.queue_size, + &locals->queue_atomics_ao, + kernel_split_state.queue_data, + kernel_split_params.queue_index); +#endif } + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_indirect_background.h b/intern/cycles/kernel/split/kernel_indirect_background.h new file mode 100644 index 00000000000..4cf88a02590 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_indirect_background.h @@ -0,0 +1,65 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_indirect_background(KernelGlobals *kg) +{ + ccl_global char *ray_state = kernel_split_state.ray_state; + + int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + int ray_index; + + if(kernel_data.integrator.ao_bounces != INT_MAX) { + ray_index = get_ray_index(kg, thread_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 0); + + if(ray_index != QUEUE_EMPTY_SLOT) { + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + if(path_state_ao_bounce(kg, state)) { + kernel_split_path_end(kg, ray_index); + } + } + } + } + + ray_index = get_ray_index(kg, thread_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 0); + + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } + + if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + float3 throughput = kernel_split_state.throughput[ray_index]; + ShaderData *sd = kernel_split_sd(sd, ray_index); + + kernel_path_background(kg, state, ray, throughput, sd, L); + kernel_split_path_end(kg, ray_index); + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_indirect_subsurface.h b/intern/cycles/kernel/split/kernel_indirect_subsurface.h new file mode 100644 index 00000000000..236c94e983c --- /dev/null +++ b/intern/cycles/kernel/split/kernel_indirect_subsurface.h @@ -0,0 +1,70 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_indirect_subsurface(KernelGlobals *kg) +{ + int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + if(thread_index == 0) { + /* We will empty both queues in this kernel. */ + kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; + kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0; + } + + int ray_index; + get_ray_index(kg, thread_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); + ray_index = get_ray_index(kg, thread_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); + +#ifdef __SUBSURFACE__ + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } + + ccl_global char *ray_state = kernel_split_state.ray_state; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; + + if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) { + ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index]; + + /* Trace indirect subsurface rays by restarting the loop. this uses less + * stack memory than invoking kernel_path_indirect. + */ + if(ss_indirect->num_rays) { + kernel_path_subsurface_setup_indirect(kg, + ss_indirect, + state, + ray, + L, + throughput); + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } + } +#endif /* __SUBSURFACE__ */ +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_lamp_emission.h b/intern/cycles/kernel/split/kernel_lamp_emission.h index 3bd0e361078..c14f66f664f 100644 --- a/intern/cycles/kernel/split/kernel_lamp_emission.h +++ b/intern/cycles/kernel/split/kernel_lamp_emission.h @@ -14,70 +14,55 @@ * limitations under the License. */ -#include "kernel_split_common.h" +CCL_NAMESPACE_BEGIN -/* Note on kernel_lamp_emission - * This is the 3rd kernel in the ray-tracing logic. This is the second of the - * path-iteration kernels. This kernel takes care of the indirect lamp emission logic. - * This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS. It processes rays of state RAY_ACTIVE - * and RAY_HIT_BACKGROUND. +/* This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS. + * It processes rays of state RAY_ACTIVE and RAY_HIT_BACKGROUND. * We will empty QUEUE_ACTIVE_AND_REGENERATED_RAYS queue in this kernel. - * The input/output of the kernel is as follows, - * Throughput_coop ------------------------------------|--- kernel_lamp_emission --|--- PathRadiance_coop - * Ray_coop -------------------------------------------| |--- Queue_data(QUEUE_ACTIVE_AND_REGENERATED_RAYS) - * PathState_coop -------------------------------------| |--- Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS) - * kg (globals) ---------------------------------------| | - * Intersection_coop ----------------------------------| | - * ray_state ------------------------------------------| | - * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -----| | - * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ----| | - * queuesize ------------------------------------------| | - * use_queues_flag ------------------------------------| | - * sw -------------------------------------------------| | - * sh -------------------------------------------------| | */ -ccl_device void kernel_lamp_emission( - KernelGlobals *kg, - ccl_global float3 *throughput_coop, /* Required for lamp emission */ - PathRadiance *PathRadiance_coop, /* Required for lamp emission */ - ccl_global Ray *Ray_coop, /* Required for lamp emission */ - ccl_global PathState *PathState_coop, /* Required for lamp emission */ - Intersection *Intersection_coop, /* Required for lamp emission */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - int sw, int sh, - ccl_global char *use_queues_flag, /* Used to decide if this kernel should use - * queues to fetch ray index - */ - int ray_index) +ccl_device void kernel_lamp_emission(KernelGlobals *kg) { - if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) || - IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) - { - PathRadiance *L = &PathRadiance_coop[ray_index]; - ccl_global PathState *state = &PathState_coop[ray_index]; +#ifndef __VOLUME__ + /* We will empty this queue in this kernel. */ + if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) { + kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; + } +#endif + /* Fetch use_queues_flag. */ + char local_use_queues_flag = *kernel_split_params.use_queues_flag; + ccl_barrier(CCL_LOCAL_MEM_FENCE); - float3 throughput = throughput_coop[ray_index]; - Ray ray = Ray_coop[ray_index]; + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + if(local_use_queues_flag) { + ray_index = get_ray_index(kg, ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, +#ifndef __VOLUME__ + 1 +#else + 0 +#endif + ); + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } + } -#ifdef __LAMP_MIS__ - if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) { - /* ray starting from previous non-transparent bounce */ - Ray light_ray; + if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) || + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND)) + { + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; - light_ray.P = ray.P - state->ray_t*ray.D; - state->ray_t += Intersection_coop[ray_index].t; - light_ray.D = ray.D; - light_ray.t = state->ray_t; - light_ray.time = ray.time; - light_ray.dD = ray.dD; - light_ray.dP = ray.dP; - /* intersect with lamp */ - float3 emission; + float3 throughput = kernel_split_state.throughput[ray_index]; + Ray ray = kernel_split_state.ray[ray_index]; + ccl_global Intersection *isect = &kernel_split_state.isect[ray_index]; + ShaderData *sd = kernel_split_sd(sd, ray_index); - if(indirect_lamp_emission(kg, kg->sd_input, state, &light_ray, &emission)) { - path_radiance_accum_emission(L, throughput, emission, state->bounce); - } - } -#endif /* __LAMP_MIS__ */ + kernel_path_lamp_emission(kg, state, &ray, throughput, isect, sd, L); } } + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h index 816f3a6fbff..e388955f1af 100644 --- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h +++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h @@ -14,128 +14,251 @@ * limitations under the License. */ -#include "kernel_split_common.h" +CCL_NAMESPACE_BEGIN -/* Note on kernel_setup_next_iteration kernel. - * This is the tenth kernel in the ray tracing logic. This is the ninth - * of the path iteration kernels. This kernel takes care of setting up - * Ray for the next iteration of path-iteration and accumulating radiance - * corresponding to AO and direct-lighting +/*This kernel takes care of setting up ray for the next iteration of + * path-iteration and accumulating radiance corresponding to AO and + * direct-lighting * - * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER + * Ray state of rays that are terminated in this kernel are changed + * to RAY_UPDATE_BUFFER. * - * The input and output are as follows, + * Note on queues: + * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS + * and processes only the rays of state RAY_ACTIVE. + * There are different points in this kernel where a ray may terminate and + * reach RAY_UPDATE_BUFF state. These rays are enqueued into + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will still be present + * in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has + * been changed to RAY_UPDATE_BUFF, there is no problem. * - * rng_coop ---------------------------------------------|--- kernel_next_iteration_setup -|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) - * throughput_coop --------------------------------------| |--- Queue_data (QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS) - * PathRadiance_coop ------------------------------------| |--- throughput_coop - * PathState_coop ---------------------------------------| |--- PathRadiance_coop - * sd ---------------------------------------------------| |--- PathState_coop - * ray_state --------------------------------------------| |--- ray_state - * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS) --------| |--- Ray_coop - * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---| |--- use_queues_flag - * Ray_coop ---------------------------------------------| | - * kg (globals) -----------------------------------------| | - * LightRay_dl_coop -------------------------------------| - * ISLamp_coop ------------------------------------------| - * BSDFEval_coop ----------------------------------------| - * LightRay_ao_coop -------------------------------------| - * AOBSDF_coop ------------------------------------------| - * AOAlpha_coop -----------------------------------------| - * - * Note on queues, - * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only - * the rays of state RAY_ACTIVE. - * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFF - * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will - * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been - * changed to RAY_UPDATE_BUFF, there is no problem. - * - * State of queues when this kernel is called : + * State of queues when this kernel is called: * At entry, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED, RAY_UPDATE_BUFFER rays. - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays + * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, + * RAY_REGENERATED, RAY_UPDATE_BUFFER rays. + * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with + * RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays. * At exit, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and more RAY_UPDATE_BUFFER rays. - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays + * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, + * RAY_REGENERATED and more RAY_UPDATE_BUFFER rays. + * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with + * RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays. */ -ccl_device char kernel_next_iteration_setup( - KernelGlobals *kg, - ShaderData *sd, /* Required for setting up ray for next iteration */ - ccl_global uint *rng_coop, /* Required for setting up ray for next iteration */ - ccl_global float3 *throughput_coop, /* Required for setting up ray for next iteration */ - PathRadiance *PathRadiance_coop, /* Required for setting up ray for next iteration */ - ccl_global Ray *Ray_coop, /* Required for setting up ray for next iteration */ - ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */ - ccl_global Ray *LightRay_dl_coop, /* Required for radiance update - direct lighting */ - ccl_global int *ISLamp_coop, /* Required for radiance update - direct lighting */ - ccl_global BsdfEval *BSDFEval_coop, /* Required for radiance update - direct lighting */ - ccl_global Ray *LightRay_ao_coop, /* Required for radiance update - AO */ - ccl_global float3 *AOBSDF_coop, /* Required for radiance update - AO */ - ccl_global float3 *AOAlpha_coop, /* Required for radiance update - AO */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - ccl_global char *use_queues_flag, /* flag to decide if scene_intersect kernel should - * use queues to fetch ray index */ - int ray_index) + +#ifdef __BRANCHED_PATH__ +ccl_device_inline void kernel_split_branched_indirect_light_init(KernelGlobals *kg, int ray_index) +{ + kernel_split_branched_path_indirect_loop_init(kg, ray_index); + + ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT); +} + +ccl_device void kernel_split_branched_transparent_bounce(KernelGlobals *kg, int ray_index) { - char enqueue_flag = 0; - - /* Load ShaderData structure. */ - PathRadiance *L = NULL; - ccl_global PathState *state = NULL; - - /* Path radiance update for AO/Direct_lighting's shadow blocked. */ - if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) || - IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) - { - state = &PathState_coop[ray_index]; - L = &PathRadiance_coop[ray_index]; - float3 _throughput = throughput_coop[ray_index]; - - if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) { - float3 shadow = LightRay_ao_coop[ray_index].P; - char update_path_radiance = LightRay_ao_coop[ray_index].t; - if(update_path_radiance) { - path_radiance_accum_ao(L, - _throughput, - AOAlpha_coop[ray_index], - AOBSDF_coop[ray_index], - shadow, - state->bounce); + ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; + ShaderData *sd = kernel_split_sd(sd, ray_index); + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + +# ifdef __VOLUME__ + if(!(sd->flag & SD_HAS_ONLY_VOLUME)) { +# endif + /* continue in case of transparency */ + *throughput *= shader_bsdf_transparency(kg, sd); + + if(is_zero(*throughput)) { + kernel_split_path_end(kg, ray_index); + return; + } + + /* Update Path State */ + path_state_next(kg, state, LABEL_TRANSPARENT); +# ifdef __VOLUME__ + } + else { + if(!path_state_volume_next(kg, state)) { + kernel_split_path_end(kg, ray_index); + return; + } + } +# endif + + ray->P = ray_offset(sd->P, -sd->Ng); + ray->t -= sd->ray_length; /* clipping works through transparent */ + +# ifdef __RAY_DIFFERENTIALS__ + ray->dP = sd->dP; + ray->dD.dx = -sd->dI.dx; + ray->dD.dy = -sd->dI.dy; +# endif /* __RAY_DIFFERENTIALS__ */ + +# ifdef __VOLUME__ + /* enter/exit volume */ + kernel_volume_stack_enter_exit(kg, sd, state->volume_stack); +# endif /* __VOLUME__ */ +} +#endif /* __BRANCHED_PATH__ */ + +ccl_device void kernel_next_iteration_setup(KernelGlobals *kg, + ccl_local_param unsigned int *local_queue_atomics) +{ + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) { + /* If we are here, then it means that scene-intersect kernel + * has already been executed atleast once. From the next time, + * scene-intersect kernel may operate on queues to fetch ray index + */ + *kernel_split_params.use_queues_flag = 1; + + /* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and + * QUEUE_SHADOW_RAY_CAST_DL_RAYS queues that were made empty during the + * previous kernel. + */ + kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0; + kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0; + } + + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + ray_index = get_ray_index(kg, ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 0); + + ccl_global char *ray_state = kernel_split_state.ray_state; + +# ifdef __VOLUME__ + /* Reactivate only volume rays here, most surface work was skipped. */ + if(IS_STATE(ray_state, ray_index, RAY_HAS_ONLY_VOLUME)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_ACTIVE); + } +# endif + + bool active = IS_STATE(ray_state, ray_index, RAY_ACTIVE); + if(active) { + ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + ShaderData *sd = kernel_split_sd(sd, ray_index); + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + +#ifdef __BRANCHED_PATH__ + if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { +#endif + /* Compute direct lighting and next bounce. */ + if(!kernel_path_surface_bounce(kg, sd, throughput, state, &L->state, ray)) { + kernel_split_path_end(kg, ray_index); } - REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO); +#ifdef __BRANCHED_PATH__ + } + else if(sd->flag & SD_HAS_ONLY_VOLUME) { + kernel_split_branched_transparent_bounce(kg, ray_index); } + else { + kernel_split_branched_indirect_light_init(kg, ray_index); - if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) { - float3 shadow = LightRay_dl_coop[ray_index].P; - char update_path_radiance = LightRay_dl_coop[ray_index].t; - if(update_path_radiance) { - BsdfEval L_light = BSDFEval_coop[ray_index]; - path_radiance_accum_light(L, - _throughput, - &L_light, - shadow, - 1.0f, - state->bounce, - ISLamp_coop[ray_index]); + if(kernel_split_branched_path_surface_indirect_light_iter(kg, + ray_index, + 1.0f, + kernel_split_sd(branched_state_sd, ray_index), + true, + true)) + { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } + else { + kernel_split_branched_path_indirect_loop_end(kg, ray_index); + kernel_split_branched_transparent_bounce(kg, ray_index); } - REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL); } +#endif /* __BRANCHED_PATH__ */ + } + + /* Enqueue RAY_UPDATE_BUFFER rays. */ + enqueue_ray_index_local(ray_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER) && active, + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); + +#ifdef __BRANCHED_PATH__ + /* iter loop */ + if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) { + kernel_split_params.queue_index[QUEUE_LIGHT_INDIRECT_ITER] = 0; } - if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - ccl_global float3 *throughput = &throughput_coop[ray_index]; - ccl_global Ray *ray = &Ray_coop[ray_index]; - ccl_global RNG *rng = &rng_coop[ray_index]; - state = &PathState_coop[ray_index]; - L = &PathRadiance_coop[ray_index]; - - /* Compute direct lighting and next bounce. */ - if(!kernel_path_surface_bounce(kg, rng, sd, throughput, state, L, ray)) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - enqueue_flag = 1; + ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0), + QUEUE_LIGHT_INDIRECT_ITER, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); + + if(IS_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER)) { + /* for render passes, sum and reset indirect light pass variables + * for the next samples */ + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + + path_radiance_sum_indirect(L); + path_radiance_reset_indirect(L); + + if(kernel_split_branched_path_surface_indirect_light_iter(kg, + ray_index, + 1.0f, + kernel_split_sd(branched_state_sd, ray_index), + true, + true)) + { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); } + else { + kernel_split_branched_path_indirect_loop_end(kg, ray_index); + kernel_split_branched_transparent_bounce(kg, ray_index); + } + } + +# ifdef __VOLUME__ + /* Enqueue RAY_VOLUME_INDIRECT_NEXT_ITER rays */ + ccl_barrier(CCL_LOCAL_MEM_FENCE); + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; } + ccl_barrier(CCL_LOCAL_MEM_FENCE); - return enqueue_flag; + ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + enqueue_ray_index_local(ray_index, + QUEUE_VOLUME_INDIRECT_ITER, + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER), + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); + +# endif /* __VOLUME__ */ + +# ifdef __SUBSURFACE__ + /* Enqueue RAY_SUBSURFACE_INDIRECT_NEXT_ITER rays */ + ccl_barrier(CCL_LOCAL_MEM_FENCE); + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + enqueue_ray_index_local(ray_index, + QUEUE_SUBSURFACE_INDIRECT_ITER, + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER), + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); +# endif /* __SUBSURFACE__ */ +#endif /* __BRANCHED_PATH__ */ } + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_path_init.h b/intern/cycles/kernel/split/kernel_path_init.h new file mode 100644 index 00000000000..fdd54225b07 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_path_init.h @@ -0,0 +1,81 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +/* This kernel initializes structures needed in path-iteration kernels. + * This is the first kernel in ray-tracing logic. + * + * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE + */ +ccl_device void kernel_path_init(KernelGlobals *kg) { + int ray_index = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0); + + /* This is the first assignment to ray_state; + * So we dont use ASSIGN_RAY_STATE macro. + */ + kernel_split_state.ray_state[ray_index] = RAY_ACTIVE; + + /* Get work. */ + ccl_global uint *work_pools = kernel_split_params.work_pools; + uint total_work_size = kernel_split_params.total_work_size; + uint work_index; + + if(!get_next_work(kg, work_pools, total_work_size, ray_index, &work_index)) { + /* No more work, mark ray as inactive */ + kernel_split_state.ray_state[ray_index] = RAY_INACTIVE; + + return; + } + + ccl_global WorkTile *tile = &kernel_split_params.tile; + uint x, y, sample; + get_work_pixel(tile, work_index, &x, &y, &sample); + + /* Store buffer offset for writing to passes. */ + uint buffer_offset = (tile->offset + x + y*tile->stride) * kernel_data.film.pass_stride; + kernel_split_state.buffer_offset[ray_index] = buffer_offset; + + /* Initialize random numbers and ray. */ + uint rng_hash; + kernel_path_trace_setup(kg, + sample, + x, y, + &rng_hash, + &kernel_split_state.ray[ray_index]); + + if(kernel_split_state.ray[ray_index].t != 0.0f) { + /* Initialize throughput, path radiance, Ray, PathState; + * These rays proceed with path-iteration. + */ + kernel_split_state.throughput[ray_index] = make_float3(1.0f, 1.0f, 1.0f); + path_radiance_init(&kernel_split_state.path_radiance[ray_index], kernel_data.film.use_light_pass); + path_state_init(kg, + AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]), + &kernel_split_state.path_state[ray_index], + rng_hash, + sample, + &kernel_split_state.ray[ray_index]); +#ifdef __SUBSURFACE__ + kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]); +#endif + } + else { + ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE); + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_queue_enqueue.h b/intern/cycles/kernel/split/kernel_queue_enqueue.h new file mode 100644 index 00000000000..df67fabab19 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_queue_enqueue.h @@ -0,0 +1,92 @@ +/* + * Copyright 2011-2016 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +/* This kernel enqueues rays of different ray state into their + * appropriate queues: + * + * 1. Rays that have been determined to hit the background from the + * "kernel_scene_intersect" kernel are enqueued in + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS; + * 2. Rays that have been determined to be actively participating in pat + * -iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS. + * + * State of queue during other times this kernel is called: + * At entry, + * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty. + * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE + * and RAY_UPDATE_BUFFER rays. + * At exit, + * - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays. + * - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with + * RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays. + */ +ccl_device void kernel_queue_enqueue(KernelGlobals *kg, + ccl_local_param QueueEnqueueLocals *locals) +{ + /* We have only 2 cases (Hit/Not-Hit) */ + int lidx = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0); + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + + if(lidx == 0) { + locals->queue_atomics[0] = 0; + locals->queue_atomics[1] = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + int queue_number = -1; + + if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND) || + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER) || + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) { + queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS; + } + else if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) || + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HAS_ONLY_VOLUME) || + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) { + queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS; + } + + unsigned int my_lqidx; + if(queue_number != -1) { + my_lqidx = get_local_queue_index(queue_number, locals->queue_atomics); + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + if(lidx == 0) { + locals->queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = + get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS, + locals->queue_atomics, + kernel_split_params.queue_index); + locals->queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = + get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + locals->queue_atomics, + kernel_split_params.queue_index); + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + unsigned int my_gqidx; + if(queue_number != -1) { + my_gqidx = get_global_queue_index(queue_number, + kernel_split_params.queue_size, + my_lqidx, + locals->queue_atomics); + kernel_split_state.queue_data[my_gqidx] = ray_index; + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h index 2388580051f..f5378bc172b 100644 --- a/intern/cycles/kernel/split/kernel_scene_intersect.h +++ b/intern/cycles/kernel/split/kernel_scene_intersect.h @@ -14,120 +14,66 @@ * limitations under the License. */ -#include "kernel_split_common.h" +CCL_NAMESPACE_BEGIN -/* Note on kernel_scene_intersect kernel. - * This is the second kernel in the ray tracing logic. This is the first - * of the path iteration kernels. This kernel takes care of scene_intersect function. +/* This kernel takes care of scene_intersect function. * * This kernel changes the ray_state of RAY_REGENERATED rays to RAY_ACTIVE. * This kernel processes rays of ray state RAY_ACTIVE - * This kernel determines the rays that have hit the background and changes their ray state to RAY_HIT_BACKGROUND. - * - * The input and output are as follows, - * - * Ray_coop ---------------------------------------|--------- kernel_scene_intersect----------|--- PathState - * PathState_coop ---------------------------------| |--- Intersection - * ray_state --------------------------------------| |--- ray_state - * use_queues_flag --------------------------------| | - * QueueData(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---| | - * kg (globals) -----------------------------------| | - * rng_coop ---------------------------------------| | - * sw ---------------------------------------------| | - * sh ---------------------------------------------| | - * queuesize --------------------------------------| | - * - * Note on Queues : - * Ideally we would want kernel_scene_intersect to work on queues. - * But during the very first time, the queues will be empty and hence we perform a direct mapping - * between ray-index and thread-index; From the next time onward, the queue will be filled and - * we may start operating on queues. - * - * State of queue during the first time this kernel is called : - * QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.before and after this kernel - * - * State of queues during other times this kernel is called : - * At entry, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will have a mix of RAY_ACTIVE, RAY_UPDATE_BUFFER and RAY_REGENERATED rays; - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays ; - * (The rays that are in the state RAY_UPDATE_BUFFER in both the queues are actually the same rays; These - * are the rays that were in RAY_ACTIVE state during the initial enqueue but on further processing - * , by different kernels, have turned into RAY_UPDATE_BUFFER rays. Since all kernel, even after fetching from - * QUEUE_ACTIVE_AND_REGENERATED_RAYS, proceed further based on ray state information, RAY_UPDATE_BUFFER rays - * being present in QUEUE_ACTIVE_AND_REGENERATED_RAYS does not cause any logical issues) - * At exit, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS - All RAY_REGENERATED rays will have been converted to RAY_ACTIVE and - * Some rays in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue will move to state RAY_HIT_BACKGROUND - * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS - no change + * This kernel determines the rays that have hit the background and changes + * their ray state to RAY_HIT_BACKGROUND. */ - -ccl_device void kernel_scene_intersect( - KernelGlobals *kg, - ccl_global uint *rng_coop, - ccl_global Ray *Ray_coop, /* Required for scene_intersect */ - ccl_global PathState *PathState_coop, /* Required for scene_intersect */ - Intersection *Intersection_coop, /* Required for scene_intersect */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - int sw, int sh, - ccl_global char *use_queues_flag, /* used to decide if this kernel should use - * queues to fetch ray index */ -#ifdef __KERNEL_DEBUG__ - DebugData *debugdata_coop, -#endif - int ray_index) +ccl_device void kernel_scene_intersect(KernelGlobals *kg) { - /* All regenerated rays become active here */ - if(IS_STATE(ray_state, ray_index, RAY_REGENERATED)) - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_ACTIVE); - - if(!IS_STATE(ray_state, ray_index, RAY_ACTIVE)) - return; + /* Fetch use_queues_flag */ + char local_use_queues_flag = *kernel_split_params.use_queues_flag; + ccl_barrier(CCL_LOCAL_MEM_FENCE); -#ifdef __KERNEL_DEBUG__ - DebugData *debug_data = &debugdata_coop[ray_index]; -#endif - Intersection *isect = &Intersection_coop[ray_index]; - PathState state = PathState_coop[ray_index]; - Ray ray = Ray_coop[ray_index]; + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + if(local_use_queues_flag) { + ray_index = get_ray_index(kg, ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 0); - /* intersect scene */ - uint visibility = path_state_ray_visibility(kg, &state); - -#ifdef __HAIR__ - float difl = 0.0f, extmax = 0.0f; - uint lcg_state = 0; - RNG rng = rng_coop[ray_index]; + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } + } - if(kernel_data.bvh.have_curves) { - if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) { - float3 pixdiff = ray.dD.dx + ray.dD.dy; - /*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/ - difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f; + /* All regenerated rays become active here */ + if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) { +#ifdef __BRANCHED_PATH__ + if(kernel_split_state.branched_state[ray_index].waiting_on_shared_samples) { + kernel_split_path_end(kg, ray_index); + } + else +#endif /* __BRANCHED_PATH__ */ + { + ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE); } + } - extmax = kernel_data.curve.maximum_width; - lcg_state = lcg_state_init(&rng, &state, 0x51633e2d); + if(!IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) { + return; } - bool hit = scene_intersect(kg, ray, visibility, isect, &lcg_state, difl, extmax); -#else - bool hit = scene_intersect(kg, ray, visibility, isect, NULL, 0.0f, 0.0f); -#endif + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + Ray ray = kernel_split_state.ray[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; -#ifdef __KERNEL_DEBUG__ - if(state.flag & PATH_RAY_CAMERA) { - debug_data->num_bvh_traversed_nodes += isect->num_traversed_nodes; - debug_data->num_bvh_traversed_instances += isect->num_traversed_instances; - debug_data->num_bvh_intersections += isect->num_intersections; - } - debug_data->num_ray_bounces++; -#endif + Intersection isect; + bool hit = kernel_path_scene_intersect(kg, state, &ray, &isect, L); + kernel_split_state.isect[ray_index] = isect; if(!hit) { /* Change the state of rays that hit the background; * These rays undergo special processing in the * background_bufferUpdate kernel. */ - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND); + ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND); } } + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h index cef64bf5f36..2bc2d300699 100644 --- a/intern/cycles/kernel/split/kernel_shader_eval.h +++ b/intern/cycles/kernel/split/kernel_shader_eval.h @@ -1,5 +1,5 @@ /* - * Copyright 2011-2015 Blender Foundation + * Copyright 2011-2017 Blender Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,57 +14,53 @@ * limitations under the License. */ -#include "kernel_split_common.h" +CCL_NAMESPACE_BEGIN -/* Note on kernel_shader_eval kernel - * This kernel is the 5th kernel in the ray tracing logic. This is - * the 4rd kernel in path iteration. This kernel sets up the ShaderData - * structure from the values computed by the previous kernels. It also identifies - * the rays of state RAY_TO_REGENERATE and enqueues them in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. - * - * The input and output of the kernel is as follows, - * rng_coop -------------------------------------------|--- kernel_shader_eval --|--- sd - * Ray_coop -------------------------------------------| |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) - * PathState_coop -------------------------------------| |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) - * Intersection_coop ----------------------------------| | - * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS)-------| | - * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)---| | - * ray_state ------------------------------------------| | - * kg (globals) ---------------------------------------| | - * queuesize ------------------------------------------| | - * - * Note on Queues : - * This kernel reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes - * only the rays of state RAY_ACTIVE; - * State of queues when this kernel is called, - * at entry, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty. - * at exit, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays +/* This kernel evaluates ShaderData structure from the values computed + * by the previous kernels. */ -ccl_device void kernel_shader_eval( - KernelGlobals *kg, - ShaderData *sd, /* Output ShaderData structure to be filled */ - ccl_global uint *rng_coop, /* Required for rbsdf calculation */ - ccl_global Ray *Ray_coop, /* Required for setting up shader from ray */ - ccl_global PathState *PathState_coop, /* Required for all functions in this kernel */ - Intersection *Intersection_coop, /* Required for setting up shader from ray */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - int ray_index) +ccl_device void kernel_shader_eval(KernelGlobals *kg) { + + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + /* Sorting on cuda split is not implemented */ +#ifdef __KERNEL_CUDA__ + int queue_index = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS]; +#else + int queue_index = kernel_split_params.queue_index[QUEUE_SHADER_SORTED_RAYS]; +#endif + if(ray_index >= queue_index) { + return; + } + ray_index = get_ray_index(kg, ray_index, +#ifdef __KERNEL_CUDA__ + QUEUE_ACTIVE_AND_REGENERATED_RAYS, +#else + QUEUE_SHADER_SORTED_RAYS, +#endif + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 0); + + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } + + ccl_global char *ray_state = kernel_split_state.ray_state; if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - Intersection *isect = &Intersection_coop[ray_index]; - ccl_global uint *rng = &rng_coop[ray_index]; - ccl_global PathState *state = &PathState_coop[ray_index]; - Ray ray = Ray_coop[ray_index]; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; - shader_setup_from_ray(kg, - sd, - isect, - &ray); - float rbsdf = path_state_rng_1D_for_decision(kg, rng, state, PRNG_BSDF); - shader_eval_surface(kg, sd, rng, state, rbsdf, state->flag, SHADER_CONTEXT_MAIN); + shader_eval_surface(kg, kernel_split_sd(sd, ray_index), state, state->flag); +#ifdef __BRANCHED_PATH__ + if(kernel_data.integrator.branched) { + shader_merge_closures(kernel_split_sd(sd, ray_index)); + } + else +#endif + { + shader_prepare_closures(kernel_split_sd(sd, ray_index), state); + } } } + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_shader_setup.h b/intern/cycles/kernel/split/kernel_shader_setup.h new file mode 100644 index 00000000000..ea3ec2ec83f --- /dev/null +++ b/intern/cycles/kernel/split/kernel_shader_setup.h @@ -0,0 +1,78 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +/* This kernel sets up the ShaderData structure from the values computed + * by the previous kernels. + * + * It also identifies the rays of state RAY_TO_REGENERATE and enqueues them + * in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. + */ +ccl_device void kernel_shader_setup(KernelGlobals *kg, + ccl_local_param unsigned int *local_queue_atomics) +{ + /* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */ + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + *local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + int queue_index = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS]; + if(ray_index >= queue_index) { + return; + } + ray_index = get_ray_index(kg, ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 0); + + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } + + char enqueue_flag = (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0; + enqueue_ray_index_local(ray_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + enqueue_flag, + kernel_split_params.queue_size, + local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); + + /* Continue on with shader evaluation. */ + if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) { + Intersection isect = kernel_split_state.isect[ray_index]; + Ray ray = kernel_split_state.ray[ray_index]; + ShaderData *sd = kernel_split_sd(sd, ray_index); + + shader_setup_from_ray(kg, + sd, + &isect, + &ray); + +#ifdef __VOLUME__ + if(sd->flag & SD_HAS_ONLY_VOLUME) { + ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_HAS_ONLY_VOLUME); + } +#endif + } + +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_shader_sort.h b/intern/cycles/kernel/split/kernel_shader_sort.h new file mode 100644 index 00000000000..2132c42220f --- /dev/null +++ b/intern/cycles/kernel/split/kernel_shader_sort.h @@ -0,0 +1,97 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + + +ccl_device void kernel_shader_sort(KernelGlobals *kg, + ccl_local_param ShaderSortLocals *locals) +{ +#ifndef __KERNEL_CUDA__ + int tid = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + uint qsize = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS]; + if(tid == 0) { + kernel_split_params.queue_index[QUEUE_SHADER_SORTED_RAYS] = qsize; + } + + uint offset = (tid/SHADER_SORT_LOCAL_SIZE)*SHADER_SORT_BLOCK_SIZE; + if(offset >= qsize) { + return; + } + + int lid = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0); + uint input = QUEUE_ACTIVE_AND_REGENERATED_RAYS * (kernel_split_params.queue_size); + uint output = QUEUE_SHADER_SORTED_RAYS * (kernel_split_params.queue_size); + ccl_local uint *local_value = &locals->local_value[0]; + ccl_local ushort *local_index = &locals->local_index[0]; + + /* copy to local memory */ + for(uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) { + uint idx = offset + i + lid; + uint add = input + idx; + uint value = (~0); + if(idx < qsize) { + int ray_index = kernel_split_state.queue_data[add]; + bool valid = (ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE); + if(valid) { + value = kernel_split_sd(sd, ray_index)->shader & SHADER_MASK; + } + } + local_value[i + lid] = value; + local_index[i + lid] = i + lid; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + /* skip sorting for cpu split kernel */ +# ifdef __KERNEL_OPENCL__ + + /* bitonic sort */ + for(uint length = 1; length < SHADER_SORT_BLOCK_SIZE; length <<= 1) { + for(uint inc = length; inc > 0; inc >>= 1) { + for(uint ii = 0; ii < SHADER_SORT_BLOCK_SIZE; ii += SHADER_SORT_LOCAL_SIZE) { + uint i = lid + ii; + bool direction = ((i & (length << 1)) != 0); + uint j = i ^ inc; + ushort ioff = local_index[i]; + ushort joff = local_index[j]; + uint iKey = local_value[ioff]; + uint jKey = local_value[joff]; + bool smaller = (jKey < iKey) || (jKey == iKey && j < i); + bool swap = smaller ^ (j < i) ^ direction; + ccl_barrier(CCL_LOCAL_MEM_FENCE); + local_index[i] = (swap) ? joff : ioff; + local_index[j] = (swap) ? ioff : joff; + ccl_barrier(CCL_LOCAL_MEM_FENCE); + } + } + } +# endif /* __KERNEL_OPENCL__ */ + + /* copy to destination */ + for(uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) { + uint idx = offset + i + lid; + uint lidx = local_index[i + lid]; + uint outi = output + idx; + uint ini = input + offset + lidx; + uint value = local_value[lidx]; + if(idx < qsize) { + kernel_split_state.queue_data[outi] = (value == (~0)) ? QUEUE_EMPTY_SLOT : kernel_split_state.queue_data[ini]; + } + } +#endif /* __KERNEL_CUDA__ */ +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked.h b/intern/cycles/kernel/split/kernel_shadow_blocked.h deleted file mode 100644 index 6153af47f96..00000000000 --- a/intern/cycles/kernel/split/kernel_shadow_blocked.h +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "kernel_split_common.h" - -/* Note on kernel_shadow_blocked kernel. - * This is the ninth kernel in the ray tracing logic. This is the eighth - * of the path iteration kernels. This kernel takes care of "shadow ray cast" - * logic of the direct lighting and AO part of ray tracing. - * - * The input and output are as follows, - * - * PathState_coop ----------------------------------|--- kernel_shadow_blocked --| - * LightRay_dl_coop --------------------------------| |--- LightRay_dl_coop - * LightRay_ao_coop --------------------------------| |--- LightRay_ao_coop - * ray_state ---------------------------------------| |--- ray_state - * Queue_data(QUEUE_SHADOW_RAY_CAST_AO_RAYS & | |--- Queue_data (QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_AO_RAYS) - QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------| | - * Queue_index(QUEUE_SHADOW_RAY_CAST_AO_RAYS& - QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------| | - * kg (globals) ------------------------------------| | - * queuesize ---------------------------------------| | - * - * Note on sd_shadow : sd_shadow is neither input nor output to this kernel. sd_shadow is filled and consumed in this kernel itself. - * Note on queues : - * The kernel fetches from QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS queues. We will empty - * these queues this kernel. - * State of queues when this kernel is called : - * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same - * before and after this kernel call. - * QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_DL_RAYS will be filled with rays marked with flags RAY_SHADOW_RAY_CAST_AO - * and RAY_SHADOW_RAY_CAST_DL respectively, during kernel entry. - * QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty at kernel exit. - */ -ccl_device void kernel_shadow_blocked( - KernelGlobals *kg, - ccl_global PathState *PathState_coop, /* Required for shadow blocked */ - ccl_global Ray *LightRay_dl_coop, /* Required for direct lighting's shadow blocked */ - ccl_global Ray *LightRay_ao_coop, /* Required for AO's shadow blocked */ - ccl_global char *ray_state, - char shadow_blocked_type, - int ray_index) -{ - /* Flag determining if we need to update L. */ - char update_path_radiance = 0; - - if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) || - IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) - { - ccl_global PathState *state = &PathState_coop[ray_index]; - ccl_global Ray *light_ray_dl_global = &LightRay_dl_coop[ray_index]; - ccl_global Ray *light_ray_ao_global = &LightRay_ao_coop[ray_index]; - - ccl_global Ray *light_ray_global = - shadow_blocked_type == RAY_SHADOW_RAY_CAST_AO - ? light_ray_ao_global - : light_ray_dl_global; - - float3 shadow; - update_path_radiance = !(shadow_blocked(kg, - kg->sd_input, - state, - light_ray_global, - &shadow)); - - /* We use light_ray_global's P and t to store shadow and - * update_path_radiance. - */ - light_ray_global->P = shadow; - light_ray_global->t = update_path_radiance; - } -} diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h new file mode 100644 index 00000000000..a4cffd77eff --- /dev/null +++ b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h @@ -0,0 +1,55 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +/* Shadow ray cast for AO. */ +ccl_device void kernel_shadow_blocked_ao(KernelGlobals *kg) +{ + unsigned int ao_queue_length = kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS]; + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + int ray_index = QUEUE_EMPTY_SLOT; + int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + if(thread_index < ao_queue_length) { + ray_index = get_ray_index(kg, thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, + kernel_split_state.queue_data, kernel_split_params.queue_size, 1); + } + + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } + + ShaderData *sd = kernel_split_sd(sd, ray_index); + ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]); + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + float3 throughput = kernel_split_state.throughput[ray_index]; + +#ifdef __BRANCHED_PATH__ + if(!kernel_data.integrator.branched || IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { +#endif + kernel_path_ao(kg, sd, emission_sd, L, state, throughput, shader_bsdf_alpha(kg, sd)); +#ifdef __BRANCHED_PATH__ + } + else { + kernel_branched_path_ao(kg, sd, emission_sd, L, state, throughput); + } +#endif +} + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h new file mode 100644 index 00000000000..da072fd5f1a --- /dev/null +++ b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h @@ -0,0 +1,107 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +/* Shadow ray cast for direct visible light. */ +ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg) +{ + unsigned int dl_queue_length = kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS]; + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + int ray_index = QUEUE_EMPTY_SLOT; + int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + if(thread_index < dl_queue_length) { + ray_index = get_ray_index(kg, thread_index, QUEUE_SHADOW_RAY_CAST_DL_RAYS, + kernel_split_state.queue_data, kernel_split_params.queue_size, 1); + } + +#ifdef __BRANCHED_PATH__ + /* TODO(mai): move this somewhere else? */ + if(thread_index == 0) { + /* Clear QUEUE_INACTIVE_RAYS before next kernel. */ + kernel_split_params.queue_index[QUEUE_INACTIVE_RAYS] = 0; + } +#endif /* __BRANCHED_PATH__ */ + + if(ray_index == QUEUE_EMPTY_SLOT) + return; + + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + Ray ray = kernel_split_state.light_ray[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ShaderData *sd = kernel_split_sd(sd, ray_index); + float3 throughput = kernel_split_state.throughput[ray_index]; + + BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index]; + ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]); + bool is_lamp = kernel_split_state.is_lamp[ray_index]; + +# if defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__) + bool use_branched = false; + int all = 0; + + if(state->flag & PATH_RAY_SHADOW_CATCHER) { + use_branched = true; + all = 1; + } +# if defined(__BRANCHED_PATH__) + else if(kernel_data.integrator.branched) { + use_branched = true; + + if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { + all = (kernel_data.integrator.sample_all_lights_indirect); + } + else + { + all = (kernel_data.integrator.sample_all_lights_direct); + } + } +# endif /* __BRANCHED_PATH__ */ + + if(use_branched) { + kernel_branched_path_surface_connect_light(kg, + sd, + emission_sd, + state, + throughput, + 1.0f, + L, + all); + } + else +# endif /* defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)*/ + { + /* trace shadow ray */ + float3 shadow; + + if(!shadow_blocked(kg, + sd, + emission_sd, + state, + &ray, + &shadow)) + { + /* accumulate */ + path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp); + } + else { + path_radiance_accum_total_light(L, state, throughput, &L_light); + } + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h index 2135ee22b2e..b52e7bddc82 100644 --- a/intern/cycles/kernel/split/kernel_split_common.h +++ b/intern/cycles/kernel/split/kernel_split_common.h @@ -17,48 +17,83 @@ #ifndef __KERNEL_SPLIT_H__ #define __KERNEL_SPLIT_H__ -#include "kernel_compat_opencl.h" -#include "kernel_math.h" -#include "kernel_types.h" -#include "kernel_globals.h" -#include "kernel_image_opencl.h" - -#include "util_atomic.h" - -#include "kernel_random.h" -#include "kernel_projection.h" -#include "kernel_montecarlo.h" -#include "kernel_differential.h" -#include "kernel_camera.h" - -#include "geom/geom.h" -#include "bvh/bvh.h" - -#include "kernel_accumulate.h" -#include "kernel_shader.h" -#include "kernel_light.h" -#include "kernel_passes.h" - -#ifdef __SUBSURFACE__ -#include "kernel_subsurface.h" +#include "kernel/kernel_math.h" +#include "kernel/kernel_types.h" + +#include "kernel/split/kernel_split_data.h" + +#include "kernel/kernel_globals.h" + +#ifdef __OSL__ +# include "kernel/osl/osl_shader.h" #endif -#ifdef __VOLUME__ -#include "kernel_volume.h" +#ifdef __KERNEL_OPENCL__ +# include "kernel/kernels/opencl/kernel_opencl_image.h" +#endif +#ifdef __KERNEL_CUDA__ +# include "kernel/kernels/cuda/kernel_cuda_image.h" +#endif +#ifdef __KERNEL_CPU__ +# include "kernel/kernels/cpu/kernel_cpu_image.h" #endif -#include "kernel_path_state.h" -#include "kernel_shadow.h" -#include "kernel_emission.h" -#include "kernel_path_common.h" -#include "kernel_path_surface.h" -#include "kernel_path_volume.h" +#include "util/util_atomic.h" + +#include "kernel/kernel_path.h" +#ifdef __BRANCHED_PATH__ +# include "kernel/kernel_path_branched.h" +#endif + +#include "kernel/kernel_queues.h" +#include "kernel/kernel_work_stealing.h" + +#ifdef __BRANCHED_PATH__ +# include "kernel/split/kernel_branched.h" +#endif + +CCL_NAMESPACE_BEGIN + +ccl_device_inline void kernel_split_path_end(KernelGlobals *kg, int ray_index) +{ + ccl_global char *ray_state = kernel_split_state.ray_state; + +#ifdef __BRANCHED_PATH__ + ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index]; + + if(ss_indirect->num_rays) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + } + else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT_SHARED)) { + int orig_ray = kernel_split_state.branched_state[ray_index].original_ray; + + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + PathRadiance *orig_ray_L = &kernel_split_state.path_radiance[orig_ray]; + + path_radiance_sum_indirect(L); + path_radiance_accum_sample(orig_ray_L, L); + + atomic_fetch_and_dec_uint32((ccl_global uint*)&kernel_split_state.branched_state[orig_ray].shared_sample_count); -#ifdef __KERNEL_DEBUG__ -#include "kernel_debug.h" + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE); + } + else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER); + } + else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER); + } + else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER); + } + else { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + } +#else + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); #endif +} -#include "kernel_queues.h" -#include "kernel_work_stealing.h" +CCL_NAMESPACE_END #endif /* __KERNEL_SPLIT_H__ */ diff --git a/intern/cycles/kernel/split/kernel_split_data.h b/intern/cycles/kernel/split/kernel_split_data.h new file mode 100644 index 00000000000..9297e1e0ad5 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_split_data.h @@ -0,0 +1,75 @@ +/* + * Copyright 2011-2016 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __KERNEL_SPLIT_DATA_H__ +#define __KERNEL_SPLIT_DATA_H__ + +#include "kernel/split/kernel_split_data_types.h" +#include "kernel/kernel_globals.h" + +CCL_NAMESPACE_BEGIN + +ccl_device_inline uint64_t split_data_buffer_size(KernelGlobals *kg, size_t num_elements) +{ + (void)kg; /* Unused on CPU. */ + + uint64_t size = 0; +#define SPLIT_DATA_ENTRY(type, name, num) + align_up(num_elements * num * sizeof(type), 16) + size = size SPLIT_DATA_ENTRIES; +#undef SPLIT_DATA_ENTRY + + uint64_t closure_size = sizeof(ShaderClosure) * (kernel_data.integrator.max_closures-1); + +#ifdef __BRANCHED_PATH__ + size += align_up(num_elements * (sizeof(ShaderData) + closure_size), 16); +#endif + + size += align_up(num_elements * (sizeof(ShaderData) + closure_size), 16); + + return size; +} + +ccl_device_inline void split_data_init(KernelGlobals *kg, + ccl_global SplitData *split_data, + size_t num_elements, + ccl_global void *data, + ccl_global char *ray_state) +{ + (void)kg; /* Unused on CPU. */ + + ccl_global char *p = (ccl_global char*)data; + +#define SPLIT_DATA_ENTRY(type, name, num) \ + split_data->name = (type*)p; p += align_up(num_elements * num * sizeof(type), 16); + SPLIT_DATA_ENTRIES; +#undef SPLIT_DATA_ENTRY + + uint64_t closure_size = sizeof(ShaderClosure) * (kernel_data.integrator.max_closures-1); + +#ifdef __BRANCHED_PATH__ + split_data->_branched_state_sd = (ShaderData*)p; + p += align_up(num_elements * (sizeof(ShaderData) + closure_size), 16); +#endif + + split_data->_sd = (ShaderData*)p; + p += align_up(num_elements * (sizeof(ShaderData) + closure_size), 16); + + split_data->ray_state = ray_state; +} + +CCL_NAMESPACE_END + +#endif /* __KERNEL_SPLIT_DATA_H__ */ diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h new file mode 100644 index 00000000000..56194d9f857 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_split_data_types.h @@ -0,0 +1,177 @@ +/* + * Copyright 2011-2016 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __KERNEL_SPLIT_DATA_TYPES_H__ +#define __KERNEL_SPLIT_DATA_TYPES_H__ + +CCL_NAMESPACE_BEGIN + +/* parameters used by the split kernels, we use a single struct to avoid passing these to each kernel */ + +typedef struct SplitParams { + WorkTile tile; + uint total_work_size; + + ccl_global unsigned int *work_pools; + + ccl_global int *queue_index; + int queue_size; + ccl_global char *use_queues_flag; + + /* Place for storing sd->flag. AMD GPU OpenCL compiler workaround */ + int dummy_sd_flag; +} SplitParams; + +/* Global memory variables [porting]; These memory is used for + * co-operation between different kernels; Data written by one + * kernel will be available to another kernel via this global + * memory. + */ + +/* SPLIT_DATA_ENTRY(type, name, num) */ + +#ifdef __BRANCHED_PATH__ + +typedef ccl_global struct SplitBranchedState { + /* various state that must be kept and restored after an indirect loop */ + PathState path_state; + float3 throughput; + Ray ray; + + Intersection isect; + + char ray_state; + + /* indirect loop state */ + int next_closure; + int next_sample; + +#ifdef __SUBSURFACE__ + int ss_next_closure; + int ss_next_sample; + int next_hit; + int num_hits; + + uint lcg_state; + LocalIntersection ss_isect; +#endif /*__SUBSURFACE__ */ + + int shared_sample_count; /* number of branched samples shared with other threads */ + int original_ray; /* index of original ray when sharing branched samples */ + bool waiting_on_shared_samples; +} SplitBranchedState; + +#define SPLIT_DATA_BRANCHED_ENTRIES \ + SPLIT_DATA_ENTRY( SplitBranchedState, branched_state, 1) \ + SPLIT_DATA_ENTRY(ShaderData, _branched_state_sd, 0) +#else +#define SPLIT_DATA_BRANCHED_ENTRIES +#endif /* __BRANCHED_PATH__ */ + +#ifdef __SUBSURFACE__ +# define SPLIT_DATA_SUBSURFACE_ENTRIES \ + SPLIT_DATA_ENTRY(ccl_global SubsurfaceIndirectRays, ss_rays, 1) +#else +# define SPLIT_DATA_SUBSURFACE_ENTRIES +#endif /* __SUBSURFACE__ */ + +#ifdef __VOLUME__ +# define SPLIT_DATA_VOLUME_ENTRIES \ + SPLIT_DATA_ENTRY(ccl_global PathState, state_shadow, 1) +#else +# define SPLIT_DATA_VOLUME_ENTRIES +#endif /* __VOLUME__ */ + +#define SPLIT_DATA_ENTRIES \ + SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \ + SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \ + SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \ + SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \ + SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \ + SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \ + SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \ + SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \ + SPLIT_DATA_ENTRY(ccl_global int, queue_data, (NUM_QUEUES*2)) /* TODO(mai): this is too large? */ \ + SPLIT_DATA_ENTRY(ccl_global uint, buffer_offset, 1) \ + SPLIT_DATA_ENTRY(ShaderDataTinyStorage, sd_DL_shadow, 1) \ + SPLIT_DATA_SUBSURFACE_ENTRIES \ + SPLIT_DATA_VOLUME_ENTRIES \ + SPLIT_DATA_BRANCHED_ENTRIES \ + SPLIT_DATA_ENTRY(ShaderData, _sd, 0) + +/* entries to be copied to inactive rays when sharing branched samples (TODO: which are actually needed?) */ +#define SPLIT_DATA_ENTRIES_BRANCHED_SHARED \ + SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \ + SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \ + SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \ + SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \ + SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \ + SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \ + SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \ + SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \ + SPLIT_DATA_ENTRY(ShaderDataTinyStorage, sd_DL_shadow, 1) \ + SPLIT_DATA_SUBSURFACE_ENTRIES \ + SPLIT_DATA_VOLUME_ENTRIES \ + SPLIT_DATA_BRANCHED_ENTRIES \ + SPLIT_DATA_ENTRY(ShaderData, _sd, 0) + +/* struct that holds pointers to data in the shared state buffer */ +typedef struct SplitData { +#define SPLIT_DATA_ENTRY(type, name, num) type *name; + SPLIT_DATA_ENTRIES +#undef SPLIT_DATA_ENTRY + + /* this is actually in a separate buffer from the rest of the split state data (so it can be read back from + * the host easily) but is still used the same as the other data so we have it here in this struct as well + */ + ccl_global char *ray_state; +} SplitData; + +#ifndef __KERNEL_CUDA__ +# define kernel_split_state (kg->split_data) +# define kernel_split_params (kg->split_param_data) +#else +__device__ SplitData __split_data; +# define kernel_split_state (__split_data) +__device__ SplitParams __split_param_data; +# define kernel_split_params (__split_param_data) +#endif /* __KERNEL_CUDA__ */ + +#define kernel_split_sd(sd, ray_index) ((ShaderData*) \ + ( \ + ((ccl_global char*)kernel_split_state._##sd) + \ + (sizeof(ShaderData) + sizeof(ShaderClosure)*(kernel_data.integrator.max_closures-1)) * (ray_index) \ + )) + +/* Local storage for queue_enqueue kernel. */ +typedef struct QueueEnqueueLocals { + uint queue_atomics[2]; +} QueueEnqueueLocals; + +/* Local storage for holdout_emission_blurring_pathtermination_ao kernel. */ +typedef struct BackgroundAOLocals { + uint queue_atomics_bg; + uint queue_atomics_ao; +} BackgroundAOLocals; + +typedef struct ShaderSortLocals { + uint local_value[SHADER_SORT_BLOCK_SIZE]; + ushort local_index[SHADER_SORT_BLOCK_SIZE]; +} ShaderSortLocals; + +CCL_NAMESPACE_END + +#endif /* __KERNEL_SPLIT_DATA_TYPES_H__ */ diff --git a/intern/cycles/kernel/split/kernel_subsurface_scatter.h b/intern/cycles/kernel/split/kernel_subsurface_scatter.h new file mode 100644 index 00000000000..af0303d8608 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_subsurface_scatter.h @@ -0,0 +1,287 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +#if defined(__BRANCHED_PATH__) && defined(__SUBSURFACE__) + +ccl_device_inline void kernel_split_branched_path_subsurface_indirect_light_init(KernelGlobals *kg, int ray_index) +{ + kernel_split_branched_path_indirect_loop_init(kg, ray_index); + + SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; + + branched_state->ss_next_closure = 0; + branched_state->ss_next_sample = 0; + + branched_state->num_hits = 0; + branched_state->next_hit = 0; + + ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT); +} + +ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_iter(KernelGlobals *kg, int ray_index) +{ + SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index]; + + ShaderData *sd = kernel_split_sd(branched_state_sd, ray_index); + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]); + + for(int i = branched_state->ss_next_closure; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; + + if(!CLOSURE_IS_BSSRDF(sc->type)) + continue; + + /* set up random number generator */ + if(branched_state->ss_next_sample == 0 && branched_state->next_hit == 0 && + branched_state->next_closure == 0 && branched_state->next_sample == 0) + { + branched_state->lcg_state = lcg_state_init_addrspace(&branched_state->path_state, + 0x68bc21eb); + } + int num_samples = kernel_data.integrator.subsurface_samples * 3; + float num_samples_inv = 1.0f/num_samples; + uint bssrdf_rng_hash = cmj_hash(branched_state->path_state.rng_hash, i); + + /* do subsurface scatter step with copy of shader data, this will + * replace the BSSRDF with a diffuse BSDF closure */ + for(int j = branched_state->ss_next_sample; j < num_samples; j++) { + ccl_global PathState *hit_state = &kernel_split_state.path_state[ray_index]; + *hit_state = branched_state->path_state; + hit_state->rng_hash = bssrdf_rng_hash; + path_state_branch(hit_state, j, num_samples); + + ccl_global LocalIntersection *ss_isect = &branched_state->ss_isect; + float bssrdf_u, bssrdf_v; + path_branched_rng_2D(kg, + bssrdf_rng_hash, + hit_state, + j, + num_samples, + PRNG_BSDF_U, + &bssrdf_u, + &bssrdf_v); + + /* intersection is expensive so avoid doing multiple times for the same input */ + if(branched_state->next_hit == 0 && branched_state->next_closure == 0 && branched_state->next_sample == 0) { + uint lcg_state = branched_state->lcg_state; + LocalIntersection ss_isect_private; + + branched_state->num_hits = subsurface_scatter_multi_intersect(kg, + &ss_isect_private, + sd, + hit_state, + sc, + &lcg_state, + bssrdf_u, bssrdf_v, + true); + + branched_state->lcg_state = lcg_state; + *ss_isect = ss_isect_private; + } + + hit_state->rng_offset += PRNG_BOUNCE_NUM; + +#ifdef __VOLUME__ + Ray volume_ray = branched_state->ray; + bool need_update_volume_stack = + kernel_data.integrator.use_volumes && + sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME; +#endif /* __VOLUME__ */ + + /* compute lighting with the BSDF closure */ + for(int hit = branched_state->next_hit; hit < branched_state->num_hits; hit++) { + ShaderData *bssrdf_sd = kernel_split_sd(sd, ray_index); + *bssrdf_sd = *sd; /* note: copy happens each iteration of inner loop, this is + * important as the indirect path will write into bssrdf_sd */ + + LocalIntersection ss_isect_private = *ss_isect; + subsurface_scatter_multi_setup(kg, + &ss_isect_private, + hit, + bssrdf_sd, + hit_state, + sc); + *ss_isect = ss_isect_private; + +#ifdef __VOLUME__ + if(need_update_volume_stack) { + /* Setup ray from previous surface point to the new one. */ + float3 P = ray_offset(bssrdf_sd->P, -bssrdf_sd->Ng); + volume_ray.D = normalize_len(P - volume_ray.P, &volume_ray.t); + + for(int k = 0; k < VOLUME_STACK_SIZE; k++) { + hit_state->volume_stack[k] = branched_state->path_state.volume_stack[k]; + } + + kernel_volume_stack_update_for_subsurface(kg, + emission_sd, + &volume_ray, + hit_state->volume_stack); + } +#endif /* __VOLUME__ */ + +#ifdef __EMISSION__ + if(branched_state->next_closure == 0 && branched_state->next_sample == 0) { + /* direct light */ + if(kernel_data.integrator.use_direct_light) { + int all = (kernel_data.integrator.sample_all_lights_direct) || + (hit_state->flag & PATH_RAY_SHADOW_CATCHER); + kernel_branched_path_surface_connect_light(kg, + bssrdf_sd, + emission_sd, + hit_state, + branched_state->throughput, + num_samples_inv, + L, + all); + } + } +#endif /* __EMISSION__ */ + + /* indirect light */ + if(kernel_split_branched_path_surface_indirect_light_iter(kg, + ray_index, + num_samples_inv, + bssrdf_sd, + false, + false)) + { + branched_state->ss_next_closure = i; + branched_state->ss_next_sample = j; + branched_state->next_hit = hit; + + return true; + } + + branched_state->next_closure = 0; + } + + branched_state->next_hit = 0; + } + + branched_state->ss_next_sample = 0; + } + + branched_state->ss_next_closure = sd->num_closure; + + branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0); + if(branched_state->waiting_on_shared_samples) { + return true; + } + + kernel_split_branched_path_indirect_loop_end(kg, ray_index); + + return false; +} + +#endif /* __BRANCHED_PATH__ && __SUBSURFACE__ */ + +ccl_device void kernel_subsurface_scatter(KernelGlobals *kg) +{ + int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + if(thread_index == 0) { + /* We will empty both queues in this kernel. */ + kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; + kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0; + } + + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + ray_index = get_ray_index(kg, ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); + get_ray_index(kg, thread_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); + +#ifdef __SUBSURFACE__ + ccl_global char *ray_state = kernel_split_state.ray_state; + + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; + ccl_global SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index]; + ShaderData *sd = kernel_split_sd(sd, ray_index); + ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]); + + if(sd->flag & SD_BSSRDF) { + +#ifdef __BRANCHED_PATH__ + if(!kernel_data.integrator.branched || + IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) + { +#endif + if(kernel_path_subsurface_scatter(kg, + sd, + emission_sd, + L, + state, + ray, + throughput, + ss_indirect)) + { + kernel_split_path_end(kg, ray_index); + } +#ifdef __BRANCHED_PATH__ + } + else { + kernel_split_branched_path_subsurface_indirect_light_init(kg, ray_index); + + if(kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } + } +#endif + } + } + +# ifdef __BRANCHED_PATH__ + if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) { + kernel_split_params.queue_index[QUEUE_SUBSURFACE_INDIRECT_ITER] = 0; + } + + /* iter loop */ + ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0), + QUEUE_SUBSURFACE_INDIRECT_ITER, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); + + if(IS_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER)) { + /* for render passes, sum and reset indirect light pass variables + * for the next samples */ + path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]); + path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]); + + if(kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + } + } +# endif /* __BRANCHED_PATH__ */ + +#endif /* __SUBSURFACE__ */ + +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_sum_all_radiance.h b/intern/cycles/kernel/split/kernel_sum_all_radiance.h deleted file mode 100644 index a21e9b6a0b1..00000000000 --- a/intern/cycles/kernel/split/kernel_sum_all_radiance.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "../kernel_compat_opencl.h" -#include "../kernel_math.h" -#include "../kernel_types.h" -#include "../kernel_globals.h" - -/* Since we process various samples in parallel; The output radiance of different samples - * are stored in different locations; This kernel combines the output radiance contributed - * by all different samples and stores them in the RenderTile's output buffer. - */ -ccl_device void kernel_sum_all_radiance( - ccl_constant KernelData *data, /* To get pass_stride to offet into buffer */ - ccl_global float *buffer, /* Output buffer of RenderTile */ - ccl_global float *per_sample_output_buffer, /* Radiance contributed by all samples */ - int parallel_samples, int sw, int sh, int stride, - int buffer_offset_x, - int buffer_offset_y, - int buffer_stride, - int start_sample) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if(x < sw && y < sh) { - buffer += ((buffer_offset_x + x) + (buffer_offset_y + y) * buffer_stride) * (data->film.pass_stride); - per_sample_output_buffer += ((x + y * stride) * parallel_samples) * (data->film.pass_stride); - - int sample_stride = (data->film.pass_stride); - - int sample_iterator = 0; - int pass_stride_iterator = 0; - int num_floats = data->film.pass_stride; - - for(sample_iterator = 0; sample_iterator < parallel_samples; sample_iterator++) { - for(pass_stride_iterator = 0; pass_stride_iterator < num_floats; pass_stride_iterator++) { - *(buffer + pass_stride_iterator) = - (start_sample == 0 && sample_iterator == 0) - ? *(per_sample_output_buffer + pass_stride_iterator) - : *(buffer + pass_stride_iterator) + *(per_sample_output_buffer + pass_stride_iterator); - } - per_sample_output_buffer += sample_stride; - } - } -} diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h index 88ec7fe6fcc..39cd5da7b12 100644 --- a/intern/cycles/kernel/svm/svm.h +++ b/intern/cycles/kernel/svm/svm.h @@ -30,8 +30,7 @@ * in local memory on the GPU, as it would take too many register and indexes in * ways not known at compile time. This seems the only solution even though it * may be slow, with two positive factors. If the same shader is being executed, - * memory access will be coalesced, and on fermi cards, memory will actually be - * cached. + * memory access will be coalesced and cached. * * The result of shader execution will be a single closure. This means the * closure type, associated label, data and weight. Sampling from multiple @@ -39,7 +38,7 @@ * mostly taken care of in the SVM compiler. */ -#include "svm_types.h" +#include "kernel/svm/svm_types.h" CCL_NAMESPACE_BEGIN @@ -139,49 +138,53 @@ CCL_NAMESPACE_END /* Nodes */ -#include "svm_noise.h" +#include "kernel/svm/svm_noise.h" #include "svm_texture.h" -#include "svm_color_util.h" -#include "svm_math_util.h" - -#include "svm_attribute.h" -#include "svm_gradient.h" -#include "svm_blackbody.h" -#include "svm_closure.h" -#include "svm_noisetex.h" -#include "svm_convert.h" -#include "svm_displace.h" -#include "svm_fresnel.h" -#include "svm_wireframe.h" -#include "svm_wavelength.h" -#include "svm_camera.h" -#include "svm_geometry.h" -#include "svm_hsv.h" -#include "svm_image.h" -#include "svm_gamma.h" -#include "svm_brightness.h" -#include "svm_invert.h" -#include "svm_light_path.h" -#include "svm_magic.h" -#include "svm_mapping.h" -#include "svm_normal.h" -#include "svm_wave.h" -#include "svm_math.h" -#include "svm_mix.h" -#include "svm_ramp.h" -#include "svm_sepcomb_hsv.h" -#include "svm_sepcomb_vector.h" -#include "svm_musgrave.h" -#include "svm_sky.h" -#include "svm_tex_coord.h" -#include "svm_value.h" -#include "svm_voronoi.h" -#include "svm_checker.h" -#include "svm_brick.h" -#include "svm_vector_transform.h" -#include "svm_voxel.h" -#include "svm_bump.h" +#include "kernel/svm/svm_color_util.h" +#include "kernel/svm/svm_math_util.h" + +#include "kernel/svm/svm_attribute.h" +#include "kernel/svm/svm_gradient.h" +#include "kernel/svm/svm_blackbody.h" +#include "kernel/svm/svm_closure.h" +#include "kernel/svm/svm_noisetex.h" +#include "kernel/svm/svm_convert.h" +#include "kernel/svm/svm_displace.h" +#include "kernel/svm/svm_fresnel.h" +#include "kernel/svm/svm_wireframe.h" +#include "kernel/svm/svm_wavelength.h" +#include "kernel/svm/svm_camera.h" +#include "kernel/svm/svm_geometry.h" +#include "kernel/svm/svm_hsv.h" +#include "kernel/svm/svm_image.h" +#include "kernel/svm/svm_gamma.h" +#include "kernel/svm/svm_brightness.h" +#include "kernel/svm/svm_invert.h" +#include "kernel/svm/svm_light_path.h" +#include "kernel/svm/svm_magic.h" +#include "kernel/svm/svm_mapping.h" +#include "kernel/svm/svm_normal.h" +#include "kernel/svm/svm_wave.h" +#include "kernel/svm/svm_math.h" +#include "kernel/svm/svm_mix.h" +#include "kernel/svm/svm_ramp.h" +#include "kernel/svm/svm_sepcomb_hsv.h" +#include "kernel/svm/svm_sepcomb_vector.h" +#include "kernel/svm/svm_musgrave.h" +#include "kernel/svm/svm_sky.h" +#include "kernel/svm/svm_tex_coord.h" +#include "kernel/svm/svm_value.h" +#include "kernel/svm/svm_voronoi.h" +#include "kernel/svm/svm_checker.h" +#include "kernel/svm/svm_brick.h" +#include "kernel/svm/svm_vector_transform.h" +#include "kernel/svm/svm_voxel.h" +#include "kernel/svm/svm_bump.h" + +#ifdef __SHADER_RAYTRACE__ +# include "kernel/svm/svm_bevel.h" +#endif CCL_NAMESPACE_BEGIN @@ -192,7 +195,7 @@ CCL_NAMESPACE_BEGIN ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, ShaderType type, int path_flag) { float stack[SVM_STACK_SIZE]; - int offset = ccl_fetch(sd, shader) & SHADER_MASK; + int offset = sd->shader & SHADER_MASK; while(1) { uint4 node = read_node(kg, &offset); @@ -207,7 +210,7 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ccl_a break; } case NODE_CLOSURE_BSDF: - svm_node_closure_bsdf(kg, sd, stack, node, path_flag, &offset); + svm_node_closure_bsdf(kg, sd, stack, node, type, path_flag, &offset); break; case NODE_CLOSURE_EMISSION: svm_node_closure_emission(sd, stack, node); @@ -263,6 +266,12 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ccl_a case NODE_SET_DISPLACEMENT: svm_node_set_displacement(kg, sd, stack, node.y); break; + case NODE_DISPLACEMENT: + svm_node_displacement(kg, sd, stack, node); + break; + case NODE_VECTOR_DISPLACEMENT: + svm_node_vector_displacement(kg, sd, stack, node, &offset); + break; # endif /* NODES_FEATURE(NODE_FEATURE_BUMP) */ # ifdef __TEXTURES__ case NODE_TEX_IMAGE: @@ -325,7 +334,10 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ccl_a break; # if NODES_FEATURE(NODE_FEATURE_VOLUME) case NODE_CLOSURE_VOLUME: - svm_node_closure_volume(kg, sd, stack, node, path_flag); + svm_node_closure_volume(kg, sd, stack, node, type); + break; + case NODE_PRINCIPLED_VOLUME: + svm_node_principled_volume(kg, sd, stack, node, type, path_flag, &offset); break; # endif /* NODES_FEATURE(NODE_FEATURE_VOLUME) */ # ifdef __EXTRA_NODES__ @@ -460,6 +472,11 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ccl_a svm_node_tex_voxel(kg, sd, stack, node, &offset); break; # endif /* NODES_FEATURE(NODE_FEATURE_VOLUME) */ +# ifdef __SHADER_RAYTRACE__ + case NODE_BEVEL: + svm_node_bevel(kg, sd, state, stack, node); + break; +# endif /* __SHADER_RAYTRACE__ */ #endif /* NODES_GROUP(NODE_GROUP_LEVEL_3) */ case NODE_END: return; diff --git a/intern/cycles/kernel/svm/svm_attribute.h b/intern/cycles/kernel/svm/svm_attribute.h index 0e55c99ae97..229a3f20421 100644 --- a/intern/cycles/kernel/svm/svm_attribute.h +++ b/intern/cycles/kernel/svm/svm_attribute.h @@ -27,7 +27,7 @@ ccl_device AttributeDescriptor svm_node_attr_init(KernelGlobals *kg, ShaderData AttributeDescriptor desc; - if(ccl_fetch(sd, object) != OBJECT_NONE) { + if(sd->object != OBJECT_NONE) { desc = find_attribute(kg, sd, node.y); if(desc.offset == ATTR_STD_NOT_FOUND) { desc = attribute_not_found(); diff --git a/intern/cycles/kernel/svm/svm_bevel.h b/intern/cycles/kernel/svm/svm_bevel.h new file mode 100644 index 00000000000..89f4a98e846 --- /dev/null +++ b/intern/cycles/kernel/svm/svm_bevel.h @@ -0,0 +1,226 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +/* Bevel shader averaging normals from nearby surfaces. + * + * Sampling strategy from: BSSRDF Importance Sampling, SIGGRAPH 2013 + * http://library.imageworks.com/pdfs/imageworks-library-BSSRDF-sampling.pdf + */ + +ccl_device_noinline float3 svm_bevel( + KernelGlobals *kg, + ShaderData *sd, + ccl_addr_space PathState *state, + float radius, + int num_samples) +{ + /* Early out if no sampling needed. */ + if(radius <= 0.0f || num_samples < 1 || sd->object == OBJECT_NONE) { + return sd->N; + } + + /* Don't bevel for blurry indirect rays. */ + if(state->min_ray_pdf < 8.0f) { + return sd->N; + } + + /* Setup for multi intersection. */ + LocalIntersection isect; + uint lcg_state = lcg_state_init_addrspace(state, 0x64c6a40e); + + /* Sample normals from surrounding points on surface. */ + float3 sum_N = make_float3(0.0f, 0.0f, 0.0f); + + for(int sample = 0; sample < num_samples; sample++) { + float disk_u, disk_v; + path_branched_rng_2D(kg, state->rng_hash, state, sample, num_samples, + PRNG_BEVEL_U, &disk_u, &disk_v); + + /* Pick random axis in local frame and point on disk. */ + float3 disk_N, disk_T, disk_B; + float pick_pdf_N, pick_pdf_T, pick_pdf_B; + + disk_N = sd->Ng; + make_orthonormals(disk_N, &disk_T, &disk_B); + + float axisu = disk_u; + + if(axisu < 0.5f) { + pick_pdf_N = 0.5f; + pick_pdf_T = 0.25f; + pick_pdf_B = 0.25f; + disk_u *= 2.0f; + } + else if(axisu < 0.75f) { + float3 tmp = disk_N; + disk_N = disk_T; + disk_T = tmp; + pick_pdf_N = 0.25f; + pick_pdf_T = 0.5f; + pick_pdf_B = 0.25f; + disk_u = (disk_u - 0.5f)*4.0f; + } + else { + float3 tmp = disk_N; + disk_N = disk_B; + disk_B = tmp; + pick_pdf_N = 0.25f; + pick_pdf_T = 0.25f; + pick_pdf_B = 0.5f; + disk_u = (disk_u - 0.75f)*4.0f; + } + + /* Sample point on disk. */ + float phi = M_2PI_F * disk_u; + float disk_r = disk_v; + float disk_height; + + /* Perhaps find something better than Cubic BSSRDF, but happens to work well. */ + bssrdf_cubic_sample(radius, 0.0f, disk_r, &disk_r, &disk_height); + + float3 disk_P = (disk_r*cosf(phi)) * disk_T + (disk_r*sinf(phi)) * disk_B; + + /* Create ray. */ + Ray *ray = &isect.ray; + ray->P = sd->P + disk_N*disk_height + disk_P; + ray->D = -disk_N; + ray->t = 2.0f*disk_height; + ray->dP = sd->dP; + ray->dD = differential3_zero(); + ray->time = sd->time; + + /* Intersect with the same object. if multiple intersections are found it + * will use at most LOCAL_MAX_HITS hits, a random subset of all hits. */ + scene_intersect_local(kg, + *ray, + &isect, + sd->object, + &lcg_state, + LOCAL_MAX_HITS); + + int num_eval_hits = min(isect.num_hits, LOCAL_MAX_HITS); + + for(int hit = 0; hit < num_eval_hits; hit++) { + /* Quickly retrieve P and Ng without setting up ShaderData. */ + float3 hit_P; + if(sd->type & PRIMITIVE_TRIANGLE) { + hit_P = triangle_refine_local(kg, + sd, + &isect.hits[hit], + ray); + } +#ifdef __OBJECT_MOTION__ + else if(sd->type & PRIMITIVE_MOTION_TRIANGLE) { + float3 verts[3]; + motion_triangle_vertices( + kg, + sd->object, + kernel_tex_fetch(__prim_index, isect.hits[hit].prim), + sd->time, + verts); + hit_P = motion_triangle_refine_local(kg, + sd, + &isect.hits[hit], + ray, + verts); + } +#endif /* __OBJECT_MOTION__ */ + + float3 hit_Ng = isect.Ng[hit]; + + /* Compute smooth normal. */ + float3 N = hit_Ng; + int prim = kernel_tex_fetch(__prim_index, isect.hits[hit].prim); + int shader = kernel_tex_fetch(__tri_shader, prim); + + if (shader & SHADER_SMOOTH_NORMAL) { + float u = isect.hits[hit].u; + float v = isect.hits[hit].v; + + if (sd->type & PRIMITIVE_TRIANGLE) { + N = triangle_smooth_normal(kg, N, prim, u, v); + } +#ifdef __OBJECT_MOTION__ + else if(sd->type & PRIMITIVE_MOTION_TRIANGLE) { + N = motion_triangle_smooth_normal(kg, N, sd->object, prim, u, v, sd->time); + } +#endif /* __OBJECT_MOTION__ */ + } + + /* Transform normals to world space. */ + if(isect.hits[hit].object != OBJECT_NONE) { + object_normal_transform(kg, sd, &N); + object_normal_transform(kg, sd, &hit_Ng); + } + + /* Probability densities for local frame axes. */ + float pdf_N = pick_pdf_N * fabsf(dot(disk_N, hit_Ng)); + float pdf_T = pick_pdf_T * fabsf(dot(disk_T, hit_Ng)); + float pdf_B = pick_pdf_B * fabsf(dot(disk_B, hit_Ng)); + + /* Multiple importance sample between 3 axes, power heuristic + * found to be slightly better than balance heuristic. pdf_N + * in the MIS weight and denominator cancelled out. */ + float w = pdf_N / (sqr(pdf_N) + sqr(pdf_T) + sqr(pdf_B)); + if(isect.num_hits > LOCAL_MAX_HITS) { + w *= isect.num_hits/(float)LOCAL_MAX_HITS; + } + + /* Real distance to sampled point. */ + float r = len(hit_P - sd->P); + + /* Compute weight. */ + float pdf = bssrdf_cubic_pdf(radius, 0.0f, r); + float disk_pdf = bssrdf_cubic_pdf(radius, 0.0f, disk_r); + + w *= pdf / disk_pdf; + + /* Sum normal and weight. */ + sum_N += w * N; + } + } + + /* Normalize. */ + float3 N = safe_normalize(sum_N); + return is_zero(N) ? sd->N : (sd->flag & SD_BACKFACING) ? -N : N; +} + +ccl_device void svm_node_bevel( + KernelGlobals *kg, + ShaderData *sd, + ccl_addr_space PathState *state, + float *stack, + uint4 node) +{ + uint num_samples, radius_offset, normal_offset, out_offset; + decode_node_uchar4(node.y, &num_samples, &radius_offset, &normal_offset, &out_offset); + + float radius = stack_load_float(stack, radius_offset); + float3 bevel_N = svm_bevel(kg, sd, state, radius, num_samples); + + if(stack_valid(normal_offset)) { + /* Preserve input normal. */ + float3 ref_N = stack_load_float3(stack, normal_offset); + bevel_N = normalize(ref_N + (bevel_N - sd->N)); + } + + stack_store_float3(stack, out_offset, bevel_N); +} + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/svm/svm_blackbody.h b/intern/cycles/kernel/svm/svm_blackbody.h index b750ad87b7f..51590b18505 100644 --- a/intern/cycles/kernel/svm/svm_blackbody.h +++ b/intern/cycles/kernel/svm/svm_blackbody.h @@ -41,8 +41,7 @@ ccl_device void svm_node_blackbody(KernelGlobals *kg, ShaderData *sd, float *sta float3 color_rgb = svm_math_blackbody_color(temperature); - if(stack_valid(col_offset)) - stack_store_float3(stack, col_offset, color_rgb); + stack_store_float3(stack, col_offset, color_rgb); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_brick.h b/intern/cycles/kernel/svm/svm_brick.h index 14245cf0522..90fa2a99b67 100644 --- a/intern/cycles/kernel/svm/svm_brick.h +++ b/intern/cycles/kernel/svm/svm_brick.h @@ -18,9 +18,9 @@ CCL_NAMESPACE_BEGIN /* Brick */ -ccl_device_noinline float brick_noise(int n) /* fast integer noise */ +ccl_device_noinline float brick_noise(uint n) /* fast integer noise */ { - int nn; + uint nn; n = (n + 1013) & 0x7fffffff; n = (n >> 13) ^ n; nn = (n * (n * n * 60493 + 19990303) + 1376312589) & 0x7fffffff; diff --git a/intern/cycles/kernel/svm/svm_bump.h b/intern/cycles/kernel/svm/svm_bump.h index 04a8c7b64e5..610d9af9e1f 100644 --- a/intern/cycles/kernel/svm/svm_bump.h +++ b/intern/cycles/kernel/svm/svm_bump.h @@ -21,9 +21,9 @@ CCL_NAMESPACE_BEGIN ccl_device void svm_node_enter_bump_eval(KernelGlobals *kg, ShaderData *sd, float *stack, uint offset) { /* save state */ - stack_store_float3(stack, offset+0, ccl_fetch(sd, P)); - stack_store_float3(stack, offset+3, ccl_fetch(sd, dP).dx); - stack_store_float3(stack, offset+6, ccl_fetch(sd, dP).dy); + stack_store_float3(stack, offset+0, sd->P); + stack_store_float3(stack, offset+3, sd->dP.dx); + stack_store_float3(stack, offset+6, sd->dP.dy); /* set state as if undisplaced */ const AttributeDescriptor desc = find_attribute(kg, sd, ATTR_STD_POSITION_UNDISPLACED); @@ -36,18 +36,18 @@ ccl_device void svm_node_enter_bump_eval(KernelGlobals *kg, ShaderData *sd, floa object_dir_transform(kg, sd, &dPdx); object_dir_transform(kg, sd, &dPdy); - ccl_fetch(sd, P) = P; - ccl_fetch(sd, dP).dx = dPdx; - ccl_fetch(sd, dP).dy = dPdy; + sd->P = P; + sd->dP.dx = dPdx; + sd->dP.dy = dPdy; } } ccl_device void svm_node_leave_bump_eval(KernelGlobals *kg, ShaderData *sd, float *stack, uint offset) { /* restore state */ - ccl_fetch(sd, P) = stack_load_float3(stack, offset+0); - ccl_fetch(sd, dP).dx = stack_load_float3(stack, offset+3); - ccl_fetch(sd, dP).dy = stack_load_float3(stack, offset+6); + sd->P = stack_load_float3(stack, offset+0); + sd->dP.dx = stack_load_float3(stack, offset+3); + sd->dP.dy = stack_load_float3(stack, offset+6); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_camera.h b/intern/cycles/kernel/svm/svm_camera.h index 00678a49d70..90249dfd978 100644 --- a/intern/cycles/kernel/svm/svm_camera.h +++ b/intern/cycles/kernel/svm/svm_camera.h @@ -23,7 +23,7 @@ ccl_device void svm_node_camera(KernelGlobals *kg, ShaderData *sd, float *stack, float3 vector; Transform tfm = kernel_data.cam.worldtocamera; - vector = transform_point(&tfm, ccl_fetch(sd, P)); + vector = transform_point(&tfm, sd->P); zdepth = vector.z; distance = len(vector); diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h index 017d697f9f8..886a1333fa3 100644 --- a/intern/cycles/kernel/svm/svm_closure.h +++ b/intern/cycles/kernel/svm/svm_closure.h @@ -25,13 +25,13 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, MicrofacetBsdf *bsdf, int t bsdf->alpha_y = 0.0f; bsdf->alpha_x = 0.0f; bsdf->ior = eta; - ccl_fetch(sd, flag) |= bsdf_refraction_setup(bsdf); + sd->flag |= bsdf_refraction_setup(bsdf); } else { bsdf->alpha_y = 0.0f; bsdf->alpha_x = 0.0f; bsdf->ior = 0.0f; - ccl_fetch(sd, flag) |= bsdf_reflection_setup(bsdf); + sd->flag |= bsdf_reflection_setup(bsdf); } } else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID) { @@ -40,9 +40,9 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, MicrofacetBsdf *bsdf, int t bsdf->ior = eta; if(refract) - ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(bsdf); + sd->flag |= bsdf_microfacet_beckmann_refraction_setup(bsdf); else - ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(bsdf); + sd->flag |= bsdf_microfacet_beckmann_setup(bsdf); } else { bsdf->alpha_x = roughness; @@ -50,13 +50,13 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, MicrofacetBsdf *bsdf, int t bsdf->ior = eta; if(refract) - ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(bsdf); + sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf); else - ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(bsdf); + sd->flag |= bsdf_microfacet_ggx_setup(bsdf); } } -ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int path_flag, int *offset) +ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, ShaderType shader_type, int path_flag, int *offset) { uint type, param1_offset, param2_offset; @@ -67,17 +67,359 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * /* note we read this extra node before weight check, so offset is added */ uint4 data_node = read_node(kg, offset); - if(mix_weight == 0.0f) + /* Only compute BSDF for surfaces, transparent variable is shared with volume extinction. */ + if(mix_weight == 0.0f || shader_type != SHADER_TYPE_SURFACE) { + if(type == CLOSURE_BSDF_PRINCIPLED_ID) { + /* Read all principled BSDF extra data to get the right offset. */ + read_node(kg, offset); + read_node(kg, offset); + read_node(kg, offset); + read_node(kg, offset); + } + return; + } - float3 N = stack_valid(data_node.x)? stack_load_float3(stack, data_node.x): ccl_fetch(sd, N); + float3 N = stack_valid(data_node.x)? stack_load_float3(stack, data_node.x): sd->N; float param1 = (stack_valid(param1_offset))? stack_load_float(stack, param1_offset): __uint_as_float(node.z); float param2 = (stack_valid(param2_offset))? stack_load_float(stack, param2_offset): __uint_as_float(node.w); switch(type) { +#ifdef __PRINCIPLED__ + case CLOSURE_BSDF_PRINCIPLED_ID: { + uint specular_offset, roughness_offset, specular_tint_offset, anisotropic_offset, sheen_offset, + sheen_tint_offset, clearcoat_offset, clearcoat_roughness_offset, eta_offset, transmission_offset, + anisotropic_rotation_offset, transmission_roughness_offset; + uint4 data_node2 = read_node(kg, offset); + + float3 T = stack_load_float3(stack, data_node.y); + decode_node_uchar4(data_node.z, &specular_offset, &roughness_offset, &specular_tint_offset, &anisotropic_offset); + decode_node_uchar4(data_node.w, &sheen_offset, &sheen_tint_offset, &clearcoat_offset, &clearcoat_roughness_offset); + decode_node_uchar4(data_node2.x, &eta_offset, &transmission_offset, &anisotropic_rotation_offset, &transmission_roughness_offset); + + // get Disney principled parameters + float metallic = param1; + float subsurface = param2; + float specular = stack_load_float(stack, specular_offset); + float roughness = stack_load_float(stack, roughness_offset); + float specular_tint = stack_load_float(stack, specular_tint_offset); + float anisotropic = stack_load_float(stack, anisotropic_offset); + float sheen = stack_load_float(stack, sheen_offset); + float sheen_tint = stack_load_float(stack, sheen_tint_offset); + float clearcoat = stack_load_float(stack, clearcoat_offset); + float clearcoat_roughness = stack_load_float(stack, clearcoat_roughness_offset); + float transmission = stack_load_float(stack, transmission_offset); + float anisotropic_rotation = stack_load_float(stack, anisotropic_rotation_offset); + float transmission_roughness = stack_load_float(stack, transmission_roughness_offset); + float eta = fmaxf(stack_load_float(stack, eta_offset), 1e-5f); + + ClosureType distribution = (ClosureType) data_node2.y; + ClosureType subsurface_method = (ClosureType) data_node2.z; + + /* rotate tangent */ + if(anisotropic_rotation != 0.0f) + T = rotate_around_axis(T, N, anisotropic_rotation * M_2PI_F); + + /* calculate ior */ + float ior = (sd->flag & SD_BACKFACING) ? 1.0f / eta : eta; + + // calculate fresnel for refraction + float cosNO = dot(N, sd->I); + float fresnel = fresnel_dielectric_cos(cosNO, ior); + + // calculate weights of the diffuse and specular part + float diffuse_weight = (1.0f - saturate(metallic)) * (1.0f - saturate(transmission)); + + float final_transmission = saturate(transmission) * (1.0f - saturate(metallic)); + float specular_weight = (1.0f - final_transmission); + + // get the base color + uint4 data_base_color = read_node(kg, offset); + float3 base_color = stack_valid(data_base_color.x) ? stack_load_float3(stack, data_base_color.x) : + make_float3(__uint_as_float(data_base_color.y), __uint_as_float(data_base_color.z), __uint_as_float(data_base_color.w)); + + // get the additional clearcoat normal and subsurface scattering radius + uint4 data_cn_ssr = read_node(kg, offset); + float3 clearcoat_normal = stack_valid(data_cn_ssr.x) ? stack_load_float3(stack, data_cn_ssr.x) : sd->N; + float3 subsurface_radius = stack_valid(data_cn_ssr.y) ? stack_load_float3(stack, data_cn_ssr.y) : make_float3(1.0f, 1.0f, 1.0f); + + // get the subsurface color + uint4 data_subsurface_color = read_node(kg, offset); + float3 subsurface_color = stack_valid(data_subsurface_color.x) ? stack_load_float3(stack, data_subsurface_color.x) : + make_float3(__uint_as_float(data_subsurface_color.y), __uint_as_float(data_subsurface_color.z), __uint_as_float(data_subsurface_color.w)); + + float3 weight = sd->svm_closure_weight * mix_weight; + +#ifdef __SUBSURFACE__ + float3 mixed_ss_base_color = subsurface_color * subsurface + base_color * (1.0f - subsurface); + float3 subsurf_weight = weight * mixed_ss_base_color * diffuse_weight; + + /* disable in case of diffuse ancestor, can't see it well then and + * adds considerably noise due to probabilities of continuing path + * getting lower and lower */ + if(path_flag & PATH_RAY_DIFFUSE_ANCESTOR) { + subsurface = 0.0f; + + /* need to set the base color in this case such that the + * rays get the correctly mixed color after transmitting + * the object */ + base_color = mixed_ss_base_color; + } + + /* diffuse */ + if(fabsf(average(mixed_ss_base_color)) > CLOSURE_WEIGHT_CUTOFF) { + if(subsurface <= CLOSURE_WEIGHT_CUTOFF && diffuse_weight > CLOSURE_WEIGHT_CUTOFF) { + float3 diff_weight = weight * base_color * diffuse_weight; + + PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bsdf_alloc(sd, sizeof(PrincipledDiffuseBsdf), diff_weight); + + if(bsdf) { + bsdf->N = N; + bsdf->roughness = roughness; + + /* setup bsdf */ + sd->flag |= bsdf_principled_diffuse_setup(bsdf); + } + } + else if(subsurface > CLOSURE_WEIGHT_CUTOFF) { + Bssrdf *bssrdf = bssrdf_alloc(sd, subsurf_weight); + + if(bssrdf) { + bssrdf->radius = subsurface_radius * subsurface; + bssrdf->albedo = (subsurface_method == CLOSURE_BSSRDF_PRINCIPLED_ID)? subsurface_color: mixed_ss_base_color; + bssrdf->texture_blur = 0.0f; + bssrdf->sharpness = 0.0f; + bssrdf->N = N; + bssrdf->roughness = roughness; + + /* setup bsdf */ + sd->flag |= bssrdf_setup(sd, bssrdf, subsurface_method); + } + } + } +#else + /* diffuse */ + if(diffuse_weight > CLOSURE_WEIGHT_CUTOFF) { + float3 diff_weight = weight * base_color * diffuse_weight; + + PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bsdf_alloc(sd, sizeof(PrincipledDiffuseBsdf), diff_weight); + + if(bsdf) { + bsdf->N = N; + bsdf->roughness = roughness; + + /* setup bsdf */ + sd->flag |= bsdf_principled_diffuse_setup(bsdf); + } + } +#endif + + /* sheen */ + if(diffuse_weight > CLOSURE_WEIGHT_CUTOFF && sheen > CLOSURE_WEIGHT_CUTOFF) { + float m_cdlum = linear_rgb_to_gray(base_color); + float3 m_ctint = m_cdlum > 0.0f ? base_color / m_cdlum : make_float3(1.0f, 1.0f, 1.0f); // normalize lum. to isolate hue+sat + + /* color of the sheen component */ + float3 sheen_color = make_float3(1.0f, 1.0f, 1.0f) * (1.0f - sheen_tint) + m_ctint * sheen_tint; + + float3 sheen_weight = weight * sheen * sheen_color * diffuse_weight; + + PrincipledSheenBsdf *bsdf = (PrincipledSheenBsdf*)bsdf_alloc(sd, sizeof(PrincipledSheenBsdf), sheen_weight); + + if(bsdf) { + bsdf->N = N; + + /* setup bsdf */ + sd->flag |= bsdf_principled_sheen_setup(bsdf); + } + } + + /* specular reflection */ +#ifdef __CAUSTICS_TRICKS__ + if(kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0) { +#endif + if(specular_weight > CLOSURE_WEIGHT_CUTOFF && (specular > CLOSURE_WEIGHT_CUTOFF || metallic > CLOSURE_WEIGHT_CUTOFF)) { + float3 spec_weight = weight * specular_weight; + + MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), spec_weight); + if(!bsdf){ + break; + } + + MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); + if(!extra) { + break; + } + + bsdf->N = N; + bsdf->ior = (2.0f / (1.0f - safe_sqrtf(0.08f * specular))) - 1.0f; + bsdf->T = T; + bsdf->extra = extra; + + float aspect = safe_sqrtf(1.0f - anisotropic * 0.9f); + float r2 = roughness * roughness; + + bsdf->alpha_x = r2 / aspect; + bsdf->alpha_y = r2 * aspect; + + float m_cdlum = 0.3f * base_color.x + 0.6f * base_color.y + 0.1f * base_color.z; // luminance approx. + float3 m_ctint = m_cdlum > 0.0f ? base_color / m_cdlum : make_float3(0.0f, 0.0f, 0.0f); // normalize lum. to isolate hue+sat + float3 tmp_col = make_float3(1.0f, 1.0f, 1.0f) * (1.0f - specular_tint) + m_ctint * specular_tint; + + bsdf->extra->cspec0 = (specular * 0.08f * tmp_col) * (1.0f - metallic) + base_color * metallic; + bsdf->extra->color = base_color; + bsdf->extra->clearcoat = 0.0f; + + /* setup bsdf */ + if(distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID || roughness <= 0.075f) /* use single-scatter GGX */ + sd->flag |= bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf, sd); + else /* use multi-scatter GGX */ + sd->flag |= bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf, sd); + } +#ifdef __CAUSTICS_TRICKS__ + } +#endif + + /* BSDF */ +#ifdef __CAUSTICS_TRICKS__ + if(kernel_data.integrator.caustics_reflective || kernel_data.integrator.caustics_refractive || (path_flag & PATH_RAY_DIFFUSE) == 0) { +#endif + if(final_transmission > CLOSURE_WEIGHT_CUTOFF) { + float3 glass_weight = weight * final_transmission; + float3 cspec0 = base_color * specular_tint + make_float3(1.0f, 1.0f, 1.0f) * (1.0f - specular_tint); + + if(roughness <= 5e-2f || distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID) { /* use single-scatter GGX */ + float refl_roughness = roughness; + + /* reflection */ +#ifdef __CAUSTICS_TRICKS__ + if(kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0) +#endif + { + MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), glass_weight*fresnel); + if(!bsdf) { + break; + } + + MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); + if(!extra) { + break; + } + + bsdf->N = N; + bsdf->T = make_float3(0.0f, 0.0f, 0.0f); + bsdf->extra = extra; + + bsdf->alpha_x = refl_roughness * refl_roughness; + bsdf->alpha_y = refl_roughness * refl_roughness; + bsdf->ior = ior; + + bsdf->extra->color = base_color; + bsdf->extra->cspec0 = cspec0; + bsdf->extra->clearcoat = 0.0f; + + /* setup bsdf */ + sd->flag |= bsdf_microfacet_ggx_fresnel_setup(bsdf, sd); + } + + /* refraction */ +#ifdef __CAUSTICS_TRICKS__ + if(kernel_data.integrator.caustics_refractive || (path_flag & PATH_RAY_DIFFUSE) == 0) +#endif + { + MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), base_color*glass_weight*(1.0f - fresnel)); + if(!bsdf) { + break; + } + + bsdf->N = N; + bsdf->T = make_float3(0.0f, 0.0f, 0.0f); + bsdf->extra = NULL; + + if(distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID) + transmission_roughness = 1.0f - (1.0f - refl_roughness) * (1.0f - transmission_roughness); + else + transmission_roughness = refl_roughness; + + bsdf->alpha_x = transmission_roughness * transmission_roughness; + bsdf->alpha_y = transmission_roughness * transmission_roughness; + bsdf->ior = ior; + + /* setup bsdf */ + sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf); + } + } + else { /* use multi-scatter GGX */ + MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), glass_weight); + if(!bsdf) { + break; + } + + MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); + if(!extra) { + break; + } + + bsdf->N = N; + bsdf->extra = extra; + bsdf->T = make_float3(0.0f, 0.0f, 0.0f); + + bsdf->alpha_x = roughness * roughness; + bsdf->alpha_y = roughness * roughness; + bsdf->ior = ior; + + bsdf->extra->color = base_color; + bsdf->extra->cspec0 = cspec0; + bsdf->extra->clearcoat = 0.0f; + + /* setup bsdf */ + sd->flag |= bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf, sd); + } + } +#ifdef __CAUSTICS_TRICKS__ + } +#endif + + /* clearcoat */ +#ifdef __CAUSTICS_TRICKS__ + if(kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0) { +#endif + if(clearcoat > CLOSURE_WEIGHT_CUTOFF) { + MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight); + if(!bsdf) { + break; + } + + MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); + if(!extra) { + break; + } + + bsdf->N = clearcoat_normal; + bsdf->T = make_float3(0.0f, 0.0f, 0.0f); + bsdf->ior = 1.5f; + bsdf->extra = extra; + + bsdf->alpha_x = clearcoat_roughness * clearcoat_roughness; + bsdf->alpha_y = clearcoat_roughness * clearcoat_roughness; + + bsdf->extra->color = make_float3(0.0f, 0.0f, 0.0f); + bsdf->extra->cspec0 = make_float3(0.04f, 0.04f, 0.04f); + bsdf->extra->clearcoat = clearcoat; + + /* setup bsdf */ + sd->flag |= bsdf_microfacet_ggx_clearcoat_setup(bsdf, sd); + } +#ifdef __CAUSTICS_TRICKS__ + } +#endif + + break; + } +#endif /* __PRINCIPLED__ */ case CLOSURE_BSDF_DIFFUSE_ID: { - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; OrenNayarBsdf *bsdf = (OrenNayarBsdf*)bsdf_alloc(sd, sizeof(OrenNayarBsdf), weight); if(bsdf) { @@ -86,32 +428,28 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * float roughness = param1; if(roughness == 0.0f) { - ccl_fetch(sd, flag) |= bsdf_diffuse_setup((DiffuseBsdf*)bsdf); + sd->flag |= bsdf_diffuse_setup((DiffuseBsdf*)bsdf); } else { bsdf->roughness = roughness; - ccl_fetch(sd, flag) |= bsdf_oren_nayar_setup(bsdf); + sd->flag |= bsdf_oren_nayar_setup(bsdf); } } break; } case CLOSURE_BSDF_TRANSLUCENT_ID: { - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight); if(bsdf) { bsdf->N = N; - ccl_fetch(sd, flag) |= bsdf_translucent_setup(bsdf); + sd->flag |= bsdf_translucent_setup(bsdf); } break; } case CLOSURE_BSDF_TRANSPARENT_ID: { - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; - ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight); - - if(bsdf) { - ccl_fetch(sd, flag) |= bsdf_transparent_setup(bsdf); - } + float3 weight = sd->svm_closure_weight * mix_weight; + bsdf_transparent_setup(sd, weight, path_flag); break; } case CLOSURE_BSDF_REFLECTION_ID: @@ -123,33 +461,41 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE)) break; #endif - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight); - if(bsdf) { - bsdf->N = N; - bsdf->alpha_x = param1; - bsdf->alpha_y = param1; - bsdf->ior = 0.0f; - bsdf->extra = NULL; + if(!bsdf) { + break; + } - /* setup bsdf */ - if(type == CLOSURE_BSDF_REFLECTION_ID) - ccl_fetch(sd, flag) |= bsdf_reflection_setup(bsdf); - else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ID) - ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(bsdf); - else if(type == CLOSURE_BSDF_MICROFACET_GGX_ID) - ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(bsdf); - else if(type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID) { - kernel_assert(stack_valid(data_node.z)); - bsdf->extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); - if(bsdf->extra) { - bsdf->extra->color = stack_load_float3(stack, data_node.z); - ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_setup(bsdf); - } + float roughness = sqr(param1); + + bsdf->N = N; + bsdf->T = make_float3(0.0f, 0.0f, 0.0f); + bsdf->alpha_x = roughness; + bsdf->alpha_y = roughness; + bsdf->ior = 0.0f; + bsdf->extra = NULL; + + /* setup bsdf */ + if(type == CLOSURE_BSDF_REFLECTION_ID) + sd->flag |= bsdf_reflection_setup(bsdf); + else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ID) + sd->flag |= bsdf_microfacet_beckmann_setup(bsdf); + else if(type == CLOSURE_BSDF_MICROFACET_GGX_ID) + sd->flag |= bsdf_microfacet_ggx_setup(bsdf); + else if(type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID) { + kernel_assert(stack_valid(data_node.z)); + bsdf->extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); + if(bsdf->extra) { + bsdf->extra->color = stack_load_float3(stack, data_node.z); + bsdf->extra->cspec0 = make_float3(0.0f, 0.0f, 0.0f); + bsdf->extra->clearcoat = 0.0f; + sd->flag |= bsdf_microfacet_multi_ggx_setup(bsdf); } - else - ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_setup(bsdf); + } + else { + sd->flag |= bsdf_ashikhmin_shirley_setup(bsdf); } break; @@ -161,15 +507,16 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(!kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE)) break; #endif - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight); if(bsdf) { bsdf->N = N; + bsdf->T = make_float3(0.0f, 0.0f, 0.0f); bsdf->extra = NULL; float eta = fmaxf(param2, 1e-5f); - eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta; + eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta; /* setup bsdf */ if(type == CLOSURE_BSDF_REFRACTION_ID) { @@ -177,17 +524,18 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bsdf->alpha_y = 0.0f; bsdf->ior = eta; - ccl_fetch(sd, flag) |= bsdf_refraction_setup(bsdf); + sd->flag |= bsdf_refraction_setup(bsdf); } else { - bsdf->alpha_x = param1; - bsdf->alpha_y = param1; + float roughness = sqr(param1); + bsdf->alpha_x = roughness; + bsdf->alpha_y = roughness; bsdf->ior = eta; if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID) - ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(bsdf); + sd->flag |= bsdf_microfacet_beckmann_refraction_setup(bsdf); else - ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(bsdf); + sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf); } } @@ -203,16 +551,16 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * break; } #endif - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; /* index of refraction */ float eta = fmaxf(param2, 1e-5f); - eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta; + eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta; /* fresnel */ - float cosNO = dot(N, ccl_fetch(sd, I)); + float cosNO = dot(N, sd->I); float fresnel = fresnel_dielectric_cos(cosNO, eta); - float roughness = param1; + float roughness = sqr(param1); /* reflection */ #ifdef __CAUSTICS_TRICKS__ @@ -223,6 +571,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(bsdf) { bsdf->N = N; + bsdf->T = make_float3(0.0f, 0.0f, 0.0f); bsdf->extra = NULL; svm_node_glass_setup(sd, bsdf, type, eta, roughness, false); } @@ -237,6 +586,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(bsdf) { bsdf->N = N; + bsdf->T = make_float3(0.0f, 0.0f, 0.0f); bsdf->extra = NULL; svm_node_glass_setup(sd, bsdf, type, eta, roughness, true); } @@ -249,27 +599,34 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(!kernel_data.integrator.caustics_reflective && !kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE)) break; #endif - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight); - MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); + if(!bsdf) { + break; + } - if(bsdf && extra) { - bsdf->N = N; - bsdf->extra = extra; - bsdf->T = make_float3(0.0f, 0.0f, 0.0f); + MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); + if(!extra) { + break; + } - bsdf->alpha_x = param1; - bsdf->alpha_y = param1; - float eta = fmaxf(param2, 1e-5f); - bsdf->ior = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta; + bsdf->N = N; + bsdf->extra = extra; + bsdf->T = make_float3(0.0f, 0.0f, 0.0f); - kernel_assert(stack_valid(data_node.z)); - bsdf->extra->color = stack_load_float3(stack, data_node.z); + float roughness = sqr(param1); + bsdf->alpha_x = roughness; + bsdf->alpha_y = roughness; + float eta = fmaxf(param2, 1e-5f); + bsdf->ior = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta; - /* setup bsdf */ - ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_glass_setup(bsdf); - } + kernel_assert(stack_valid(data_node.z)); + bsdf->extra->color = stack_load_float3(stack, data_node.z); + bsdf->extra->cspec0 = make_float3(0.0f, 0.0f, 0.0f); + bsdf->extra->clearcoat = 0.0f; + /* setup bsdf */ + sd->flag |= bsdf_microfacet_multi_ggx_glass_setup(bsdf); break; } case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID: @@ -280,7 +637,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE)) break; #endif - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight); if(bsdf) { @@ -295,7 +652,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bsdf->T = rotate_around_axis(bsdf->T, bsdf->N, rotation * M_2PI_F); /* compute roughness */ - float roughness = param1; + float roughness = sqr(param1); float anisotropy = clamp(param2, -0.99f, 0.99f); if(anisotropy < 0.0f) { @@ -310,33 +667,35 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bsdf->ior = 0.0f; if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID) { - ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_aniso_setup(bsdf); + sd->flag |= bsdf_microfacet_beckmann_aniso_setup(bsdf); } else if(type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID) { - ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_aniso_setup(bsdf); + sd->flag |= bsdf_microfacet_ggx_aniso_setup(bsdf); } else if(type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID) { kernel_assert(stack_valid(data_node.w)); bsdf->extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); if(bsdf->extra) { bsdf->extra->color = stack_load_float3(stack, data_node.w); - ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_aniso_setup(bsdf); + bsdf->extra->cspec0 = make_float3(0.0f, 0.0f, 0.0f); + bsdf->extra->clearcoat = 0.0f; + sd->flag |= bsdf_microfacet_multi_ggx_aniso_setup(bsdf); } } else - ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_aniso_setup(bsdf); + sd->flag |= bsdf_ashikhmin_shirley_aniso_setup(bsdf); } break; } case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: { - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; VelvetBsdf *bsdf = (VelvetBsdf*)bsdf_alloc(sd, sizeof(VelvetBsdf), weight); if(bsdf) { bsdf->N = N; bsdf->sigma = saturate(param1); - ccl_fetch(sd, flag) |= bsdf_ashikhmin_velvet_setup(bsdf); + sd->flag |= bsdf_ashikhmin_velvet_setup(bsdf); } break; } @@ -344,9 +703,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * #ifdef __CAUSTICS_TRICKS__ if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE)) break; + ATTR_FALLTHROUGH; #endif case CLOSURE_BSDF_DIFFUSE_TOON_ID: { - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; ToonBsdf *bsdf = (ToonBsdf*)bsdf_alloc(sd, sizeof(ToonBsdf), weight); if(bsdf) { @@ -355,34 +715,30 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bsdf->smooth = param2; if(type == CLOSURE_BSDF_DIFFUSE_TOON_ID) - ccl_fetch(sd, flag) |= bsdf_diffuse_toon_setup(bsdf); + sd->flag |= bsdf_diffuse_toon_setup(bsdf); else - ccl_fetch(sd, flag) |= bsdf_glossy_toon_setup(bsdf); + sd->flag |= bsdf_glossy_toon_setup(bsdf); } break; } #ifdef __HAIR__ case CLOSURE_BSDF_HAIR_REFLECTION_ID: case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: { - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; - if(ccl_fetch(sd, flag) & SD_BACKFACING && ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { - ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight); - - if(bsdf) { - /* todo: giving a fixed weight here will cause issues when - * mixing multiple BSDFS. energy will not be conserved and - * the throughput can blow up after multiple bounces. we - * better figure out a way to skip backfaces from rays - * spawned by transmission from the front */ - bsdf->weight = make_float3(1.0f, 1.0f, 1.0f); - ccl_fetch(sd, flag) |= bsdf_transparent_setup(bsdf); - } + if(sd->flag & SD_BACKFACING && sd->type & PRIMITIVE_ALL_CURVE) { + /* todo: giving a fixed weight here will cause issues when + * mixing multiple BSDFS. energy will not be conserved and + * the throughput can blow up after multiple bounces. we + * better figure out a way to skip backfaces from rays + * spawned by transmission from the front */ + bsdf_transparent_setup(sd, make_float3(1.0f, 1.0f, 1.0f), path_flag); } else { HairBsdf *bsdf = (HairBsdf*)bsdf_alloc(sd, sizeof(HairBsdf), weight); if(bsdf) { + bsdf->N = N; bsdf->roughness1 = param1; bsdf->roughness2 = param2; bsdf->offset = -stack_load_float(stack, data_node.z); @@ -390,18 +746,18 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(stack_valid(data_node.y)) { bsdf->T = normalize(stack_load_float3(stack, data_node.y)); } - else if(!(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)) { - bsdf->T = normalize(ccl_fetch(sd, dPdv)); + else if(!(sd->type & PRIMITIVE_ALL_CURVE)) { + bsdf->T = normalize(sd->dPdv); bsdf->offset = 0.0f; } else - bsdf->T = normalize(ccl_fetch(sd, dPdu)); + bsdf->T = normalize(sd->dPdu); if(type == CLOSURE_BSDF_HAIR_REFLECTION_ID) { - ccl_fetch(sd, flag) |= bsdf_hair_reflection_setup(bsdf); + sd->flag |= bsdf_hair_reflection_setup(bsdf); } else { - ccl_fetch(sd, flag) |= bsdf_hair_transmission_setup(bsdf); + sd->flag |= bsdf_hair_transmission_setup(bsdf); } } } @@ -413,58 +769,25 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * #ifdef __SUBSURFACE__ case CLOSURE_BSSRDF_CUBIC_ID: case CLOSURE_BSSRDF_GAUSSIAN_ID: - case CLOSURE_BSSRDF_BURLEY_ID: { - float3 albedo = ccl_fetch(sd, svm_closure_weight); - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; - float sample_weight = fabsf(average(weight)); - - /* disable in case of diffuse ancestor, can't see it well then and - * adds considerably noise due to probabilities of continuing path - * getting lower and lower */ - if(path_flag & PATH_RAY_DIFFUSE_ANCESTOR) - param1 = 0.0f; - - if(sample_weight > CLOSURE_WEIGHT_CUTOFF) { - /* radius * scale */ - float3 radius = stack_load_float3(stack, data_node.z)*param1; - /* sharpness */ - float sharpness = stack_load_float(stack, data_node.w); - /* texture color blur */ - float texture_blur = param2; - - /* create one closure per color channel */ - Bssrdf *bssrdf = bssrdf_alloc(sd, make_float3(weight.x, 0.0f, 0.0f)); - if(bssrdf) { - bssrdf->sample_weight = sample_weight; - bssrdf->radius = radius.x; - bssrdf->texture_blur = texture_blur; - bssrdf->albedo = albedo.x; - bssrdf->sharpness = sharpness; - bssrdf->N = N; - ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type); - } - - bssrdf = bssrdf_alloc(sd, make_float3(0.0f, weight.y, 0.0f)); - if(bssrdf) { - bssrdf->sample_weight = sample_weight; - bssrdf->radius = radius.y; - bssrdf->texture_blur = texture_blur; - bssrdf->albedo = albedo.y; - bssrdf->sharpness = sharpness; - bssrdf->N = N; - ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type); - } - - bssrdf = bssrdf_alloc(sd, make_float3(0.0f, 0.0f, weight.z)); - if(bssrdf) { - bssrdf->sample_weight = sample_weight; - bssrdf->radius = radius.z; - bssrdf->texture_blur = texture_blur; - bssrdf->albedo = albedo.z; - bssrdf->sharpness = sharpness; - bssrdf->N = N; - ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type); - } + case CLOSURE_BSSRDF_BURLEY_ID: + case CLOSURE_BSSRDF_RANDOM_WALK_ID: { + float3 weight = sd->svm_closure_weight * mix_weight; + Bssrdf *bssrdf = bssrdf_alloc(sd, weight); + + if(bssrdf) { + /* disable in case of diffuse ancestor, can't see it well then and + * adds considerably noise due to probabilities of continuing path + * getting lower and lower */ + if(path_flag & PATH_RAY_DIFFUSE_ANCESTOR) + param1 = 0.0f; + + bssrdf->radius = stack_load_float3(stack, data_node.z)*param1; + bssrdf->albedo = sd->svm_closure_weight; + bssrdf->texture_blur = param2; + bssrdf->sharpness = stack_load_float(stack, data_node.w); + bssrdf->N = N; + bssrdf->roughness = 0.0f; + sd->flag |= bssrdf_setup(sd, bssrdf, (ClosureType)type); } break; @@ -475,44 +798,148 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * } } -ccl_device void svm_node_closure_volume(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int path_flag) +ccl_device void svm_node_closure_volume(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, ShaderType shader_type) { #ifdef __VOLUME__ - uint type, param1_offset, param2_offset; + /* Only sum extinction for volumes, variable is shared with surface transparency. */ + if(shader_type != SHADER_TYPE_VOLUME) { + return; + } + + uint type, density_offset, anisotropy_offset; uint mix_weight_offset; - decode_node_uchar4(node.y, &type, ¶m1_offset, ¶m2_offset, &mix_weight_offset); + decode_node_uchar4(node.y, &type, &density_offset, &anisotropy_offset, &mix_weight_offset); float mix_weight = (stack_valid(mix_weight_offset)? stack_load_float(stack, mix_weight_offset): 1.0f); - if(mix_weight == 0.0f) + if(mix_weight == 0.0f) { return; + } - float param1 = (stack_valid(param1_offset))? stack_load_float(stack, param1_offset): __uint_as_float(node.z); - float param2 = (stack_valid(param2_offset))? stack_load_float(stack, param2_offset): __uint_as_float(node.w); - float density = fmaxf(param1, 0.0f); + float density = (stack_valid(density_offset))? stack_load_float(stack, density_offset): __uint_as_float(node.z); + density = mix_weight * fmaxf(density, 0.0f); - switch(type) { - case CLOSURE_VOLUME_ABSORPTION_ID: { - float3 weight = (make_float3(1.0f, 1.0f, 1.0f) - ccl_fetch(sd, svm_closure_weight)) * mix_weight * density; - ShaderClosure *sc = closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_NONE_ID, weight); + /* Compute scattering coefficient. */ + float3 weight = sd->svm_closure_weight; - if(sc) { - ccl_fetch(sd, flag) |= volume_absorption_setup(sc); - } - break; + if(type == CLOSURE_VOLUME_ABSORPTION_ID) { + weight = make_float3(1.0f, 1.0f, 1.0f) - weight; + } + + weight *= density; + + /* Add closure for volume scattering. */ + if(type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) { + HenyeyGreensteinVolume *volume = (HenyeyGreensteinVolume*)bsdf_alloc(sd, sizeof(HenyeyGreensteinVolume), weight); + + if(volume) { + float anisotropy = (stack_valid(anisotropy_offset))? stack_load_float(stack, anisotropy_offset): __uint_as_float(node.w); + volume->g = anisotropy; /* g */ + sd->flag |= volume_henyey_greenstein_setup(volume); } - case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: { - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight * density; - HenyeyGreensteinVolume *volume = (HenyeyGreensteinVolume*)bsdf_alloc(sd, sizeof(HenyeyGreensteinVolume), weight); + } - if(volume) { - volume->g = param2; /* g */ - ccl_fetch(sd, flag) |= volume_henyey_greenstein_setup(volume); - } - break; + /* Sum total extinction weight. */ + volume_extinction_setup(sd, weight); +#endif +} + +ccl_device void svm_node_principled_volume(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, ShaderType shader_type, int path_flag, int *offset) +{ +#ifdef __VOLUME__ + uint4 value_node = read_node(kg, offset); + uint4 attr_node = read_node(kg, offset); + + /* Only sum extinction for volumes, variable is shared with surface transparency. */ + if(shader_type != SHADER_TYPE_VOLUME) { + return; + } + + uint density_offset, anisotropy_offset, absorption_color_offset, mix_weight_offset; + decode_node_uchar4(node.y, &density_offset, &anisotropy_offset, &absorption_color_offset, &mix_weight_offset); + float mix_weight = (stack_valid(mix_weight_offset)? stack_load_float(stack, mix_weight_offset): 1.0f); + + if(mix_weight == 0.0f) { + return; + } + + /* Compute density. */ + float primitive_density = 1.0f; + float density = (stack_valid(density_offset))? stack_load_float(stack, density_offset): __uint_as_float(value_node.x); + density = mix_weight * fmaxf(density, 0.0f); + + if(density > CLOSURE_WEIGHT_CUTOFF) { + /* Density and color attribute lookup if available. */ + const AttributeDescriptor attr_density = find_attribute(kg, sd, attr_node.x); + if(attr_density.offset != ATTR_STD_NOT_FOUND) { + primitive_density = primitive_attribute_float(kg, sd, attr_density, NULL, NULL); + density = fmaxf(density * primitive_density, 0.0f); + } + } + + if(density > CLOSURE_WEIGHT_CUTOFF) { + /* Compute scattering color. */ + float3 color = sd->svm_closure_weight; + + const AttributeDescriptor attr_color = find_attribute(kg, sd, attr_node.y); + if(attr_color.offset != ATTR_STD_NOT_FOUND) { + color *= primitive_attribute_float3(kg, sd, attr_color, NULL, NULL); + } + + /* Add closure for volume scattering. */ + HenyeyGreensteinVolume *volume = (HenyeyGreensteinVolume*)bsdf_alloc(sd, sizeof(HenyeyGreensteinVolume), color * density); + if(volume) { + float anisotropy = (stack_valid(anisotropy_offset))? stack_load_float(stack, anisotropy_offset): __uint_as_float(value_node.y); + volume->g = anisotropy; + sd->flag |= volume_henyey_greenstein_setup(volume); + } + + /* Add extinction weight. */ + float3 zero = make_float3(0.0f, 0.0f, 0.0f); + float3 one = make_float3(1.0f, 1.0f, 1.0f); + float3 absorption_color = max(sqrt(stack_load_float3(stack, absorption_color_offset)), zero); + float3 absorption = max(one - color, zero) * max(one - absorption_color, zero); + volume_extinction_setup(sd, (color + absorption) * density); + } + + /* Compute emission. */ + if(path_flag & PATH_RAY_SHADOW) { + /* Don't need emission for shadows. */ + return; + } + + uint emission_offset, emission_color_offset, blackbody_offset, temperature_offset; + decode_node_uchar4(node.z, &emission_offset, &emission_color_offset, &blackbody_offset, &temperature_offset); + float emission = (stack_valid(emission_offset))? stack_load_float(stack, emission_offset): __uint_as_float(value_node.z); + float blackbody = (stack_valid(blackbody_offset))? stack_load_float(stack, blackbody_offset): __uint_as_float(value_node.w); + + if(emission > CLOSURE_WEIGHT_CUTOFF) { + float3 emission_color = stack_load_float3(stack, emission_color_offset); + emission_setup(sd, emission * emission_color); + } + + if(blackbody > CLOSURE_WEIGHT_CUTOFF) { + float T = stack_load_float(stack, temperature_offset); + + /* Add flame temperature from attribute if available. */ + const AttributeDescriptor attr_temperature = find_attribute(kg, sd, attr_node.z); + if(attr_temperature.offset != ATTR_STD_NOT_FOUND) { + float temperature = primitive_attribute_float(kg, sd, attr_temperature, NULL, NULL); + T *= fmaxf(temperature, 0.0f); + } + + T = fmaxf(T, 0.0f); + + /* Stefan-Boltzmann law. */ + float T4 = sqr(sqr(T)); + float sigma = 5.670373e-8f * 1e-6f / M_PI_F; + float intensity = sigma * mix(1.0f, T4, blackbody); + + if(intensity > CLOSURE_WEIGHT_CUTOFF) { + float3 blackbody_tint = stack_load_float3(stack, node.w); + float3 bb = blackbody_tint * intensity * svm_math_blackbody_color(T); + emission_setup(sd, bb); } - default: - break; } #endif } @@ -520,6 +947,7 @@ ccl_device void svm_node_closure_volume(KernelGlobals *kg, ShaderData *sd, float ccl_device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 node) { uint mix_weight_offset = node.y; + float3 weight = sd->svm_closure_weight; if(stack_valid(mix_weight_offset)) { float mix_weight = stack_load_float(stack, mix_weight_offset); @@ -527,17 +955,16 @@ ccl_device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 no if(mix_weight == 0.0f) return; - closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight); + weight *= mix_weight; } - else - closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, ccl_fetch(sd, svm_closure_weight)); - ccl_fetch(sd, flag) |= SD_EMISSION; + emission_setup(sd, weight); } ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4 node) { uint mix_weight_offset = node.y; + float3 weight = sd->svm_closure_weight; if(stack_valid(mix_weight_offset)) { float mix_weight = stack_load_float(stack, mix_weight_offset); @@ -545,10 +972,10 @@ ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4 if(mix_weight == 0.0f) return; - closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight); + weight *= mix_weight; } - else - closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, ccl_fetch(sd, svm_closure_weight)); + + background_setup(sd, weight); } ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 node) @@ -561,12 +988,12 @@ ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 nod if(mix_weight == 0.0f) return; - closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight); + closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, sd->svm_closure_weight * mix_weight); } else - closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, ccl_fetch(sd, svm_closure_weight)); + closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, sd->svm_closure_weight); - ccl_fetch(sd, flag) |= SD_HOLDOUT; + sd->flag |= SD_HOLDOUT; } ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack, uint4 node) @@ -579,19 +1006,19 @@ ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack, if(mix_weight == 0.0f) return; - closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight); + closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, sd->svm_closure_weight * mix_weight); } else - closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, ccl_fetch(sd, svm_closure_weight)); + closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, sd->svm_closure_weight); - ccl_fetch(sd, flag) |= SD_AO; + sd->flag |= SD_AO; } /* Closure Nodes */ ccl_device_inline void svm_node_closure_store_weight(ShaderData *sd, float3 weight) { - ccl_fetch(sd, svm_closure_weight) = weight; + sd->svm_closure_weight = weight; } ccl_device void svm_node_closure_set_weight(ShaderData *sd, uint r, uint g, uint b) @@ -641,7 +1068,7 @@ ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node) ccl_device void svm_node_set_normal(KernelGlobals *kg, ShaderData *sd, float *stack, uint in_direction, uint out_normal) { float3 normal = stack_load_float3(stack, in_direction); - ccl_fetch(sd, N) = normal; + sd->N = normal; stack_store_float3(stack, out_normal, normal); } diff --git a/intern/cycles/kernel/svm/svm_displace.h b/intern/cycles/kernel/svm/svm_displace.h index 890ab41aaaa..533b7f065e6 100644 --- a/intern/cycles/kernel/svm/svm_displace.h +++ b/intern/cycles/kernel/svm/svm_displace.h @@ -22,13 +22,13 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac { #ifdef __RAY_DIFFERENTIALS__ /* get normal input */ - uint normal_offset, distance_offset, invert, use_object_space; - decode_node_uchar4(node.y, &normal_offset, &distance_offset, &invert, &use_object_space); + uint normal_offset, scale_offset, invert, use_object_space; + decode_node_uchar4(node.y, &normal_offset, &scale_offset, &invert, &use_object_space); - float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N); + float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): sd->N; - float3 dPdx = ccl_fetch(sd, dP).dx; - float3 dPdy = ccl_fetch(sd, dP).dy; + float3 dPdx = sd->dP.dx; + float3 dPdy = sd->dP.dy; if(use_object_space) { object_inverse_normal_transform(kg, sd, &normal_in); @@ -55,16 +55,21 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac float absdet = fabsf(det); float strength = stack_load_float(stack, strength_offset); - float distance = stack_load_float(stack, distance_offset); + float scale = stack_load_float(stack, scale_offset); if(invert) - distance *= -1.0f; + scale *= -1.0f; strength = max(strength, 0.0f); /* compute and output perturbed normal */ - float3 normal_out = normalize(absdet*normal_in - distance*signf(det)*surfgrad); - normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in); + float3 normal_out = safe_normalize(absdet*normal_in - scale*signf(det)*surfgrad); + if(is_zero(normal_out)) { + normal_out = normal_in; + } + else { + normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in); + } if(use_object_space) { object_normal_transform(kg, sd, &normal_out); @@ -78,16 +83,80 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac ccl_device void svm_node_set_displacement(KernelGlobals *kg, ShaderData *sd, float *stack, uint fac_offset) { - float d = stack_load_float(stack, fac_offset); + float3 dP = stack_load_float3(stack, fac_offset); + sd->P += dP; +} - float3 dP = ccl_fetch(sd, N); - object_inverse_normal_transform(kg, sd, &dP); +ccl_device void svm_node_displacement(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node) +{ + uint height_offset, midlevel_offset, scale_offset, normal_offset; + decode_node_uchar4(node.y, &height_offset, &midlevel_offset, &scale_offset, &normal_offset); + + float height = stack_load_float(stack, height_offset); + float midlevel = stack_load_float(stack, midlevel_offset); + float scale = stack_load_float(stack, scale_offset); + float3 normal = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): sd->N; + uint space = node.w; + + float3 dP = normal; + + if(space == NODE_NORMAL_MAP_OBJECT) { + /* Object space. */ + object_inverse_normal_transform(kg, sd, &dP); + dP *= (height - midlevel) * scale; + object_dir_transform(kg, sd, &dP); + } + else { + /* World space. */ + dP *= (height - midlevel) * scale; + } - dP *= d*0.1f; /* todo: get rid of this factor */ + stack_store_float3(stack, node.z, dP); +} - object_dir_transform(kg, sd, &dP); +ccl_device void svm_node_vector_displacement(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset) +{ + uint4 data_node = read_node(kg, offset); + uint space = data_node.x; + + uint vector_offset, midlevel_offset,scale_offset, displacement_offset; + decode_node_uchar4(node.y, &vector_offset, &midlevel_offset, &scale_offset, &displacement_offset); + + float3 vector = stack_load_float3(stack, vector_offset); + float midlevel = stack_load_float(stack, midlevel_offset); + float scale = stack_load_float(stack, scale_offset); + float3 dP = (vector - make_float3(midlevel, midlevel, midlevel)) * scale; + + if(space == NODE_NORMAL_MAP_TANGENT) { + /* Tangent space. */ + float3 normal = sd->N; + object_inverse_normal_transform(kg, sd, &normal); + + const AttributeDescriptor attr = find_attribute(kg, sd, node.z); + float3 tangent; + if(attr.offset != ATTR_STD_NOT_FOUND) { + tangent = primitive_attribute_float3(kg, sd, attr, NULL, NULL); + } + else { + tangent = normalize(sd->dPdu); + } + + float3 bitangent = normalize(cross(normal, tangent)); + const AttributeDescriptor attr_sign = find_attribute(kg, sd, node.w); + if(attr_sign.offset != ATTR_STD_NOT_FOUND) { + float sign = primitive_attribute_float(kg, sd, attr_sign, NULL, NULL); + bitangent *= sign; + } + + dP = tangent*dP.x + normal*dP.y + bitangent*dP.z; + } + + if(space != NODE_NORMAL_MAP_WORLD) { + /* Tangent or object space. */ + object_dir_transform(kg, sd, &dP); + } - ccl_fetch(sd, P) += dP; + stack_store_float3(stack, displacement_offset, dP); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_fresnel.h b/intern/cycles/kernel/svm/svm_fresnel.h index 23c97d80cb0..3703ec55015 100644 --- a/intern/cycles/kernel/svm/svm_fresnel.h +++ b/intern/cycles/kernel/svm/svm_fresnel.h @@ -23,12 +23,12 @@ ccl_device void svm_node_fresnel(ShaderData *sd, float *stack, uint ior_offset, uint normal_offset, out_offset; decode_node_uchar4(node, &normal_offset, &out_offset, NULL, NULL); float eta = (stack_valid(ior_offset))? stack_load_float(stack, ior_offset): __uint_as_float(ior_value); - float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N); + float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): sd->N; eta = fmaxf(eta, 1e-5f); - eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta; + eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta; - float f = fresnel_dielectric_cos(dot(ccl_fetch(sd, I), normal_in), eta); + float f = fresnel_dielectric_cos(dot(sd->I, normal_in), eta); stack_store_float(stack, out_offset, f); } @@ -44,18 +44,18 @@ ccl_device void svm_node_layer_weight(ShaderData *sd, float *stack, uint4 node) decode_node_uchar4(node.w, &type, &normal_offset, &out_offset, NULL); float blend = (stack_valid(blend_offset))? stack_load_float(stack, blend_offset): __uint_as_float(blend_value); - float3 normal_in = (stack_valid(normal_offset))? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N); + float3 normal_in = (stack_valid(normal_offset))? stack_load_float3(stack, normal_offset): sd->N; float f; if(type == NODE_LAYER_WEIGHT_FRESNEL) { float eta = fmaxf(1.0f - blend, 1e-5f); - eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? eta: 1.0f/eta; + eta = (sd->flag & SD_BACKFACING)? eta: 1.0f/eta; - f = fresnel_dielectric_cos(dot(ccl_fetch(sd, I), normal_in), eta); + f = fresnel_dielectric_cos(dot(sd->I, normal_in), eta); } else { - f = fabsf(dot(ccl_fetch(sd, I), normal_in)); + f = fabsf(dot(sd->I, normal_in)); if(blend != 0.5f) { blend = clamp(blend, 0.0f, 1.0f-1e-5f); diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h index 7d512f7ff4d..81308d6f12b 100644 --- a/intern/cycles/kernel/svm/svm_geometry.h +++ b/intern/cycles/kernel/svm/svm_geometry.h @@ -27,16 +27,17 @@ ccl_device_inline void svm_node_geometry(KernelGlobals *kg, float3 data; switch(type) { - case NODE_GEOM_P: data = ccl_fetch(sd, P); break; - case NODE_GEOM_N: data = ccl_fetch(sd, N); break; + case NODE_GEOM_P: data = sd->P; break; + case NODE_GEOM_N: data = sd->N; break; #ifdef __DPDU__ case NODE_GEOM_T: data = primitive_tangent(kg, sd); break; #endif - case NODE_GEOM_I: data = ccl_fetch(sd, I); break; - case NODE_GEOM_Ng: data = ccl_fetch(sd, Ng); break; + case NODE_GEOM_I: data = sd->I; break; + case NODE_GEOM_Ng: data = sd->Ng; break; #ifdef __UV__ - case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u), ccl_fetch(sd, v), 0.0f); break; + case NODE_GEOM_uv: data = make_float3(sd->u, sd->v, 0.0f); break; #endif + default: data = make_float3(0.0f, 0.0f, 0.0f); } stack_store_float3(stack, out_offset, data); @@ -48,8 +49,8 @@ ccl_device void svm_node_geometry_bump_dx(KernelGlobals *kg, ShaderData *sd, flo float3 data; switch(type) { - case NODE_GEOM_P: data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx; break; - case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u) + ccl_fetch(sd, du).dx, ccl_fetch(sd, v) + ccl_fetch(sd, dv).dx, 0.0f); break; + case NODE_GEOM_P: data = sd->P + sd->dP.dx; break; + case NODE_GEOM_uv: data = make_float3(sd->u + sd->du.dx, sd->v + sd->dv.dx, 0.0f); break; default: svm_node_geometry(kg, sd, stack, type, out_offset); return; } @@ -65,8 +66,8 @@ ccl_device void svm_node_geometry_bump_dy(KernelGlobals *kg, ShaderData *sd, flo float3 data; switch(type) { - case NODE_GEOM_P: data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy; break; - case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u) + ccl_fetch(sd, du).dy, ccl_fetch(sd, v) + ccl_fetch(sd, dv).dy, 0.0f); break; + case NODE_GEOM_P: data = sd->P + sd->dP.dy; break; + case NODE_GEOM_uv: data = make_float3(sd->u + sd->du.dy, sd->v + sd->dv.dy, 0.0f); break; default: svm_node_geometry(kg, sd, stack, type, out_offset); return; } @@ -87,9 +88,17 @@ ccl_device void svm_node_object_info(KernelGlobals *kg, ShaderData *sd, float *s stack_store_float3(stack, out_offset, object_location(kg, sd)); return; } - case NODE_INFO_OB_INDEX: data = object_pass_id(kg, ccl_fetch(sd, object)); break; + case NODE_INFO_OB_INDEX: data = object_pass_id(kg, sd->object); break; case NODE_INFO_MAT_INDEX: data = shader_pass_id(kg, sd); break; - case NODE_INFO_OB_RANDOM: data = object_random_number(kg, ccl_fetch(sd, object)); break; + case NODE_INFO_OB_RANDOM: { + if(sd->lamp != LAMP_NONE) { + data = lamp_random_number(kg, sd->lamp); + } + else { + data = object_random_number(kg, sd->object); + } + break; + } default: data = 0.0f; break; } @@ -106,44 +115,50 @@ ccl_device void svm_node_particle_info(KernelGlobals *kg, { switch(type) { case NODE_INFO_PAR_INDEX: { - int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); + int particle_id = object_particle_id(kg, sd->object); stack_store_float(stack, out_offset, particle_index(kg, particle_id)); break; } + case NODE_INFO_PAR_RANDOM: { + int particle_id = object_particle_id(kg, sd->object); + float random = hash_int_01(particle_index(kg, particle_id)); + stack_store_float(stack, out_offset, random); + break; + } case NODE_INFO_PAR_AGE: { - int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); + int particle_id = object_particle_id(kg, sd->object); stack_store_float(stack, out_offset, particle_age(kg, particle_id)); break; } case NODE_INFO_PAR_LIFETIME: { - int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); + int particle_id = object_particle_id(kg, sd->object); stack_store_float(stack, out_offset, particle_lifetime(kg, particle_id)); break; } case NODE_INFO_PAR_LOCATION: { - int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); + int particle_id = object_particle_id(kg, sd->object); stack_store_float3(stack, out_offset, particle_location(kg, particle_id)); break; } #if 0 /* XXX float4 currently not supported in SVM stack */ case NODE_INFO_PAR_ROTATION: { - int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); + int particle_id = object_particle_id(kg, sd->object); stack_store_float4(stack, out_offset, particle_rotation(kg, particle_id)); break; } #endif case NODE_INFO_PAR_SIZE: { - int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); + int particle_id = object_particle_id(kg, sd->object); stack_store_float(stack, out_offset, particle_size(kg, particle_id)); break; } case NODE_INFO_PAR_VELOCITY: { - int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); + int particle_id = object_particle_id(kg, sd->object); stack_store_float3(stack, out_offset, particle_velocity(kg, particle_id)); break; } case NODE_INFO_PAR_ANGULAR_VELOCITY: { - int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); + int particle_id = object_particle_id(kg, sd->object); stack_store_float3(stack, out_offset, particle_angular_velocity(kg, particle_id)); break; } @@ -165,19 +180,21 @@ ccl_device void svm_node_hair_info(KernelGlobals *kg, switch(type) { case NODE_INFO_CURVE_IS_STRAND: { - data = (ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) != 0; + data = (sd->type & PRIMITIVE_ALL_CURVE) != 0; stack_store_float(stack, out_offset, data); break; } case NODE_INFO_CURVE_INTERCEPT: break; /* handled as attribute */ + case NODE_INFO_CURVE_RANDOM: + break; /* handled as attribute */ case NODE_INFO_CURVE_THICKNESS: { data = curve_thickness(kg, sd); stack_store_float(stack, out_offset, data); break; } /*case NODE_INFO_CURVE_FADE: { - data = ccl_fetch(sd, curve_transparency); + data = sd->curve_transparency; stack_store_float(stack, out_offset, data); break; }*/ diff --git a/intern/cycles/kernel/svm/svm_gradient.h b/intern/cycles/kernel/svm/svm_gradient.h index 53d7b4f812c..74e36e70427 100644 --- a/intern/cycles/kernel/svm/svm_gradient.h +++ b/intern/cycles/kernel/svm/svm_gradient.h @@ -46,7 +46,10 @@ ccl_device float svm_gradient(float3 p, NodeGradientType type) return atan2f(y, x) / M_2PI_F + 0.5f; } else { - float r = fmaxf(1.0f - sqrtf(x*x + y*y + z*z), 0.0f); + /* Bias a little bit for the case where p is a unit length vector, + * to get exactly zero instead of a small random value depending + * on float precision. */ + float r = fmaxf(0.999999f - sqrtf(x*x + y*y + z*z), 0.0f); if(type == NODE_BLEND_QUADRATIC_SPHERE) return r*r; diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h index 0d6efb47223..4226e7adfe0 100644 --- a/intern/cycles/kernel/svm/svm_image.h +++ b/intern/cycles/kernel/svm/svm_image.h @@ -16,185 +16,25 @@ CCL_NAMESPACE_BEGIN -/* Float4 textures on various devices. */ -#if defined(__KERNEL_CPU__) -# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_CPU -#elif defined(__KERNEL_CUDA__) -# if __CUDA_ARCH__ < 300 -# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_CUDA -# else -# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_CUDA_KEPLER -# endif -#else -# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_OPENCL -#endif - ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint srgb, uint use_alpha) { -#ifdef __KERNEL_CPU__ -# ifdef __KERNEL_SSE2__ - ssef r_ssef; - float4 &r = (float4 &)r_ssef; - r = kernel_tex_image_interp(id, x, y); -# else - float4 r = kernel_tex_image_interp(id, x, y); -# endif -#elif defined(__KERNEL_OPENCL__) float4 r = kernel_tex_image_interp(kg, id, x, y); -#else - float4 r; - -# if __CUDA_ARCH__ < 300 - /* not particularly proud of this massive switch, what are the - * alternatives? - * - use a single big 1D texture, and do our own lookup/filtering - * - group by size and use a 3d texture, performance impact - * - group into larger texture with some padding for correct lerp - * - * also note that cuda has a textures limit (128 for Fermi, 256 for Kepler), - * and we cannot use all since we still need some for other storage */ - - switch(id) { - case 0: r = kernel_tex_image_interp(__tex_image_float4_000, x, y); break; - case 1: r = kernel_tex_image_interp(__tex_image_float4_001, x, y); break; - case 2: r = kernel_tex_image_interp(__tex_image_float4_002, x, y); break; - case 3: r = kernel_tex_image_interp(__tex_image_float4_003, x, y); break; - case 4: r = kernel_tex_image_interp(__tex_image_float4_004, x, y); break; - case 5: r = kernel_tex_image_interp(__tex_image_byte4_005, x, y); break; - case 6: r = kernel_tex_image_interp(__tex_image_byte4_006, x, y); break; - case 7: r = kernel_tex_image_interp(__tex_image_byte4_007, x, y); break; - case 8: r = kernel_tex_image_interp(__tex_image_byte4_008, x, y); break; - case 9: r = kernel_tex_image_interp(__tex_image_byte4_009, x, y); break; - case 10: r = kernel_tex_image_interp(__tex_image_byte4_010, x, y); break; - case 11: r = kernel_tex_image_interp(__tex_image_byte4_011, x, y); break; - case 12: r = kernel_tex_image_interp(__tex_image_byte4_012, x, y); break; - case 13: r = kernel_tex_image_interp(__tex_image_byte4_013, x, y); break; - case 14: r = kernel_tex_image_interp(__tex_image_byte4_014, x, y); break; - case 15: r = kernel_tex_image_interp(__tex_image_byte4_015, x, y); break; - case 16: r = kernel_tex_image_interp(__tex_image_byte4_016, x, y); break; - case 17: r = kernel_tex_image_interp(__tex_image_byte4_017, x, y); break; - case 18: r = kernel_tex_image_interp(__tex_image_byte4_018, x, y); break; - case 19: r = kernel_tex_image_interp(__tex_image_byte4_019, x, y); break; - case 20: r = kernel_tex_image_interp(__tex_image_byte4_020, x, y); break; - case 21: r = kernel_tex_image_interp(__tex_image_byte4_021, x, y); break; - case 22: r = kernel_tex_image_interp(__tex_image_byte4_022, x, y); break; - case 23: r = kernel_tex_image_interp(__tex_image_byte4_023, x, y); break; - case 24: r = kernel_tex_image_interp(__tex_image_byte4_024, x, y); break; - case 25: r = kernel_tex_image_interp(__tex_image_byte4_025, x, y); break; - case 26: r = kernel_tex_image_interp(__tex_image_byte4_026, x, y); break; - case 27: r = kernel_tex_image_interp(__tex_image_byte4_027, x, y); break; - case 28: r = kernel_tex_image_interp(__tex_image_byte4_028, x, y); break; - case 29: r = kernel_tex_image_interp(__tex_image_byte4_029, x, y); break; - case 30: r = kernel_tex_image_interp(__tex_image_byte4_030, x, y); break; - case 31: r = kernel_tex_image_interp(__tex_image_byte4_031, x, y); break; - case 32: r = kernel_tex_image_interp(__tex_image_byte4_032, x, y); break; - case 33: r = kernel_tex_image_interp(__tex_image_byte4_033, x, y); break; - case 34: r = kernel_tex_image_interp(__tex_image_byte4_034, x, y); break; - case 35: r = kernel_tex_image_interp(__tex_image_byte4_035, x, y); break; - case 36: r = kernel_tex_image_interp(__tex_image_byte4_036, x, y); break; - case 37: r = kernel_tex_image_interp(__tex_image_byte4_037, x, y); break; - case 38: r = kernel_tex_image_interp(__tex_image_byte4_038, x, y); break; - case 39: r = kernel_tex_image_interp(__tex_image_byte4_039, x, y); break; - case 40: r = kernel_tex_image_interp(__tex_image_byte4_040, x, y); break; - case 41: r = kernel_tex_image_interp(__tex_image_byte4_041, x, y); break; - case 42: r = kernel_tex_image_interp(__tex_image_byte4_042, x, y); break; - case 43: r = kernel_tex_image_interp(__tex_image_byte4_043, x, y); break; - case 44: r = kernel_tex_image_interp(__tex_image_byte4_044, x, y); break; - case 45: r = kernel_tex_image_interp(__tex_image_byte4_045, x, y); break; - case 46: r = kernel_tex_image_interp(__tex_image_byte4_046, x, y); break; - case 47: r = kernel_tex_image_interp(__tex_image_byte4_047, x, y); break; - case 48: r = kernel_tex_image_interp(__tex_image_byte4_048, x, y); break; - case 49: r = kernel_tex_image_interp(__tex_image_byte4_049, x, y); break; - case 50: r = kernel_tex_image_interp(__tex_image_byte4_050, x, y); break; - case 51: r = kernel_tex_image_interp(__tex_image_byte4_051, x, y); break; - case 52: r = kernel_tex_image_interp(__tex_image_byte4_052, x, y); break; - case 53: r = kernel_tex_image_interp(__tex_image_byte4_053, x, y); break; - case 54: r = kernel_tex_image_interp(__tex_image_byte4_054, x, y); break; - case 55: r = kernel_tex_image_interp(__tex_image_byte4_055, x, y); break; - case 56: r = kernel_tex_image_interp(__tex_image_byte4_056, x, y); break; - case 57: r = kernel_tex_image_interp(__tex_image_byte4_057, x, y); break; - case 58: r = kernel_tex_image_interp(__tex_image_byte4_058, x, y); break; - case 59: r = kernel_tex_image_interp(__tex_image_byte4_059, x, y); break; - case 60: r = kernel_tex_image_interp(__tex_image_byte4_060, x, y); break; - case 61: r = kernel_tex_image_interp(__tex_image_byte4_061, x, y); break; - case 62: r = kernel_tex_image_interp(__tex_image_byte4_062, x, y); break; - case 63: r = kernel_tex_image_interp(__tex_image_byte4_063, x, y); break; - case 64: r = kernel_tex_image_interp(__tex_image_byte4_064, x, y); break; - case 65: r = kernel_tex_image_interp(__tex_image_byte4_065, x, y); break; - case 66: r = kernel_tex_image_interp(__tex_image_byte4_066, x, y); break; - case 67: r = kernel_tex_image_interp(__tex_image_byte4_067, x, y); break; - case 68: r = kernel_tex_image_interp(__tex_image_byte4_068, x, y); break; - case 69: r = kernel_tex_image_interp(__tex_image_byte4_069, x, y); break; - case 70: r = kernel_tex_image_interp(__tex_image_byte4_070, x, y); break; - case 71: r = kernel_tex_image_interp(__tex_image_byte4_071, x, y); break; - case 72: r = kernel_tex_image_interp(__tex_image_byte4_072, x, y); break; - case 73: r = kernel_tex_image_interp(__tex_image_byte4_073, x, y); break; - case 74: r = kernel_tex_image_interp(__tex_image_byte4_074, x, y); break; - case 75: r = kernel_tex_image_interp(__tex_image_byte4_075, x, y); break; - case 76: r = kernel_tex_image_interp(__tex_image_byte4_076, x, y); break; - case 77: r = kernel_tex_image_interp(__tex_image_byte4_077, x, y); break; - case 78: r = kernel_tex_image_interp(__tex_image_byte4_078, x, y); break; - case 79: r = kernel_tex_image_interp(__tex_image_byte4_079, x, y); break; - case 80: r = kernel_tex_image_interp(__tex_image_byte4_080, x, y); break; - case 81: r = kernel_tex_image_interp(__tex_image_byte4_081, x, y); break; - case 82: r = kernel_tex_image_interp(__tex_image_byte4_082, x, y); break; - case 83: r = kernel_tex_image_interp(__tex_image_byte4_083, x, y); break; - case 84: r = kernel_tex_image_interp(__tex_image_byte4_084, x, y); break; - case 85: r = kernel_tex_image_interp(__tex_image_byte4_085, x, y); break; - case 86: r = kernel_tex_image_interp(__tex_image_byte4_086, x, y); break; - case 87: r = kernel_tex_image_interp(__tex_image_byte4_087, x, y); break; - case 88: r = kernel_tex_image_interp(__tex_image_byte4_088, x, y); break; - default: - kernel_assert(0); - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } -# else - CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id); - /* float4, byte4 and half4 */ - if(id < TEX_START_FLOAT_CUDA_KEPLER) - r = kernel_tex_image_interp_float4(tex, x, y); - /* float, byte and half */ - else { - float f = kernel_tex_image_interp_float(tex, x, y); - r = make_float4(f, f, f, 1.0f); - } -# endif -#endif - -#ifdef __KERNEL_SSE2__ - float alpha = r.w; + const float alpha = r.w; if(use_alpha && alpha != 1.0f && alpha != 0.0f) { - r_ssef = r_ssef / ssef(alpha); - if(id >= TEX_NUM_FLOAT4_IMAGES) - r_ssef = min(r_ssef, ssef(1.0f)); - r.w = alpha; - } - - if(srgb) { - r_ssef = color_srgb_to_scene_linear(r_ssef); - r.w = alpha; - } -#else - if(use_alpha && r.w != 1.0f && r.w != 0.0f) { - float invw = 1.0f/r.w; - r.x *= invw; - r.y *= invw; - r.z *= invw; - - if(id >= TEX_NUM_FLOAT4_IMAGES) { - r.x = min(r.x, 1.0f); - r.y = min(r.y, 1.0f); - r.z = min(r.z, 1.0f); + r /= alpha; + const int texture_type = kernel_tex_type(id); + if(texture_type == IMAGE_DATA_TYPE_BYTE4 || + texture_type == IMAGE_DATA_TYPE_BYTE) + { + r = min(r, make_float4(1.0f, 1.0f, 1.0f, 1.0f)); } + r.w = alpha; } if(srgb) { - r.x = color_srgb_to_scene_linear(r.x); - r.y = color_srgb_to_scene_linear(r.y); - r.z = color_srgb_to_scene_linear(r.z); + r = color_srgb_to_scene_linear_v4(r); } -#endif return r; } @@ -237,12 +77,14 @@ ccl_device void svm_node_tex_image(KernelGlobals *kg, ShaderData *sd, float *sta ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node) { /* get object space normal */ - float3 N = ccl_fetch(sd, N); + float3 N = sd->N; - N = ccl_fetch(sd, N); + N = sd->N; object_inverse_normal_transform(kg, sd, &N); /* project from direction vector to barycentric coordinates in triangles */ + float3 signed_N = N; + N.x = fabsf(N.x); N.y = fabsf(N.y); N.z = fabsf(N.z); @@ -312,12 +154,19 @@ ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float float4 f = make_float4(0.0f, 0.0f, 0.0f, 0.0f); uint use_alpha = stack_valid(alpha_offset); - if(weight.x > 0.0f) - f += weight.x*svm_image_texture(kg, id, co.y, co.z, srgb, use_alpha); - if(weight.y > 0.0f) - f += weight.y*svm_image_texture(kg, id, co.x, co.z, srgb, use_alpha); - if(weight.z > 0.0f) - f += weight.z*svm_image_texture(kg, id, co.y, co.x, srgb, use_alpha); + /* Map so that no textures are flipped, rotation is somewhat arbitrary. */ + if(weight.x > 0.0f) { + float2 uv = make_float2((signed_N.x < 0.0f)? 1.0f - co.y: co.y, co.z); + f += weight.x*svm_image_texture(kg, id, uv.x, uv.y, srgb, use_alpha); + } + if(weight.y > 0.0f) { + float2 uv = make_float2((signed_N.y > 0.0f)? 1.0f - co.x: co.x, co.z); + f += weight.y*svm_image_texture(kg, id, uv.x, uv.y, srgb, use_alpha); + } + if(weight.z > 0.0f) { + float2 uv = make_float2((signed_N.z > 0.0f)? 1.0f - co.y: co.y, co.x); + f += weight.z*svm_image_texture(kg, id, uv.x, uv.y, srgb, use_alpha); + } if(stack_valid(out_offset)) stack_store_float3(stack, out_offset, make_float3(f.x, f.y, f.z)); @@ -336,8 +185,8 @@ ccl_device void svm_node_tex_environment(KernelGlobals *kg, ShaderData *sd, floa float3 co = stack_load_float3(stack, co_offset); float2 uv; - co = normalize(co); - + co = safe_normalize(co); + if(projection == 0) uv = direction_to_equirectangular(co); else diff --git a/intern/cycles/kernel/svm/svm_light_path.h b/intern/cycles/kernel/svm/svm_light_path.h index 04f6f623f18..1492e358608 100644 --- a/intern/cycles/kernel/svm/svm_light_path.h +++ b/intern/cycles/kernel/svm/svm_light_path.h @@ -31,8 +31,8 @@ ccl_device void svm_node_light_path(ShaderData *sd, ccl_addr_space PathState *st case NODE_LP_reflection: info = (path_flag & PATH_RAY_REFLECT)? 1.0f: 0.0f; break; case NODE_LP_transmission: info = (path_flag & PATH_RAY_TRANSMIT)? 1.0f: 0.0f; break; case NODE_LP_volume_scatter: info = (path_flag & PATH_RAY_VOLUME_SCATTER)? 1.0f: 0.0f; break; - case NODE_LP_backfacing: info = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f: 0.0f; break; - case NODE_LP_ray_length: info = ccl_fetch(sd, ray_length); break; + case NODE_LP_backfacing: info = (sd->flag & SD_BACKFACING)? 1.0f: 0.0f; break; + case NODE_LP_ray_length: info = sd->ray_length; break; case NODE_LP_ray_depth: info = (float)state->bounce; break; case NODE_LP_ray_diffuse: info = (float)state->diffuse_bounce; break; case NODE_LP_ray_glossy: info = (float)state->glossy_bounce; break; @@ -56,14 +56,14 @@ ccl_device void svm_node_light_falloff(ShaderData *sd, float *stack, uint4 node) switch(type) { case NODE_LIGHT_FALLOFF_QUADRATIC: break; - case NODE_LIGHT_FALLOFF_LINEAR: strength *= ccl_fetch(sd, ray_length); break; - case NODE_LIGHT_FALLOFF_CONSTANT: strength *= ccl_fetch(sd, ray_length)*ccl_fetch(sd, ray_length); break; + case NODE_LIGHT_FALLOFF_LINEAR: strength *= sd->ray_length; break; + case NODE_LIGHT_FALLOFF_CONSTANT: strength *= sd->ray_length*sd->ray_length; break; } float smooth = stack_load_float(stack, smooth_offset); if(smooth > 0.0f) { - float squared = ccl_fetch(sd, ray_length)*ccl_fetch(sd, ray_length); + float squared = sd->ray_length*sd->ray_length; /* Distant lamps set the ray length to FLT_MAX, which causes squared to overflow. */ if(isfinite(squared)) { strength *= squared/(smooth + squared); diff --git a/intern/cycles/kernel/svm/svm_mapping.h b/intern/cycles/kernel/svm/svm_mapping.h index 0a890545af4..42a7ae9946f 100644 --- a/intern/cycles/kernel/svm/svm_mapping.h +++ b/intern/cycles/kernel/svm/svm_mapping.h @@ -26,7 +26,6 @@ ccl_device void svm_node_mapping(KernelGlobals *kg, ShaderData *sd, float *stack tfm.x = read_node_float(kg, offset); tfm.y = read_node_float(kg, offset); tfm.z = read_node_float(kg, offset); - tfm.w = read_node_float(kg, offset); float3 r = transform_point(&tfm, v); stack_store_float3(stack, out_offset, r); diff --git a/intern/cycles/kernel/svm/svm_math_util.h b/intern/cycles/kernel/svm/svm_math_util.h index a7f15de7325..1ce7777aac3 100644 --- a/intern/cycles/kernel/svm/svm_math_util.h +++ b/intern/cycles/kernel/svm/svm_math_util.h @@ -100,71 +100,64 @@ ccl_device float svm_math(NodeMath type, float Fac1, float Fac2) return Fac; } -ccl_device float3 svm_math_blackbody_color(float t) { - /* Calculate color in range 800..12000 using an approximation - * a/x+bx+c for R and G and ((at + b)t + c)t + d) for B - * Max absolute error for RGB is (0.00095, 0.00077, 0.00057), - * which is enough to get the same 8 bit/channel color. - */ - - const float rc[6][3] = { - { 2.52432244e+03f, -1.06185848e-03f, 3.11067539e+00f }, - { 3.37763626e+03f, -4.34581697e-04f, 1.64843306e+00f }, - { 4.10671449e+03f, -8.61949938e-05f, 6.41423749e-01f }, - { 4.66849800e+03f, 2.85655028e-05f, 1.29075375e-01f }, - { 4.60124770e+03f, 2.89727618e-05f, 1.48001316e-01f }, - { 3.78765709e+03f, 9.36026367e-06f, 3.98995841e-01f }, - }; - - const float gc[6][3] = { - { -7.50343014e+02f, 3.15679613e-04f, 4.73464526e-01f }, - { -1.00402363e+03f, 1.29189794e-04f, 9.08181524e-01f }, - { -1.22075471e+03f, 2.56245413e-05f, 1.20753416e+00f }, - { -1.42546105e+03f, -4.01730887e-05f, 1.44002695e+00f }, - { -1.18134453e+03f, -2.18913373e-05f, 1.30656109e+00f }, - { -5.00279505e+02f, -4.59745390e-06f, 1.09090465e+00f }, - }; - - const float bc[6][4] = { - { 0.0f, 0.0f, 0.0f, 0.0f }, /* zeros should be optimized by compiler */ - { 0.0f, 0.0f, 0.0f, 0.0f }, - { 0.0f, 0.0f, 0.0f, 0.0f }, - { -2.02524603e-11f, 1.79435860e-07f, -2.60561875e-04f, -1.41761141e-02f }, - { -2.22463426e-13f, -1.55078698e-08f, 3.81675160e-04f, -7.30646033e-01f }, - { 6.72595954e-13f, -2.73059993e-08f, 4.24068546e-04f, -7.52204323e-01f }, - }; - - int i; +/* Calculate color in range 800..12000 using an approximation + * a/x+bx+c for R and G and ((at + b)t + c)t + d) for B + * Max absolute error for RGB is (0.00095, 0.00077, 0.00057), + * which is enough to get the same 8 bit/channel color. + */ + +ccl_static_constant float blackbody_table_r[6][3] = { + { 2.52432244e+03f, -1.06185848e-03f, 3.11067539e+00f }, + { 3.37763626e+03f, -4.34581697e-04f, 1.64843306e+00f }, + { 4.10671449e+03f, -8.61949938e-05f, 6.41423749e-01f }, + { 4.66849800e+03f, 2.85655028e-05f, 1.29075375e-01f }, + { 4.60124770e+03f, 2.89727618e-05f, 1.48001316e-01f }, + { 3.78765709e+03f, 9.36026367e-06f, 3.98995841e-01f }, +}; + +ccl_static_constant float blackbody_table_g[6][3] = { + { -7.50343014e+02f, 3.15679613e-04f, 4.73464526e-01f }, + { -1.00402363e+03f, 1.29189794e-04f, 9.08181524e-01f }, + { -1.22075471e+03f, 2.56245413e-05f, 1.20753416e+00f }, + { -1.42546105e+03f, -4.01730887e-05f, 1.44002695e+00f }, + { -1.18134453e+03f, -2.18913373e-05f, 1.30656109e+00f }, + { -5.00279505e+02f, -4.59745390e-06f, 1.09090465e+00f }, +}; + +ccl_static_constant float blackbody_table_b[6][4] = { + { 0.0f, 0.0f, 0.0f, 0.0f }, /* zeros should be optimized by compiler */ + { 0.0f, 0.0f, 0.0f, 0.0f }, + { 0.0f, 0.0f, 0.0f, 0.0f }, + { -2.02524603e-11f, 1.79435860e-07f, -2.60561875e-04f, -1.41761141e-02f }, + { -2.22463426e-13f, -1.55078698e-08f, 3.81675160e-04f, -7.30646033e-01f }, + { 6.72595954e-13f, -2.73059993e-08f, 4.24068546e-04f, -7.52204323e-01f }, +}; + + +ccl_device float3 svm_math_blackbody_color(float t) +{ if(t >= 12000.0f) { return make_float3(0.826270103f, 0.994478524f, 1.56626022f); } - else if(t >= 6365.0f) { - i = 5; - } - else if(t >= 3315.0f) { - i = 4; - } - else if(t >= 1902.0f) { - i = 3; - } - else if(t >= 1449.0f) { - i = 2; - } - else if(t >= 1167.0f) { - i = 1; - } - else if(t >= 965.0f) { - i = 0; - } - else { + else if(t < 965.0f) { /* For 800 <= t < 965 color does not change in OSL implementation, so keep color the same */ return make_float3(4.70366907f, 0.0f, 0.0f); } + int i = (t >= 6365.0f)? 5: + (t >= 3315.0f)? 4: + (t >= 1902.0f)? 3: + (t >= 1449.0f)? 2: + (t >= 1167.0f)? 1: 0; + + ccl_constant float *r = blackbody_table_r[i]; + ccl_constant float *g = blackbody_table_g[i]; + ccl_constant float *b = blackbody_table_b[i]; + const float t_inv = 1.0f / t; - return make_float3(rc[i][0] * t_inv + rc[i][1] * t + rc[i][2], - gc[i][0] * t_inv + gc[i][1] * t + gc[i][2], - ((bc[i][0] * t + bc[i][1]) * t + bc[i][2]) * t + bc[i][3]); + return make_float3(r[0] * t_inv + r[1] * t + r[2], + g[0] * t_inv + g[1] * t + g[2], + ((b[0] * t + b[1]) * t + b[2]) * t + b[3]); } ccl_device_inline float3 svm_math_gamma_color(float3 color, float gamma) diff --git a/intern/cycles/kernel/svm/svm_tex_coord.h b/intern/cycles/kernel/svm/svm_tex_coord.h index c0b01262212..6ff39e5f587 100644 --- a/intern/cycles/kernel/svm/svm_tex_coord.h +++ b/intern/cycles/kernel/svm/svm_tex_coord.h @@ -31,9 +31,9 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg, switch(type) { case NODE_TEXCO_OBJECT: { - data = ccl_fetch(sd, P); + data = sd->P; if(node.w == 0) { - if(ccl_fetch(sd, object) != OBJECT_NONE) { + if(sd->object != OBJECT_NONE) { object_inverse_position_transform(kg, sd, &data); } } @@ -42,53 +42,52 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg, tfm.x = read_node_float(kg, offset); tfm.y = read_node_float(kg, offset); tfm.z = read_node_float(kg, offset); - tfm.w = read_node_float(kg, offset); data = transform_point(&tfm, data); } break; } case NODE_TEXCO_NORMAL: { - data = ccl_fetch(sd, N); + data = sd->N; object_inverse_normal_transform(kg, sd, &data); break; } case NODE_TEXCO_CAMERA: { Transform tfm = kernel_data.cam.worldtocamera; - if(ccl_fetch(sd, object) != OBJECT_NONE) - data = transform_point(&tfm, ccl_fetch(sd, P)); + if(sd->object != OBJECT_NONE) + data = transform_point(&tfm, sd->P); else - data = transform_point(&tfm, ccl_fetch(sd, P) + camera_position(kg)); + data = transform_point(&tfm, sd->P + camera_position(kg)); break; } case NODE_TEXCO_WINDOW: { - if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) - data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P)); + if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) + data = camera_world_to_ndc(kg, sd, sd->ray_P); else - data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P)); + data = camera_world_to_ndc(kg, sd, sd->P); data.z = 0.0f; break; } case NODE_TEXCO_REFLECTION: { - if(ccl_fetch(sd, object) != OBJECT_NONE) - data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I); + if(sd->object != OBJECT_NONE) + data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I; else - data = ccl_fetch(sd, I); + data = sd->I; break; } case NODE_TEXCO_DUPLI_GENERATED: { - data = object_dupli_generated(kg, ccl_fetch(sd, object)); + data = object_dupli_generated(kg, sd->object); break; } case NODE_TEXCO_DUPLI_UV: { - data = object_dupli_uv(kg, ccl_fetch(sd, object)); + data = object_dupli_uv(kg, sd->object); break; } case NODE_TEXCO_VOLUME_GENERATED: { - data = ccl_fetch(sd, P); + data = sd->P; #ifdef __VOLUME__ - if(ccl_fetch(sd, object) != OBJECT_NONE) + if(sd->object != OBJECT_NONE) data = volume_normalized_position(kg, sd, data); #endif break; @@ -112,9 +111,9 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg, switch(type) { case NODE_TEXCO_OBJECT: { - data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx; + data = sd->P + sd->dP.dx; if(node.w == 0) { - if(ccl_fetch(sd, object) != OBJECT_NONE) { + if(sd->object != OBJECT_NONE) { object_inverse_position_transform(kg, sd, &data); } } @@ -123,53 +122,52 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg, tfm.x = read_node_float(kg, offset); tfm.y = read_node_float(kg, offset); tfm.z = read_node_float(kg, offset); - tfm.w = read_node_float(kg, offset); data = transform_point(&tfm, data); } break; } case NODE_TEXCO_NORMAL: { - data = ccl_fetch(sd, N); + data = sd->N; object_inverse_normal_transform(kg, sd, &data); break; } case NODE_TEXCO_CAMERA: { Transform tfm = kernel_data.cam.worldtocamera; - if(ccl_fetch(sd, object) != OBJECT_NONE) - data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx); + if(sd->object != OBJECT_NONE) + data = transform_point(&tfm, sd->P + sd->dP.dx); else - data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx + camera_position(kg)); + data = transform_point(&tfm, sd->P + sd->dP.dx + camera_position(kg)); break; } case NODE_TEXCO_WINDOW: { - if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) - data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P) + ccl_fetch(sd, ray_dP).dx); + if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) + data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dx); else - data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx); + data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dx); data.z = 0.0f; break; } case NODE_TEXCO_REFLECTION: { - if(ccl_fetch(sd, object) != OBJECT_NONE) - data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I); + if(sd->object != OBJECT_NONE) + data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I; else - data = ccl_fetch(sd, I); + data = sd->I; break; } case NODE_TEXCO_DUPLI_GENERATED: { - data = object_dupli_generated(kg, ccl_fetch(sd, object)); + data = object_dupli_generated(kg, sd->object); break; } case NODE_TEXCO_DUPLI_UV: { - data = object_dupli_uv(kg, ccl_fetch(sd, object)); + data = object_dupli_uv(kg, sd->object); break; } case NODE_TEXCO_VOLUME_GENERATED: { - data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx; + data = sd->P + sd->dP.dx; #ifdef __VOLUME__ - if(ccl_fetch(sd, object) != OBJECT_NONE) + if(sd->object != OBJECT_NONE) data = volume_normalized_position(kg, sd, data); #endif break; @@ -196,9 +194,9 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg, switch(type) { case NODE_TEXCO_OBJECT: { - data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy; + data = sd->P + sd->dP.dy; if(node.w == 0) { - if(ccl_fetch(sd, object) != OBJECT_NONE) { + if(sd->object != OBJECT_NONE) { object_inverse_position_transform(kg, sd, &data); } } @@ -207,53 +205,52 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg, tfm.x = read_node_float(kg, offset); tfm.y = read_node_float(kg, offset); tfm.z = read_node_float(kg, offset); - tfm.w = read_node_float(kg, offset); data = transform_point(&tfm, data); } break; } case NODE_TEXCO_NORMAL: { - data = ccl_fetch(sd, N); + data = sd->N; object_inverse_normal_transform(kg, sd, &data); break; } case NODE_TEXCO_CAMERA: { Transform tfm = kernel_data.cam.worldtocamera; - if(ccl_fetch(sd, object) != OBJECT_NONE) - data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy); + if(sd->object != OBJECT_NONE) + data = transform_point(&tfm, sd->P + sd->dP.dy); else - data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy + camera_position(kg)); + data = transform_point(&tfm, sd->P + sd->dP.dy + camera_position(kg)); break; } case NODE_TEXCO_WINDOW: { - if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) - data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P) + ccl_fetch(sd, ray_dP).dy); + if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) + data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dy); else - data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy); + data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dy); data.z = 0.0f; break; } case NODE_TEXCO_REFLECTION: { - if(ccl_fetch(sd, object) != OBJECT_NONE) - data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I); + if(sd->object != OBJECT_NONE) + data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I; else - data = ccl_fetch(sd, I); + data = sd->I; break; } case NODE_TEXCO_DUPLI_GENERATED: { - data = object_dupli_generated(kg, ccl_fetch(sd, object)); + data = object_dupli_generated(kg, sd->object); break; } case NODE_TEXCO_DUPLI_UV: { - data = object_dupli_uv(kg, ccl_fetch(sd, object)); + data = object_dupli_uv(kg, sd->object); break; } case NODE_TEXCO_VOLUME_GENERATED: { - data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy; + data = sd->P + sd->dP.dy; #ifdef __VOLUME__ - if(ccl_fetch(sd, object) != OBJECT_NONE) + if(sd->object != OBJECT_NONE) data = volume_normalized_position(kg, sd, data); #endif break; @@ -274,12 +271,12 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st float3 color = stack_load_float3(stack, color_offset); color = 2.0f*make_float3(color.x - 0.5f, color.y - 0.5f, color.z - 0.5f); - bool is_backfacing = (ccl_fetch(sd, flag) & SD_BACKFACING) != 0; + bool is_backfacing = (sd->flag & SD_BACKFACING) != 0; float3 N; if(space == NODE_NORMAL_MAP_TANGENT) { /* tangent space */ - if(ccl_fetch(sd, object) == OBJECT_NONE) { + if(sd->object == OBJECT_NONE) { stack_store_float3(stack, normal_offset, make_float3(0.0f, 0.0f, 0.0f)); return; } @@ -299,11 +296,11 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st float sign = primitive_attribute_float(kg, sd, attr_sign, NULL, NULL); float3 normal; - if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) { + if(sd->shader & SHADER_SMOOTH_NORMAL) { normal = primitive_attribute_float3(kg, sd, attr_normal, NULL, NULL); } else { - normal = ccl_fetch(sd, Ng); + normal = sd->Ng; /* the normal is already inverted, which is too soon for the math here */ if(is_backfacing) { @@ -345,11 +342,11 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st if(strength != 1.0f) { strength = max(strength, 0.0f); - N = safe_normalize(ccl_fetch(sd, N) + (N - ccl_fetch(sd, N))*strength); + N = safe_normalize(sd->N + (N - sd->N)*strength); } if(is_zero(N)) { - N = ccl_fetch(sd, N); + N = sd->N; } stack_store_float3(stack, normal_offset, N); @@ -377,7 +374,7 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack float3 generated; if(desc.offset == ATTR_STD_NOT_FOUND) - generated = ccl_fetch(sd, P); + generated = sd->P; else generated = primitive_attribute_float3(kg, sd, desc, NULL, NULL); @@ -390,7 +387,7 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack } object_normal_transform(kg, sd, &tangent); - tangent = cross(ccl_fetch(sd, N), normalize(cross(tangent, ccl_fetch(sd, N)))); + tangent = cross(sd->N, normalize(cross(tangent, sd->N))); stack_store_float3(stack, tangent_offset, tangent); } diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h index 47209ddfbab..4c3a5975fb8 100644 --- a/intern/cycles/kernel/svm/svm_types.h +++ b/intern/cycles/kernel/svm/svm_types.h @@ -132,6 +132,10 @@ typedef enum ShaderNodeType { NODE_TEX_VOXEL, NODE_ENTER_BUMP_EVAL, NODE_LEAVE_BUMP_EVAL, + NODE_BEVEL, + NODE_DISPLACEMENT, + NODE_VECTOR_DISPLACEMENT, + NODE_PRINCIPLED_VOLUME, } ShaderNodeType; typedef enum NodeAttributeType { @@ -158,6 +162,7 @@ typedef enum NodeObjectInfo { typedef enum NodeParticleInfo { NODE_INFO_PAR_INDEX, + NODE_INFO_PAR_RANDOM, NODE_INFO_PAR_AGE, NODE_INFO_PAR_LIFETIME, NODE_INFO_PAR_LOCATION, @@ -173,7 +178,8 @@ typedef enum NodeHairInfo { NODE_INFO_CURVE_THICKNESS, /*fade for minimum hair width transpency*/ /*NODE_INFO_CURVE_FADE,*/ - NODE_INFO_CURVE_TANGENT_NORMAL + NODE_INFO_CURVE_TANGENT_NORMAL, + NODE_INFO_CURVE_RANDOM, } NodeHairInfo; typedef enum NodeLightPath { @@ -397,17 +403,23 @@ typedef enum ClosureType { CLOSURE_BSDF_DIFFUSE_ID, CLOSURE_BSDF_OREN_NAYAR_ID, CLOSURE_BSDF_DIFFUSE_RAMP_ID, + CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID, + CLOSURE_BSDF_PRINCIPLED_SHEEN_ID, CLOSURE_BSDF_DIFFUSE_TOON_ID, /* Glossy */ - CLOSURE_BSDF_GLOSSY_ID, CLOSURE_BSDF_REFLECTION_ID, CLOSURE_BSDF_MICROFACET_GGX_ID, + CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID, + CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID, CLOSURE_BSDF_MICROFACET_BECKMANN_ID, CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID, + CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID, CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID, CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID, + CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID, CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID, + CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_FRESNEL_ID, CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID, CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID, CLOSURE_BSDF_ASHIKHMIN_VELVET_ID, @@ -416,29 +428,31 @@ typedef enum ClosureType { CLOSURE_BSDF_HAIR_REFLECTION_ID, /* Transmission */ - CLOSURE_BSDF_TRANSMISSION_ID, CLOSURE_BSDF_TRANSLUCENT_ID, CLOSURE_BSDF_REFRACTION_ID, CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID, CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID, + CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID, CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID, CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID, - CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID, + CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID, CLOSURE_BSDF_SHARP_GLASS_ID, CLOSURE_BSDF_HAIR_TRANSMISSION_ID, /* Special cases */ CLOSURE_BSDF_BSSRDF_ID, + CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID, CLOSURE_BSDF_TRANSPARENT_ID, /* BSSRDF */ CLOSURE_BSSRDF_CUBIC_ID, CLOSURE_BSSRDF_GAUSSIAN_ID, + CLOSURE_BSSRDF_PRINCIPLED_ID, CLOSURE_BSSRDF_BURLEY_ID, + CLOSURE_BSSRDF_RANDOM_WALK_ID, + CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID, /* Other */ - CLOSURE_EMISSION_ID, - CLOSURE_BACKGROUND_ID, CLOSURE_HOLDOUT_ID, CLOSURE_AMBIENT_OCCLUSION_ID, @@ -447,28 +461,38 @@ typedef enum ClosureType { CLOSURE_VOLUME_ABSORPTION_ID, CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID, + CLOSURE_BSDF_PRINCIPLED_ID, + NBUILTIN_CLOSURES } ClosureType; /* watch this, being lazy with memory usage */ #define CLOSURE_IS_BSDF(type) (type <= CLOSURE_BSDF_TRANSPARENT_ID) #define CLOSURE_IS_BSDF_DIFFUSE(type) (type >= CLOSURE_BSDF_DIFFUSE_ID && type <= CLOSURE_BSDF_DIFFUSE_TOON_ID) -#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_GLOSSY_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID) -#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSMISSION_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID) -#define CLOSURE_IS_BSDF_BSSRDF(type) (type == CLOSURE_BSDF_BSSRDF_ID) -#define CLOSURE_IS_BSDF_ANISOTROPIC(type) (type >= CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID) +#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID) +#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSLUCENT_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID) +#define CLOSURE_IS_BSDF_BSSRDF(type) (type == CLOSURE_BSDF_BSSRDF_ID || type == CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID) +#define CLOSURE_IS_BSDF_SINGULAR(type) (type == CLOSURE_BSDF_REFLECTION_ID || \ + type == CLOSURE_BSDF_REFRACTION_ID || \ + type == CLOSURE_BSDF_TRANSPARENT_ID) +#define CLOSURE_IS_BSDF_TRANSPARENT(type) (type == CLOSURE_BSDF_TRANSPARENT_ID) #define CLOSURE_IS_BSDF_MULTISCATTER(type) (type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID ||\ type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID || \ - type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID) -#define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_BURLEY_ID) -#define CLOSURE_IS_BSSRDF(type) (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_BURLEY_ID) + type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID) +#define CLOSURE_IS_BSDF_MICROFACET(type) ((type >= CLOSURE_BSDF_MICROFACET_GGX_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID) ||\ + (type >= CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID && type <= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID) ||\ + (type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID)) +#define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID) +#define CLOSURE_IS_BSSRDF(type) (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID) +#define CLOSURE_IS_DISK_BSSRDF(type) (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_BURLEY_ID) #define CLOSURE_IS_VOLUME(type) (type >= CLOSURE_VOLUME_ID && type <= CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) -#define CLOSURE_IS_EMISSION(type) (type == CLOSURE_EMISSION_ID) +#define CLOSURE_IS_VOLUME_SCATTER(type) (type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) +#define CLOSURE_IS_VOLUME_ABSORPTION(type) (type == CLOSURE_VOLUME_ABSORPTION_ID) #define CLOSURE_IS_HOLDOUT(type) (type == CLOSURE_HOLDOUT_ID) -#define CLOSURE_IS_BACKGROUND(type) (type == CLOSURE_BACKGROUND_ID) #define CLOSURE_IS_AMBIENT_OCCLUSION(type) (type == CLOSURE_AMBIENT_OCCLUSION_ID) #define CLOSURE_IS_PHASE(type) (type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) -#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID) +#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID) +#define CLOSURE_IS_PRINCIPLED(type) (type == CLOSURE_BSDF_PRINCIPLED_ID) #define CLOSURE_WEIGHT_CUTOFF 1e-5f diff --git a/intern/cycles/kernel/svm/svm_vector_transform.h b/intern/cycles/kernel/svm/svm_vector_transform.h index 4c32130d06d..4e92f27acdb 100644 --- a/intern/cycles/kernel/svm/svm_vector_transform.h +++ b/intern/cycles/kernel/svm/svm_vector_transform.h @@ -33,7 +33,7 @@ ccl_device void svm_node_vector_transform(KernelGlobals *kg, ShaderData *sd, flo NodeVectorTransformConvertSpace to = (NodeVectorTransformConvertSpace)ito; Transform tfm; - bool is_object = (ccl_fetch(sd, object) != OBJECT_NONE); + bool is_object = (sd->object != OBJECT_NONE); bool is_direction = (type == NODE_VECTOR_TRANSFORM_TYPE_VECTOR || type == NODE_VECTOR_TRANSFORM_TYPE_NORMAL); /* From world */ diff --git a/intern/cycles/kernel/svm/svm_voxel.h b/intern/cycles/kernel/svm/svm_voxel.h index a8b3604a8a7..43b433683e0 100644 --- a/intern/cycles/kernel/svm/svm_voxel.h +++ b/intern/cycles/kernel/svm/svm_voxel.h @@ -39,27 +39,10 @@ ccl_device void svm_node_tex_voxel(KernelGlobals *kg, tfm.x = read_node_float(kg, offset); tfm.y = read_node_float(kg, offset); tfm.z = read_node_float(kg, offset); - tfm.w = read_node_float(kg, offset); co = transform_point(&tfm, co); } - float4 r; -# if defined(__KERNEL_CUDA__) -# if __CUDA_ARCH__ >= 300 - CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id); - if(id < 2048) /* TODO(dingto): Make this a variable */ - r = kernel_tex_image_interp_3d_float4(tex, co.x, co.y, co.z); - else { - float f = kernel_tex_image_interp_3d_float(tex, co.x, co.y, co.z); - r = make_float4(f, f, f, 1.0f); - } -# else /* __CUDA_ARCH__ >= 300 */ - r = volume_image_texture_3d(id, co.x, co.y, co.z); -# endif -# elif defined(__KERNEL_OPENCL__) - r = kernel_tex_image_interp_3d(kg, id, co.x, co.y, co.z); -# else - r = kernel_tex_image_interp_3d(id, co.x, co.y, co.z); -# endif /* __KERNEL_CUDA__ */ + + float4 r = kernel_tex_image_interp_3d(kg, id, co.x, co.y, co.z, INTERPOLATION_NONE); #else float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f); #endif diff --git a/intern/cycles/kernel/svm/svm_wavelength.h b/intern/cycles/kernel/svm/svm_wavelength.h index 57030f3979d..855b356b397 100644 --- a/intern/cycles/kernel/svm/svm_wavelength.h +++ b/intern/cycles/kernel/svm/svm_wavelength.h @@ -34,44 +34,44 @@ CCL_NAMESPACE_BEGIN /* Wavelength to RGB */ +// CIE colour matching functions xBar, yBar, and zBar for +// wavelengths from 380 through 780 nanometers, every 5 +// nanometers. For a wavelength lambda in this range: +// cie_colour_match[(lambda - 380) / 5][0] = xBar +// cie_colour_match[(lambda - 380) / 5][1] = yBar +// cie_colour_match[(lambda - 380) / 5][2] = zBar +ccl_static_constant float cie_colour_match[81][3] = { + {0.0014f,0.0000f,0.0065f}, {0.0022f,0.0001f,0.0105f}, {0.0042f,0.0001f,0.0201f}, + {0.0076f,0.0002f,0.0362f}, {0.0143f,0.0004f,0.0679f}, {0.0232f,0.0006f,0.1102f}, + {0.0435f,0.0012f,0.2074f}, {0.0776f,0.0022f,0.3713f}, {0.1344f,0.0040f,0.6456f}, + {0.2148f,0.0073f,1.0391f}, {0.2839f,0.0116f,1.3856f}, {0.3285f,0.0168f,1.6230f}, + {0.3483f,0.0230f,1.7471f}, {0.3481f,0.0298f,1.7826f}, {0.3362f,0.0380f,1.7721f}, + {0.3187f,0.0480f,1.7441f}, {0.2908f,0.0600f,1.6692f}, {0.2511f,0.0739f,1.5281f}, + {0.1954f,0.0910f,1.2876f}, {0.1421f,0.1126f,1.0419f}, {0.0956f,0.1390f,0.8130f}, + {0.0580f,0.1693f,0.6162f}, {0.0320f,0.2080f,0.4652f}, {0.0147f,0.2586f,0.3533f}, + {0.0049f,0.3230f,0.2720f}, {0.0024f,0.4073f,0.2123f}, {0.0093f,0.5030f,0.1582f}, + {0.0291f,0.6082f,0.1117f}, {0.0633f,0.7100f,0.0782f}, {0.1096f,0.7932f,0.0573f}, + {0.1655f,0.8620f,0.0422f}, {0.2257f,0.9149f,0.0298f}, {0.2904f,0.9540f,0.0203f}, + {0.3597f,0.9803f,0.0134f}, {0.4334f,0.9950f,0.0087f}, {0.5121f,1.0000f,0.0057f}, + {0.5945f,0.9950f,0.0039f}, {0.6784f,0.9786f,0.0027f}, {0.7621f,0.9520f,0.0021f}, + {0.8425f,0.9154f,0.0018f}, {0.9163f,0.8700f,0.0017f}, {0.9786f,0.8163f,0.0014f}, + {1.0263f,0.7570f,0.0011f}, {1.0567f,0.6949f,0.0010f}, {1.0622f,0.6310f,0.0008f}, + {1.0456f,0.5668f,0.0006f}, {1.0026f,0.5030f,0.0003f}, {0.9384f,0.4412f,0.0002f}, + {0.8544f,0.3810f,0.0002f}, {0.7514f,0.3210f,0.0001f}, {0.6424f,0.2650f,0.0000f}, + {0.5419f,0.2170f,0.0000f}, {0.4479f,0.1750f,0.0000f}, {0.3608f,0.1382f,0.0000f}, + {0.2835f,0.1070f,0.0000f}, {0.2187f,0.0816f,0.0000f}, {0.1649f,0.0610f,0.0000f}, + {0.1212f,0.0446f,0.0000f}, {0.0874f,0.0320f,0.0000f}, {0.0636f,0.0232f,0.0000f}, + {0.0468f,0.0170f,0.0000f}, {0.0329f,0.0119f,0.0000f}, {0.0227f,0.0082f,0.0000f}, + {0.0158f,0.0057f,0.0000f}, {0.0114f,0.0041f,0.0000f}, {0.0081f,0.0029f,0.0000f}, + {0.0058f,0.0021f,0.0000f}, {0.0041f,0.0015f,0.0000f}, {0.0029f,0.0010f,0.0000f}, + {0.0020f,0.0007f,0.0000f}, {0.0014f,0.0005f,0.0000f}, {0.0010f,0.0004f,0.0000f}, + {0.0007f,0.0002f,0.0000f}, {0.0005f,0.0002f,0.0000f}, {0.0003f,0.0001f,0.0000f}, + {0.0002f,0.0001f,0.0000f}, {0.0002f,0.0001f,0.0000f}, {0.0001f,0.0000f,0.0000f}, + {0.0001f,0.0000f,0.0000f}, {0.0001f,0.0000f,0.0000f}, {0.0000f,0.0000f,0.0000f} +}; + ccl_device void svm_node_wavelength(ShaderData *sd, float *stack, uint wavelength, uint color_out) { - // CIE colour matching functions xBar, yBar, and zBar for - // wavelengths from 380 through 780 nanometers, every 5 - // nanometers. For a wavelength lambda in this range: - // cie_colour_match[(lambda - 380) / 5][0] = xBar - // cie_colour_match[(lambda - 380) / 5][1] = yBar - // cie_colour_match[(lambda - 380) / 5][2] = zBar - const float cie_colour_match[81][3] = { - {0.0014f,0.0000f,0.0065f}, {0.0022f,0.0001f,0.0105f}, {0.0042f,0.0001f,0.0201f}, - {0.0076f,0.0002f,0.0362f}, {0.0143f,0.0004f,0.0679f}, {0.0232f,0.0006f,0.1102f}, - {0.0435f,0.0012f,0.2074f}, {0.0776f,0.0022f,0.3713f}, {0.1344f,0.0040f,0.6456f}, - {0.2148f,0.0073f,1.0391f}, {0.2839f,0.0116f,1.3856f}, {0.3285f,0.0168f,1.6230f}, - {0.3483f,0.0230f,1.7471f}, {0.3481f,0.0298f,1.7826f}, {0.3362f,0.0380f,1.7721f}, - {0.3187f,0.0480f,1.7441f}, {0.2908f,0.0600f,1.6692f}, {0.2511f,0.0739f,1.5281f}, - {0.1954f,0.0910f,1.2876f}, {0.1421f,0.1126f,1.0419f}, {0.0956f,0.1390f,0.8130f}, - {0.0580f,0.1693f,0.6162f}, {0.0320f,0.2080f,0.4652f}, {0.0147f,0.2586f,0.3533f}, - {0.0049f,0.3230f,0.2720f}, {0.0024f,0.4073f,0.2123f}, {0.0093f,0.5030f,0.1582f}, - {0.0291f,0.6082f,0.1117f}, {0.0633f,0.7100f,0.0782f}, {0.1096f,0.7932f,0.0573f}, - {0.1655f,0.8620f,0.0422f}, {0.2257f,0.9149f,0.0298f}, {0.2904f,0.9540f,0.0203f}, - {0.3597f,0.9803f,0.0134f}, {0.4334f,0.9950f,0.0087f}, {0.5121f,1.0000f,0.0057f}, - {0.5945f,0.9950f,0.0039f}, {0.6784f,0.9786f,0.0027f}, {0.7621f,0.9520f,0.0021f}, - {0.8425f,0.9154f,0.0018f}, {0.9163f,0.8700f,0.0017f}, {0.9786f,0.8163f,0.0014f}, - {1.0263f,0.7570f,0.0011f}, {1.0567f,0.6949f,0.0010f}, {1.0622f,0.6310f,0.0008f}, - {1.0456f,0.5668f,0.0006f}, {1.0026f,0.5030f,0.0003f}, {0.9384f,0.4412f,0.0002f}, - {0.8544f,0.3810f,0.0002f}, {0.7514f,0.3210f,0.0001f}, {0.6424f,0.2650f,0.0000f}, - {0.5419f,0.2170f,0.0000f}, {0.4479f,0.1750f,0.0000f}, {0.3608f,0.1382f,0.0000f}, - {0.2835f,0.1070f,0.0000f}, {0.2187f,0.0816f,0.0000f}, {0.1649f,0.0610f,0.0000f}, - {0.1212f,0.0446f,0.0000f}, {0.0874f,0.0320f,0.0000f}, {0.0636f,0.0232f,0.0000f}, - {0.0468f,0.0170f,0.0000f}, {0.0329f,0.0119f,0.0000f}, {0.0227f,0.0082f,0.0000f}, - {0.0158f,0.0057f,0.0000f}, {0.0114f,0.0041f,0.0000f}, {0.0081f,0.0029f,0.0000f}, - {0.0058f,0.0021f,0.0000f}, {0.0041f,0.0015f,0.0000f}, {0.0029f,0.0010f,0.0000f}, - {0.0020f,0.0007f,0.0000f}, {0.0014f,0.0005f,0.0000f}, {0.0010f,0.0004f,0.0000f}, - {0.0007f,0.0002f,0.0000f}, {0.0005f,0.0002f,0.0000f}, {0.0003f,0.0001f,0.0000f}, - {0.0002f,0.0001f,0.0000f}, {0.0002f,0.0001f,0.0000f}, {0.0001f,0.0000f,0.0000f}, - {0.0001f,0.0000f,0.0000f}, {0.0001f,0.0000f,0.0000f}, {0.0000f,0.0000f,0.0000f} - }; - float lambda_nm = stack_load_float(stack, wavelength); float ii = (lambda_nm-380.0f) * (1.0f/5.0f); // scaled 0..80 int i = float_to_int(ii); @@ -82,7 +82,7 @@ ccl_device void svm_node_wavelength(ShaderData *sd, float *stack, uint wavelengt } else { ii -= i; - const float *c = cie_colour_match[i]; + ccl_constant float *c = cie_colour_match[i]; color = interp(make_float3(c[0], c[1], c[2]), make_float3(c[3], c[4], c[5]), ii); } @@ -92,8 +92,7 @@ ccl_device void svm_node_wavelength(ShaderData *sd, float *stack, uint wavelengt /* Clamp to zero if values are smaller */ color = max(color, make_float3(0.0f, 0.0f, 0.0f)); - if(stack_valid(color_out)) - stack_store_float3(stack, color_out, color); + stack_store_float3(stack, color_out, color); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_wireframe.h b/intern/cycles/kernel/svm/svm_wireframe.h index 87e40791333..3c6353c8001 100644 --- a/intern/cycles/kernel/svm/svm_wireframe.h +++ b/intern/cycles/kernel/svm/svm_wireframe.h @@ -41,9 +41,9 @@ ccl_device_inline float wireframe(KernelGlobals *kg, float3 *P) { #ifdef __HAIR__ - if(ccl_fetch(sd, prim) != PRIM_NONE && ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) + if(sd->prim != PRIM_NONE && sd->type & PRIMITIVE_ALL_TRIANGLE) #else - if(ccl_fetch(sd, prim) != PRIM_NONE) + if(sd->prim != PRIM_NONE) #endif { float3 Co[3]; @@ -52,12 +52,12 @@ ccl_device_inline float wireframe(KernelGlobals *kg, /* Triangles */ int np = 3; - if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) - triangle_vertices(kg, ccl_fetch(sd, prim), Co); + if(sd->type & PRIMITIVE_TRIANGLE) + triangle_vertices(kg, sd->prim, Co); else - motion_triangle_vertices(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), Co); + motion_triangle_vertices(kg, sd->object, sd->prim, sd->time, Co); - if(!(ccl_fetch(sd, object_flag) & SD_OBJECT_TRANSFORM_APPLIED)) { + if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { object_position_transform(kg, sd, &Co[0]); object_position_transform(kg, sd, &Co[1]); object_position_transform(kg, sd, &Co[2]); @@ -66,8 +66,8 @@ ccl_device_inline float wireframe(KernelGlobals *kg, if(pixel_size) { // Project the derivatives of P to the viewing plane defined // by I so we have a measure of how big is a pixel at this point - float pixelwidth_x = len(ccl_fetch(sd, dP).dx - dot(ccl_fetch(sd, dP).dx, ccl_fetch(sd, I)) * ccl_fetch(sd, I)); - float pixelwidth_y = len(ccl_fetch(sd, dP).dy - dot(ccl_fetch(sd, dP).dy, ccl_fetch(sd, I)) * ccl_fetch(sd, I)); + float pixelwidth_x = len(sd->dP.dx - dot(sd->dP.dx, sd->I) * sd->I); + float pixelwidth_y = len(sd->dP.dy - dot(sd->dP.dy, sd->I) * sd->I); // Take the average of both axis' length pixelwidth = (pixelwidth_x + pixelwidth_y) * 0.5f; } @@ -113,20 +113,20 @@ ccl_device void svm_node_wireframe(KernelGlobals *kg, * With OpenCL 2.0 it's possible to avoid this change, but for until * then we'll be living with such an exception. */ - float3 P = ccl_fetch(sd, P); + float3 P = sd->P; float f = wireframe(kg, sd, size, pixel_size, &P); #else - float f = wireframe(kg, sd, size, pixel_size, &ccl_fetch(sd, P)); + float f = wireframe(kg, sd, size, pixel_size, &sd->P); #endif /* TODO(sergey): Think of faster way to calculate derivatives. */ if(bump_offset == NODE_BUMP_OFFSET_DX) { - float3 Px = ccl_fetch(sd, P) - ccl_fetch(sd, dP).dx; - f += (f - wireframe(kg, sd, size, pixel_size, &Px)) / len(ccl_fetch(sd, dP).dx); + float3 Px = sd->P - sd->dP.dx; + f += (f - wireframe(kg, sd, size, pixel_size, &Px)) / len(sd->dP.dx); } else if(bump_offset == NODE_BUMP_OFFSET_DY) { - float3 Py = ccl_fetch(sd, P) - ccl_fetch(sd, dP).dy; - f += (f - wireframe(kg, sd, size, pixel_size, &Py)) / len(ccl_fetch(sd, dP).dy); + float3 Py = sd->P - sd->dP.dy; + f += (f - wireframe(kg, sd, size, pixel_size, &Py)) / len(sd->dP.dy); } if(stack_valid(out_fac)) diff --git a/intern/cycles/render/CMakeLists.txt b/intern/cycles/render/CMakeLists.txt index 8eaa9de3874..b7248354abd 100644 --- a/intern/cycles/render/CMakeLists.txt +++ b/intern/cycles/render/CMakeLists.txt @@ -1,14 +1,6 @@ set(INC - . - ../device - ../graph - ../kernel - ../kernel/svm - ../kernel/osl - ../bvh - ../subd - ../util + .. ../../glew-mx ) @@ -31,6 +23,7 @@ set(SRC mesh.cpp mesh_displace.cpp mesh_subdivision.cpp + mesh_volume.cpp nodes.cpp object.cpp osl.cpp @@ -79,4 +72,4 @@ include_directories(SYSTEM ${INC_SYS}) add_definitions(${GL_DEFINITIONS}) -add_library(cycles_render ${SRC} ${SRC_HEADERS}) +cycles_add_library(cycles_render ${SRC} ${SRC_HEADERS}) diff --git a/intern/cycles/render/attribute.cpp b/intern/cycles/render/attribute.cpp index c0d429a583c..8c77687d9cc 100644 --- a/intern/cycles/render/attribute.cpp +++ b/intern/cycles/render/attribute.cpp @@ -14,13 +14,12 @@ * limitations under the License. */ -#include "image.h" -#include "mesh.h" -#include "attribute.h" +#include "render/image.h" +#include "render/mesh.h" +#include "render/attribute.h" -#include "util_debug.h" -#include "util_foreach.h" -#include "util_transform.h" +#include "util/util_foreach.h" +#include "util/util_transform.h" CCL_NAMESPACE_BEGIN @@ -268,6 +267,8 @@ const char *Attribute::standard_name(AttributeStandard std) return "particle"; case ATTR_STD_CURVE_INTERCEPT: return "curve_intercept"; + case ATTR_STD_CURVE_RANDOM: + return "curve_random"; case ATTR_STD_PTEX_FACE_ID: return "ptex_face_id"; case ATTR_STD_PTEX_UV: @@ -280,6 +281,8 @@ const char *Attribute::standard_name(AttributeStandard std) return "flame"; case ATTR_STD_VOLUME_HEAT: return "heat"; + case ATTR_STD_VOLUME_TEMPERATURE: + return "temperature"; case ATTR_STD_VOLUME_VELOCITY: return "velocity"; case ATTR_STD_POINTINESS: @@ -295,9 +298,13 @@ const char *Attribute::standard_name(AttributeStandard std) AttributeStandard Attribute::name_standard(const char *name) { - for(int std = ATTR_STD_NONE; std < ATTR_STD_NUM; std++) - if(strcmp(name, Attribute::standard_name((AttributeStandard)std)) == 0) - return (AttributeStandard)std; + if(name) { + for(int std = ATTR_STD_NONE; std < ATTR_STD_NUM; std++) { + if(strcmp(name, Attribute::standard_name((AttributeStandard)std)) == 0) { + return (AttributeStandard)std; + } + } + } return ATTR_STD_NONE; } @@ -424,6 +431,7 @@ Attribute *AttributeSet::add(AttributeStandard std, ustring name) case ATTR_STD_VOLUME_DENSITY: case ATTR_STD_VOLUME_FLAME: case ATTR_STD_VOLUME_HEAT: + case ATTR_STD_VOLUME_TEMPERATURE: attr = add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_VOXEL); break; case ATTR_STD_VOLUME_COLOR: @@ -452,6 +460,9 @@ Attribute *AttributeSet::add(AttributeStandard std, ustring name) case ATTR_STD_CURVE_INTERCEPT: attr = add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_CURVE_KEY); break; + case ATTR_STD_CURVE_RANDOM: + attr = add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_CURVE); + break; case ATTR_STD_GENERATED_TRANSFORM: attr = add(name, TypeDesc::TypeMatrix, ATTR_ELEMENT_MESH); break; @@ -502,6 +513,16 @@ Attribute *AttributeSet::find(AttributeRequest& req) return find(req.std); } +void AttributeSet::remove(Attribute *attribute) +{ + if(attribute->std == ATTR_STD_NONE) { + remove(attribute->name); + } + else { + remove(attribute->std); + } +} + void AttributeSet::resize(bool reserve_only) { foreach(Attribute& attr, attributes) { @@ -514,9 +535,23 @@ void AttributeSet::resize(bool reserve_only) } } -void AttributeSet::clear() +void AttributeSet::clear(bool preserve_voxel_data) { - attributes.clear(); + if(preserve_voxel_data) { + list<Attribute>::iterator it; + + for(it = attributes.begin(); it != attributes.end();) { + if(it->element == ATTR_ELEMENT_VOXEL || it->std == ATTR_STD_GENERATED_TRANSFORM) { + it++; + } + else { + attributes.erase(it++); + } + } + } + else { + attributes.clear(); + } } /* AttributeRequest */ @@ -598,9 +633,11 @@ bool AttributeRequestSet::modified(const AttributeRequestSet& other) void AttributeRequestSet::add(ustring name) { - foreach(AttributeRequest& req, requests) - if(req.name == name) + foreach(AttributeRequest& req, requests) { + if(req.name == name) { return; + } + } requests.push_back(AttributeRequest(name)); } @@ -624,6 +661,22 @@ void AttributeRequestSet::add(AttributeRequestSet& reqs) } } +void AttributeRequestSet::add_standard(ustring name) +{ + if(!name) { + return; + } + + AttributeStandard std = Attribute::name_standard(name.c_str()); + + if(std) { + add(std); + } + else { + add(name); + } +} + bool AttributeRequestSet::find(ustring name) { foreach(AttributeRequest& req, requests) diff --git a/intern/cycles/render/attribute.h b/intern/cycles/render/attribute.h index f4538c76369..5cb6c75aab2 100644 --- a/intern/cycles/render/attribute.h +++ b/intern/cycles/render/attribute.h @@ -17,12 +17,12 @@ #ifndef __ATTRIBUTE_H__ #define __ATTRIBUTE_H__ -#include "kernel_types.h" +#include "kernel/kernel_types.h" -#include "util_list.h" -#include "util_param.h" -#include "util_types.h" -#include "util_vector.h" +#include "util/util_list.h" +#include "util/util_param.h" +#include "util/util_types.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN @@ -120,8 +120,10 @@ public: Attribute *find(AttributeRequest& req); + void remove(Attribute *attribute); + void resize(bool reserve_only = false); - void clear(); + void clear(bool preserve_voxel_data = false); }; /* AttributeRequest @@ -157,6 +159,7 @@ public: void add(ustring name); void add(AttributeStandard std); void add(AttributeRequestSet& reqs); + void add_standard(ustring name); bool find(ustring name); bool find(AttributeStandard std); diff --git a/intern/cycles/render/background.cpp b/intern/cycles/render/background.cpp index 8d7d7b847fd..df3b65be110 100644 --- a/intern/cycles/render/background.cpp +++ b/intern/cycles/render/background.cpp @@ -14,17 +14,17 @@ * limitations under the License. */ -#include "background.h" -#include "device.h" -#include "integrator.h" -#include "graph.h" -#include "nodes.h" -#include "scene.h" -#include "shader.h" - -#include "util_foreach.h" -#include "util_math.h" -#include "util_types.h" +#include "render/background.h" +#include "device/device.h" +#include "render/integrator.h" +#include "render/graph.h" +#include "render/nodes.h" +#include "render/scene.h" +#include "render/shader.h" + +#include "util/util_foreach.h" +#include "util/util_math.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN @@ -38,7 +38,10 @@ NODE_DEFINE(Background) SOCKET_BOOLEAN(use_shader, "Use Shader", true); SOCKET_BOOLEAN(use_ao, "Use AO", false); SOCKET_UINT(visibility, "Visibility", PATH_RAY_ALL_VISIBILITY); + SOCKET_BOOLEAN(transparent, "Transparent", false); + SOCKET_BOOLEAN(transparent_glass, "Transparent Glass", false); + SOCKET_FLOAT(transparent_roughness_threshold, "Transparent Roughness Threshold", 0.0f); SOCKET_NODE(shader, "Shader", &Shader::node_type); @@ -74,18 +77,22 @@ void Background::device_update(Device *device, DeviceScene *dscene, Scene *scene /* set shader index and transparent option */ KernelBackground *kbackground = &dscene->data.background; - if(use_ao) { - kbackground->ao_factor = ao_factor; - kbackground->ao_distance = ao_distance; - } - else { - kbackground->ao_factor = 0.0f; - kbackground->ao_distance = FLT_MAX; - } + kbackground->ao_factor = (use_ao)? ao_factor: 0.0f; + kbackground->ao_bounces_factor = ao_factor; + kbackground->ao_distance = ao_distance; kbackground->transparent = transparent; kbackground->surface_shader = scene->shader_manager->get_shader_id(bg_shader); + if(transparent && transparent_glass) { + /* Square twice, once for principled BSDF convention, and once for + * faster comparison in kernel with anisotropic roughness. */ + kbackground->transparent_roughness_squared_threshold = sqr(sqr(transparent_roughness_threshold)); + } + else { + kbackground->transparent_roughness_squared_threshold = -1.0f; + } + if(bg_shader->has_volume) kbackground->volume_shader = kbackground->surface_shader; else diff --git a/intern/cycles/render/background.h b/intern/cycles/render/background.h index 8029c6a9e80..145c05f1c18 100644 --- a/intern/cycles/render/background.h +++ b/intern/cycles/render/background.h @@ -17,9 +17,9 @@ #ifndef __BACKGROUND_H__ #define __BACKGROUND_H__ -#include "node.h" +#include "graph/node.h" -#include "util_types.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN @@ -30,7 +30,7 @@ class Shader; class Background : public Node { public: - NODE_DECLARE; + NODE_DECLARE float ao_factor; float ao_distance; @@ -42,6 +42,9 @@ public: Shader *shader; bool transparent; + bool transparent_glass; + float transparent_roughness_threshold; + bool need_update; Background(); diff --git a/intern/cycles/render/bake.cpp b/intern/cycles/render/bake.cpp index d9a297002c6..927e04abc7f 100644 --- a/intern/cycles/render/bake.cpp +++ b/intern/cycles/render/bake.cpp @@ -14,8 +14,13 @@ * limitations under the License. */ -#include "bake.h" -#include "integrator.h" +#include "render/bake.h" +#include "render/mesh.h" +#include "render/object.h" +#include "render/shader.h" +#include "render/integrator.h" + +#include "util/util_foreach.h" CCL_NAMESPACE_BEGIN @@ -135,7 +140,7 @@ bool BakeManager::bake(Device *device, DeviceScene *dscene, Scene *scene, Progre { size_t num_pixels = bake_data->size(); - int num_samples = is_aa_pass(shader_type)? scene->integrator->aa_samples : 1; + int num_samples = aa_samples(scene, bake_data, shader_type); /* calculate the total pixel samples for the progress bar */ total_pixel_samples = 0; @@ -146,12 +151,16 @@ bool BakeManager::bake(Device *device, DeviceScene *dscene, Scene *scene, Progre progress.reset_sample(); progress.set_total_pixel_samples(total_pixel_samples); + /* needs to be up to date for baking specific AA samples */ + dscene->data.integrator.aa_samples = num_samples; + device->const_copy_to("__data", &dscene->data, sizeof(dscene->data)); + for(size_t shader_offset = 0; shader_offset < num_pixels; shader_offset += m_shader_limit) { size_t shader_size = (size_t)fminf(num_pixels - shader_offset, m_shader_limit); /* setup input for device task */ - device_vector<uint4> d_input; - uint4 *d_input_data = d_input.resize(shader_size * 2); + device_vector<uint4> d_input(device, "bake_input", MEM_READ_ONLY); + uint4 *d_input_data = d_input.alloc(shader_size * 2); size_t d_input_size = 0; for(size_t i = shader_offset; i < (shader_offset + shader_size); i++) { @@ -165,15 +174,10 @@ bool BakeManager::bake(Device *device, DeviceScene *dscene, Scene *scene, Progre } /* run device task */ - device_vector<float4> d_output; - d_output.resize(shader_size); - - /* needs to be up to data for attribute access */ - device->const_copy_to("__data", &dscene->data, sizeof(dscene->data)); - - device->mem_alloc(d_input, MEM_READ_ONLY); - device->mem_copy_to(d_input); - device->mem_alloc(d_output, MEM_READ_WRITE); + device_vector<float4> d_output(device, "bake_output", MEM_READ_WRITE); + d_output.alloc(shader_size); + d_output.zero_to_device(); + d_input.copy_to_device(); DeviceTask task(DeviceTask::SHADER); task.shader_input = d_input.device_pointer; @@ -191,20 +195,19 @@ bool BakeManager::bake(Device *device, DeviceScene *dscene, Scene *scene, Progre device->task_wait(); if(progress.get_cancel()) { - device->mem_free(d_input); - device->mem_free(d_output); + d_input.free(); + d_output.free(); m_is_baking = false; return false; } - device->mem_copy_from(d_output, 0, 1, d_output.size(), sizeof(float4)); - device->mem_free(d_input); - device->mem_free(d_output); + d_output.copy_from_device(0, 1, d_output.size()); + d_input.free(); /* read result */ int k = 0; - float4 *offset = (float4*)d_output.data_pointer; + float4 *offset = d_output.data(); size_t depth = 4; for(size_t i=shader_offset; i < (shader_offset + shader_size); i++) { @@ -217,6 +220,8 @@ bool BakeManager::bake(Device *device, DeviceScene *dscene, Scene *scene, Progre } } } + + d_output.free(); } m_is_baking = false; @@ -240,14 +245,27 @@ void BakeManager::device_free(Device * /*device*/, DeviceScene * /*dscene*/) { } -bool BakeManager::is_aa_pass(ShaderEvalType type) +int BakeManager::aa_samples(Scene *scene, BakeData *bake_data, ShaderEvalType type) { - switch(type) { - case SHADER_EVAL_UV: - case SHADER_EVAL_NORMAL: - return false; - default: - return true; + if(type == SHADER_EVAL_UV || type == SHADER_EVAL_ROUGHNESS) { + return 1; + } + else if(type == SHADER_EVAL_NORMAL) { + /* Only antialias normal if mesh has bump mapping. */ + Object *object = scene->objects[bake_data->object()]; + + if(object->mesh) { + foreach(Shader *shader, object->mesh->used_shaders) { + if(shader->has_bump) { + return scene->integrator->aa_samples; + } + } + } + + return 1; + } + else { + return scene->integrator->aa_samples; } } diff --git a/intern/cycles/render/bake.h b/intern/cycles/render/bake.h index aed9c5a8e75..fbb8686b8f6 100644 --- a/intern/cycles/render/bake.h +++ b/intern/cycles/render/bake.h @@ -17,11 +17,11 @@ #ifndef __BAKE_H__ #define __BAKE_H__ -#include "device.h" -#include "scene.h" +#include "device/device.h" +#include "render/scene.h" -#include "util_progress.h" -#include "util_vector.h" +#include "util/util_progress.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN @@ -69,7 +69,7 @@ public: void device_free(Device *device, DeviceScene *dscene); static int shader_type_to_pass_filter(ShaderEvalType type, const int pass_filter); - static bool is_aa_pass(ShaderEvalType type); + static int aa_samples(Scene *scene, BakeData *bake_data, ShaderEvalType type); bool need_update; diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp index f1692712d61..6f560380b40 100644 --- a/intern/cycles/render/buffers.cpp +++ b/intern/cycles/render/buffers.cpp @@ -16,17 +16,15 @@ #include <stdlib.h> -#include "buffers.h" -#include "device.h" - -#include "util_debug.h" -#include "util_foreach.h" -#include "util_hash.h" -#include "util_image.h" -#include "util_math.h" -#include "util_opengl.h" -#include "util_time.h" -#include "util_types.h" +#include "render/buffers.h" +#include "device/device.h" + +#include "util/util_foreach.h" +#include "util/util_hash.h" +#include "util/util_math.h" +#include "util/util_opengl.h" +#include "util/util_time.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN @@ -42,6 +40,9 @@ BufferParams::BufferParams() full_width = 0; full_height = 0; + denoising_data_pass = false; + denoising_clean_pass = false; + Pass::add(PASS_COMBINED, passes); } @@ -68,10 +69,25 @@ int BufferParams::get_passes_size() for(size_t i = 0; i < passes.size(); i++) size += passes[i].components; - + + if(denoising_data_pass) { + size += DENOISING_PASS_SIZE_BASE; + if(denoising_clean_pass) size += DENOISING_PASS_SIZE_CLEAN; + } + return align_up(size, 4); } +int BufferParams::get_denoising_offset() +{ + int offset = 0; + + for(size_t i = 0; i < passes.size(); i++) + offset += passes[i].components; + + return offset; +} + /* Render Buffer Task */ RenderTile::RenderTile() @@ -90,66 +106,124 @@ RenderTile::RenderTile() stride = 0; buffer = 0; - rng_state = 0; buffers = NULL; } /* Render Buffers */ -RenderBuffers::RenderBuffers(Device *device_) +RenderBuffers::RenderBuffers(Device *device) +: buffer(device, "RenderBuffers", MEM_READ_WRITE), + map_neighbor_copied(false), render_time(0.0f) { - device = device_; } RenderBuffers::~RenderBuffers() { - device_free(); + buffer.free(); } -void RenderBuffers::device_free() +void RenderBuffers::reset(BufferParams& params_) { - if(buffer.device_pointer) { - device->mem_free(buffer); - buffer.clear(); - } + params = params_; - if(rng_state.device_pointer) { - device->mem_free(rng_state); - rng_state.clear(); - } + /* re-allocate buffer */ + buffer.alloc(params.width*params.height*params.get_passes_size()); + buffer.zero_to_device(); } -void RenderBuffers::reset(Device *device, BufferParams& params_) +void RenderBuffers::zero() { - params = params_; + buffer.zero_to_device(); +} - /* free existing buffers */ - device_free(); - - /* allocate buffer */ - buffer.resize(params.width*params.height*params.get_passes_size()); - device->mem_alloc(buffer, MEM_READ_WRITE); - device->mem_zero(buffer); +bool RenderBuffers::copy_from_device() +{ + if(!buffer.device_pointer) + return false; - /* allocate rng state */ - rng_state.resize(params.width, params.height); + buffer.copy_from_device(0, params.width * params.get_passes_size(), params.height); - device->mem_alloc(rng_state, MEM_READ_WRITE); + return true; } -bool RenderBuffers::copy_from_device() +bool RenderBuffers::get_denoising_pass_rect(int offset, float exposure, int sample, int components, float *pixels) { - if(!buffer.device_pointer) + if(buffer.data() == NULL) { return false; + } + + float invsample = 1.0f/sample; + float scale = invsample; + bool variance = (offset == DENOISING_PASS_NORMAL_VAR) || + (offset == DENOISING_PASS_ALBEDO_VAR) || + (offset == DENOISING_PASS_DEPTH_VAR) || + (offset == DENOISING_PASS_COLOR_VAR); + + if(offset == DENOISING_PASS_COLOR) { + scale *= exposure; + } + else if(offset == DENOISING_PASS_COLOR_VAR) { + scale *= exposure*exposure; + } + + offset += params.get_denoising_offset(); + int pass_stride = params.get_passes_size(); + int size = params.width*params.height; - device->mem_copy_from(buffer, 0, params.width, params.height, params.get_passes_size()*sizeof(float)); + if(variance) { + /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance + * update does not work efficiently with atomics in the kernel. */ + int mean_offset = offset - components; + float *mean = buffer.data() + mean_offset; + float *var = buffer.data() + offset; + assert(mean_offset >= 0); + + if(components == 1) { + for(int i = 0; i < size; i++, mean += pass_stride, var += pass_stride, pixels++) { + pixels[0] = max(0.0f, var[0] - mean[0]*mean[0]*invsample)*scale; + } + } + else if(components == 3) { + for(int i = 0; i < size; i++, mean += pass_stride, var += pass_stride, pixels += 3) { + pixels[0] = max(0.0f, var[0] - mean[0]*mean[0]*invsample)*scale; + pixels[1] = max(0.0f, var[1] - mean[1]*mean[1]*invsample)*scale; + pixels[2] = max(0.0f, var[2] - mean[2]*mean[2]*invsample)*scale; + } + } + else { + return false; + } + } + else { + float *in = buffer.data() + offset; + + if(components == 1) { + for(int i = 0; i < size; i++, in += pass_stride, pixels++) { + pixels[0] = in[0]*scale; + } + } + else if(components == 3) { + for(int i = 0; i < size; i++, in += pass_stride, pixels += 3) { + pixels[0] = in[0]*scale; + pixels[1] = in[1]*scale; + pixels[2] = in[2]*scale; + } + } + else { + return false; + } + } return true; } bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int components, float *pixels) { + if(buffer.data() == NULL) { + return false; + } + int pass_offset = 0; for(size_t j = 0; j < params.passes.size(); j++) { @@ -160,7 +234,7 @@ bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int continue; } - float *in = (float*)buffer.data_pointer + pass_offset; + float *in = buffer.data() + pass_offset; int pass_stride = params.get_passes_size(); float scale = (pass.filter)? 1.0f/(float)sample: 1.0f; @@ -168,10 +242,17 @@ bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int int size = params.width*params.height; - if(components == 1) { + if(components == 1 && type == PASS_RENDER_TIME) { + /* Render time is not stored by kernel, but measured per tile. */ + float val = (float) (1000.0 * render_time/(params.width * params.height * sample)); + for(int i = 0; i < size; i++, pixels++) { + pixels[0] = val; + } + } + else if(components == 1) { assert(pass.components == components); - /* scalar */ + /* Scalar */ if(type == PASS_DEPTH) { for(int i = 0; i < size; i++, in += pass_stride, pixels++) { float f = *in; @@ -227,7 +308,7 @@ bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int pass_offset += color_pass.components; } - float *in_divide = (float*)buffer.data_pointer + pass_offset; + float *in_divide = buffer.data() + pass_offset; for(int i = 0; i < size; i++, in += pass_stride, in_divide += pass_stride, pixels += 3) { float3 f = make_float3(in[0], in[1], in[2]); @@ -276,7 +357,7 @@ bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int pass_offset += color_pass.components; } - float *in_weight = (float*)buffer.data_pointer + pass_offset; + float *in_weight = buffer.data() + pass_offset; for(int i = 0; i < size; i++, in += pass_stride, in_weight += pass_stride, pixels += 4) { float4 f = make_float4(in[0], in[1], in[2], in[3]); @@ -311,50 +392,35 @@ bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int /* Display Buffer */ -DisplayBuffer::DisplayBuffer(Device *device_, bool linear) +DisplayBuffer::DisplayBuffer(Device *device, bool linear) +: draw_width(0), + draw_height(0), + transparent(true), /* todo: determine from background */ + half_float(linear), + rgba_byte(device, "display buffer byte"), + rgba_half(device, "display buffer half") { - device = device_; - draw_width = 0; - draw_height = 0; - transparent = true; /* todo: determine from background */ - half_float = linear; } DisplayBuffer::~DisplayBuffer() { - device_free(); + rgba_byte.free(); + rgba_half.free(); } -void DisplayBuffer::device_free() -{ - if(rgba_byte.device_pointer) { - device->pixels_free(rgba_byte); - rgba_byte.clear(); - } - if(rgba_half.device_pointer) { - device->pixels_free(rgba_half); - rgba_half.clear(); - } -} - -void DisplayBuffer::reset(Device *device, BufferParams& params_) +void DisplayBuffer::reset(BufferParams& params_) { draw_width = 0; draw_height = 0; params = params_; - /* free existing buffers */ - device_free(); - /* allocate display pixels */ if(half_float) { - rgba_half.resize(params.width, params.height); - device->pixels_alloc(rgba_half); + rgba_half.alloc_to_device(params.width, params.height); } else { - rgba_byte.resize(params.width, params.height); - device->pixels_alloc(rgba_byte); + rgba_byte.alloc_to_device(params.width, params.height); } } @@ -369,7 +435,8 @@ void DisplayBuffer::draw_set(int width, int height) void DisplayBuffer::draw(Device *device, const DeviceDrawParams& draw_params) { if(draw_width != 0 && draw_height != 0) { - device_memory& rgba = rgba_data(); + device_memory& rgba = (half_float)? (device_memory&)rgba_half: + (device_memory&)rgba_byte; device->draw_pixels(rgba, 0, draw_width, draw_height, params.full_x, params.full_y, params.width, params.height, transparent, draw_params); } @@ -380,47 +447,5 @@ bool DisplayBuffer::draw_ready() return (draw_width != 0 && draw_height != 0); } -void DisplayBuffer::write(Device *device, const string& filename) -{ - int w = draw_width; - int h = draw_height; - - if(w == 0 || h == 0) - return; - - if(half_float) - return; - - /* read buffer from device */ - device_memory& rgba = rgba_data(); - device->pixels_copy_from(rgba, 0, w, h); - - /* write image */ - ImageOutput *out = ImageOutput::create(filename); - ImageSpec spec(w, h, 4, TypeDesc::UINT8); - int scanlinesize = w*4*sizeof(uchar); - - out->open(filename, spec); - - /* conversion for different top/bottom convention */ - out->write_image(TypeDesc::UINT8, - (uchar*)rgba.data_pointer + (h-1)*scanlinesize, - AutoStride, - -scanlinesize, - AutoStride); - - out->close(); - - delete out; -} - -device_memory& DisplayBuffer::rgba_data() -{ - if(half_float) - return rgba_half; - else - return rgba_byte; -} - CCL_NAMESPACE_END diff --git a/intern/cycles/render/buffers.h b/intern/cycles/render/buffers.h index c9c2a21079a..dfc98fe2061 100644 --- a/intern/cycles/render/buffers.h +++ b/intern/cycles/render/buffers.h @@ -17,16 +17,16 @@ #ifndef __BUFFERS_H__ #define __BUFFERS_H__ -#include "device_memory.h" +#include "device/device_memory.h" -#include "film.h" +#include "render/film.h" -#include "kernel_types.h" +#include "kernel/kernel_types.h" -#include "util_half.h" -#include "util_string.h" -#include "util_thread.h" -#include "util_types.h" +#include "util/util_half.h" +#include "util/util_string.h" +#include "util/util_thread.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN @@ -51,6 +51,9 @@ public: /* passes */ array<Pass> passes; + bool denoising_data_pass; + /* If only some light path types should be denoised, an additional pass is needed. */ + bool denoising_clean_pass; /* functions */ BufferParams(); @@ -59,6 +62,7 @@ public: bool modified(const BufferParams& params); void add_pass(PassType type); int get_passes_size(); + int get_denoising_offset(); }; /* Render Buffers */ @@ -70,21 +74,18 @@ public: /* float buffer */ device_vector<float> buffer; - /* random number generator state */ - device_vector<uint> rng_state; + bool map_neighbor_copied; + double render_time; explicit RenderBuffers(Device *device); ~RenderBuffers(); - void reset(Device *device, BufferParams& params); + void reset(BufferParams& params); + void zero(); bool copy_from_device(); bool get_pass_rect(PassType type, float exposure, int sample, int components, float *pixels); - -protected: - void device_free(); - - Device *device; + bool get_denoising_pass_rect(int offset, float exposure, int sample, int components, float *pixels); }; /* Display Buffer @@ -105,25 +106,17 @@ public: /* use half float? */ bool half_float; /* byte buffer for converted result */ - device_vector<uchar4> rgba_byte; - device_vector<half4> rgba_half; + device_pixels<uchar4> rgba_byte; + device_pixels<half4> rgba_half; DisplayBuffer(Device *device, bool linear = false); ~DisplayBuffer(); - void reset(Device *device, BufferParams& params); - void write(Device *device, const string& filename); + void reset(BufferParams& params); void draw_set(int width, int height); void draw(Device *device, const DeviceDrawParams& draw_params); bool draw_ready(); - - device_memory& rgba_data(); - -protected: - void device_free(); - - Device *device; }; /* Render Tile @@ -131,6 +124,9 @@ protected: class RenderTile { public: + typedef enum { PATH_TRACE, DENOISE } Task; + + Task task; int x, y, w, h; int start_sample; int num_samples; @@ -138,9 +134,9 @@ public: int resolution; int offset; int stride; + int tile_index; device_ptr buffer; - device_ptr rng_state; RenderBuffers *buffers; diff --git a/intern/cycles/render/camera.cpp b/intern/cycles/render/camera.cpp index c8c51ec96d2..38936ffc094 100644 --- a/intern/cycles/render/camera.cpp +++ b/intern/cycles/render/camera.cpp @@ -14,18 +14,27 @@ * limitations under the License. */ -#include "camera.h" -#include "mesh.h" -#include "object.h" -#include "scene.h" -#include "tables.h" - -#include "device.h" - -#include "util_foreach.h" -#include "util_function.h" -#include "util_math_cdf.h" -#include "util_vector.h" +#include "render/camera.h" +#include "render/mesh.h" +#include "render/object.h" +#include "render/scene.h" +#include "render/tables.h" + +#include "device/device.h" + +#include "util/util_foreach.h" +#include "util/util_function.h" +#include "util/util_math_cdf.h" +#include "util/util_vector.h" + +/* needed for calculating differentials */ +#include "kernel/kernel_compat_cpu.h" +#include "kernel/split/kernel_split_data.h" +#include "kernel/kernel_globals.h" +#include "kernel/kernel_projection.h" +#include "kernel/kernel_differential.h" +#include "kernel/kernel_montecarlo.h" +#include "kernel/kernel_camera.h" CCL_NAMESPACE_BEGIN @@ -73,6 +82,7 @@ NODE_DEFINE(Camera) SOCKET_FLOAT(bladesrotation, "Blades Rotation", 0.0f); SOCKET_TRANSFORM(matrix, "Matrix", transform_identity()); + SOCKET_TRANSFORM_ARRAY(motion, "Motion", array<Transform>()); SOCKET_FLOAT(aperture_ratio, "Aperture Ratio", 1.0f); @@ -128,6 +138,8 @@ NODE_DEFINE(Camera) SOCKET_FLOAT(border.bottom, "Border Bottom", 0); SOCKET_FLOAT(border.top, "Border Top", 0); + SOCKET_FLOAT(offscreen_dicing_scale, "Offscreen Dicing Scale", 1.0f); + return type; } @@ -140,9 +152,6 @@ Camera::Camera() height = 512; resolution = 1; - motion.pre = transform_identity(); - motion.post = transform_identity(); - use_motion = false; use_perspective_motion = false; shutter_curve.resize(RAMP_TABLE_SIZE); @@ -152,12 +161,12 @@ Camera::Camera() compute_auto_viewplane(); - screentoworld = transform_identity(); - rastertoworld = transform_identity(); - ndctoworld = transform_identity(); - rastertocamera = transform_identity(); + screentoworld = projection_identity(); + rastertoworld = projection_identity(); + ndctoworld = projection_identity(); + rastertocamera = projection_identity(); cameratoworld = transform_identity(); - worldtoraster = transform_identity(); + worldtoraster = projection_identity(); dx = make_float3(0.0f, 0.0f, 0.0f); dy = make_float3(0.0f, 0.0f, 0.0f); @@ -166,6 +175,8 @@ Camera::Camera() need_device_update = true; need_flags_update = true; previous_need_motion = -1; + + memset(&kernel_camera, 0, sizeof(kernel_camera)); } Camera::~Camera() @@ -197,8 +208,17 @@ void Camera::compute_auto_viewplane() } } -void Camera::update() +void Camera::update(Scene *scene) { + Scene::MotionType need_motion = scene->need_motion(); + + if(previous_need_motion != need_motion) { + /* scene's motion model could have been changed since previous device + * camera update this could happen for example in case when one render + * layer has got motion pass and another not */ + need_device_update = true; + } + if(!need_update) return; @@ -219,18 +239,18 @@ void Camera::update() Transform full_rastertoscreen = transform_inverse(full_screentoraster); /* screen to camera */ - Transform cameratoscreen; + ProjectionTransform cameratoscreen; if(type == CAMERA_PERSPECTIVE) - cameratoscreen = transform_perspective(fov, nearclip, farclip); + cameratoscreen = projection_perspective(fov, nearclip, farclip); else if(type == CAMERA_ORTHOGRAPHIC) - cameratoscreen = transform_orthographic(nearclip, farclip); + cameratoscreen = projection_orthographic(nearclip, farclip); else - cameratoscreen = transform_identity(); + cameratoscreen = projection_identity(); - Transform screentocamera = transform_inverse(cameratoscreen); + ProjectionTransform screentocamera = projection_inverse(cameratoscreen); rastertocamera = screentocamera * rastertoscreen; - Transform full_rastertocamera = screentocamera * full_rastertoscreen; + ProjectionTransform full_rastertocamera = screentocamera * full_rastertoscreen; cameratoraster = screentoraster * cameratoscreen; cameratoworld = matrix; @@ -248,10 +268,10 @@ void Camera::update() /* differentials */ if(type == CAMERA_ORTHOGRAPHIC) { - dx = transform_direction(&rastertocamera, make_float3(1, 0, 0)); - dy = transform_direction(&rastertocamera, make_float3(0, 1, 0)); - full_dx = transform_direction(&full_rastertocamera, make_float3(1, 0, 0)); - full_dy = transform_direction(&full_rastertocamera, make_float3(0, 1, 0)); + dx = transform_perspective_direction(&rastertocamera, make_float3(1, 0, 0)); + dy = transform_perspective_direction(&rastertocamera, make_float3(0, 1, 0)); + full_dx = transform_perspective_direction(&full_rastertocamera, make_float3(1, 0, 0)); + full_dy = transform_perspective_direction(&full_rastertocamera, make_float3(0, 1, 0)); } else if(type == CAMERA_PERSPECTIVE) { dx = transform_perspective(&rastertocamera, make_float3(1, 0, 0)) - @@ -273,45 +293,15 @@ void Camera::update() full_dx = transform_direction(&cameratoworld, full_dx); full_dy = transform_direction(&cameratoworld, full_dy); - /* TODO(sergey): Support other types of camera. */ if(type == CAMERA_PERSPECTIVE) { - /* TODO(sergey): Move to an utility function and de-duplicate with - * calculation above. - */ - Transform screentocamera_pre = - transform_inverse(transform_perspective(fov_pre, - nearclip, - farclip)); - Transform screentocamera_post = - transform_inverse(transform_perspective(fov_post, - nearclip, - farclip)); - perspective_motion.pre = screentocamera_pre * rastertoscreen; - perspective_motion.post = screentocamera_post * rastertoscreen; - } + float3 v = transform_perspective(&full_rastertocamera, make_float3(full_width, full_height, 1.0f)); - need_update = false; - need_device_update = true; - need_flags_update = true; -} - -void Camera::device_update(Device *device, DeviceScene *dscene, Scene *scene) -{ - Scene::MotionType need_motion = scene->need_motion(device->info.advanced_shading); - - update(); - - if(previous_need_motion != need_motion) { - /* scene's motion model could have been changed since previous device - * camera update this could happen for example in case when one render - * layer has got motion pass and another not */ - need_device_update = true; + frustum_right_normal = normalize(make_float3(v.z, 0.0f, -v.x)); + frustum_top_normal = normalize(make_float3(0.0f, v.z, -v.y)); } - if(!need_device_update) - return; - - KernelCamera *kcam = &dscene->data.cam; + /* Compute kernel camera data. */ + KernelCamera *kcam = &kernel_camera; /* store matrices */ kcam->screentoworld = screentoworld; @@ -322,46 +312,68 @@ void Camera::device_update(Device *device, DeviceScene *dscene, Scene *scene) kcam->worldtoscreen = worldtoscreen; kcam->worldtoraster = worldtoraster; kcam->worldtondc = worldtondc; + kcam->ndctoworld = ndctoworld; /* camera motion */ - kcam->have_motion = 0; + kcam->num_motion_steps = 0; kcam->have_perspective_motion = 0; + kernel_camera_motion.clear(); + + /* Test if any of the transforms are actually different. */ + bool have_motion = false; + for(size_t i = 0; i < motion.size(); i++) { + have_motion = have_motion || motion[i] != matrix; + } if(need_motion == Scene::MOTION_PASS) { /* TODO(sergey): Support perspective (zoom, fov) motion. */ if(type == CAMERA_PANORAMA) { - if(use_motion) { - kcam->motion.pre = transform_inverse(motion.pre); - kcam->motion.post = transform_inverse(motion.post); + if(have_motion) { + kcam->motion_pass_pre = transform_inverse(motion[0]); + kcam->motion_pass_post = transform_inverse(motion[motion.size()-1]); } else { - kcam->motion.pre = kcam->worldtocamera; - kcam->motion.post = kcam->worldtocamera; + kcam->motion_pass_pre = kcam->worldtocamera; + kcam->motion_pass_post = kcam->worldtocamera; } } else { - if(use_motion) { - kcam->motion.pre = cameratoraster * transform_inverse(motion.pre); - kcam->motion.post = cameratoraster * transform_inverse(motion.post); + if(have_motion) { + kcam->perspective_pre = cameratoraster * transform_inverse(motion[0]); + kcam->perspective_post = cameratoraster * transform_inverse(motion[motion.size()-1]); } else { - kcam->motion.pre = worldtoraster; - kcam->motion.post = worldtoraster; + kcam->perspective_pre = worldtoraster; + kcam->perspective_post = worldtoraster; } } } -#ifdef __CAMERA_MOTION__ else if(need_motion == Scene::MOTION_BLUR) { - if(use_motion) { - transform_motion_decompose((DecompMotionTransform*)&kcam->motion, &motion, &matrix); - kcam->have_motion = 1; + if(have_motion) { + kernel_camera_motion.resize(motion.size()); + transform_motion_decompose(kernel_camera_motion.data(), motion.data(), motion.size()); + kcam->num_motion_steps = motion.size(); } - if(use_perspective_motion) { - kcam->perspective_motion = perspective_motion; + + /* TODO(sergey): Support other types of camera. */ + if(use_perspective_motion && type == CAMERA_PERSPECTIVE) { + /* TODO(sergey): Move to an utility function and de-duplicate with + * calculation above. + */ + ProjectionTransform screentocamera_pre = + projection_inverse(projection_perspective(fov_pre, + nearclip, + farclip)); + ProjectionTransform screentocamera_post = + projection_inverse(projection_perspective(fov_post, + nearclip, + farclip)); + + kcam->perspective_pre = screentocamera_pre * rastertoscreen; + kcam->perspective_post = screentocamera_post * rastertoscreen; kcam->have_perspective_motion = 1; } } -#endif /* depth of field */ kcam->aperturesize = aperturesize; @@ -370,26 +382,8 @@ void Camera::device_update(Device *device, DeviceScene *dscene, Scene *scene) kcam->bladesrotation = bladesrotation; /* motion blur */ -#ifdef __CAMERA_MOTION__ kcam->shuttertime = (need_motion == Scene::MOTION_BLUR) ? shuttertime: -1.0f; - scene->lookup_tables->remove_table(&shutter_table_offset); - if(need_motion == Scene::MOTION_BLUR) { - vector<float> shutter_table; - util_cdf_inverted(SHUTTER_TABLE_SIZE, - 0.0f, - 1.0f, - function_bind(shutter_curve_eval, _1, shutter_curve), - false, - shutter_table); - shutter_table_offset = scene->lookup_tables->add_table(dscene, - shutter_table); - kcam->shutter_table_offset = (int)shutter_table_offset; - } -#else - kcam->shuttertime = -1.0f; -#endif - /* type */ kcam->type = type; @@ -450,9 +444,49 @@ void Camera::device_update(Device *device, DeviceScene *dscene, Scene *scene) kcam->rolling_shutter_type = rolling_shutter_type; kcam->rolling_shutter_duration = rolling_shutter_duration; + /* Set further update flags */ + need_update = false; + need_device_update = true; + need_flags_update = true; previous_need_motion = need_motion; } +void Camera::device_update(Device * /* device */, + DeviceScene *dscene, + Scene *scene) +{ + update(scene); + + if(!need_device_update) + return; + + scene->lookup_tables->remove_table(&shutter_table_offset); + if(kernel_camera.shuttertime != -1.0f) { + vector<float> shutter_table; + util_cdf_inverted(SHUTTER_TABLE_SIZE, + 0.0f, + 1.0f, + function_bind(shutter_curve_eval, _1, shutter_curve), + false, + shutter_table); + shutter_table_offset = scene->lookup_tables->add_table(dscene, + shutter_table); + kernel_camera.shutter_table_offset = (int)shutter_table_offset; + } + + dscene->data.cam = kernel_camera; + + size_t num_motion_steps = kernel_camera_motion.size(); + if(num_motion_steps) { + DecomposedTransform *camera_motion = dscene->camera_motion.alloc(num_motion_steps); + memcpy(camera_motion, kernel_camera_motion.data(), sizeof(*camera_motion) * num_motion_steps); + dscene->camera_motion.copy_to_device(); + } + else { + dscene->camera_motion.free(); + } +} + void Camera::device_update_volume(Device * /*device*/, DeviceScene *dscene, Scene *scene) @@ -477,10 +511,11 @@ void Camera::device_update_volume(Device * /*device*/, } void Camera::device_free(Device * /*device*/, - DeviceScene * /*dscene*/, + DeviceScene *dscene, Scene *scene) { scene->lookup_tables->remove_table(&shutter_table_offset); + dscene->camera_motion.free(); } bool Camera::modified(const Camera& cam) @@ -491,7 +526,6 @@ bool Camera::modified(const Camera& cam) bool Camera::motion_modified(const Camera& cam) { return !((motion == cam.motion) && - (use_motion == cam.use_motion) && (use_perspective_motion == cam.use_perspective_motion)); } @@ -581,8 +615,27 @@ BoundBox Camera::viewplane_bounds_get() float Camera::world_to_raster_size(float3 P) { + float res = 1.0f; + if(type == CAMERA_ORTHOGRAPHIC) { - return min(len(full_dx), len(full_dy)); + res = min(len(full_dx), len(full_dy)); + + if(offscreen_dicing_scale > 1.0f) { + float3 p = transform_point(&worldtocamera, P); + float3 v = transform_perspective(&rastertocamera, make_float3(width, height, 0.0f)); + + /* Create point clamped to frustum */ + float3 c; + c.x = max(-v.x, min(v.x, p.x)); + c.y = max(-v.y, min(v.y, p.y)); + c.z = max(0.0f, p.z); + + float f_dist = len(p - c) / sqrtf((v.x*v.x+v.y*v.y)*0.5f); + + if(f_dist > 0.0f) { + res += res * f_dist * (offscreen_dicing_scale - 1.0f); + } + } } else if(type == CAMERA_PERSPECTIVE) { /* Calculate as if point is directly ahead of the camera. */ @@ -597,14 +650,121 @@ float Camera::world_to_raster_size(float3 P) /* dPdx */ float dist = len(transform_point(&worldtocamera, P)); float3 D = normalize(Ddiff); - return len(dist*dDdx - dot(dist*dDdx, D)*D); + res = len(dist*dDdx - dot(dist*dDdx, D)*D); + + /* Decent approx distance to frustum (doesn't handle corners correctly, but not that big of a deal) */ + float f_dist = 0.0f; + + if(offscreen_dicing_scale > 1.0f) { + float3 p = transform_point(&worldtocamera, P); + + /* Distance from the four planes */ + float r = dot(p, frustum_right_normal); + float t = dot(p, frustum_top_normal); + p = make_float3(-p.x, -p.y, p.z); + float l = dot(p, frustum_right_normal); + float b = dot(p, frustum_top_normal); + p = make_float3(-p.x, -p.y, p.z); + + if(r <= 0.0f && l <= 0.0f && t <= 0.0f && b <= 0.0f) { + /* Point is inside frustum */ + f_dist = 0.0f; + } + else if(r > 0.0f && l > 0.0f && t > 0.0f && b > 0.0f) { + /* Point is behind frustum */ + f_dist = len(p); + } + else { + /* Point may be behind or off to the side, need to check */ + float3 along_right = make_float3(-frustum_right_normal.z, 0.0f, frustum_right_normal.x); + float3 along_left = make_float3(frustum_right_normal.z, 0.0f, frustum_right_normal.x); + float3 along_top = make_float3(0.0f, -frustum_top_normal.z, frustum_top_normal.y); + float3 along_bottom = make_float3(0.0f, frustum_top_normal.z, frustum_top_normal.y); + + float dist[] = {r, l, t, b}; + float3 along[] = {along_right, along_left, along_top, along_bottom}; + + bool test_o = false; + + float *d = dist; + float3 *a = along; + for(int i = 0; i < 4; i++, d++, a++) { + /* Test if we should check this side at all */ + if(*d > 0.0f) { + if(dot(p, *a) >= 0.0f) { + /* We are in front of the back edge of this side of the frustum */ + f_dist = max(f_dist, *d); + } + else { + /* Possibly far enough behind the frustum to use distance to origin instead of edge */ + test_o = true; + } + } + } + + if(test_o) { + f_dist = (f_dist > 0) ? min(f_dist, len(p)) : len(p); + } + } + + if(f_dist > 0.0f) { + res += len(dDdx - dot(dDdx, D)*D) * f_dist * (offscreen_dicing_scale - 1.0f); + } + } } - else { - // TODO(mai): implement for CAMERA_PANORAMA - assert(!"pixel width calculation for panoramic projection not implemented yet"); + else if(type == CAMERA_PANORAMA) { + float3 D = transform_point(&worldtocamera, P); + float dist = len(D); + + Ray ray; + + /* Distortion can become so great that the results become meaningless, there + * may be a better way to do this, but calculating differentials from the + * point directly ahead seems to produce good enough results. */ +#if 0 + float2 dir = direction_to_panorama(&kernel_camera, kernel_camera_motion.data(), normalize(D)); + float3 raster = transform_perspective(&cameratoraster, make_float3(dir.x, dir.y, 0.0f)); + + ray.t = 1.0f; + camera_sample_panorama(&kernel_camera, kernel_camera_motion.data(), raster.x, raster.y, 0.0f, 0.0f, &ray); + if(ray.t == 0.0f) { + /* No differentials, just use from directly ahead. */ + camera_sample_panorama(&kernel_camera, kernel_camera_motion.data(), 0.5f*width, 0.5f*height, 0.0f, 0.0f, &ray); + } +#else + camera_sample_panorama(&kernel_camera, kernel_camera_motion.data(), 0.5f*width, 0.5f*height, 0.0f, 0.0f, &ray); +#endif + + differential_transfer(&ray.dP, ray.dP, ray.D, ray.dD, ray.D, dist); + + return max(len(ray.dP.dx) * (float(width)/float(full_width)), + len(ray.dP.dy) * (float(height)/float(full_height))); + } + + return res; +} + +bool Camera::use_motion() const +{ + return motion.size() > 1; +} + +float Camera::motion_time(int step) const +{ + return (use_motion()) ? 2.0f * step / (motion.size() - 1) - 1.0f : 0.0f; +} + +int Camera::motion_step(float time) const +{ + if(use_motion()) { + for(int step = 0; step < motion.size(); step++) { + if(time == motion_time(step)) { + return step; + } + } } - return 1.0f; + return -1; } CCL_NAMESPACE_END diff --git a/intern/cycles/render/camera.h b/intern/cycles/render/camera.h index 141ef9cccef..37d05c01bd9 100644 --- a/intern/cycles/render/camera.h +++ b/intern/cycles/render/camera.h @@ -17,13 +17,14 @@ #ifndef __CAMERA_H__ #define __CAMERA_H__ -#include "kernel_types.h" +#include "kernel/kernel_types.h" -#include "node.h" +#include "graph/node.h" -#include "util_boundbox.h" -#include "util_transform.h" -#include "util_types.h" +#include "util/util_boundbox.h" +#include "util/util_projection.h" +#include "util/util_transform.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN @@ -39,7 +40,7 @@ class Scene; class Camera : public Node { public: - NODE_DECLARE; + NODE_DECLARE /* Specifies an offset for the shutter's time interval. */ enum MotionPosition { @@ -129,6 +130,8 @@ public: BoundBox2D viewplane; /* width and height change during preview, so we need these for calculating dice rates. */ int full_width, full_height; + /* controls how fast the dicing rate falls off for geometry out side of view */ + float offscreen_dicing_scale; /* border */ BoundBox2D border; @@ -138,24 +141,23 @@ public: Transform matrix; /* motion */ - MotionTransform motion; - bool use_motion, use_perspective_motion; + array<Transform> motion; + bool use_perspective_motion; float fov_pre, fov_post; - PerspectiveMotionTransform perspective_motion; /* computed camera parameters */ - Transform screentoworld; - Transform rastertoworld; - Transform ndctoworld; + ProjectionTransform screentoworld; + ProjectionTransform rastertoworld; + ProjectionTransform ndctoworld; Transform cameratoworld; - Transform worldtoraster; - Transform worldtoscreen; - Transform worldtondc; + ProjectionTransform worldtoraster; + ProjectionTransform worldtoscreen; + ProjectionTransform worldtondc; Transform worldtocamera; - Transform rastertocamera; - Transform cameratoraster; + ProjectionTransform rastertocamera; + ProjectionTransform cameratoraster; float3 dx; float3 dy; @@ -163,19 +165,26 @@ public: float3 full_dx; float3 full_dy; + float3 frustum_right_normal; + float3 frustum_top_normal; + /* update */ bool need_update; bool need_device_update; bool need_flags_update; int previous_need_motion; + /* Kernel camera data, copied here for dicing. */ + KernelCamera kernel_camera; + array<DecomposedTransform> kernel_camera_motion; + /* functions */ Camera(); ~Camera(); void compute_auto_viewplane(); - void update(); + void update(Scene *scene); void device_update(Device *device, DeviceScene *dscene, Scene *scene); void device_update_volume(Device *device, DeviceScene *dscene, Scene *scene); @@ -191,6 +200,11 @@ public: /* Calculates the width of a pixel at point in world space. */ float world_to_raster_size(float3 P); + /* Motion blur. */ + float motion_time(int step) const; + int motion_step(float time) const; + bool use_motion() const; + private: /* Private utility functions. */ float3 transform_raster_to_world(float raster_x, float raster_y); diff --git a/intern/cycles/render/constant_fold.cpp b/intern/cycles/render/constant_fold.cpp index b7f25663bc3..943b218f0e4 100644 --- a/intern/cycles/render/constant_fold.cpp +++ b/intern/cycles/render/constant_fold.cpp @@ -14,11 +14,11 @@ * limitations under the License. */ -#include "constant_fold.h" -#include "graph.h" +#include "render/constant_fold.h" +#include "render/graph.h" -#include "util_foreach.h" -#include "util_logging.h" +#include "util/util_foreach.h" +#include "util/util_logging.h" CCL_NAMESPACE_BEGIN @@ -160,6 +160,14 @@ bool ConstantFolder::try_bypass_or_make_constant(ShaderInput *input, bool clamp) bypass(input->link); return true; } + else { + /* disconnect other inputs if we can't fully bypass due to clamp */ + foreach(ShaderInput *other, node->inputs) { + if(other != input && other->link) { + graph->disconnect(other); + } + } + } return false; } diff --git a/intern/cycles/render/constant_fold.h b/intern/cycles/render/constant_fold.h index 7962698319f..33f93b8c0ab 100644 --- a/intern/cycles/render/constant_fold.h +++ b/intern/cycles/render/constant_fold.h @@ -17,8 +17,8 @@ #ifndef __CONSTANT_FOLD_H__ #define __CONSTANT_FOLD_H__ -#include "util_types.h" -#include "svm_types.h" +#include "util/util_types.h" +#include "kernel/svm/svm_types.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/render/curves.cpp b/intern/cycles/render/curves.cpp index f671eb19cae..4c085b928fb 100644 --- a/intern/cycles/render/curves.cpp +++ b/intern/cycles/render/curves.cpp @@ -14,16 +14,16 @@ * limitations under the License. */ -#include "device.h" -#include "curves.h" -#include "mesh.h" -#include "object.h" -#include "scene.h" - -#include "util_foreach.h" -#include "util_map.h" -#include "util_progress.h" -#include "util_vector.h" +#include "device/device.h" +#include "render/curves.h" +#include "render/mesh.h" +#include "render/object.h" +#include "render/scene.h" + +#include "util/util_foreach.h" +#include "util/util_map.h" +#include "util/util_progress.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/render/curves.h b/intern/cycles/render/curves.h index e41967eebf5..8834764bd63 100644 --- a/intern/cycles/render/curves.h +++ b/intern/cycles/render/curves.h @@ -17,8 +17,8 @@ #ifndef __CURVES_H__ #define __CURVES_H__ -#include "util_types.h" -#include "util_vector.h" +#include "util/util_types.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp index 923252bb375..69828cc78da 100644 --- a/intern/cycles/render/film.cpp +++ b/intern/cycles/render/film.cpp @@ -14,19 +14,18 @@ * limitations under the License. */ -#include "camera.h" -#include "device.h" -#include "film.h" -#include "integrator.h" -#include "mesh.h" -#include "scene.h" -#include "tables.h" - -#include "util_algorithm.h" -#include "util_debug.h" -#include "util_foreach.h" -#include "util_math.h" -#include "util_math_cdf.h" +#include "render/camera.h" +#include "device/device.h" +#include "render/film.h" +#include "render/integrator.h" +#include "render/mesh.h" +#include "render/scene.h" +#include "render/tables.h" + +#include "util/util_algorithm.h" +#include "util/util_foreach.h" +#include "util/util_math.h" +#include "util/util_math_cdf.h" CCL_NAMESPACE_BEGIN @@ -85,83 +84,81 @@ void Pass::add(PassType type, array<Pass>& passes) pass.components = 1; pass.filter = false; break; - case PASS_DIFFUSE_COLOR: - case PASS_GLOSSY_COLOR: - case PASS_TRANSMISSION_COLOR: - case PASS_SUBSURFACE_COLOR: - pass.components = 4; - break; - case PASS_DIFFUSE_INDIRECT: + + case PASS_EMISSION: + case PASS_BACKGROUND: pass.components = 4; pass.exposure = true; - pass.divide_type = PASS_DIFFUSE_COLOR; break; - case PASS_GLOSSY_INDIRECT: + case PASS_AO: pass.components = 4; - pass.exposure = true; - pass.divide_type = PASS_GLOSSY_COLOR; break; - case PASS_TRANSMISSION_INDIRECT: + case PASS_SHADOW: pass.components = 4; - pass.exposure = true; - pass.divide_type = PASS_TRANSMISSION_COLOR; + pass.exposure = false; break; - case PASS_SUBSURFACE_INDIRECT: + case PASS_LIGHT: + /* This isn't a real pass, used by baking to see whether + * light data is needed or not. + * + * Set components to 0 so pass sort below happens in a + * determined way. + */ + pass.components = 0; + break; +#ifdef WITH_CYCLES_DEBUG + case PASS_BVH_TRAVERSED_NODES: + case PASS_BVH_TRAVERSED_INSTANCES: + case PASS_BVH_INTERSECTIONS: + case PASS_RAY_BOUNCES: + pass.components = 1; + pass.exposure = false; + break; +#endif + case PASS_RENDER_TIME: + /* This pass is handled entirely on the host side. */ + pass.components = 0; + break; + + case PASS_DIFFUSE_COLOR: + case PASS_GLOSSY_COLOR: + case PASS_TRANSMISSION_COLOR: + case PASS_SUBSURFACE_COLOR: pass.components = 4; - pass.exposure = true; - pass.divide_type = PASS_SUBSURFACE_COLOR; break; case PASS_DIFFUSE_DIRECT: + case PASS_DIFFUSE_INDIRECT: pass.components = 4; pass.exposure = true; pass.divide_type = PASS_DIFFUSE_COLOR; break; case PASS_GLOSSY_DIRECT: + case PASS_GLOSSY_INDIRECT: pass.components = 4; pass.exposure = true; pass.divide_type = PASS_GLOSSY_COLOR; break; case PASS_TRANSMISSION_DIRECT: + case PASS_TRANSMISSION_INDIRECT: pass.components = 4; pass.exposure = true; pass.divide_type = PASS_TRANSMISSION_COLOR; break; case PASS_SUBSURFACE_DIRECT: + case PASS_SUBSURFACE_INDIRECT: pass.components = 4; pass.exposure = true; pass.divide_type = PASS_SUBSURFACE_COLOR; break; - - case PASS_EMISSION: - case PASS_BACKGROUND: + case PASS_VOLUME_DIRECT: + case PASS_VOLUME_INDIRECT: pass.components = 4; pass.exposure = true; break; - case PASS_AO: - pass.components = 4; - break; - case PASS_SHADOW: - pass.components = 4; - pass.exposure = false; - break; - case PASS_LIGHT: - /* This isn't a real pass, used by baking to see whether - * light data is needed or not. - * - * Set components to 0 so pass sort below happens in a - * determined way. - */ - pass.components = 0; - break; -#ifdef WITH_CYCLES_DEBUG - case PASS_BVH_TRAVERSED_NODES: - case PASS_BVH_TRAVERSED_INSTANCES: - case PASS_BVH_INTERSECTIONS: - case PASS_RAY_BOUNCES: - pass.components = 1; - pass.exposure = false; + + default: + assert(false); break; -#endif } passes.push_back_slow(pass); @@ -279,6 +276,10 @@ NODE_DEFINE(Film) SOCKET_BOOLEAN(use_sample_clamp, "Use Sample Clamp", false); + SOCKET_BOOLEAN(denoising_data_pass, "Generate Denoising Data Pass", false); + SOCKET_BOOLEAN(denoising_clean_pass, "Generate Denoising Clean Pass", false); + SOCKET_INT(denoising_flags, "Denoising Flags", 0); + return type; } @@ -309,12 +310,25 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene) /* update __data */ kfilm->exposure = exposure; kfilm->pass_flag = 0; + kfilm->light_pass_flag = 0; kfilm->pass_stride = 0; kfilm->use_light_pass = use_light_visibility || use_sample_clamp; for(size_t i = 0; i < passes.size(); i++) { Pass& pass = passes[i]; - kfilm->pass_flag |= pass.type; + + if(pass.type == PASS_NONE) + continue; + + int pass_flag = (1 << (pass.type % 32)); + if(pass.type <= PASS_CATEGORY_MAIN_END) { + kfilm->pass_flag |= pass_flag; + } + else { + assert(pass.type <= PASS_CATEGORY_LIGHT_END); + kfilm->use_light_pass = 1; + kfilm->light_pass_flag |= pass_flag; + } switch(pass.type) { case PASS_COMBINED: @@ -323,10 +337,6 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene) case PASS_DEPTH: kfilm->pass_depth = kfilm->pass_stride; break; - case PASS_MIST: - kfilm->pass_mist = kfilm->pass_stride; - kfilm->use_light_pass = 1; - break; case PASS_NORMAL: kfilm->pass_normal = kfilm->pass_stride; break; @@ -345,74 +355,67 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene) case PASS_MATERIAL_ID: kfilm->pass_material_id = kfilm->pass_stride; break; + + case PASS_MIST: + kfilm->pass_mist = kfilm->pass_stride; + break; + case PASS_EMISSION: + kfilm->pass_emission = kfilm->pass_stride; + break; + case PASS_BACKGROUND: + kfilm->pass_background = kfilm->pass_stride; + break; + case PASS_AO: + kfilm->pass_ao = kfilm->pass_stride; + break; + case PASS_SHADOW: + kfilm->pass_shadow = kfilm->pass_stride; + break; + + case PASS_LIGHT: + break; + case PASS_DIFFUSE_COLOR: kfilm->pass_diffuse_color = kfilm->pass_stride; - kfilm->use_light_pass = 1; break; case PASS_GLOSSY_COLOR: kfilm->pass_glossy_color = kfilm->pass_stride; - kfilm->use_light_pass = 1; break; case PASS_TRANSMISSION_COLOR: kfilm->pass_transmission_color = kfilm->pass_stride; - kfilm->use_light_pass = 1; break; case PASS_SUBSURFACE_COLOR: kfilm->pass_subsurface_color = kfilm->pass_stride; - kfilm->use_light_pass = 1; break; case PASS_DIFFUSE_INDIRECT: kfilm->pass_diffuse_indirect = kfilm->pass_stride; - kfilm->use_light_pass = 1; break; case PASS_GLOSSY_INDIRECT: kfilm->pass_glossy_indirect = kfilm->pass_stride; - kfilm->use_light_pass = 1; break; case PASS_TRANSMISSION_INDIRECT: kfilm->pass_transmission_indirect = kfilm->pass_stride; - kfilm->use_light_pass = 1; break; case PASS_SUBSURFACE_INDIRECT: kfilm->pass_subsurface_indirect = kfilm->pass_stride; - kfilm->use_light_pass = 1; + break; + case PASS_VOLUME_INDIRECT: + kfilm->pass_volume_indirect = kfilm->pass_stride; break; case PASS_DIFFUSE_DIRECT: kfilm->pass_diffuse_direct = kfilm->pass_stride; - kfilm->use_light_pass = 1; break; case PASS_GLOSSY_DIRECT: kfilm->pass_glossy_direct = kfilm->pass_stride; - kfilm->use_light_pass = 1; break; case PASS_TRANSMISSION_DIRECT: kfilm->pass_transmission_direct = kfilm->pass_stride; - kfilm->use_light_pass = 1; break; case PASS_SUBSURFACE_DIRECT: kfilm->pass_subsurface_direct = kfilm->pass_stride; - kfilm->use_light_pass = 1; - break; - - case PASS_EMISSION: - kfilm->pass_emission = kfilm->pass_stride; - kfilm->use_light_pass = 1; break; - case PASS_BACKGROUND: - kfilm->pass_background = kfilm->pass_stride; - kfilm->use_light_pass = 1; - break; - case PASS_AO: - kfilm->pass_ao = kfilm->pass_stride; - kfilm->use_light_pass = 1; - break; - case PASS_SHADOW: - kfilm->pass_shadow = kfilm->pass_stride; - kfilm->use_light_pass = 1; - break; - - case PASS_LIGHT: - kfilm->use_light_pass = 1; + case PASS_VOLUME_DIRECT: + kfilm->pass_volume_direct = kfilm->pass_stride; break; #ifdef WITH_CYCLES_DEBUG @@ -429,14 +432,31 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene) kfilm->pass_ray_bounces = kfilm->pass_stride; break; #endif + case PASS_RENDER_TIME: + break; - case PASS_NONE: + default: + assert(false); break; } kfilm->pass_stride += pass.components; } + kfilm->pass_denoising_data = 0; + kfilm->pass_denoising_clean = 0; + kfilm->denoising_flags = 0; + if(denoising_data_pass) { + kfilm->pass_denoising_data = kfilm->pass_stride; + kfilm->pass_stride += DENOISING_PASS_SIZE_BASE; + kfilm->denoising_flags = denoising_flags; + if(denoising_clean_pass) { + kfilm->pass_denoising_clean = kfilm->pass_stride; + kfilm->pass_stride += DENOISING_PASS_SIZE_CLEAN; + kfilm->use_light_pass = 1; + } + } + kfilm->pass_stride = align_up(kfilm->pass_stride, 4); kfilm->pass_alpha_threshold = pass_alpha_threshold; @@ -451,6 +471,10 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene) kfilm->mist_inv_depth = (mist_depth > 0.0f)? 1.0f/mist_depth: 0.0f; kfilm->mist_falloff = mist_falloff; + pass_stride = kfilm->pass_stride; + denoising_data_offset = kfilm->pass_denoising_data; + denoising_clean_offset = kfilm->pass_denoising_clean; + need_update = false; } @@ -472,7 +496,7 @@ void Film::tag_passes_update(Scene *scene, const array<Pass>& passes_) scene->mesh_manager->tag_update(scene); foreach(Shader *shader, scene->shaders) - shader->need_update_attributes = true; + shader->need_update_mesh = true; } else if(Pass::contains(passes, PASS_MOTION) != Pass::contains(passes_, PASS_MOTION)) scene->mesh_manager->tag_update(scene); diff --git a/intern/cycles/render/film.h b/intern/cycles/render/film.h index 9fa51c51f52..29b1e7e9157 100644 --- a/intern/cycles/render/film.h +++ b/intern/cycles/render/film.h @@ -17,12 +17,12 @@ #ifndef __FILM_H__ #define __FILM_H__ -#include "util_string.h" -#include "util_vector.h" +#include "util/util_string.h" +#include "util/util_vector.h" -#include "kernel_types.h" +#include "kernel/kernel_types.h" -#include "node.h" +#include "graph/node.h" CCL_NAMESPACE_BEGIN @@ -53,12 +53,19 @@ public: class Film : public Node { public: - NODE_DECLARE; + NODE_DECLARE float exposure; array<Pass> passes; + bool denoising_data_pass; + bool denoising_clean_pass; + int denoising_flags; float pass_alpha_threshold; + int pass_stride; + int denoising_data_offset; + int denoising_clean_offset; + FilterType filter_type; float filter_width; size_t filter_table_offset; diff --git a/intern/cycles/render/graph.cpp b/intern/cycles/render/graph.cpp index f6c83fb5c7e..096de878e51 100644 --- a/intern/cycles/render/graph.cpp +++ b/intern/cycles/render/graph.cpp @@ -14,17 +14,18 @@ * limitations under the License. */ -#include "attribute.h" -#include "graph.h" -#include "nodes.h" -#include "shader.h" -#include "constant_fold.h" - -#include "util_algorithm.h" -#include "util_debug.h" -#include "util_foreach.h" -#include "util_queue.h" -#include "util_logging.h" +#include "render/attribute.h" +#include "render/graph.h" +#include "render/nodes.h" +#include "render/scene.h" +#include "render/shader.h" +#include "render/constant_fold.h" + +#include "util/util_algorithm.h" +#include "util/util_foreach.h" +#include "util/util_logging.h" +#include "util/util_md5.h" +#include "util/util_queue.h" CCL_NAMESPACE_BEGIN @@ -195,6 +196,7 @@ bool ShaderNode::equals(const ShaderNode& other) ShaderGraph::ShaderGraph() { finalized = false; + simplified = false; num_node_ids = 0; add(new OutputNode()); } @@ -207,6 +209,8 @@ ShaderGraph::~ShaderGraph() ShaderNode *ShaderGraph::add(ShaderNode *node) { assert(!finalized); + simplified = false; + node->id = num_node_ids++; nodes.push_back(node); return node; @@ -217,26 +221,6 @@ OutputNode *ShaderGraph::output() return (OutputNode*)nodes.front(); } -ShaderGraph *ShaderGraph::copy() -{ - ShaderGraph *newgraph = new ShaderGraph(); - - /* copy nodes */ - ShaderNodeSet nodes_all; - foreach(ShaderNode *node, nodes) - nodes_all.insert(node); - - ShaderNodeMap nodes_copy; - copy_nodes(nodes_all, nodes_copy); - - /* add nodes (in same order, so output is still first) */ - newgraph->clear_nodes(); - foreach(ShaderNode *node, nodes) - newgraph->add(nodes_copy[node]); - - return newgraph; -} - void ShaderGraph::connect(ShaderOutput *from, ShaderInput *to) { assert(!finalized); @@ -273,6 +257,7 @@ void ShaderGraph::connect(ShaderOutput *from, ShaderInput *to) void ShaderGraph::disconnect(ShaderOutput *from) { assert(!finalized); + simplified = false; foreach(ShaderInput *sock, from->links) { sock->link = NULL; @@ -285,6 +270,7 @@ void ShaderGraph::disconnect(ShaderInput *to) { assert(!finalized); assert(to->link); + simplified = false; ShaderOutput *from = to->link; @@ -294,6 +280,8 @@ void ShaderGraph::disconnect(ShaderInput *to) void ShaderGraph::relink(ShaderNode *node, ShaderOutput *from, ShaderOutput *to) { + simplified = false; + /* Copy because disconnect modifies this list */ vector<ShaderInput*> outputs = from->links; @@ -310,9 +298,19 @@ void ShaderGraph::relink(ShaderNode *node, ShaderOutput *from, ShaderOutput *to) } } +void ShaderGraph::simplify(Scene *scene) +{ + if(!simplified) { + default_inputs(scene->shader_manager->use_osl()); + clean(scene); + refine_bump_nodes(); + + simplified = true; + } +} + void ShaderGraph::finalize(Scene *scene, bool do_bump, - bool do_osl, bool do_simplify, bool bump_in_object_space) { @@ -322,9 +320,7 @@ void ShaderGraph::finalize(Scene *scene, * modified afterwards. */ if(!finalized) { - default_inputs(do_osl); - clean(scene); - refine_bump_nodes(); + simplify(scene); if(do_bump) bump_from_displacement(bump_in_object_space); @@ -405,7 +401,8 @@ void ShaderGraph::copy_nodes(ShaderNodeSet& nodes, ShaderNodeMap& nnodemap) /* Graph simplification */ /* ******************** */ -/* Step 1: Remove proxy nodes. +/* Remove proxy nodes. + * * These only exists temporarily when exporting groups, and we must remove them * early so that node->attributes() and default links do not see them. */ @@ -475,7 +472,8 @@ void ShaderGraph::remove_proxy_nodes() } } -/* Step 2: Constant folding. +/* Constant folding. + * * Try to constant fold some nodes, and pipe result directly to * the input socket of connected nodes. */ @@ -529,14 +527,14 @@ void ShaderGraph::constant_fold() * that happens to ensure there is still a valid graph for displacement. */ if(has_displacement && !output()->input("Displacement")->link) { - ValueNode *value = (ValueNode*)add(new ValueNode()); + ColorNode *value = (ColorNode*)add(new ColorNode()); value->value = output()->displacement; - connect(value->output("Value"), output()->input("Displacement")); + connect(value->output("Color"), output()->input("Displacement")); } } -/* Step 3: Simplification. */ +/* Simplification. */ void ShaderGraph::simplify_settings(Scene *scene) { foreach(ShaderNode *node, nodes) { @@ -544,7 +542,7 @@ void ShaderGraph::simplify_settings(Scene *scene) } } -/* Step 4: Deduplicate nodes with same settings. */ +/* Deduplicate nodes with same settings. */ void ShaderGraph::deduplicate_nodes() { /* NOTES: @@ -620,6 +618,48 @@ void ShaderGraph::deduplicate_nodes() } } +/* Check whether volume output has meaningful nodes, otherwise + * disconnect the output. + */ +void ShaderGraph::verify_volume_output() +{ + /* Check whether we can optimize the whole volume graph out. */ + ShaderInput *volume_in = output()->input("Volume"); + if(volume_in->link == NULL) { + return; + } + bool has_valid_volume = false; + ShaderNodeSet scheduled; + queue<ShaderNode*> traverse_queue; + /* Schedule volume output. */ + traverse_queue.push(volume_in->link->parent); + scheduled.insert(volume_in->link->parent); + /* Traverse down the tree. */ + while(!traverse_queue.empty()) { + ShaderNode *node = traverse_queue.front(); + traverse_queue.pop(); + /* Node is fully valid for volume, can't optimize anything out. */ + if(node->has_volume_support()) { + has_valid_volume = true; + break; + } + foreach(ShaderInput *input, node->inputs) { + if(input->link == NULL) { + continue; + } + if(scheduled.find(input->link->parent) != scheduled.end()) { + continue; + } + traverse_queue.push(input->link->parent); + scheduled.insert(input->link->parent); + } + } + if(!has_valid_volume) { + VLOG(1) << "Disconnect meaningless volume output."; + disconnect(volume_in->link); + } +} + void ShaderGraph::break_cycles(ShaderNode *node, vector<bool>& visited, vector<bool>& on_stack) { visited[node->id] = true; @@ -644,20 +684,41 @@ void ShaderGraph::break_cycles(ShaderNode *node, vector<bool>& visited, vector<b on_stack[node->id] = false; } +void ShaderGraph::compute_displacement_hash() +{ + /* Compute hash of all nodes linked to displacement, to detect if we need + * to recompute displacement when shader nodes change. */ + ShaderInput *displacement_in = output()->input("Displacement"); + + if(!displacement_in->link) { + displacement_hash = ""; + return; + } + + ShaderNodeSet nodes_displace; + find_dependencies(nodes_displace, displacement_in); + + MD5Hash md5; + foreach(ShaderNode *node, nodes_displace) { + node->hash(md5); + foreach(ShaderInput *input, node->inputs) { + int link_id = (input->link) ? input->link->parent->id : 0; + md5.append((uint8_t*)&link_id, sizeof(link_id)); + } + } + + displacement_hash = md5.get_hex(); +} + void ShaderGraph::clean(Scene *scene) { /* Graph simplification */ - /* 1: Remove proxy nodes was already done. */ - - /* 2: Constant folding. */ + /* NOTE: Remove proxy nodes was already done. */ constant_fold(); - - /* 3: Simplification. */ simplify_settings(scene); - - /* 4: De-duplication. */ deduplicate_nodes(); + verify_volume_output(); /* we do two things here: find cycles and break them, and remove unused * nodes that don't feed into the output. how cycles are broken is @@ -827,7 +888,7 @@ void ShaderGraph::bump_from_displacement(bool use_object_space) if(!displacement_in->link) return; - + /* find dependencies for the given input */ ShaderNodeSet nodes_displace; find_dependencies(nodes_displace, displacement_in); @@ -859,15 +920,34 @@ void ShaderGraph::bump_from_displacement(bool use_object_space) /* add bump node and connect copied graphs to it */ BumpNode *bump = (BumpNode*)add(new BumpNode()); bump->use_object_space = use_object_space; + bump->distance = 1.0f; ShaderOutput *out = displacement_in->link; ShaderOutput *out_center = nodes_center[out->parent]->output(out->name()); ShaderOutput *out_dx = nodes_dx[out->parent]->output(out->name()); ShaderOutput *out_dy = nodes_dy[out->parent]->output(out->name()); - connect(out_center, bump->input("SampleCenter")); - connect(out_dx, bump->input("SampleX")); - connect(out_dy, bump->input("SampleY")); + /* convert displacement vector to height */ + VectorMathNode *dot_center = (VectorMathNode*)add(new VectorMathNode()); + VectorMathNode *dot_dx = (VectorMathNode*)add(new VectorMathNode()); + VectorMathNode *dot_dy = (VectorMathNode*)add(new VectorMathNode()); + + dot_center->type = NODE_VECTOR_MATH_DOT_PRODUCT; + dot_dx->type = NODE_VECTOR_MATH_DOT_PRODUCT; + dot_dy->type = NODE_VECTOR_MATH_DOT_PRODUCT; + + GeometryNode *geom = (GeometryNode*)add(new GeometryNode()); + connect(geom->output("Normal"), dot_center->input("Vector2")); + connect(geom->output("Normal"), dot_dx->input("Vector2")); + connect(geom->output("Normal"), dot_dy->input("Vector2")); + + connect(out_center, dot_center->input("Vector1")); + connect(out_dx, dot_dx->input("Vector1")); + connect(out_dy, dot_dy->input("Vector1")); + + connect(dot_center->output("Value"), bump->input("SampleCenter")); + connect(dot_dx->output("Value"), bump->input("SampleX")); + connect(dot_dy->output("Value"), bump->input("SampleY")); /* connect the bump out to the set normal in: */ connect(bump->output("Normal"), set_normal->input("Direction")); @@ -980,6 +1060,12 @@ int ShaderGraph::get_num_closures() else if(CLOSURE_IS_BSDF_MULTISCATTER(closure_type)) { num_closures += 2; } + else if(CLOSURE_IS_PRINCIPLED(closure_type)) { + num_closures += 8; + } + else if(CLOSURE_IS_VOLUME(closure_type)) { + num_closures += VOLUME_STACK_SIZE; + } else { ++num_closures; } diff --git a/intern/cycles/render/graph.h b/intern/cycles/render/graph.h index 780fdf49ca4..2c134932b3c 100644 --- a/intern/cycles/render/graph.h +++ b/intern/cycles/render/graph.h @@ -17,17 +17,17 @@ #ifndef __GRAPH_H__ #define __GRAPH_H__ -#include "node.h" -#include "node_type.h" +#include "graph/node.h" +#include "graph/node_type.h" -#include "kernel_types.h" +#include "kernel/kernel_types.h" -#include "util_list.h" -#include "util_map.h" -#include "util_param.h" -#include "util_set.h" -#include "util_types.h" -#include "util_vector.h" +#include "util/util_list.h" +#include "util/util_map.h" +#include "util/util_param.h" +#include "util/util_set.h" +#include "util/util_types.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN @@ -42,6 +42,7 @@ class SVMCompiler; class OSLCompiler; class OutputNode; class ConstantFolder; +class MD5Hash; /* Bump * @@ -151,11 +152,14 @@ public: virtual bool has_surface_emission() { return false; } virtual bool has_surface_transparent() { return false; } virtual bool has_surface_bssrdf() { return false; } + virtual bool has_bump() { return false; } virtual bool has_bssrdf_bump() { return false; } virtual bool has_spatial_varying() { return false; } virtual bool has_object_dependency() { return false; } + virtual bool has_attribute_dependency() { return false; } virtual bool has_integrator_dependency() { return false; } - + virtual bool has_volume_support() { return false; } + virtual bool has_raytrace() { return false; } vector<ShaderInput*> inputs; vector<ShaderOutput*> outputs; @@ -201,14 +205,14 @@ public: /* Node definition utility macros */ #define SHADER_NODE_CLASS(type) \ - NODE_DECLARE; \ + NODE_DECLARE \ type(); \ virtual ShaderNode *clone() const { return new type(*this); } \ virtual void compile(SVMCompiler& compiler); \ virtual void compile(OSLCompiler& compiler); \ #define SHADER_NODE_NO_CLONE_CLASS(type) \ - NODE_DECLARE; \ + NODE_DECLARE \ type(); \ virtual void compile(SVMCompiler& compiler); \ virtual void compile(OSLCompiler& compiler); \ @@ -240,12 +244,12 @@ public: list<ShaderNode*> nodes; size_t num_node_ids; bool finalized; + bool simplified; + string displacement_hash; ShaderGraph(); ~ShaderGraph(); - ShaderGraph *copy(); - ShaderNode *add(ShaderNode *node); OutputNode *output(); @@ -255,9 +259,10 @@ public: void relink(ShaderNode *node, ShaderOutput *from, ShaderOutput *to); void remove_proxy_nodes(); + void compute_displacement_hash(); + void simplify(Scene *scene); void finalize(Scene *scene, bool do_bump = false, - bool do_osl = false, bool do_simplify = false, bool bump_in_object_space = false); @@ -283,6 +288,7 @@ protected: void constant_fold(); void simplify_settings(Scene *scene); void deduplicate_nodes(); + void verify_volume_output(); }; CCL_NAMESPACE_END diff --git a/intern/cycles/render/image.cpp b/intern/cycles/render/image.cpp index fd8a1262208..9c5e32e8219 100644 --- a/intern/cycles/render/image.cpp +++ b/intern/cycles/render/image.cpp @@ -14,15 +14,15 @@ * limitations under the License. */ -#include "device.h" -#include "image.h" -#include "scene.h" +#include "device/device.h" +#include "render/image.h" +#include "render/scene.h" -#include "util_foreach.h" -#include "util_logging.h" -#include "util_path.h" -#include "util_progress.h" -#include "util_texture.h" +#include "util/util_foreach.h" +#include "util/util_logging.h" +#include "util/util_path.h" +#include "util/util_progress.h" +#include "util/util_texture.h" #ifdef WITH_OSL #include <OSL/oslexec.h> @@ -30,73 +30,29 @@ CCL_NAMESPACE_BEGIN +/* Some helpers to silence warning in templated function. */ +static bool isfinite(uchar /*value*/) +{ + return false; +} +static bool isfinite(half /*value*/) +{ + return false; +} + ImageManager::ImageManager(const DeviceInfo& info) { need_update = true; - pack_images = false; osl_texture_system = NULL; animation_frame = 0; - /* In case of multiple devices used we need to know type of an actual - * compute device. - * - * NOTE: We assume that all the devices are same type, otherwise we'll - * be screwed on so many levels.. - */ - DeviceType device_type = info.type; - if(device_type == DEVICE_MULTI) { - device_type = info.multi_devices[0].type; - } - /* Set image limits */ -#define SET_TEX_IMAGES_LIMITS(ARCH) \ - { \ - tex_num_images[IMAGE_DATA_TYPE_FLOAT4] = TEX_NUM_FLOAT4_ ## ARCH; \ - tex_num_images[IMAGE_DATA_TYPE_BYTE4] = TEX_NUM_BYTE4_ ## ARCH; \ - tex_num_images[IMAGE_DATA_TYPE_HALF4] = TEX_NUM_HALF4_ ## ARCH; \ - tex_num_images[IMAGE_DATA_TYPE_FLOAT] = TEX_NUM_FLOAT_ ## ARCH; \ - tex_num_images[IMAGE_DATA_TYPE_BYTE] = TEX_NUM_BYTE_ ## ARCH; \ - tex_num_images[IMAGE_DATA_TYPE_HALF] = TEX_NUM_HALF_ ## ARCH; \ - tex_start_images[IMAGE_DATA_TYPE_FLOAT4] = TEX_START_FLOAT4_ ## ARCH; \ - tex_start_images[IMAGE_DATA_TYPE_BYTE4] = TEX_START_BYTE4_ ## ARCH; \ - tex_start_images[IMAGE_DATA_TYPE_HALF4] = TEX_START_HALF4_ ## ARCH; \ - tex_start_images[IMAGE_DATA_TYPE_FLOAT] = TEX_START_FLOAT_ ## ARCH; \ - tex_start_images[IMAGE_DATA_TYPE_BYTE] = TEX_START_BYTE_ ## ARCH; \ - tex_start_images[IMAGE_DATA_TYPE_HALF] = TEX_START_HALF_ ## ARCH; \ - } + max_num_images = TEX_NUM_MAX; + has_half_images = info.has_half_images; - if(device_type == DEVICE_CPU) { - SET_TEX_IMAGES_LIMITS(CPU); - } - else if(device_type == DEVICE_CUDA) { - if(info.has_bindless_textures) { - SET_TEX_IMAGES_LIMITS(CUDA_KEPLER); - } - else { - SET_TEX_IMAGES_LIMITS(CUDA); - } - } - else if(device_type == DEVICE_OPENCL) { - SET_TEX_IMAGES_LIMITS(OPENCL); - } - else { - /* Should not happen. */ - tex_num_images[IMAGE_DATA_TYPE_FLOAT4] = 0; - tex_num_images[IMAGE_DATA_TYPE_BYTE4] = 0; - tex_num_images[IMAGE_DATA_TYPE_HALF4] = 0; - tex_num_images[IMAGE_DATA_TYPE_FLOAT] = 0; - tex_num_images[IMAGE_DATA_TYPE_BYTE] = 0; - tex_num_images[IMAGE_DATA_TYPE_HALF] = 0; - tex_start_images[IMAGE_DATA_TYPE_FLOAT4] = 0; - tex_start_images[IMAGE_DATA_TYPE_BYTE4] = 0; - tex_start_images[IMAGE_DATA_TYPE_HALF4] = 0; - tex_start_images[IMAGE_DATA_TYPE_FLOAT] = 0; - tex_start_images[IMAGE_DATA_TYPE_BYTE] = 0; - tex_start_images[IMAGE_DATA_TYPE_HALF] = 0; - assert(0); + for(size_t type = 0; type < IMAGE_DATA_NUM_TYPES; type++) { + tex_num_images[type] = 0; } - -#undef SET_TEX_IMAGES_LIMITS } ImageManager::~ImageManager() @@ -107,11 +63,6 @@ ImageManager::~ImageManager() } } -void ImageManager::set_pack_images(bool pack_images_) -{ - pack_images = pack_images_; -} - void ImageManager::set_osl_texture_system(void *texture_system) { osl_texture_system = texture_system; @@ -133,109 +84,143 @@ bool ImageManager::set_animation_frame_update(int frame) return false; } -ImageManager::ImageDataType ImageManager::get_image_metadata(const string& filename, - void *builtin_data, - bool& is_linear) +device_memory *ImageManager::image_memory(int flat_slot) +{ + ImageDataType type; + int slot = flattened_slot_to_type_index(flat_slot, &type); + + Image *img = images[type][slot]; + + return img->mem; +} + +bool ImageManager::get_image_metadata(const string& filename, + void *builtin_data, + ImageMetaData& metadata) { - bool is_float = false, is_half = false; - is_linear = false; - int channels = 4; + memset(&metadata, 0, sizeof(metadata)); if(builtin_data) { if(builtin_image_info_cb) { - int width, height, depth; - builtin_image_info_cb(filename, builtin_data, is_float, width, height, depth, channels); + builtin_image_info_cb(filename, builtin_data, metadata); + } + else { + return false; } - if(is_float) { - is_linear = true; - return (channels > 1) ? IMAGE_DATA_TYPE_FLOAT4 : IMAGE_DATA_TYPE_FLOAT; + if(metadata.is_float) { + metadata.is_linear = true; + metadata.type = (metadata.channels > 1) ? IMAGE_DATA_TYPE_FLOAT4 : IMAGE_DATA_TYPE_FLOAT; } else { - return (channels > 1) ? IMAGE_DATA_TYPE_BYTE4 : IMAGE_DATA_TYPE_BYTE; + metadata.type = (metadata.channels > 1) ? IMAGE_DATA_TYPE_BYTE4 : IMAGE_DATA_TYPE_BYTE; } + + return true; + } + + /* Perform preliminary checks, with meaningful logging. */ + if(!path_exists(filename)) { + VLOG(1) << "File '" << filename << "' does not exist."; + return false; + } + if(path_is_directory(filename)) { + VLOG(1) << "File '" << filename << "' is a directory, can't use as image."; + return false; } ImageInput *in = ImageInput::create(filename); - if(in) { - ImageSpec spec; - - if(in->open(filename, spec)) { - /* check the main format, and channel formats; - * if any take up more than one byte, we'll need a float texture slot */ - if(spec.format.basesize() > 1) { - is_float = true; - is_linear = true; - } + if(!in) { + return false; + } - for(size_t channel = 0; channel < spec.channelformats.size(); channel++) { - if(spec.channelformats[channel].basesize() > 1) { - is_float = true; - is_linear = true; - } - } + ImageSpec spec; + if(!in->open(filename, spec)) { + delete in; + return false; + } - /* check if it's half float */ - if(spec.format == TypeDesc::HALF) - is_half = true; + metadata.width = spec.width; + metadata.height = spec.height; + metadata.depth = spec.depth; - channels = spec.nchannels; + /* check the main format, and channel formats; + * if any take up more than one byte, we'll need a float texture slot */ + if(spec.format.basesize() > 1) { + metadata.is_float = true; + metadata.is_linear = true; + } - /* basic color space detection, not great but better than nothing - * before we do OpenColorIO integration */ - if(is_float) { - string colorspace = spec.get_string_attribute("oiio:ColorSpace"); + for(size_t channel = 0; channel < spec.channelformats.size(); channel++) { + if(spec.channelformats[channel].basesize() > 1) { + metadata.is_float = true; + metadata.is_linear = true; + } + } - is_linear = !(colorspace == "sRGB" || - colorspace == "GammaCorrected" || - (colorspace == "" && - (strcmp(in->format_name(), "png") == 0 || - strcmp(in->format_name(), "tiff") == 0 || - strcmp(in->format_name(), "dpx") == 0 || - strcmp(in->format_name(), "jpeg2000") == 0))); - } - else { - is_linear = false; - } + /* check if it's half float */ + if(spec.format == TypeDesc::HALF) + metadata.is_half = true; - in->close(); - } + /* basic color space detection, not great but better than nothing + * before we do OpenColorIO integration */ + if(metadata.is_float) { + string colorspace = spec.get_string_attribute("oiio:ColorSpace"); - delete in; + metadata.is_linear = !(colorspace == "sRGB" || + colorspace == "GammaCorrected" || + (colorspace == "" && + (strcmp(in->format_name(), "png") == 0 || + strcmp(in->format_name(), "tiff") == 0 || + strcmp(in->format_name(), "dpx") == 0 || + strcmp(in->format_name(), "jpeg2000") == 0))); + } + else { + metadata.is_linear = false; } - if(is_half) { - return (channels > 1) ? IMAGE_DATA_TYPE_HALF4 : IMAGE_DATA_TYPE_HALF; + /* set type and channels */ + metadata.channels = spec.nchannels; + + if(metadata.is_half) { + metadata.type = (metadata.channels > 1) ? IMAGE_DATA_TYPE_HALF4 : IMAGE_DATA_TYPE_HALF; } - else if(is_float) { - return (channels > 1) ? IMAGE_DATA_TYPE_FLOAT4 : IMAGE_DATA_TYPE_FLOAT; + else if(metadata.is_float) { + metadata.type = (metadata.channels > 1) ? IMAGE_DATA_TYPE_FLOAT4 : IMAGE_DATA_TYPE_FLOAT; } else { - return (channels > 1) ? IMAGE_DATA_TYPE_BYTE4 : IMAGE_DATA_TYPE_BYTE; + metadata.type = (metadata.channels > 1) ? IMAGE_DATA_TYPE_BYTE4 : IMAGE_DATA_TYPE_BYTE; + } + + in->close(); + delete in; + + return true; +} + +int ImageManager::max_flattened_slot(ImageDataType type) +{ + if(tex_num_images[type] == 0) { + /* No textures for the type, no slots needs allocation. */ + return 0; } + return type_index_to_flattened_slot(tex_num_images[type], type); } -/* We use a consecutive slot counting scheme on the devices, in order - * float4, byte4, half4, float, byte, half. +/* The lower three bits of a device texture slot number indicate its type. * These functions convert the slot ids from ImageManager "images" ones - * to device ones and vice versa. */ + * to device ones and vice verse. + */ int ImageManager::type_index_to_flattened_slot(int slot, ImageDataType type) { - return slot + tex_start_images[type]; + return (slot << IMAGE_DATA_TYPE_SHIFT) | (type); } int ImageManager::flattened_slot_to_type_index(int flat_slot, ImageDataType *type) { - for(int i = IMAGE_DATA_NUM_TYPES - 1; i >= 0; i--) { - if(flat_slot >= tex_start_images[i]) { - *type = (ImageDataType)i; - return flat_slot - tex_start_images[i]; - } - } - - /* Should not happen. */ - return flat_slot; + *type = (ImageDataType)(flat_slot & IMAGE_DATA_TYPE_MASK); + return flat_slot >> IMAGE_DATA_TYPE_SHIFT; } string ImageManager::name_from_type(int type) @@ -272,32 +257,27 @@ int ImageManager::add_image(const string& filename, void *builtin_data, bool animated, float frame, - bool& is_float, - bool& is_linear, InterpolationType interpolation, ExtensionType extension, - bool use_alpha) + bool use_alpha, + ImageMetaData& metadata) { Image *img; size_t slot; - ImageDataType type = get_image_metadata(filename, builtin_data, is_linear); + get_image_metadata(filename, builtin_data, metadata); + ImageDataType type = metadata.type; thread_scoped_lock device_lock(device_mutex); - /* Do we have a float? */ - if(type == IMAGE_DATA_TYPE_FLOAT || type == IMAGE_DATA_TYPE_FLOAT4) - is_float = true; - - /* No single channel and half textures on CUDA (Fermi) and no half on OpenCL, use available slots */ - if((type == IMAGE_DATA_TYPE_FLOAT || - type == IMAGE_DATA_TYPE_HALF4 || - type == IMAGE_DATA_TYPE_HALF) && - tex_num_images[type] == 0) { - type = IMAGE_DATA_TYPE_FLOAT4; - } - if(type == IMAGE_DATA_TYPE_BYTE && tex_num_images[type] == 0) { - type = IMAGE_DATA_TYPE_BYTE4; + /* No half textures on OpenCL, use full float instead. */ + if(!has_half_images) { + if(type == IMAGE_DATA_TYPE_HALF4) { + type = IMAGE_DATA_TYPE_FLOAT4; + } + else if(type == IMAGE_DATA_TYPE_HALF) { + type = IMAGE_DATA_TYPE_FLOAT; + } } /* Fnd existing image. */ @@ -329,14 +309,19 @@ int ImageManager::add_image(const string& filename, break; } - if(slot == images[type].size()) { - /* Max images limit reached. */ - if(images[type].size() == tex_num_images[type]) { - printf("ImageManager::add_image: Reached %s image limit (%d), skipping '%s'\n", - name_from_type(type).c_str(), tex_num_images[type], filename.c_str()); - return -1; - } + /* Count if we're over the limit. + * Very unlikely, since max_num_images is insanely big. But better safe than sorry. */ + int tex_count = 0; + for(int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) { + tex_count += tex_num_images[type]; + } + if(tex_count > max_num_images) { + printf("ImageManager::add_image: Reached image limit (%d), skipping '%s'\n", + max_num_images, filename.c_str()); + return -1; + } + if(slot == images[type].size()) { images[type].resize(images[type].size() + 1); } @@ -344,6 +329,7 @@ int ImageManager::add_image(const string& filename, img = new Image(); img->filename = filename; img->builtin_data = builtin_data; + img->builtin_free_cache = metadata.builtin_free_cache; img->need_load = true; img->animated = animated; img->frame = frame; @@ -351,9 +337,12 @@ int ImageManager::add_image(const string& filename, img->extension = extension; img->users = 1; img->use_alpha = use_alpha; + img->mem = NULL; images[type][slot] = img; + ++tex_num_images[type]; + need_update = true; return type_index_to_flattened_slot(slot, type); @@ -427,12 +416,22 @@ void ImageManager::tag_reload_image(const string& filename, } } -bool ImageManager::file_load_image_generic(Image *img, ImageInput **in, int &width, int &height, int &depth, int &components) +bool ImageManager::file_load_image_generic(Image *img, + ImageInput **in, + int &width, + int &height, + int &depth, + int &components) { if(img->filename == "") return false; if(!img->builtin_data) { + /* NOTE: Error logging is done in meta data acquisition. */ + if(!path_exists(img->filename) || path_is_directory(img->filename)) { + return false; + } + /* load image from file through OIIO */ *in = ImageInput::create(img->filename); @@ -461,8 +460,13 @@ bool ImageManager::file_load_image_generic(Image *img, ImageInput **in, int &wid if(!builtin_image_info_cb || !builtin_image_pixels_cb) return false; - bool is_float; - builtin_image_info_cb(img->filename, img->builtin_data, is_float, width, height, depth, components); + ImageMetaData metadata; + builtin_image_info_cb(img->filename, img->builtin_data, metadata); + + width = metadata.width; + height = metadata.height; + depth = metadata.depth; + components = metadata.channels; } /* we only handle certain number of components */ @@ -497,12 +501,21 @@ bool ImageManager::file_load_image(Image *img, vector<StorageType> pixels_storage; StorageType *pixels; const size_t max_size = max(max(width, height), depth); + if(max_size == 0) { + /* Don't bother with invalid images. */ + return false; + } if(texture_limit > 0 && max_size > texture_limit) { pixels_storage.resize(((size_t)width)*height*depth*4); pixels = &pixels_storage[0]; } else { - pixels = (StorageType*)tex_img.resize(width, height, depth); + thread_scoped_lock device_lock(device_mutex); + pixels = (StorageType*)tex_img.alloc(width, height, depth); + } + if(pixels == NULL) { + /* Could be that we've run out of memory. */ + return false; } bool cmyk = false; const size_t num_pixels = ((size_t)width) * height * depth; @@ -543,13 +556,15 @@ bool ImageManager::file_load_image(Image *img, builtin_image_float_pixels_cb(img->filename, img->builtin_data, (float*)&pixels[0], - num_pixels * components); + num_pixels * components, + img->builtin_free_cache); } else if(FileFormat == TypeDesc::UINT8) { builtin_image_pixels_cb(img->filename, img->builtin_data, (uchar*)&pixels[0], - num_pixels * components); + num_pixels * components, + img->builtin_free_cache); } else { /* TODO(dingto): Support half for ImBuf. */ @@ -604,6 +619,37 @@ bool ImageManager::file_load_image(Image *img, } } } + /* Make sure we don't have buggy values. */ + if(FileFormat == TypeDesc::FLOAT) { + /* For RGBA buffers we put all channels to 0 if either of them is not + * finite. This way we avoid possible artifacts caused by fully changed + * hue. + */ + if(is_rgba) { + for(size_t i = 0; i < num_pixels; i += 4) { + StorageType *pixel = &pixels[i*4]; + if(!isfinite(pixel[0]) || + !isfinite(pixel[1]) || + !isfinite(pixel[2]) || + !isfinite(pixel[3])) + { + pixel[0] = 0; + pixel[1] = 0; + pixel[2] = 0; + pixel[3] = 0; + } + } + } + else { + for(size_t i = 0; i < num_pixels; ++i) { + StorageType *pixel = &pixels[i]; + if(!isfinite(pixel[0])) { + pixel[0] = 0; + } + } + } + } + /* Scale image down if needed. */ if(pixels_storage.size() > 0) { float scale_factor = 1.0f; while(max_size * scale_factor > texture_limit) { @@ -619,9 +665,16 @@ bool ImageManager::file_load_image(Image *img, scale_factor, &scaled_pixels, &scaled_width, &scaled_height, &scaled_depth); - StorageType *texture_pixels = (StorageType*)tex_img.resize(scaled_width, - scaled_height, - scaled_depth); + + StorageType *texture_pixels; + + { + thread_scoped_lock device_lock(device_mutex); + texture_pixels = (StorageType*)tex_img.alloc(scaled_width, + scaled_height, + scaled_depth); + } + memcpy(texture_pixels, &scaled_pixels[0], scaled_pixels.size() * sizeof(StorageType)); @@ -630,7 +683,6 @@ bool ImageManager::file_load_image(Image *img, } void ImageManager::device_load_image(Device *device, - DeviceScene *dscene, Scene *scene, ImageDataType type, int slot, @@ -651,30 +703,28 @@ void ImageManager::device_load_image(Device *device, /* Slot assignment */ int flat_slot = type_index_to_flattened_slot(slot, type); + img->mem_name = string_printf("__tex_image_%s_%03d", name_from_type(type).c_str(), flat_slot); - string name; - if(flat_slot >= 100) - name = string_printf("__tex_image_%s_%d", name_from_type(type).c_str(), flat_slot); - else if(flat_slot >= 10) - name = string_printf("__tex_image_%s_0%d", name_from_type(type).c_str(), flat_slot); - else - name = string_printf("__tex_image_%s_00%d", name_from_type(type).c_str(), flat_slot); + /* Free previous texture in slot. */ + if(img->mem) { + thread_scoped_lock device_lock(device_mutex); + delete img->mem; + img->mem = NULL; + } + /* Create new texture. */ if(type == IMAGE_DATA_TYPE_FLOAT4) { - device_vector<float4>& tex_img = dscene->tex_float4_image[slot]; - - if(tex_img.device_pointer) { - thread_scoped_lock device_lock(device_mutex); - device->tex_free(tex_img); - } + device_vector<float4> *tex_img + = new device_vector<float4>(device, img->mem_name.c_str(), MEM_TEXTURE); if(!file_load_image<TypeDesc::FLOAT, float>(img, type, texture_limit, - tex_img)) + *tex_img)) { /* on failure to load, we set a 1x1 pixels pink image */ - float *pixels = (float*)tex_img.resize(1, 1); + thread_scoped_lock device_lock(device_mutex); + float *pixels = (float*)tex_img->alloc(1, 1); pixels[0] = TEX_IMAGE_MISSING_R; pixels[1] = TEX_IMAGE_MISSING_G; @@ -682,56 +732,48 @@ void ImageManager::device_load_image(Device *device, pixels[3] = TEX_IMAGE_MISSING_A; } - if(!pack_images) { - thread_scoped_lock device_lock(device_mutex); - device->tex_alloc(name.c_str(), - tex_img, - img->interpolation, - img->extension); - } + img->mem = tex_img; + img->mem->interpolation = img->interpolation; + img->mem->extension = img->extension; + + thread_scoped_lock device_lock(device_mutex); + tex_img->copy_to_device(); } else if(type == IMAGE_DATA_TYPE_FLOAT) { - device_vector<float>& tex_img = dscene->tex_float_image[slot]; - - if(tex_img.device_pointer) { - thread_scoped_lock device_lock(device_mutex); - device->tex_free(tex_img); - } + device_vector<float> *tex_img + = new device_vector<float>(device, img->mem_name.c_str(), MEM_TEXTURE); if(!file_load_image<TypeDesc::FLOAT, float>(img, type, texture_limit, - tex_img)) + *tex_img)) { /* on failure to load, we set a 1x1 pixels pink image */ - float *pixels = (float*)tex_img.resize(1, 1); + thread_scoped_lock device_lock(device_mutex); + float *pixels = (float*)tex_img->alloc(1, 1); pixels[0] = TEX_IMAGE_MISSING_R; } - if(!pack_images) { - thread_scoped_lock device_lock(device_mutex); - device->tex_alloc(name.c_str(), - tex_img, - img->interpolation, - img->extension); - } + img->mem = tex_img; + img->mem->interpolation = img->interpolation; + img->mem->extension = img->extension; + + thread_scoped_lock device_lock(device_mutex); + tex_img->copy_to_device(); } else if(type == IMAGE_DATA_TYPE_BYTE4) { - device_vector<uchar4>& tex_img = dscene->tex_byte4_image[slot]; - - if(tex_img.device_pointer) { - thread_scoped_lock device_lock(device_mutex); - device->tex_free(tex_img); - } + device_vector<uchar4> *tex_img + = new device_vector<uchar4>(device, img->mem_name.c_str(), MEM_TEXTURE); if(!file_load_image<TypeDesc::UINT8, uchar>(img, type, texture_limit, - tex_img)) + *tex_img)) { /* on failure to load, we set a 1x1 pixels pink image */ - uchar *pixels = (uchar*)tex_img.resize(1, 1); + thread_scoped_lock device_lock(device_mutex); + uchar *pixels = (uchar*)tex_img->alloc(1, 1); pixels[0] = (TEX_IMAGE_MISSING_R * 255); pixels[1] = (TEX_IMAGE_MISSING_G * 255); @@ -739,54 +781,46 @@ void ImageManager::device_load_image(Device *device, pixels[3] = (TEX_IMAGE_MISSING_A * 255); } - if(!pack_images) { - thread_scoped_lock device_lock(device_mutex); - device->tex_alloc(name.c_str(), - tex_img, - img->interpolation, - img->extension); - } - } - else if(type == IMAGE_DATA_TYPE_BYTE){ - device_vector<uchar>& tex_img = dscene->tex_byte_image[slot]; + img->mem = tex_img; + img->mem->interpolation = img->interpolation; + img->mem->extension = img->extension; - if(tex_img.device_pointer) { - thread_scoped_lock device_lock(device_mutex); - device->tex_free(tex_img); - } + thread_scoped_lock device_lock(device_mutex); + tex_img->copy_to_device(); + } + else if(type == IMAGE_DATA_TYPE_BYTE) { + device_vector<uchar> *tex_img + = new device_vector<uchar>(device, img->mem_name.c_str(), MEM_TEXTURE); if(!file_load_image<TypeDesc::UINT8, uchar>(img, type, texture_limit, - tex_img)) { + *tex_img)) { /* on failure to load, we set a 1x1 pixels pink image */ - uchar *pixels = (uchar*)tex_img.resize(1, 1); + thread_scoped_lock device_lock(device_mutex); + uchar *pixels = (uchar*)tex_img->alloc(1, 1); pixels[0] = (TEX_IMAGE_MISSING_R * 255); } - if(!pack_images) { - thread_scoped_lock device_lock(device_mutex); - device->tex_alloc(name.c_str(), - tex_img, - img->interpolation, - img->extension); - } - } - else if(type == IMAGE_DATA_TYPE_HALF4){ - device_vector<half4>& tex_img = dscene->tex_half4_image[slot]; + img->mem = tex_img; + img->mem->interpolation = img->interpolation; + img->mem->extension = img->extension; - if(tex_img.device_pointer) { - thread_scoped_lock device_lock(device_mutex); - device->tex_free(tex_img); - } + thread_scoped_lock device_lock(device_mutex); + tex_img->copy_to_device(); + } + else if(type == IMAGE_DATA_TYPE_HALF4) { + device_vector<half4> *tex_img + = new device_vector<half4>(device, img->mem_name.c_str(), MEM_TEXTURE); if(!file_load_image<TypeDesc::HALF, half>(img, type, texture_limit, - tex_img)) { + *tex_img)) { /* on failure to load, we set a 1x1 pixels pink image */ - half *pixels = (half*)tex_img.resize(1, 1); + thread_scoped_lock device_lock(device_mutex); + half *pixels = (half*)tex_img->alloc(1, 1); pixels[0] = TEX_IMAGE_MISSING_R; pixels[1] = TEX_IMAGE_MISSING_G; @@ -794,45 +828,40 @@ void ImageManager::device_load_image(Device *device, pixels[3] = TEX_IMAGE_MISSING_A; } - if(!pack_images) { - thread_scoped_lock device_lock(device_mutex); - device->tex_alloc(name.c_str(), - tex_img, - img->interpolation, - img->extension); - } - } - else if(type == IMAGE_DATA_TYPE_HALF){ - device_vector<half>& tex_img = dscene->tex_half_image[slot]; + img->mem = tex_img; + img->mem->interpolation = img->interpolation; + img->mem->extension = img->extension; - if(tex_img.device_pointer) { - thread_scoped_lock device_lock(device_mutex); - device->tex_free(tex_img); - } + thread_scoped_lock device_lock(device_mutex); + tex_img->copy_to_device(); + } + else if(type == IMAGE_DATA_TYPE_HALF) { + device_vector<half> *tex_img + = new device_vector<half>(device, img->mem_name.c_str(), MEM_TEXTURE); if(!file_load_image<TypeDesc::HALF, half>(img, type, texture_limit, - tex_img)) { + *tex_img)) { /* on failure to load, we set a 1x1 pixels pink image */ - half *pixels = (half*)tex_img.resize(1, 1); + thread_scoped_lock device_lock(device_mutex); + half *pixels = (half*)tex_img->alloc(1, 1); pixels[0] = TEX_IMAGE_MISSING_R; } - if(!pack_images) { - thread_scoped_lock device_lock(device_mutex); - device->tex_alloc(name.c_str(), - tex_img, - img->interpolation, - img->extension); - } + img->mem = tex_img; + img->mem->interpolation = img->interpolation; + img->mem->extension = img->extension; + + thread_scoped_lock device_lock(device_mutex); + tex_img->copy_to_device(); } img->need_load = false; } -void ImageManager::device_free_image(Device *device, DeviceScene *dscene, ImageDataType type, int slot) +void ImageManager::device_free_image(Device *, ImageDataType type, int slot) { Image *img = images[type][slot]; @@ -843,96 +872,40 @@ void ImageManager::device_free_image(Device *device, DeviceScene *dscene, ImageD ((OSL::TextureSystem*)osl_texture_system)->invalidate(filename); #endif } - else if(type == IMAGE_DATA_TYPE_FLOAT4) { - device_vector<float4>& tex_img = dscene->tex_float4_image[slot]; - - if(tex_img.device_pointer) { - thread_scoped_lock device_lock(device_mutex); - device->tex_free(tex_img); - } - - tex_img.clear(); - } - else if(type == IMAGE_DATA_TYPE_FLOAT) { - device_vector<float>& tex_img = dscene->tex_float_image[slot]; - - if(tex_img.device_pointer) { - thread_scoped_lock device_lock(device_mutex); - device->tex_free(tex_img); - } - - tex_img.clear(); - } - else if(type == IMAGE_DATA_TYPE_BYTE4) { - device_vector<uchar4>& tex_img = dscene->tex_byte4_image[slot]; - - if(tex_img.device_pointer) { - thread_scoped_lock device_lock(device_mutex); - device->tex_free(tex_img); - } - - tex_img.clear(); - } - else if(type == IMAGE_DATA_TYPE_BYTE){ - device_vector<uchar>& tex_img = dscene->tex_byte_image[slot]; - if(tex_img.device_pointer) { - thread_scoped_lock device_lock(device_mutex); - device->tex_free(tex_img); - } - - tex_img.clear(); - } - else if(type == IMAGE_DATA_TYPE_HALF4){ - device_vector<half4>& tex_img = dscene->tex_half4_image[slot]; - - if(tex_img.device_pointer) { - thread_scoped_lock device_lock(device_mutex); - device->tex_free(tex_img); - } - - tex_img.clear(); - } - else if(type == IMAGE_DATA_TYPE_HALF){ - device_vector<half>& tex_img = dscene->tex_half_image[slot]; - - if(tex_img.device_pointer) { - thread_scoped_lock device_lock(device_mutex); - device->tex_free(tex_img); - } - - tex_img.clear(); + if(img->mem) { + thread_scoped_lock device_lock(device_mutex); + delete img->mem; } - delete images[type][slot]; + delete img; images[type][slot] = NULL; + --tex_num_images[type]; } } void ImageManager::device_update(Device *device, - DeviceScene *dscene, Scene *scene, Progress& progress) { - if(!need_update) + if(!need_update) { return; + } TaskPool pool; - for(int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) { for(size_t slot = 0; slot < images[type].size(); slot++) { if(!images[type][slot]) continue; if(images[type][slot]->users == 0) { - device_free_image(device, dscene, (ImageDataType)type, slot); + device_free_image(device, (ImageDataType)type, slot); } else if(images[type][slot]->need_load) { if(!osl_texture_system || images[type][slot]->builtin_data) pool.push(function_bind(&ImageManager::device_load_image, this, device, - dscene, scene, (ImageDataType)type, slot, @@ -943,14 +916,10 @@ void ImageManager::device_update(Device *device, pool.wait_work(); - if(pack_images) - device_pack_images(device, dscene, progress); - need_update = false; } void ImageManager::device_update_slot(Device *device, - DeviceScene *dscene, Scene *scene, int flat_slot, Progress *progress) @@ -962,12 +931,11 @@ void ImageManager::device_update_slot(Device *device, assert(image != NULL); if(image->users == 0) { - device_free_image(device, dscene, type, slot); + device_free_image(device, type, slot); } else if(image->need_load) { if(!osl_texture_system || image->builtin_data) device_load_image(device, - dscene, scene, type, slot, @@ -975,229 +943,24 @@ void ImageManager::device_update_slot(Device *device, } } -uint8_t ImageManager::pack_image_options(ImageDataType type, size_t slot) -{ - uint8_t options = 0; - - /* Image Options are packed into one uint: - * bit 0 -> Interpolation - * bit 1 + 2 + 3-> Extension */ - if(images[type][slot]->interpolation == INTERPOLATION_CLOSEST) - options |= (1 << 0); - - if(images[type][slot]->extension == EXTENSION_REPEAT) - options |= (1 << 1); - else if(images[type][slot]->extension == EXTENSION_EXTEND) - options |= (1 << 2); - else /* EXTENSION_CLIP */ - options |= (1 << 3); - - return options; -} - -void ImageManager::device_pack_images(Device *device, - DeviceScene *dscene, - Progress& /*progess*/) -{ - /* For OpenCL, we pack all image textures into a single large texture, and - * do our own interpolation in the kernel. */ - size_t size = 0, offset = 0; - ImageDataType type; - - int info_size = tex_num_images[IMAGE_DATA_TYPE_FLOAT4] + tex_num_images[IMAGE_DATA_TYPE_BYTE4] - + tex_num_images[IMAGE_DATA_TYPE_FLOAT] + tex_num_images[IMAGE_DATA_TYPE_BYTE]; - uint4 *info = dscene->tex_image_packed_info.resize(info_size*2); - - /* Byte4 Textures*/ - type = IMAGE_DATA_TYPE_BYTE4; - - for(size_t slot = 0; slot < images[type].size(); slot++) { - if(!images[type][slot]) - continue; - - device_vector<uchar4>& tex_img = dscene->tex_byte4_image[slot]; - size += tex_img.size(); - } - - uchar4 *pixels_byte4 = dscene->tex_image_byte4_packed.resize(size); - - for(size_t slot = 0; slot < images[type].size(); slot++) { - if(!images[type][slot]) - continue; - - device_vector<uchar4>& tex_img = dscene->tex_byte4_image[slot]; - - uint8_t options = pack_image_options(type, slot); - - int index = type_index_to_flattened_slot(slot, type) * 2; - info[index] = make_uint4(tex_img.data_width, tex_img.data_height, offset, options); - info[index+1] = make_uint4(tex_img.data_depth, 0, 0, 0); - - memcpy(pixels_byte4+offset, (void*)tex_img.data_pointer, tex_img.memory_size()); - offset += tex_img.size(); - } - - /* Float4 Textures*/ - type = IMAGE_DATA_TYPE_FLOAT4; - size = 0, offset = 0; - - for(size_t slot = 0; slot < images[type].size(); slot++) { - if(!images[type][slot]) - continue; - - device_vector<float4>& tex_img = dscene->tex_float4_image[slot]; - size += tex_img.size(); - } - - float4 *pixels_float4 = dscene->tex_image_float4_packed.resize(size); - - for(size_t slot = 0; slot < images[type].size(); slot++) { - if(!images[type][slot]) - continue; - - device_vector<float4>& tex_img = dscene->tex_float4_image[slot]; - - /* todo: support 3D textures, only CPU for now */ - - uint8_t options = pack_image_options(type, slot); - - int index = type_index_to_flattened_slot(slot, type) * 2; - info[index] = make_uint4(tex_img.data_width, tex_img.data_height, offset, options); - info[index+1] = make_uint4(tex_img.data_depth, 0, 0, 0); - - memcpy(pixels_float4+offset, (void*)tex_img.data_pointer, tex_img.memory_size()); - offset += tex_img.size(); - } - - /* Byte Textures*/ - type = IMAGE_DATA_TYPE_BYTE; - size = 0, offset = 0; - - for(size_t slot = 0; slot < images[type].size(); slot++) { - if(!images[type][slot]) - continue; - - device_vector<uchar>& tex_img = dscene->tex_byte_image[slot]; - size += tex_img.size(); - } - - uchar *pixels_byte = dscene->tex_image_byte_packed.resize(size); - - for(size_t slot = 0; slot < images[type].size(); slot++) { - if(!images[type][slot]) - continue; - - device_vector<uchar>& tex_img = dscene->tex_byte_image[slot]; - - uint8_t options = pack_image_options(type, slot); - - int index = type_index_to_flattened_slot(slot, type) * 2; - info[index] = make_uint4(tex_img.data_width, tex_img.data_height, offset, options); - info[index+1] = make_uint4(tex_img.data_depth, 0, 0, 0); - - memcpy(pixels_byte+offset, (void*)tex_img.data_pointer, tex_img.memory_size()); - offset += tex_img.size(); - } - - /* Float Textures*/ - type = IMAGE_DATA_TYPE_FLOAT; - size = 0, offset = 0; - - for(size_t slot = 0; slot < images[type].size(); slot++) { - if(!images[type][slot]) - continue; - - device_vector<float>& tex_img = dscene->tex_float_image[slot]; - size += tex_img.size(); - } - - float *pixels_float = dscene->tex_image_float_packed.resize(size); - - for(size_t slot = 0; slot < images[type].size(); slot++) { - if(!images[type][slot]) - continue; - - device_vector<float>& tex_img = dscene->tex_float_image[slot]; - - /* todo: support 3D textures, only CPU for now */ - - uint8_t options = pack_image_options(type, slot); - - int index = type_index_to_flattened_slot(slot, type) * 2; - info[index] = make_uint4(tex_img.data_width, tex_img.data_height, offset, options); - info[index+1] = make_uint4(tex_img.data_depth, 0, 0, 0); - - memcpy(pixels_float+offset, (void*)tex_img.data_pointer, tex_img.memory_size()); - offset += tex_img.size(); - } - - if(dscene->tex_image_byte4_packed.size()) { - if(dscene->tex_image_byte4_packed.device_pointer) { - thread_scoped_lock device_lock(device_mutex); - device->tex_free(dscene->tex_image_byte4_packed); - } - device->tex_alloc("__tex_image_byte4_packed", dscene->tex_image_byte4_packed); - } - if(dscene->tex_image_float4_packed.size()) { - if(dscene->tex_image_float4_packed.device_pointer) { - thread_scoped_lock device_lock(device_mutex); - device->tex_free(dscene->tex_image_float4_packed); - } - device->tex_alloc("__tex_image_float4_packed", dscene->tex_image_float4_packed); - } - if(dscene->tex_image_byte_packed.size()) { - if(dscene->tex_image_byte_packed.device_pointer) { - thread_scoped_lock device_lock(device_mutex); - device->tex_free(dscene->tex_image_byte_packed); - } - device->tex_alloc("__tex_image_byte_packed", dscene->tex_image_byte_packed); - } - if(dscene->tex_image_float_packed.size()) { - if(dscene->tex_image_float_packed.device_pointer) { - thread_scoped_lock device_lock(device_mutex); - device->tex_free(dscene->tex_image_float_packed); - } - device->tex_alloc("__tex_image_float_packed", dscene->tex_image_float_packed); - } - if(dscene->tex_image_packed_info.size()) { - if(dscene->tex_image_packed_info.device_pointer) { - thread_scoped_lock device_lock(device_mutex); - device->tex_free(dscene->tex_image_packed_info); - } - device->tex_alloc("__tex_image_packed_info", dscene->tex_image_packed_info); - } -} - -void ImageManager::device_free_builtin(Device *device, DeviceScene *dscene) +void ImageManager::device_free_builtin(Device *device) { for(int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) { for(size_t slot = 0; slot < images[type].size(); slot++) { if(images[type][slot] && images[type][slot]->builtin_data) - device_free_image(device, dscene, (ImageDataType)type, slot); + device_free_image(device, (ImageDataType)type, slot); } } } -void ImageManager::device_free(Device *device, DeviceScene *dscene) +void ImageManager::device_free(Device *device) { for(int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) { for(size_t slot = 0; slot < images[type].size(); slot++) { - device_free_image(device, dscene, (ImageDataType)type, slot); + device_free_image(device, (ImageDataType)type, slot); } images[type].clear(); } - - device->tex_free(dscene->tex_image_byte4_packed); - device->tex_free(dscene->tex_image_float4_packed); - device->tex_free(dscene->tex_image_byte_packed); - device->tex_free(dscene->tex_image_float_packed); - device->tex_free(dscene->tex_image_packed_info); - - dscene->tex_image_byte4_packed.clear(); - dscene->tex_image_float4_packed.clear(); - dscene->tex_image_byte_packed.clear(); - dscene->tex_image_float_packed.clear(); - dscene->tex_image_packed_info.clear(); } CCL_NAMESPACE_END diff --git a/intern/cycles/render/image.h b/intern/cycles/render/image.h index 494c74f0cdd..5391490d993 100644 --- a/intern/cycles/render/image.h +++ b/intern/cycles/render/image.h @@ -17,46 +17,46 @@ #ifndef __IMAGE_H__ #define __IMAGE_H__ -#include "device.h" -#include "device_memory.h" +#include "device/device.h" +#include "device/device_memory.h" -#include "util_image.h" -#include "util_string.h" -#include "util_thread.h" -#include "util_vector.h" +#include "util/util_image.h" +#include "util/util_string.h" +#include "util/util_thread.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN class Device; -class DeviceScene; class Progress; class Scene; +class ImageMetaData { +public: + /* Must be set by image file or builtin callback. */ + bool is_float, is_half; + int channels; + size_t width, height, depth; + bool builtin_free_cache; + + /* Automatically set. */ + ImageDataType type; + bool is_linear; +}; + class ImageManager { public: explicit ImageManager(const DeviceInfo& info); ~ImageManager(); - enum ImageDataType { - IMAGE_DATA_TYPE_FLOAT4 = 0, - IMAGE_DATA_TYPE_BYTE4 = 1, - IMAGE_DATA_TYPE_HALF4 = 2, - IMAGE_DATA_TYPE_FLOAT = 3, - IMAGE_DATA_TYPE_BYTE = 4, - IMAGE_DATA_TYPE_HALF = 5, - - IMAGE_DATA_NUM_TYPES - }; - int add_image(const string& filename, void *builtin_data, bool animated, float frame, - bool& is_float, - bool& is_linear, InterpolationType interpolation, ExtensionType extension, - bool use_alpha); + bool use_alpha, + ImageMetaData& metadata); void remove_image(int flat_slot); void remove_image(const string& filename, void *builtin_data, @@ -68,24 +68,25 @@ public: InterpolationType interpolation, ExtensionType extension, bool use_alpha); - ImageDataType get_image_metadata(const string& filename, void *builtin_data, bool& is_linear); + bool get_image_metadata(const string& filename, + void *builtin_data, + ImageMetaData& metadata); void device_update(Device *device, - DeviceScene *dscene, Scene *scene, Progress& progress); void device_update_slot(Device *device, - DeviceScene *dscene, Scene *scene, int flat_slot, Progress *progress); - void device_free(Device *device, DeviceScene *dscene); - void device_free_builtin(Device *device, DeviceScene *dscene); + void device_free(Device *device); + void device_free_builtin(Device *device); void set_osl_texture_system(void *texture_system); - void set_pack_images(bool pack_images_); bool set_animation_frame_update(int frame); + device_memory *image_memory(int flat_slot); + bool need_update; /* NOTE: Here pixels_size is a size of storage, which equals to @@ -94,23 +95,22 @@ public: */ function<void(const string &filename, void *data, - bool &is_float, - int &width, - int &height, - int &depth, - int &channels)> builtin_image_info_cb; + ImageMetaData& metadata)> builtin_image_info_cb; function<bool(const string &filename, void *data, unsigned char *pixels, - const size_t pixels_size)> builtin_image_pixels_cb; + const size_t pixels_size, + const bool free_cache)> builtin_image_pixels_cb; function<bool(const string &filename, void *data, float *pixels, - const size_t pixels_size)> builtin_image_float_pixels_cb; + const size_t pixels_size, + const bool free_cache)> builtin_image_float_pixels_cb; struct Image { string filename; void *builtin_data; + bool builtin_free_cache; bool use_alpha; bool need_load; @@ -119,21 +119,29 @@ public: InterpolationType interpolation; ExtensionType extension; + string mem_name; + device_memory *mem; + int users; }; private: int tex_num_images[IMAGE_DATA_NUM_TYPES]; - int tex_start_images[IMAGE_DATA_NUM_TYPES]; + int max_num_images; + bool has_half_images; thread_mutex device_mutex; int animation_frame; vector<Image*> images[IMAGE_DATA_NUM_TYPES]; void *osl_texture_system; - bool pack_images; - bool file_load_image_generic(Image *img, ImageInput **in, int &width, int &height, int &depth, int &components); + bool file_load_image_generic(Image *img, + ImageInput **in, + int &width, + int &height, + int &depth, + int &components); template<TypeDesc::BASETYPE FileFormat, typename StorageType, @@ -143,26 +151,19 @@ private: int texture_limit, device_vector<DeviceType>& tex_img); + int max_flattened_slot(ImageDataType type); int type_index_to_flattened_slot(int slot, ImageDataType type); int flattened_slot_to_type_index(int flat_slot, ImageDataType *type); string name_from_type(int type); - uint8_t pack_image_options(ImageDataType type, size_t slot); - void device_load_image(Device *device, - DeviceScene *dscene, Scene *scene, ImageDataType type, int slot, Progress *progess); void device_free_image(Device *device, - DeviceScene *dscene, ImageDataType type, int slot); - - void device_pack_images(Device *device, - DeviceScene *dscene, - Progress& progess); }; CCL_NAMESPACE_END diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp index 1ab0f9874f2..9c276bcab31 100644 --- a/intern/cycles/render/integrator.cpp +++ b/intern/cycles/render/integrator.cpp @@ -14,16 +14,17 @@ * limitations under the License. */ -#include "device.h" -#include "integrator.h" -#include "film.h" -#include "light.h" -#include "scene.h" -#include "shader.h" -#include "sobol.h" - -#include "util_foreach.h" -#include "util_hash.h" +#include "device/device.h" +#include "render/background.h" +#include "render/integrator.h" +#include "render/film.h" +#include "render/light.h" +#include "render/scene.h" +#include "render/shader.h" +#include "render/sobol.h" + +#include "util/util_foreach.h" +#include "util/util_hash.h" CCL_NAMESPACE_BEGIN @@ -31,7 +32,6 @@ NODE_DEFINE(Integrator) { NodeType *type = NodeType::add("integrator", create); - SOCKET_INT(min_bounce, "Min Bounce", 2); SOCKET_INT(max_bounce, "Max Bounce", 7); SOCKET_INT(max_diffuse_bounce, "Max Diffuse Bounce", 7); @@ -39,9 +39,7 @@ NODE_DEFINE(Integrator) SOCKET_INT(max_transmission_bounce, "Max Transmission Bounce", 7); SOCKET_INT(max_volume_bounce, "Max Volume Bounce", 7); - SOCKET_INT(transparent_min_bounce, "Transparent Min Bounce", 2); SOCKET_INT(transparent_max_bounce, "Transparent Max Bounce", 7); - SOCKET_BOOLEAN(transparent_shadows, "Transparent Shadows", false); SOCKET_INT(ao_bounces, "AO Bounces", 0); @@ -104,7 +102,6 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene /* integrator parameters */ kintegrator->max_bounce = max_bounce + 1; - kintegrator->min_bounce = min_bounce + 1; kintegrator->max_diffuse_bounce = max_diffuse_bounce + 1; kintegrator->max_glossy_bounce = max_glossy_bounce + 1; @@ -112,7 +109,6 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene kintegrator->max_volume_bounce = max_volume_bounce + 1; kintegrator->transparent_max_bounce = transparent_max_bounce + 1; - kintegrator->transparent_min_bounce = transparent_min_bounce + 1; if(ao_bounces == 0) { kintegrator->ao_bounces = INT_MAX; @@ -125,19 +121,14 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene * We only need to enable transparent shadows, if we actually have * transparent shaders in the scene. Otherwise we can disable it * to improve performance a bit. */ - if(transparent_shadows) { - kintegrator->transparent_shadows = false; - foreach(Shader *shader, scene->shaders) { - /* keep this in sync with SD_HAS_TRANSPARENT_SHADOW in shader.cpp */ - if((shader->has_surface_transparent && shader->use_transparent_shadow) || shader->has_volume) { - kintegrator->transparent_shadows = true; - break; - } + kintegrator->transparent_shadows = false; + foreach(Shader *shader, scene->shaders) { + /* keep this in sync with SD_HAS_TRANSPARENT_SHADOW in shader.cpp */ + if((shader->has_surface_transparent && shader->use_transparent_shadow) || shader->has_volume) { + kintegrator->transparent_shadows = true; + break; } } - else { - kintegrator->transparent_shadows = false; - } kintegrator->volume_max_steps = volume_max_steps; kintegrator->volume_step_size = volume_step_size; @@ -155,6 +146,7 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene kintegrator->sample_clamp_indirect = (sample_clamp_indirect == 0.0f)? FLT_MAX: sample_clamp_indirect*3.0f; kintegrator->branched = (method == BRANCHED_PATH); + kintegrator->volume_decoupled = device->info.has_volume_decoupled; kintegrator->diffuse_samples = diffuse_samples; kintegrator->glossy_samples = glossy_samples; kintegrator->transmission_samples = transmission_samples; @@ -195,16 +187,21 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene max_samples = max(max_samples, volume_samples); } - max_samples *= (max_bounce + transparent_max_bounce + 3 + BSSRDF_MAX_HITS); + uint total_bounces = max_bounce + + transparent_max_bounce + 3 + + VOLUME_BOUNDS_MAX + + max(BSSRDF_MAX_HITS, BSSRDF_MAX_BOUNCES); + + max_samples *= total_bounces; int dimensions = PRNG_BASE_NUM + max_samples*PRNG_BOUNCE_NUM; dimensions = min(dimensions, SOBOL_MAX_DIMENSIONS); - uint *directions = dscene->sobol_directions.resize(SOBOL_BITS*dimensions); + uint *directions = dscene->sobol_directions.alloc(SOBOL_BITS*dimensions); sobol_generate_direction_vectors((uint(*)[SOBOL_BITS])directions, dimensions); - device->tex_alloc("__sobol_directions", dscene->sobol_directions); + dscene->sobol_directions.copy_to_device(); /* Clamping. */ bool use_sample_clamp = (sample_clamp_direct != 0.0f || @@ -217,10 +214,9 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene need_update = false; } -void Integrator::device_free(Device *device, DeviceScene *dscene) +void Integrator::device_free(Device *, DeviceScene *dscene) { - device->tex_free(dscene->sobol_directions); - dscene->sobol_directions.clear(); + dscene->sobol_directions.free(); } bool Integrator::modified(const Integrator& integrator) diff --git a/intern/cycles/render/integrator.h b/intern/cycles/render/integrator.h index 27fff4831e5..3cb430d72b4 100644 --- a/intern/cycles/render/integrator.h +++ b/intern/cycles/render/integrator.h @@ -17,9 +17,9 @@ #ifndef __INTEGRATOR_H__ #define __INTEGRATOR_H__ -#include "kernel_types.h" +#include "kernel/kernel_types.h" -#include "node.h" +#include "graph/node.h" CCL_NAMESPACE_BEGIN @@ -29,9 +29,8 @@ class Scene; class Integrator : public Node { public: - NODE_DECLARE; + NODE_DECLARE - int min_bounce; int max_bounce; int max_diffuse_bounce; @@ -39,9 +38,7 @@ public: int max_transmission_bounce; int max_volume_bounce; - int transparent_min_bounce; int transparent_max_bounce; - bool transparent_shadows; int ao_bounces; diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp index 6a4557506c3..8dec7e4ea64 100644 --- a/intern/cycles/render/light.cpp +++ b/intern/cycles/render/light.cpp @@ -14,19 +14,19 @@ * limitations under the License. */ -#include "background.h" -#include "device.h" -#include "integrator.h" -#include "film.h" -#include "light.h" -#include "mesh.h" -#include "object.h" -#include "scene.h" -#include "shader.h" - -#include "util_foreach.h" -#include "util_progress.h" -#include "util_logging.h" +#include "render/background.h" +#include "device/device.h" +#include "render/integrator.h" +#include "render/film.h" +#include "render/light.h" +#include "render/mesh.h" +#include "render/object.h" +#include "render/scene.h" +#include "render/shader.h" + +#include "util/util_foreach.h" +#include "util/util_progress.h" +#include "util/util_logging.h" CCL_NAMESPACE_BEGIN @@ -36,10 +36,10 @@ static void shade_background_pixels(Device *device, DeviceScene *dscene, int res int width = res; int height = res; - device_vector<uint4> d_input; - device_vector<float4> d_output; + device_vector<uint4> d_input(device, "background_input", MEM_READ_ONLY); + device_vector<float4> d_output(device, "background_output", MEM_READ_WRITE); - uint4 *d_input_data = d_input.resize(width*height); + uint4 *d_input_data = d_input.alloc(width*height); for(int y = 0; y < height; y++) { for(int x = 0; x < width; x++) { @@ -52,15 +52,12 @@ static void shade_background_pixels(Device *device, DeviceScene *dscene, int res } /* compute on device */ - d_output.resize(width*height); - memset((void*)d_output.data_pointer, 0, d_output.memory_size()); + d_output.alloc(width*height); + d_output.zero_to_device(); + d_input.copy_to_device(); device->const_copy_to("__data", &dscene->data, sizeof(dscene->data)); - device->mem_alloc(d_input, MEM_READ_ONLY); - device->mem_copy_to(d_input); - device->mem_alloc(d_output, MEM_WRITE_ONLY); - DeviceTask main_task(DeviceTask::SHADER); main_task.shader_input = d_input.device_pointer; main_task.shader_output = d_output.device_pointer; @@ -77,15 +74,12 @@ static void shade_background_pixels(Device *device, DeviceScene *dscene, int res foreach(DeviceTask& task, split_tasks) { device->task_add(task); device->task_wait(); - device->mem_copy_from(d_output, task.shader_x, 1, task.shader_w, sizeof(float4)); + d_output.copy_from_device(task.shader_x, 1, task.shader_w); } - device->mem_free(d_input); - device->mem_free(d_output); - - d_input.clear(); + d_input.free(); - float4 *d_output_data = reinterpret_cast<float4*>(d_output.data_pointer); + float4 *d_output_data = d_output.data(); pixels.resize(width*height); @@ -96,6 +90,8 @@ static void shade_background_pixels(Device *device, DeviceScene *dscene, int res pixels[y*width + x].z = d_output_data[y*width + x].z; } } + + d_output.free(); } /* Light */ @@ -138,6 +134,7 @@ NODE_DEFINE(Light) SOCKET_INT(samples, "Samples", 1); SOCKET_INT(max_bounces, "Max Bounces", 1024); + SOCKET_UINT(random_id, "Random ID", 0); SOCKET_BOOLEAN(is_portal, "Is Portal", false); SOCKET_BOOLEAN(is_enabled, "Is Enabled", true); @@ -224,12 +221,12 @@ void LightManager::disable_ineffective_light(Device *device, Scene *scene) bool LightManager::object_usable_as_light(Object *object) { Mesh *mesh = object->mesh; - /* Skip if we are not visible for BSDFs. */ - if(!(object->visibility & (PATH_RAY_DIFFUSE|PATH_RAY_GLOSSY|PATH_RAY_TRANSMIT))) { + /* Skip objects with NaNs */ + if(!object->bounds.valid()) { return false; } - /* Skip motion blurred deforming meshes, not supported yet. */ - if(mesh->has_motion_blur()) { + /* Skip if we are not visible for BSDFs. */ + if(!(object->visibility & (PATH_RAY_DIFFUSE|PATH_RAY_GLOSSY|PATH_RAY_TRANSMIT))) { return false; } /* Skip if we have no emission shaders. */ @@ -245,7 +242,7 @@ bool LightManager::object_usable_as_light(Object *object) { return false; } -void LightManager::device_update_distribution(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress) +void LightManager::device_update_distribution(Device *, DeviceScene *dscene, Scene *scene, Progress& progress) { progress.set_status("Updating Lights", "Computing distribution"); @@ -291,7 +288,7 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen VLOG(1) << "Total " << num_distribution << " of light distribution primitives."; /* emission area */ - float4 *distribution = dscene->light_distribution.resize(num_distribution + 1); + KernelLightDistribution *distribution = dscene->light_distribution.alloc(num_distribution + 1); float totarea = 0.0f; /* triangles */ @@ -337,13 +334,16 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen : scene->default_surface; if(shader->use_mis && shader->has_surface_emission) { - distribution[offset].x = totarea; - distribution[offset].y = __int_as_float(i + mesh->tri_offset); - distribution[offset].z = __int_as_float(shader_flag); - distribution[offset].w = __int_as_float(object_id); + distribution[offset].totarea = totarea; + distribution[offset].prim = i + mesh->tri_offset; + distribution[offset].mesh_light.shader_flag = shader_flag; + distribution[offset].mesh_light.object_id = object_id; offset++; Mesh::Triangle t = mesh->get_triangle(i); + if(!t.valid(&mesh->verts[0])) { + continue; + } float3 p1 = mesh->verts[t.v[0]]; float3 p2 = mesh->verts[t.v[1]]; float3 p3 = mesh->verts[t.v[2]]; @@ -372,10 +372,10 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen if(!light->is_enabled) continue; - distribution[offset].x = totarea; - distribution[offset].y = __int_as_float(~light_index); - distribution[offset].z = 1.0f; - distribution[offset].w = light->size; + distribution[offset].totarea = totarea; + distribution[offset].prim = ~light_index; + distribution[offset].lamp.pad = 1.0f; + distribution[offset].lamp.size = light->size; totarea += lightarea; if(light->size > 0.0f && light->use_mis) @@ -390,15 +390,15 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen } /* normalize cumulative distribution functions */ - distribution[num_distribution].x = totarea; - distribution[num_distribution].y = 0.0f; - distribution[num_distribution].z = 0.0f; - distribution[num_distribution].w = 0.0f; + distribution[num_distribution].totarea = totarea; + distribution[num_distribution].prim = 0.0f; + distribution[num_distribution].lamp.pad = 0.0f; + distribution[num_distribution].lamp.size = 0.0f; if(totarea > 0.0f) { for(size_t i = 0; i < num_distribution; i++) - distribution[i].x /= totarea; - distribution[num_distribution].x = 1.0f; + distribution[i].totarea /= totarea; + distribution[num_distribution].totarea = 1.0f; } if(progress.get_cancel()) return; @@ -415,7 +415,6 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen /* precompute pdfs */ kintegrator->pdf_triangles = 0.0f; kintegrator->pdf_lights = 0.0f; - kintegrator->inv_pdf_lights = 0.0f; /* sample one, with 0.5 probability of light or triangle */ kintegrator->num_all_lights = num_lights; @@ -430,8 +429,6 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen kintegrator->pdf_lights = 1.0f/num_lights; if(trianglearea > 0.0f) kintegrator->pdf_lights *= 0.5f; - - kintegrator->inv_pdf_lights = 1.0f/kintegrator->pdf_lights; } kintegrator->use_lamp_mis = use_lamp_mis; @@ -447,7 +444,7 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen kfilm->pass_shadow_scale *= (float)(num_lights - num_background_lights)/(float)num_lights; /* CDF */ - device->tex_alloc("__light_distribution", dscene->light_distribution); + dscene->light_distribution.copy_to_device(); /* Portals */ if(num_portals > 0) { @@ -462,13 +459,12 @@ void LightManager::device_update_distribution(Device *device, DeviceScene *dscen } } else { - dscene->light_distribution.clear(); + dscene->light_distribution.free(); kintegrator->num_distribution = 0; kintegrator->num_all_lights = 0; kintegrator->pdf_triangles = 0.0f; kintegrator->pdf_lights = 0.0f; - kintegrator->inv_pdf_lights = 0.0f; kintegrator->use_lamp_mis = false; kintegrator->num_portals = 0; kintegrator->portal_offset = 0; @@ -486,18 +482,10 @@ static void background_cdf(int start, float2 *cond_cdf) { /* Conditional CDFs (rows, U direction). */ - /* NOTE: It is possible to have some NaN pixels on background - * which will ruin CDF causing wrong shading. We replace such - * pixels with black. - */ for(int i = start; i < end; i++) { float sin_theta = sinf(M_PI_F * (i + 0.5f) / res); float3 env_color = (*pixels)[i * res]; float ave_luminance = average(env_color); - /* TODO(sergey): Consider adding average_safe(). */ - if(!isfinite(ave_luminance)) { - ave_luminance = 0.0f; - } cond_cdf[i * cdf_count].x = ave_luminance * sin_theta; cond_cdf[i * cdf_count].y = 0.0f; @@ -505,9 +493,6 @@ static void background_cdf(int start, for(int j = 1; j < res; j++) { env_color = (*pixels)[i * res + j]; ave_luminance = average(env_color); - if(!isfinite(ave_luminance)) { - ave_luminance = 0.0f; - } cond_cdf[i * cdf_count + j].x = ave_luminance * sin_theta; cond_cdf[i * cdf_count + j].y = cond_cdf[i * cdf_count + j - 1].y + cond_cdf[i * cdf_count + j - 1].x / res; @@ -568,8 +553,8 @@ void LightManager::device_update_background(Device *device, /* build row distributions and column distribution for the infinite area environment light */ int cdf_count = res + 1; - float2 *marg_cdf = dscene->light_background_marginal_cdf.resize(cdf_count); - float2 *cond_cdf = dscene->light_background_conditional_cdf.resize(cdf_count * cdf_count); + float2 *marg_cdf = dscene->light_background_marginal_cdf.alloc(cdf_count); + float2 *cond_cdf = dscene->light_background_conditional_cdf.alloc(cdf_count * cdf_count); double time_start = time_dt(); if(res < 512) { @@ -618,11 +603,11 @@ void LightManager::device_update_background(Device *device, VLOG(2) << "Background MIS build time " << time_dt() - time_start << "\n"; /* update device */ - device->tex_alloc("__light_background_marginal_cdf", dscene->light_background_marginal_cdf); - device->tex_alloc("__light_background_conditional_cdf", dscene->light_background_conditional_cdf); + dscene->light_background_marginal_cdf.copy_to_device(); + dscene->light_background_conditional_cdf.copy_to_device(); } -void LightManager::device_update_points(Device *device, +void LightManager::device_update_points(Device *, DeviceScene *dscene, Scene *scene) { @@ -635,7 +620,7 @@ void LightManager::device_update_points(Device *device, } } - float4 *light_data = dscene->light_data.resize(num_lights*LIGHT_SIZE); + KernelLight *klights = dscene->lights.alloc(num_lights); if(num_lights == 0) { VLOG(1) << "No effective light, ignoring points update."; @@ -652,8 +637,9 @@ void LightManager::device_update_points(Device *device, float3 co = light->co; Shader *shader = (light->shader) ? light->shader : scene->default_light; int shader_id = scene->shader_manager->get_shader_id(shader); - float samples = __int_as_float(light->samples); - float max_bounces = __int_as_float(light->max_bounces); + int samples = light->samples; + int max_bounces = light->max_bounces; + float random = (float)light->random_id * (1.0f/(float)0xFFFFFFFF); if(!light->cast_shadow) shader_id &= ~SHADER_CAST_SHADOW; @@ -675,6 +661,9 @@ void LightManager::device_update_points(Device *device, use_light_visibility = true; } + klights[light_index].type = light->type; + klights[light_index].samples = samples; + if(light->type == LIGHT_POINT) { shader_id &= ~SHADER_AREA_LIGHT; @@ -684,10 +673,12 @@ void LightManager::device_update_points(Device *device, if(light->use_mis && radius > 0.0f) shader_id |= SHADER_USE_MIS; - light_data[light_index*LIGHT_SIZE + 0] = make_float4(__int_as_float(light->type), co.x, co.y, co.z); - light_data[light_index*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), radius, invarea, 0.0f); - light_data[light_index*LIGHT_SIZE + 2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f); - light_data[light_index*LIGHT_SIZE + 3] = make_float4(samples, 0.0f, 0.0f, 0.0f); + klights[light_index].co[0] = co.x; + klights[light_index].co[1] = co.y; + klights[light_index].co[2] = co.z; + + klights[light_index].spot.radius = radius; + klights[light_index].spot.invarea = invarea; } else if(light->type == LIGHT_DISTANT) { shader_id &= ~SHADER_AREA_LIGHT; @@ -704,10 +695,13 @@ void LightManager::device_update_points(Device *device, if(light->use_mis && area > 0.0f) shader_id |= SHADER_USE_MIS; - light_data[light_index*LIGHT_SIZE + 0] = make_float4(__int_as_float(light->type), dir.x, dir.y, dir.z); - light_data[light_index*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), radius, cosangle, invarea); - light_data[light_index*LIGHT_SIZE + 2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f); - light_data[light_index*LIGHT_SIZE + 3] = make_float4(samples, 0.0f, 0.0f, 0.0f); + klights[light_index].co[0] = dir.x; + klights[light_index].co[1] = dir.y; + klights[light_index].co[2] = dir.z; + + klights[light_index].distant.invarea = invarea; + klights[light_index].distant.radius = radius; + klights[light_index].distant.cosangle = cosangle; } else if(light->type == LIGHT_BACKGROUND) { uint visibility = scene->background->visibility; @@ -731,11 +725,6 @@ void LightManager::device_update_points(Device *device, shader_id |= SHADER_EXCLUDE_SCATTER; use_light_visibility = true; } - - light_data[light_index*LIGHT_SIZE + 0] = make_float4(__int_as_float(light->type), 0.0f, 0.0f, 0.0f); - light_data[light_index*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), 0.0f, 0.0f, 0.0f); - light_data[light_index*LIGHT_SIZE + 2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f); - light_data[light_index*LIGHT_SIZE + 3] = make_float4(samples, 0.0f, 0.0f, 0.0f); } else if(light->type == LIGHT_AREA) { float3 axisu = light->axisu*(light->sizeu*light->size); @@ -749,10 +738,20 @@ void LightManager::device_update_points(Device *device, if(light->use_mis && area > 0.0f) shader_id |= SHADER_USE_MIS; - light_data[light_index*LIGHT_SIZE + 0] = make_float4(__int_as_float(light->type), co.x, co.y, co.z); - light_data[light_index*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), axisu.x, axisu.y, axisu.z); - light_data[light_index*LIGHT_SIZE + 2] = make_float4(invarea, axisv.x, axisv.y, axisv.z); - light_data[light_index*LIGHT_SIZE + 3] = make_float4(samples, dir.x, dir.y, dir.z); + klights[light_index].co[0] = co.x; + klights[light_index].co[1] = co.y; + klights[light_index].co[2] = co.z; + + klights[light_index].area.axisu[0] = axisu.x; + klights[light_index].area.axisu[1] = axisu.y; + klights[light_index].area.axisu[2] = axisu.z; + klights[light_index].area.axisv[0] = axisv.x; + klights[light_index].area.axisv[1] = axisv.y; + klights[light_index].area.axisv[2] = axisv.z; + klights[light_index].area.invarea = invarea; + klights[light_index].area.dir[0] = dir.x; + klights[light_index].area.dir[1] = dir.y; + klights[light_index].area.dir[2] = dir.z; } else if(light->type == LIGHT_SPOT) { shader_id &= ~SHADER_AREA_LIGHT; @@ -768,18 +767,26 @@ void LightManager::device_update_points(Device *device, if(light->use_mis && radius > 0.0f) shader_id |= SHADER_USE_MIS; - light_data[light_index*LIGHT_SIZE + 0] = make_float4(__int_as_float(light->type), co.x, co.y, co.z); - light_data[light_index*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), radius, invarea, spot_angle); - light_data[light_index*LIGHT_SIZE + 2] = make_float4(spot_smooth, dir.x, dir.y, dir.z); - light_data[light_index*LIGHT_SIZE + 3] = make_float4(samples, 0.0f, 0.0f, 0.0f); + klights[light_index].co[0] = co.x; + klights[light_index].co[1] = co.y; + klights[light_index].co[2] = co.z; + + klights[light_index].spot.radius = radius; + klights[light_index].spot.invarea = invarea; + klights[light_index].spot.spot_angle = spot_angle; + klights[light_index].spot.spot_smooth = spot_smooth; + klights[light_index].spot.dir[0] = dir.x; + klights[light_index].spot.dir[1] = dir.y; + klights[light_index].spot.dir[2] = dir.z; } - light_data[light_index*LIGHT_SIZE + 4] = make_float4(max_bounces, 0.0f, 0.0f, 0.0f); + klights[light_index].shader_id = shader_id; - Transform tfm = light->tfm; - Transform itfm = transform_inverse(tfm); - memcpy(&light_data[light_index*LIGHT_SIZE + 5], &tfm, sizeof(float4)*3); - memcpy(&light_data[light_index*LIGHT_SIZE + 8], &itfm, sizeof(float4)*3); + klights[light_index].max_bounces = max_bounces; + klights[light_index].random = random; + + klights[light_index].tfm = light->tfm; + klights[light_index].itfm = transform_inverse(light->tfm); light_index++; } @@ -796,21 +803,27 @@ void LightManager::device_update_points(Device *device, float3 axisu = light->axisu*(light->sizeu*light->size); float3 axisv = light->axisv*(light->sizev*light->size); float area = len(axisu)*len(axisv); - float invarea = (area > 0.0f) ? 1.0f / area : 1.0f; + float invarea = (area > 0.0f)? 1.0f/area: 1.0f; float3 dir = light->dir; dir = safe_normalize(dir); - light_data[light_index*LIGHT_SIZE + 0] = make_float4(__int_as_float(light->type), co.x, co.y, co.z); - light_data[light_index*LIGHT_SIZE + 1] = make_float4(area, axisu.x, axisu.y, axisu.z); - light_data[light_index*LIGHT_SIZE + 2] = make_float4(invarea, axisv.x, axisv.y, axisv.z); - light_data[light_index*LIGHT_SIZE + 3] = make_float4(-1, dir.x, dir.y, dir.z); - light_data[light_index*LIGHT_SIZE + 4] = make_float4(-1, 0.0f, 0.0f, 0.0f); - - Transform tfm = light->tfm; - Transform itfm = transform_inverse(tfm); - memcpy(&light_data[light_index*LIGHT_SIZE + 5], &tfm, sizeof(float4)*3); - memcpy(&light_data[light_index*LIGHT_SIZE + 8], &itfm, sizeof(float4)*3); + klights[light_index].co[0] = co.x; + klights[light_index].co[1] = co.y; + klights[light_index].co[2] = co.z; + + klights[light_index].area.axisu[0] = axisu.x; + klights[light_index].area.axisu[1] = axisu.y; + klights[light_index].area.axisu[2] = axisu.z; + klights[light_index].area.axisv[0] = axisv.x; + klights[light_index].area.axisv[1] = axisv.y; + klights[light_index].area.axisv[2] = axisv.z; + klights[light_index].area.invarea = invarea; + klights[light_index].area.dir[0] = dir.x; + klights[light_index].area.dir[1] = dir.y; + klights[light_index].area.dir[2] = dir.z; + klights[light_index].tfm = light->tfm; + klights[light_index].itfm = transform_inverse(light->tfm); light_index++; } @@ -820,7 +833,7 @@ void LightManager::device_update_points(Device *device, VLOG(1) << "Number of lights without contribution: " << num_scene_lights - light_index; - device->tex_alloc("__light_data", dscene->light_data); + dscene->lights.copy_to_device(); } void LightManager::device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress) @@ -853,17 +866,12 @@ void LightManager::device_update(Device *device, DeviceScene *dscene, Scene *sce need_update = false; } -void LightManager::device_free(Device *device, DeviceScene *dscene) +void LightManager::device_free(Device *, DeviceScene *dscene) { - device->tex_free(dscene->light_distribution); - device->tex_free(dscene->light_data); - device->tex_free(dscene->light_background_marginal_cdf); - device->tex_free(dscene->light_background_conditional_cdf); - - dscene->light_distribution.clear(); - dscene->light_data.clear(); - dscene->light_background_marginal_cdf.clear(); - dscene->light_background_conditional_cdf.clear(); + dscene->light_distribution.free(); + dscene->lights.free(); + dscene->light_background_marginal_cdf.free(); + dscene->light_background_conditional_cdf.free(); } void LightManager::tag_update(Scene * /*scene*/) diff --git a/intern/cycles/render/light.h b/intern/cycles/render/light.h index f56530b6490..97b7b971c73 100644 --- a/intern/cycles/render/light.h +++ b/intern/cycles/render/light.h @@ -17,12 +17,12 @@ #ifndef __LIGHT_H__ #define __LIGHT_H__ -#include "kernel_types.h" +#include "kernel/kernel_types.h" -#include "node.h" +#include "graph/node.h" -#include "util_types.h" -#include "util_vector.h" +#include "util/util_types.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN @@ -70,6 +70,7 @@ public: Shader *shader; int samples; int max_bounces; + uint random_id; void tag_update(Scene *scene); diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp index b7660297f3e..7cfbb7b7c7d 100644 --- a/intern/cycles/render/mesh.cpp +++ b/intern/cycles/render/mesh.cpp @@ -14,29 +14,29 @@ * limitations under the License. */ -#include "bvh.h" -#include "bvh_build.h" - -#include "camera.h" -#include "curves.h" -#include "device.h" -#include "graph.h" -#include "shader.h" -#include "light.h" -#include "mesh.h" -#include "nodes.h" -#include "object.h" -#include "scene.h" - -#include "osl_globals.h" - -#include "subd_split.h" -#include "subd_patch_table.h" - -#include "util_foreach.h" -#include "util_logging.h" -#include "util_progress.h" -#include "util_set.h" +#include "bvh/bvh.h" +#include "bvh/bvh_build.h" + +#include "render/camera.h" +#include "render/curves.h" +#include "device/device.h" +#include "render/graph.h" +#include "render/shader.h" +#include "render/light.h" +#include "render/mesh.h" +#include "render/nodes.h" +#include "render/object.h" +#include "render/scene.h" + +#include "kernel/osl/osl_globals.h" + +#include "subd/subd_split.h" +#include "subd/subd_patch_table.h" + +#include "util/util_foreach.h" +#include "util/util_logging.h" +#include "util/util_progress.h" +#include "util/util_set.h" CCL_NAMESPACE_BEGIN @@ -107,6 +107,26 @@ void Mesh::Triangle::verts_for_step(const float3 *verts, } } +float3 Mesh::Triangle::compute_normal(const float3 *verts) const +{ + const float3& v0 = verts[v[0]]; + const float3& v1 = verts[v[1]]; + const float3& v2 = verts[v[2]]; + const float3 norm = cross(v1 - v0, v2 - v0); + const float normlen = len(norm); + if(normlen == 0.0f) { + return make_float3(1.0f, 0.0f, 0.0f); + } + return norm / normlen; +} + +bool Mesh::Triangle::valid(const float3 *verts) const +{ + return isfinite3_safe(verts[v[0]]) && + isfinite3_safe(verts[v[1]]) && + isfinite3_safe(verts[v[2]]); +} + /* Curve */ void Mesh::Curve::bounds_grow(const int k, const float3 *curve_keys, const float *curve_radius, BoundBox& bounds) const @@ -416,6 +436,8 @@ Mesh::Mesh() face_offset = 0; corner_offset = 0; + attr_map_offset = 0; + num_subd_verts = 0; attributes.triangle_mesh = this; @@ -424,6 +446,7 @@ Mesh::Mesh() geometry_flags = GEOMETRY_NONE; + volume_isovalue = 0.001f; has_volume = false; has_surface_bssrdf = false; @@ -511,7 +534,7 @@ void Mesh::reserve_subd_faces(int numfaces, int num_ngons_, int numcorners) subd_attributes.resize(true); } -void Mesh::clear() +void Mesh::clear(bool preserve_voxel_data) { /* clear all verts and triangles */ verts.clear(); @@ -534,15 +557,19 @@ void Mesh::clear() subd_creases.clear(); - attributes.clear(); curve_attributes.clear(); subd_attributes.clear(); + attributes.clear(preserve_voxel_data); + used_shaders.clear(); + if(!preserve_voxel_data) { + geometry_flags = GEOMETRY_NONE; + } + transform_applied = false; transform_negative_scaled = false; transform_normal = transform_identity(); - geometry_flags = GEOMETRY_NONE; delete patch_table; patch_table = NULL; @@ -701,21 +728,6 @@ void Mesh::compute_bounds() bounds = bnds; } -static float3 compute_face_normal(const Mesh::Triangle& t, float3 *verts) -{ - float3 v0 = verts[t.v[0]]; - float3 v1 = verts[t.v[1]]; - float3 v2 = verts[t.v[2]]; - - float3 norm = cross(v1 - v0, v2 - v0); - float normlen = len(norm); - - if(normlen == 0.0f) - return make_float3(1.0f, 0.0f, 0.0f); - - return norm / normlen; -} - void Mesh::add_face_normals() { /* don't compute if already there */ @@ -733,7 +745,7 @@ void Mesh::add_face_normals() float3 *verts_ptr = verts.data(); for(size_t i = 0; i < triangles_size; i++) { - fN[i] = compute_face_normal(get_triangle(i), verts_ptr); + fN[i] = get_triangle(i).compute_normal(verts_ptr); } } @@ -795,7 +807,7 @@ void Mesh::add_vertex_normals() for(size_t i = 0; i < triangles_size; i++) { for(size_t j = 0; j < 3; j++) { - float3 fN = compute_face_normal(get_triangle(i), mP); + float3 fN = get_triangle(i).compute_normal(mP); mN[get_triangle(i).v[j]] += fN; } } @@ -865,15 +877,8 @@ void Mesh::add_undisplaced() } } -void Mesh::pack_normals(Scene *scene, uint *tri_shader, float4 *vnormal) +void Mesh::pack_shaders(Scene *scene, uint *tri_shader) { - Attribute *attr_vN = attributes.find(ATTR_STD_VERTEX_NORMAL); - if(attr_vN == NULL) { - /* Happens on objects with just hair. */ - return; - } - - float3 *vN = attr_vN->data_float3(); uint shader_id = 0; uint last_shader = -1; bool last_smooth = false; @@ -881,10 +886,6 @@ void Mesh::pack_normals(Scene *scene, uint *tri_shader, float4 *vnormal) size_t triangles_size = num_triangles(); int *shader_ptr = shader.data(); - bool do_transform = transform_applied; - Transform ntfm = transform_normal; - - /* save shader */ for(size_t i = 0; i < triangles_size; i++) { if(shader_ptr[i] != last_shader || last_smooth != smooth[i]) { last_shader = shader_ptr[i]; @@ -896,14 +897,27 @@ void Mesh::pack_normals(Scene *scene, uint *tri_shader, float4 *vnormal) tri_shader[i] = shader_id; } +} + +void Mesh::pack_normals(float4 *vnormal) +{ + Attribute *attr_vN = attributes.find(ATTR_STD_VERTEX_NORMAL); + if(attr_vN == NULL) { + /* Happens on objects with just hair. */ + return; + } + bool do_transform = transform_applied; + Transform ntfm = transform_normal; + + float3 *vN = attr_vN->data_float3(); size_t verts_size = verts.size(); for(size_t i = 0; i < verts_size; i++) { float3 vNi = vN[i]; if(do_transform) - vNi = normalize(transform_direction(&ntfm, vNi)); + vNi = safe_normalize(transform_direction(&ntfm, vNi)); vnormal[i] = make_float4(vNi.x, vNi.y, vNi.z, 0.0f); } @@ -1016,7 +1030,8 @@ void Mesh::pack_patches(uint *patch_data, uint vert_offset, uint face_offset, ui } } -void Mesh::compute_bvh(DeviceScene *dscene, +void Mesh::compute_bvh(Device *device, + DeviceScene *dscene, SceneParams *params, Progress *progress, int n, @@ -1050,7 +1065,9 @@ void Mesh::compute_bvh(DeviceScene *dscene, BVHParams bparams; bparams.use_spatial_split = params->use_bvh_spatial_split; - bparams.use_qbvh = params->use_qbvh; + bparams.bvh_layout = BVHParams::best_bvh_layout( + params->bvh_layout, + device->info.bvh_layout_mask); bparams.use_unaligned_nodes = dscene->data.bvh.have_curves && params->use_bvh_unaligned_nodes; bparams.num_motion_triangle_steps = params->num_bvh_time_steps; @@ -1102,6 +1119,32 @@ bool Mesh::has_true_displacement() const return false; } +float Mesh::motion_time(int step) const +{ + return (motion_steps > 1) ? 2.0f * step / (motion_steps - 1) - 1.0f : 0.0f; +} + +int Mesh::motion_step(float time) const +{ + if(motion_steps > 1) { + int attr_step = 0; + + for(int step = 0; step < motion_steps; step++) { + float step_time = motion_time(step); + if(step_time == time) { + return attr_step; + } + + /* Center step is stored in a separate attribute. */ + if(step != motion_steps / 2) { + attr_step++; + } + } + } + + return -1; +} + bool Mesh::need_build_bvh() const { return !transform_applied || has_surface_bssrdf; @@ -1121,14 +1164,12 @@ bool Mesh::is_instanced() const MeshManager::MeshManager() { - bvh = NULL; need_update = true; need_flags_update = true; } MeshManager::~MeshManager() { - delete bvh; } void MeshManager::update_osl_attributes(Device *device, Scene *scene, vector<AttributeRequestSet>& mesh_attributes) @@ -1248,39 +1289,33 @@ void MeshManager::update_osl_attributes(Device *device, Scene *scene, vector<Att #endif } -void MeshManager::update_svm_attributes(Device *device, DeviceScene *dscene, Scene *scene, vector<AttributeRequestSet>& mesh_attributes) +void MeshManager::update_svm_attributes(Device *, DeviceScene *dscene, Scene *scene, vector<AttributeRequestSet>& mesh_attributes) { /* for SVM, the attributes_map table is used to lookup the offset of an * attribute, based on a unique shader attribute id. */ /* compute array stride */ - int attr_map_stride = 0; + int attr_map_size = 0; - for(size_t i = 0; i < scene->meshes.size(); i++) - attr_map_stride = max(attr_map_stride, (mesh_attributes[i].size() + 1)*ATTR_PRIM_TYPES); + for(size_t i = 0; i < scene->meshes.size(); i++) { + Mesh *mesh = scene->meshes[i]; + mesh->attr_map_offset = attr_map_size; + attr_map_size += (mesh_attributes[i].size() + 1)*ATTR_PRIM_TYPES; + } - if(attr_map_stride == 0) + if(attr_map_size == 0) return; /* create attribute map */ - uint4 *attr_map = dscene->attributes_map.resize(attr_map_stride*scene->objects.size()); + uint4 *attr_map = dscene->attributes_map.alloc(attr_map_size*scene->meshes.size()); memset(attr_map, 0, dscene->attributes_map.size()*sizeof(uint)); - for(size_t i = 0; i < scene->objects.size(); i++) { - Object *object = scene->objects[i]; - Mesh *mesh = object->mesh; - - /* find mesh attributes */ - size_t j; - - for(j = 0; j < scene->meshes.size(); j++) - if(scene->meshes[j] == mesh) - break; - - AttributeRequestSet& attributes = mesh_attributes[j]; + for(size_t i = 0; i < scene->meshes.size(); i++) { + Mesh *mesh = scene->meshes[i]; + AttributeRequestSet& attributes = mesh_attributes[i]; /* set object attributes */ - int index = i*attr_map_stride; + int index = mesh->attr_map_offset; foreach(AttributeRequest& req, attributes.requests) { uint id; @@ -1354,8 +1389,7 @@ void MeshManager::update_svm_attributes(Device *device, DeviceScene *dscene, Sce } /* copy to device */ - dscene->data.bvh.attributes_map_stride = attr_map_stride; - device->tex_alloc("__attributes_map", dscene->attributes_map); + dscene->attributes_map.copy_to_device(); } static void update_attribute_element_size(Mesh *mesh, @@ -1387,11 +1421,11 @@ static void update_attribute_element_size(Mesh *mesh, } static void update_attribute_element_offset(Mesh *mesh, - vector<float>& attr_float, + device_vector<float>& attr_float, size_t& attr_float_offset, - vector<float4>& attr_float3, + device_vector<float4>& attr_float3, size_t& attr_float3_offset, - vector<uchar4>& attr_uchar4, + device_vector<uchar4>& attr_uchar4, size_t& attr_uchar4_offset, Attribute *mattr, AttributePrimitive prim, @@ -1419,7 +1453,7 @@ static void update_attribute_element_offset(Mesh *mesh, uchar4 *data = mattr->data_uchar4(); offset = attr_uchar4_offset; - assert(attr_uchar4.capacity() >= offset + size); + assert(attr_uchar4.size() >= offset + size); for(size_t k = 0; k < size; k++) { attr_uchar4[offset+k] = data[k]; } @@ -1429,7 +1463,7 @@ static void update_attribute_element_offset(Mesh *mesh, float *data = mattr->data_float(); offset = attr_float_offset; - assert(attr_float.capacity() >= offset + size); + assert(attr_float.size() >= offset + size); for(size_t k = 0; k < size; k++) { attr_float[offset+k] = data[k]; } @@ -1439,17 +1473,17 @@ static void update_attribute_element_offset(Mesh *mesh, Transform *tfm = mattr->data_transform(); offset = attr_float3_offset; - assert(attr_float3.capacity() >= offset + size * 4); - for(size_t k = 0; k < size*4; k++) { + assert(attr_float3.size() >= offset + size * 3); + for(size_t k = 0; k < size*3; k++) { attr_float3[offset+k] = (&tfm->x)[k]; } - attr_float3_offset += size * 4; + attr_float3_offset += size * 3; } else { float4 *data = mattr->data_float4(); offset = attr_float3_offset; - assert(attr_float3.capacity() >= offset + size); + assert(attr_float3.size() >= offset + size); for(size_t k = 0; k < size; k++) { attr_float3[offset+k] = data[k]; } @@ -1550,9 +1584,9 @@ void MeshManager::device_update_attributes(Device *device, DeviceScene *dscene, } } - vector<float> attr_float(attr_float_size); - vector<float4> attr_float3(attr_float3_size); - vector<uchar4> attr_uchar4(attr_uchar4_size); + dscene->attributes_float.alloc(attr_float_size); + dscene->attributes_float3.alloc(attr_float3_size); + dscene->attributes_uchar4.alloc(attr_uchar4_size); size_t attr_float_offset = 0; size_t attr_float3_offset = 0; @@ -1571,27 +1605,27 @@ void MeshManager::device_update_attributes(Device *device, DeviceScene *dscene, Attribute *subd_mattr = mesh->subd_attributes.find(req); update_attribute_element_offset(mesh, - attr_float, attr_float_offset, - attr_float3, attr_float3_offset, - attr_uchar4, attr_uchar4_offset, + dscene->attributes_float, attr_float_offset, + dscene->attributes_float3, attr_float3_offset, + dscene->attributes_uchar4, attr_uchar4_offset, triangle_mattr, ATTR_PRIM_TRIANGLE, req.triangle_type, req.triangle_desc); update_attribute_element_offset(mesh, - attr_float, attr_float_offset, - attr_float3, attr_float3_offset, - attr_uchar4, attr_uchar4_offset, + dscene->attributes_float, attr_float_offset, + dscene->attributes_float3, attr_float3_offset, + dscene->attributes_uchar4, attr_uchar4_offset, curve_mattr, ATTR_PRIM_CURVE, req.curve_type, req.curve_desc); update_attribute_element_offset(mesh, - attr_float, attr_float_offset, - attr_float3, attr_float3_offset, - attr_uchar4, attr_uchar4_offset, + dscene->attributes_float, attr_float_offset, + dscene->attributes_float3, attr_float3_offset, + dscene->attributes_uchar4, attr_uchar4_offset, subd_mattr, ATTR_PRIM_SUBD, req.subd_type, @@ -1612,18 +1646,21 @@ void MeshManager::device_update_attributes(Device *device, DeviceScene *dscene, /* copy to device */ progress.set_status("Updating Mesh", "Copying Attributes to device"); - if(attr_float.size()) { - dscene->attributes_float.copy(&attr_float[0], attr_float.size()); - device->tex_alloc("__attributes_float", dscene->attributes_float); + if(dscene->attributes_float.size()) { + dscene->attributes_float.copy_to_device(); } - if(attr_float3.size()) { - dscene->attributes_float3.copy(&attr_float3[0], attr_float3.size()); - device->tex_alloc("__attributes_float3", dscene->attributes_float3); + if(dscene->attributes_float3.size()) { + dscene->attributes_float3.copy_to_device(); } - if(attr_uchar4.size()) { - dscene->attributes_uchar4.copy(&attr_uchar4[0], attr_uchar4.size()); - device->tex_alloc("__attributes_uchar4", dscene->attributes_uchar4); + if(dscene->attributes_uchar4.size()) { + dscene->attributes_uchar4.copy_to_device(); } + + if(progress.get_cancel()) return; + + /* After mesh attributes and patch tables have been copied to device memory, + * we need to update offsets in the objects. */ + scene->object_manager->device_update_mesh_offsets(device, dscene, scene); } void MeshManager::mesh_calc_offset(Scene *scene) @@ -1670,7 +1707,7 @@ void MeshManager::mesh_calc_offset(Scene *scene) } } -void MeshManager::device_update_mesh(Device *device, +void MeshManager::device_update_mesh(Device *, DeviceScene *dscene, Scene *scene, bool for_displacement, @@ -1719,10 +1756,9 @@ void MeshManager::device_update_mesh(Device *device, } } else { - PackedBVH& pack = bvh->pack; - for(size_t i = 0; i < pack.prim_index.size(); ++i) { - if((pack.prim_type[i] & PRIMITIVE_ALL_TRIANGLE) != 0) { - tri_prim_index[pack.prim_index[i]] = pack.prim_tri_index[i]; + for(size_t i = 0; i < dscene->prim_index.size(); ++i) { + if((dscene->prim_type[i] & PRIMITIVE_ALL_TRIANGLE) != 0) { + tri_prim_index[dscene->prim_index[i]] = dscene->prim_tri_index[i]; } } } @@ -1732,16 +1768,16 @@ void MeshManager::device_update_mesh(Device *device, /* normals */ progress.set_status("Updating Mesh", "Computing normals"); - uint *tri_shader = dscene->tri_shader.resize(tri_size); - float4 *vnormal = dscene->tri_vnormal.resize(vert_size); - uint4 *tri_vindex = dscene->tri_vindex.resize(tri_size); - uint *tri_patch = dscene->tri_patch.resize(tri_size); - float2 *tri_patch_uv = dscene->tri_patch_uv.resize(vert_size); + uint *tri_shader = dscene->tri_shader.alloc(tri_size); + float4 *vnormal = dscene->tri_vnormal.alloc(vert_size); + uint4 *tri_vindex = dscene->tri_vindex.alloc(tri_size); + uint *tri_patch = dscene->tri_patch.alloc(tri_size); + float2 *tri_patch_uv = dscene->tri_patch_uv.alloc(vert_size); foreach(Mesh *mesh, scene->meshes) { - mesh->pack_normals(scene, - &tri_shader[mesh->tri_offset], - &vnormal[mesh->vert_offset]); + mesh->pack_shaders(scene, + &tri_shader[mesh->tri_offset]); + mesh->pack_normals(&vnormal[mesh->vert_offset]); mesh->pack_verts(tri_prim_index, &tri_vindex[mesh->tri_offset], &tri_patch[mesh->tri_offset], @@ -1754,32 +1790,32 @@ void MeshManager::device_update_mesh(Device *device, /* vertex coordinates */ progress.set_status("Updating Mesh", "Copying Mesh to device"); - device->tex_alloc("__tri_shader", dscene->tri_shader); - device->tex_alloc("__tri_vnormal", dscene->tri_vnormal); - device->tex_alloc("__tri_vindex", dscene->tri_vindex); - device->tex_alloc("__tri_patch", dscene->tri_patch); - device->tex_alloc("__tri_patch_uv", dscene->tri_patch_uv); + dscene->tri_shader.copy_to_device(); + dscene->tri_vnormal.copy_to_device(); + dscene->tri_vindex.copy_to_device(); + dscene->tri_patch.copy_to_device(); + dscene->tri_patch_uv.copy_to_device(); } if(curve_size != 0) { progress.set_status("Updating Mesh", "Copying Strands to device"); - float4 *curve_keys = dscene->curve_keys.resize(curve_key_size); - float4 *curves = dscene->curves.resize(curve_size); + float4 *curve_keys = dscene->curve_keys.alloc(curve_key_size); + float4 *curves = dscene->curves.alloc(curve_size); foreach(Mesh *mesh, scene->meshes) { mesh->pack_curves(scene, &curve_keys[mesh->curvekey_offset], &curves[mesh->curve_offset], mesh->curvekey_offset); if(progress.get_cancel()) return; } - device->tex_alloc("__curve_keys", dscene->curve_keys); - device->tex_alloc("__curves", dscene->curves); + dscene->curve_keys.copy_to_device(); + dscene->curves.copy_to_device(); } if(patch_size != 0) { progress.set_status("Updating Mesh", "Copying Patches to device"); - uint *patch_data = dscene->patches.resize(patch_size); + uint *patch_data = dscene->patches.alloc(patch_size); foreach(Mesh *mesh, scene->meshes) { mesh->pack_patches(&patch_data[mesh->patch_offset], mesh->vert_offset, mesh->face_offset, mesh->corner_offset); @@ -1791,11 +1827,11 @@ void MeshManager::device_update_mesh(Device *device, if(progress.get_cancel()) return; } - device->tex_alloc("__patches", dscene->patches); + dscene->patches.copy_to_device(); } if(for_displacement) { - float4 *prim_tri_verts = dscene->prim_tri_verts.resize(tri_size * 3); + float4 *prim_tri_verts = dscene->prim_tri_verts.alloc(tri_size * 3); foreach(Mesh *mesh, scene->meshes) { for(size_t i = 0; i < mesh->num_triangles(); ++i) { Mesh::Triangle t = mesh->get_triangle(i); @@ -1805,7 +1841,7 @@ void MeshManager::device_update_mesh(Device *device, prim_tri_verts[offset + 2] = float3_to_float4(mesh->verts[t.v[2]]); } } - device->tex_alloc("__prim_tri_verts", dscene->prim_tri_verts); + dscene->prim_tri_verts.copy_to_device(); } } @@ -1814,23 +1850,27 @@ void MeshManager::device_update_bvh(Device *device, DeviceScene *dscene, Scene * /* bvh build */ progress.set_status("Updating Scene BVH", "Building"); - VLOG(1) << (scene->params.use_qbvh ? "Using QBVH optimization structure" - : "Using regular BVH optimization structure"); - BVHParams bparams; bparams.top_level = true; - bparams.use_qbvh = scene->params.use_qbvh; + bparams.bvh_layout = BVHParams::best_bvh_layout( + scene->params.bvh_layout, + device->info.bvh_layout_mask); bparams.use_spatial_split = scene->params.use_bvh_spatial_split; bparams.use_unaligned_nodes = dscene->data.bvh.have_curves && scene->params.use_bvh_unaligned_nodes; bparams.num_motion_triangle_steps = scene->params.num_bvh_time_steps; bparams.num_motion_curve_steps = scene->params.num_bvh_time_steps; - delete bvh; - bvh = BVH::create(bparams, scene->objects); + VLOG(1) << "Using " << bvh_layout_name(bparams.bvh_layout) + << " layout."; + + BVH *bvh = BVH::create(bparams, scene->objects); bvh->build(progress); - if(progress.get_cancel()) return; + if(progress.get_cancel()) { + delete bvh; + return; + } /* copy to device */ progress.set_status("Updating Scene BVH", "Copying BVH to device"); @@ -1838,62 +1878,69 @@ void MeshManager::device_update_bvh(Device *device, DeviceScene *dscene, Scene * PackedBVH& pack = bvh->pack; if(pack.nodes.size()) { - dscene->bvh_nodes.reference((float4*)&pack.nodes[0], pack.nodes.size()); - device->tex_alloc("__bvh_nodes", dscene->bvh_nodes); + dscene->bvh_nodes.steal_data(pack.nodes); + dscene->bvh_nodes.copy_to_device(); } if(pack.leaf_nodes.size()) { - dscene->bvh_leaf_nodes.reference((float4*)&pack.leaf_nodes[0], pack.leaf_nodes.size()); - device->tex_alloc("__bvh_leaf_nodes", dscene->bvh_leaf_nodes); + dscene->bvh_leaf_nodes.steal_data(pack.leaf_nodes); + dscene->bvh_leaf_nodes.copy_to_device(); } if(pack.object_node.size()) { - dscene->object_node.reference((uint*)&pack.object_node[0], pack.object_node.size()); - device->tex_alloc("__object_node", dscene->object_node); + dscene->object_node.steal_data(pack.object_node); + dscene->object_node.copy_to_device(); } if(pack.prim_tri_index.size()) { - dscene->prim_tri_index.reference((uint*)&pack.prim_tri_index[0], pack.prim_tri_index.size()); - device->tex_alloc("__prim_tri_index", dscene->prim_tri_index); + dscene->prim_tri_index.steal_data(pack.prim_tri_index); + dscene->prim_tri_index.copy_to_device(); } if(pack.prim_tri_verts.size()) { - dscene->prim_tri_verts.reference((float4*)&pack.prim_tri_verts[0], pack.prim_tri_verts.size()); - device->tex_alloc("__prim_tri_verts", dscene->prim_tri_verts); + dscene->prim_tri_verts.steal_data(pack.prim_tri_verts); + dscene->prim_tri_verts.copy_to_device(); } if(pack.prim_type.size()) { - dscene->prim_type.reference((uint*)&pack.prim_type[0], pack.prim_type.size()); - device->tex_alloc("__prim_type", dscene->prim_type); + dscene->prim_type.steal_data(pack.prim_type); + dscene->prim_type.copy_to_device(); } if(pack.prim_visibility.size()) { - dscene->prim_visibility.reference((uint*)&pack.prim_visibility[0], pack.prim_visibility.size()); - device->tex_alloc("__prim_visibility", dscene->prim_visibility); + dscene->prim_visibility.steal_data(pack.prim_visibility); + dscene->prim_visibility.copy_to_device(); } if(pack.prim_index.size()) { - dscene->prim_index.reference((uint*)&pack.prim_index[0], pack.prim_index.size()); - device->tex_alloc("__prim_index", dscene->prim_index); + dscene->prim_index.steal_data(pack.prim_index); + dscene->prim_index.copy_to_device(); } if(pack.prim_object.size()) { - dscene->prim_object.reference((uint*)&pack.prim_object[0], pack.prim_object.size()); - device->tex_alloc("__prim_object", dscene->prim_object); + dscene->prim_object.steal_data(pack.prim_object); + dscene->prim_object.copy_to_device(); } if(pack.prim_time.size()) { - dscene->prim_time.reference((float2*)&pack.prim_time[0], pack.prim_time.size()); - device->tex_alloc("__prim_time", dscene->prim_time); + dscene->prim_time.steal_data(pack.prim_time); + dscene->prim_time.copy_to_device(); } dscene->data.bvh.root = pack.root_index; - dscene->data.bvh.use_qbvh = scene->params.use_qbvh; + dscene->data.bvh.bvh_layout = bparams.bvh_layout; dscene->data.bvh.use_bvh_steps = (scene->params.num_bvh_time_steps != 0); + + delete bvh; } -void MeshManager::device_update_flags(Device * /*device*/, - DeviceScene * /*dscene*/, - Scene * scene, - Progress& /*progress*/) +void MeshManager::device_update_preprocess(Device *device, + Scene *scene, + Progress& progress) { if(!need_update && !need_flags_update) { return; } - /* update flags */ + + progress.set_status("Updating Meshes Flags"); + + /* Update flags. */ + bool volume_images_updated = false; + foreach(Mesh *mesh, scene->meshes) { mesh->has_volume = false; + foreach(const Shader *shader, mesh->used_shaders) { if(shader->has_volume) { mesh->has_volume = true; @@ -1902,12 +1949,33 @@ void MeshManager::device_update_flags(Device * /*device*/, mesh->has_surface_bssrdf = true; } } + + if(need_update && mesh->has_volume) { + /* Create volume meshes if there is voxel data. */ + bool has_voxel_attributes = false; + + foreach(Attribute& attr, mesh->attributes.attributes) { + if(attr.element == ATTR_ELEMENT_VOXEL) { + has_voxel_attributes = true; + } + } + + if(has_voxel_attributes) { + if(!volume_images_updated) { + progress.set_status("Updating Meshes Volume Bounds"); + device_update_volume_images(device, scene, progress); + volume_images_updated = true; + } + + create_volume_mesh(scene, mesh, progress); + } + } } + need_flags_update = false; } void MeshManager::device_update_displacement_images(Device *device, - DeviceScene *dscene, Scene *scene, Progress& progress) { @@ -1925,16 +1993,7 @@ void MeshManager::device_update_displacement_images(Device *device, if(node->special_type != SHADER_SPECIAL_TYPE_IMAGE_SLOT) { continue; } - if(device->info.pack_images) { - /* If device requires packed images we need to update all - * images now, even if they're not used for displacement. - */ - image_manager->device_update(device, - dscene, - scene, - progress); - return; - } + ImageSlotTextureNode *image_node = static_cast<ImageSlotTextureNode*>(node); int slot = image_node->slot; if(slot != -1) { @@ -1948,7 +2007,6 @@ void MeshManager::device_update_displacement_images(Device *device, pool.push(function_bind(&ImageManager::device_update_slot, image_manager, device, - dscene, scene, slot, &progress)); @@ -1956,6 +2014,44 @@ void MeshManager::device_update_displacement_images(Device *device, pool.wait_work(); } +void MeshManager::device_update_volume_images(Device *device, + Scene *scene, + Progress& progress) +{ + progress.set_status("Updating Volume Images"); + TaskPool pool; + ImageManager *image_manager = scene->image_manager; + set<int> volume_images; + + foreach(Mesh *mesh, scene->meshes) { + if(!mesh->need_update) { + continue; + } + + foreach(Attribute& attr, mesh->attributes.attributes) { + if(attr.element != ATTR_ELEMENT_VOXEL) { + continue; + } + + VoxelAttribute *voxel = attr.data_voxel(); + + if(voxel->slot != -1) { + volume_images.insert(voxel->slot); + } + } + } + + foreach(int slot, volume_images) { + pool.push(function_bind(&ImageManager::device_update_slot, + image_manager, + device, + scene, + slot, + &progress)); + } + pool.wait_work(); +} + void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress) { if(!need_update) @@ -1963,14 +2059,17 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen VLOG(1) << "Total " << scene->meshes.size() << " meshes."; - /* Update normals. */ + bool true_displacement_used = false; + size_t total_tess_needed = 0; + foreach(Mesh *mesh, scene->meshes) { foreach(Shader *shader, mesh->used_shaders) { - if(shader->need_update_attributes) + if(shader->need_update_mesh) mesh->need_update = true; } if(mesh->need_update) { + /* Update normals. */ mesh->add_face_normals(); mesh->add_vertex_normals(); @@ -1978,60 +2077,56 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen mesh->add_undisplaced(); } + /* Test if we need tesselation. */ + if(mesh->subdivision_type != Mesh::SUBDIVISION_NONE && + mesh->num_subd_verts == 0 && + mesh->subd_params) + { + total_tess_needed++; + } + + /* Test if we need displacement. */ + if(mesh->has_true_displacement()) { + true_displacement_used = true; + } + if(progress.get_cancel()) return; } } /* Tessellate meshes that are using subdivision */ - size_t total_tess_needed = 0; - foreach(Mesh *mesh, scene->meshes) { - if(mesh->need_update && - mesh->subdivision_type != Mesh::SUBDIVISION_NONE && - mesh->num_subd_verts == 0 && - mesh->subd_params) - { - total_tess_needed++; - } - } + if(total_tess_needed) { + size_t i = 0; + foreach(Mesh *mesh, scene->meshes) { + if(mesh->need_update && + mesh->subdivision_type != Mesh::SUBDIVISION_NONE && + mesh->num_subd_verts == 0 && + mesh->subd_params) + { + string msg = "Tessellating "; + if(mesh->name == "") + msg += string_printf("%u/%u", (uint)(i+1), (uint)total_tess_needed); + else + msg += string_printf("%s %u/%u", mesh->name.c_str(), (uint)(i+1), (uint)total_tess_needed); - size_t i = 0; - foreach(Mesh *mesh, scene->meshes) { - if(mesh->need_update && - mesh->subdivision_type != Mesh::SUBDIVISION_NONE && - mesh->num_subd_verts == 0 && - mesh->subd_params) - { - string msg = "Tessellating "; - if(mesh->name == "") - msg += string_printf("%u/%u", (uint)(i+1), (uint)total_tess_needed); - else - msg += string_printf("%s %u/%u", mesh->name.c_str(), (uint)(i+1), (uint)total_tess_needed); + progress.set_status("Updating Mesh", msg); - progress.set_status("Updating Mesh", msg); + DiagSplit dsplit(*mesh->subd_params); + mesh->tessellate(&dsplit); - DiagSplit dsplit(*mesh->subd_params); - mesh->tessellate(&dsplit); + i++; - i++; + if(progress.get_cancel()) return; + } - if(progress.get_cancel()) return; } } /* Update images needed for true displacement. */ - bool true_displacement_used = false; bool old_need_object_flags_update = false; - foreach(Mesh *mesh, scene->meshes) { - if(mesh->need_update && - mesh->has_true_displacement()) - { - true_displacement_used = true; - break; - } - } if(true_displacement_used) { VLOG(1) << "Updating images used for true displacement."; - device_update_displacement_images(device, dscene, scene, progress); + device_update_displacement_images(device, scene, progress); old_need_object_flags_update = scene->object_manager->need_flags_update; scene->object_manager->device_update_flags(device, dscene, @@ -2049,20 +2144,22 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen } if(progress.get_cancel()) return; - /* after mesh data has been copied to device memory we need to update - * offsets for patch tables as this can't be known before hand */ - scene->object_manager->device_update_patch_map_offsets(device, dscene, scene); - device_update_attributes(device, dscene, scene, progress); if(progress.get_cancel()) return; /* Update displacement. */ bool displacement_done = false; + size_t num_bvh = 0; + foreach(Mesh *mesh, scene->meshes) { - if(mesh->need_update && - displace(device, dscene, scene, mesh, progress)) - { - displacement_done = true; + if(mesh->need_update) { + if(displace(device, dscene, scene, mesh, progress)) { + displacement_done = true; + } + + if(mesh->need_build_bvh()) { + num_bvh++; + } } } @@ -2077,21 +2174,14 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen if(progress.get_cancel()) return; } - /* Update bvh. */ - size_t num_bvh = 0; - foreach(Mesh *mesh, scene->meshes) { - if(mesh->need_update && mesh->need_build_bvh()) { - num_bvh++; - } - } - TaskPool pool; - i = 0; + size_t i = 0; foreach(Mesh *mesh, scene->meshes) { if(mesh->need_update) { pool.push(function_bind(&Mesh::compute_bvh, mesh, + device, dscene, &scene->params, &progress, @@ -2109,15 +2199,11 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen << summary.full_report(); foreach(Shader *shader, scene->shaders) { - shader->need_update_attributes = false; + shader->need_update_mesh = false; } -#ifdef __OBJECT_MOTION__ - Scene::MotionType need_motion = scene->need_motion(device->info.advanced_shading); + Scene::MotionType need_motion = scene->need_motion(); bool motion_blur = need_motion == Scene::MOTION_BLUR; -#else - bool motion_blur = false; -#endif /* Update objects. */ vector<Object *> volume_objects; @@ -2148,50 +2234,28 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen void MeshManager::device_free(Device *device, DeviceScene *dscene) { - device->tex_free(dscene->bvh_nodes); - device->tex_free(dscene->bvh_leaf_nodes); - device->tex_free(dscene->object_node); - device->tex_free(dscene->prim_tri_verts); - device->tex_free(dscene->prim_tri_index); - device->tex_free(dscene->prim_type); - device->tex_free(dscene->prim_visibility); - device->tex_free(dscene->prim_index); - device->tex_free(dscene->prim_object); - device->tex_free(dscene->prim_time); - device->tex_free(dscene->tri_shader); - device->tex_free(dscene->tri_vnormal); - device->tex_free(dscene->tri_vindex); - device->tex_free(dscene->tri_patch); - device->tex_free(dscene->tri_patch_uv); - device->tex_free(dscene->curves); - device->tex_free(dscene->curve_keys); - device->tex_free(dscene->patches); - device->tex_free(dscene->attributes_map); - device->tex_free(dscene->attributes_float); - device->tex_free(dscene->attributes_float3); - device->tex_free(dscene->attributes_uchar4); - - dscene->bvh_nodes.clear(); - dscene->object_node.clear(); - dscene->prim_tri_verts.clear(); - dscene->prim_tri_index.clear(); - dscene->prim_type.clear(); - dscene->prim_visibility.clear(); - dscene->prim_index.clear(); - dscene->prim_object.clear(); - dscene->prim_time.clear(); - dscene->tri_shader.clear(); - dscene->tri_vnormal.clear(); - dscene->tri_vindex.clear(); - dscene->tri_patch.clear(); - dscene->tri_patch_uv.clear(); - dscene->curves.clear(); - dscene->curve_keys.clear(); - dscene->patches.clear(); - dscene->attributes_map.clear(); - dscene->attributes_float.clear(); - dscene->attributes_float3.clear(); - dscene->attributes_uchar4.clear(); + dscene->bvh_nodes.free(); + dscene->bvh_leaf_nodes.free(); + dscene->object_node.free(); + dscene->prim_tri_verts.free(); + dscene->prim_tri_index.free(); + dscene->prim_type.free(); + dscene->prim_visibility.free(); + dscene->prim_index.free(); + dscene->prim_object.free(); + dscene->prim_time.free(); + dscene->tri_shader.free(); + dscene->tri_vnormal.free(); + dscene->tri_vindex.free(); + dscene->tri_patch.free(); + dscene->tri_patch_uv.free(); + dscene->curves.free(); + dscene->curve_keys.free(); + dscene->patches.free(); + dscene->attributes_map.free(); + dscene->attributes_float.free(); + dscene->attributes_float3.free(); + dscene->attributes_uchar4.free(); #ifdef WITH_OSL OSLGlobals *og = (OSLGlobals*)device->osl_memory(); @@ -2201,6 +2265,8 @@ void MeshManager::device_free(Device *device, DeviceScene *dscene) og->attribute_map.clear(); og->object_names.clear(); } +#else + (void)device; #endif } diff --git a/intern/cycles/render/mesh.h b/intern/cycles/render/mesh.h index 5f33e30eac2..e370f8a2021 100644 --- a/intern/cycles/render/mesh.h +++ b/intern/cycles/render/mesh.h @@ -17,17 +17,18 @@ #ifndef __MESH_H__ #define __MESH_H__ -#include "attribute.h" -#include "node.h" -#include "shader.h" - -#include "util_boundbox.h" -#include "util_list.h" -#include "util_map.h" -#include "util_param.h" -#include "util_transform.h" -#include "util_types.h" -#include "util_vector.h" +#include "graph/node.h" + +#include "render/attribute.h" +#include "render/shader.h" + +#include "util/util_boundbox.h" +#include "util/util_list.h" +#include "util/util_map.h" +#include "util/util_param.h" +#include "util/util_transform.h" +#include "util/util_types.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN @@ -48,7 +49,7 @@ struct PackedPatchTable; class Mesh : public Node { public: - NODE_DECLARE; + NODE_DECLARE /* Mesh Triangle */ struct Triangle { @@ -69,6 +70,10 @@ public: size_t num_steps, size_t step, float3 r_verts[3]) const; + + float3 compute_normal(const float3 *verts) const; + + bool valid(const float3 *verts) const; }; Triangle get_triangle(size_t i) const @@ -197,7 +202,8 @@ public: array<int> triangle_patch; /* must be < 0 for non subd triangles */ array<float2> vert_patch_uv; - bool has_volume; /* Set in the device_update_flags(). */ + float volume_isovalue; + bool has_volume; /* Set in the device_update_flags(). */ bool has_surface_bssrdf; /* Set in the device_update_flags(). */ array<float3> curve_keys; @@ -245,6 +251,8 @@ public: size_t face_offset; size_t corner_offset; + size_t attr_map_offset; + size_t num_subd_verts; /* Functions */ @@ -257,7 +265,7 @@ public: void reserve_curves(int numcurves, int numkeys); void resize_subd_faces(int numfaces, int num_ngons, int numcorners); void reserve_subd_faces(int numfaces, int num_ngons, int numcorners); - void clear(); + void clear(bool preserve_voxel_data = false); void add_vertex(float3 P); void add_vertex_slow(float3 P); void add_triangle(int v0, int v1, int v2, int shader, bool smooth); @@ -271,7 +279,8 @@ public: void add_vertex_normals(); void add_undisplaced(); - void pack_normals(Scene *scene, uint *shader, float4 *vnormal); + void pack_shaders(Scene *scene, uint *shader); + void pack_normals(float4 *vnormal); void pack_verts(const vector<uint>& tri_prim_index, uint4 *tri_vindex, uint *tri_patch, @@ -281,7 +290,8 @@ public: void pack_curves(Scene *scene, float4 *curve_key_co, float4 *curve_data, size_t curvekey_offset); void pack_patches(uint *patch_data, uint vert_offset, uint face_offset, uint corner_offset); - void compute_bvh(DeviceScene *dscene, + void compute_bvh(Device *device, + DeviceScene *dscene, SceneParams *params, Progress *progress, int n, @@ -295,6 +305,11 @@ public: bool has_motion_blur() const; bool has_true_displacement() const; + /* Convert between normalized -1..1 motion time and index + * in the VERTEX_MOTION attribute. */ + float motion_time(int step) const; + int motion_step(float time) const; + /* Check whether the mesh should have own BVH built separately. Briefly, * own BVH is needed for mesh, if: * @@ -315,8 +330,6 @@ public: class MeshManager { public: - BVH *bvh; - bool need_update; bool need_flags_update; @@ -329,13 +342,15 @@ public: void update_osl_attributes(Device *device, Scene *scene, vector<AttributeRequestSet>& mesh_attributes); void update_svm_attributes(Device *device, DeviceScene *dscene, Scene *scene, vector<AttributeRequestSet>& mesh_attributes); + void device_update_preprocess(Device *device, Scene *scene, Progress& progress); void device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress); - void device_update_flags(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress); void device_free(Device *device, DeviceScene *dscene); void tag_update(Scene *scene); + void create_volume_mesh(Scene *scene, Mesh *mesh, Progress &progress); + protected: /* Calculate verts/triangles/curves offsets in global arrays. */ void mesh_calc_offset(Scene *scene); @@ -362,9 +377,12 @@ protected: Progress& progress); void device_update_displacement_images(Device *device, - DeviceScene *dscene, Scene *scene, Progress& progress); + + void device_update_volume_images(Device *device, + Scene *scene, + Progress& progress); }; CCL_NAMESPACE_END diff --git a/intern/cycles/render/mesh_displace.cpp b/intern/cycles/render/mesh_displace.cpp index adc5b820298..ad2a5713bcb 100644 --- a/intern/cycles/render/mesh_displace.cpp +++ b/intern/cycles/render/mesh_displace.cpp @@ -14,15 +14,15 @@ * limitations under the License. */ -#include "device.h" +#include "device/device.h" -#include "mesh.h" -#include "object.h" -#include "scene.h" -#include "shader.h" +#include "render/mesh.h" +#include "render/object.h" +#include "render/scene.h" +#include "render/shader.h" -#include "util_foreach.h" -#include "util_progress.h" +#include "util/util_foreach.h" +#include "util/util_progress.h" CCL_NAMESPACE_BEGIN @@ -64,8 +64,8 @@ bool MeshManager::displace(Device *device, DeviceScene *dscene, Scene *scene, Me /* setup input for device task */ const size_t num_verts = mesh->verts.size(); vector<bool> done(num_verts, false); - device_vector<uint4> d_input; - uint4 *d_input_data = d_input.resize(num_verts); + device_vector<uint4> d_input(device, "displace_input", MEM_READ_ONLY); + uint4 *d_input_data = d_input.alloc(num_verts); size_t d_input_size = 0; size_t num_triangles = mesh->num_triangles(); @@ -115,16 +115,14 @@ bool MeshManager::displace(Device *device, DeviceScene *dscene, Scene *scene, Me return false; /* run device task */ - device_vector<float4> d_output; - d_output.resize(d_input_size); + device_vector<float4> d_output(device, "displace_output", MEM_READ_WRITE); + d_output.alloc(d_input_size); + d_output.zero_to_device(); + d_input.copy_to_device(); /* needs to be up to data for attribute access */ device->const_copy_to("__data", &dscene->data, sizeof(dscene->data)); - device->mem_alloc(d_input, MEM_READ_ONLY); - device->mem_copy_to(d_input); - device->mem_alloc(d_output, MEM_WRITE_ONLY); - DeviceTask task(DeviceTask::SHADER); task.shader_input = d_input.device_pointer; task.shader_output = d_output.device_pointer; @@ -138,21 +136,20 @@ bool MeshManager::displace(Device *device, DeviceScene *dscene, Scene *scene, Me device->task_wait(); if(progress.get_cancel()) { - device->mem_free(d_input); - device->mem_free(d_output); + d_input.free(); + d_output.free(); return false; } - device->mem_copy_from(d_output, 0, 1, d_output.size(), sizeof(float4)); - device->mem_free(d_input); - device->mem_free(d_output); + d_output.copy_from_device(0, 1, d_output.size()); + d_input.free(); /* read result */ done.clear(); done.resize(num_verts, false); int k = 0; - float4 *offset = (float4*)d_output.data_pointer; + float4 *offset = d_output.data(); Attribute *attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION); for(size_t i = 0; i < num_triangles; i++) { @@ -169,6 +166,8 @@ bool MeshManager::displace(Device *device, DeviceScene *dscene, Scene *scene, Me if(!done[t.v[j]]) { done[t.v[j]] = true; float3 off = float4_to_float3(offset[k++]); + /* Avoid illegal vertex coordinates. */ + off = ensure_finite3(off); mesh->verts[t.v[j]] += off; if(attr_mP != NULL) { for(int step = 0; step < mesh->motion_steps - 1; step++) { @@ -180,6 +179,8 @@ bool MeshManager::displace(Device *device, DeviceScene *dscene, Scene *scene, Me } } + d_output.free(); + /* for displacement method both, we only need to recompute the face * normals, as bump mapping in the shader will already alter the * vertex normal, so we start from the non-displaced vertex normals diff --git a/intern/cycles/render/mesh_subdivision.cpp b/intern/cycles/render/mesh_subdivision.cpp index 57c76a9f1c8..9dd81eb6700 100644 --- a/intern/cycles/render/mesh_subdivision.cpp +++ b/intern/cycles/render/mesh_subdivision.cpp @@ -14,16 +14,16 @@ * limitations under the License. */ -#include "mesh.h" -#include "attribute.h" -#include "camera.h" +#include "render/mesh.h" +#include "render/attribute.h" +#include "render/camera.h" -#include "subd_split.h" -#include "subd_patch.h" -#include "subd_patch_table.h" +#include "subd/subd_split.h" +#include "subd/subd_patch.h" +#include "subd/subd_patch_table.h" -#include "util_foreach.h" -#include "util_algorithm.h" +#include "util/util_foreach.h" +#include "util/util_algorithm.h" CCL_NAMESPACE_BEGIN @@ -204,7 +204,9 @@ public: src = dest; } - patch_table->ComputeLocalPointValues(&verts[0], &verts[num_refiner_verts]); + if(num_local_points) { + patch_table->ComputeLocalPointValues(&verts[0], &verts[num_refiner_verts]); + } /* create patch map */ patch_map = new Far::PatchMap(*patch_table); @@ -236,13 +238,15 @@ public: src = dest; } - if(attr.same_storage(attr.type, TypeDesc::TypeFloat)) { - patch_table->ComputeLocalPointValues((OsdValue<float>*)&attr.buffer[0], - (OsdValue<float>*)&attr.buffer[num_refiner_verts * attr.data_sizeof()]); - } - else { - patch_table->ComputeLocalPointValues((OsdValue<float4>*)&attr.buffer[0], - (OsdValue<float4>*)&attr.buffer[num_refiner_verts * attr.data_sizeof()]); + if(num_local_points) { + if(attr.same_storage(attr.type, TypeDesc::TypeFloat)) { + patch_table->ComputeLocalPointValues((OsdValue<float>*)&attr.buffer[0], + (OsdValue<float>*)&attr.buffer[num_refiner_verts * attr.data_sizeof()]); + } + else { + patch_table->ComputeLocalPointValues((OsdValue<float4>*)&attr.buffer[0], + (OsdValue<float4>*)&attr.buffer[num_refiner_verts * attr.data_sizeof()]); + } } } else if(attr.element == ATTR_ELEMENT_CORNER || attr.element == ATTR_ELEMENT_CORNER_BYTE) { diff --git a/intern/cycles/render/mesh_volume.cpp b/intern/cycles/render/mesh_volume.cpp new file mode 100644 index 00000000000..d1c49b456ff --- /dev/null +++ b/intern/cycles/render/mesh_volume.cpp @@ -0,0 +1,546 @@ +/* + * Copyright 2011-2016 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "render/mesh.h" +#include "render/attribute.h" +#include "render/scene.h" + +#include "util/util_foreach.h" +#include "util/util_logging.h" +#include "util/util_progress.h" +#include "util/util_types.h" + +CCL_NAMESPACE_BEGIN + +static size_t compute_voxel_index(const int3 &resolution, size_t x, size_t y, size_t z) +{ + if(x == -1 || x >= resolution.x) { + return -1; + } + + if(y == -1 || y >= resolution.y) { + return -1; + } + + if(z == -1 || z >= resolution.z) { + return -1; + } + + return x + y*resolution.x + z*resolution.x*resolution.y; +} + +struct QuadData { + int v0, v1, v2, v3; + + float3 normal; +}; + +enum { + QUAD_X_MIN = 0, + QUAD_X_MAX = 1, + QUAD_Y_MIN = 2, + QUAD_Y_MAX = 3, + QUAD_Z_MIN = 4, + QUAD_Z_MAX = 5, +}; + +const int quads_indices[6][4] = { + /* QUAD_X_MIN */ + { 4, 0, 3, 7 }, + /* QUAD_X_MAX */ + { 1, 5, 6, 2 }, + /* QUAD_Y_MIN */ + { 4, 5, 1, 0 }, + /* QUAD_Y_MAX */ + { 3, 2, 6, 7 }, + /* QUAD_Z_MIN */ + { 0, 1, 2, 3 }, + /* QUAD_Z_MAX */ + { 5, 4, 7, 6 }, +}; + +const float3 quads_normals[6] = { + /* QUAD_X_MIN */ + make_float3(-1.0f, 0.0f, 0.0f), + /* QUAD_X_MAX */ + make_float3(1.0f, 0.0f, 0.0f), + /* QUAD_Y_MIN */ + make_float3(0.0f, -1.0f, 0.0f), + /* QUAD_Y_MAX */ + make_float3(0.0f, 1.0f, 0.0f), + /* QUAD_Z_MIN */ + make_float3(0.0f, 0.0f, -1.0f), + /* QUAD_Z_MAX */ + make_float3(0.0f, 0.0f, 1.0f), +}; + +static void create_quad(int3 corners[8], vector<int3> &vertices, vector<QuadData> &quads, int face_index) +{ + size_t vertex_offset = vertices.size(); + + QuadData quad; + quad.v0 = vertex_offset + 0; + quad.v1 = vertex_offset + 1; + quad.v2 = vertex_offset + 2; + quad.v3 = vertex_offset + 3; + quad.normal = quads_normals[face_index]; + + quads.push_back(quad); + + vertices.push_back(corners[quads_indices[face_index][0]]); + vertices.push_back(corners[quads_indices[face_index][1]]); + vertices.push_back(corners[quads_indices[face_index][2]]); + vertices.push_back(corners[quads_indices[face_index][3]]); +} + +struct VolumeParams { + int3 resolution; + float3 cell_size; + float3 start_point; + int pad_size; +}; + +static const int CUBE_SIZE = 8; + +/* Create a mesh from a volume. + * + * The way the algorithm works is as follows: + * + * - the coordinates of active voxels from a dense volume (or 3d image) are + * gathered inside an auxialliary volume. + * - each set of coordinates of an CUBE_SIZE cube are mapped to the same + * coordinate of the auxilliary volume. + * - quads are created between active and non-active voxels in the auxialliary + * volume to generate a tight mesh around the volume. + */ +class VolumeMeshBuilder { + /* Auxilliary volume that is used to check if a node already added. */ + vector<char> grid; + + /* The resolution of the auxilliary volume, set to be equal to 1/CUBE_SIZE + * of the original volume on each axis. */ + int3 res; + + size_t number_of_nodes; + + /* Offset due to padding in the original grid. Padding will transform the + * coordinates of the original grid from 0...res to -padding...res+padding, + * so some coordinates are negative, and we need to properly account for + * them. */ + int3 pad_offset; + + VolumeParams *params; + +public: + VolumeMeshBuilder(VolumeParams *volume_params); + + void add_node(int x, int y, int z); + + void add_node_with_padding(int x, int y, int z); + + void create_mesh(vector<float3> &vertices, + vector<int> &indices, + vector<float3> &face_normals); + +private: + void generate_vertices_and_quads(vector<int3> &vertices_is, + vector<QuadData> &quads); + + void deduplicate_vertices(vector<int3> &vertices, + vector<QuadData> &quads); + + void convert_object_space(const vector<int3> &vertices, + vector<float3> &out_vertices); + + void convert_quads_to_tris(const vector<QuadData> &quads, + vector<int> &tris, + vector<float3> &face_normals); +}; + +VolumeMeshBuilder::VolumeMeshBuilder(VolumeParams *volume_params) +{ + params = volume_params; + number_of_nodes = 0; + + const size_t x = divide_up(params->resolution.x, CUBE_SIZE); + const size_t y = divide_up(params->resolution.y, CUBE_SIZE); + const size_t z = divide_up(params->resolution.z, CUBE_SIZE); + + /* Adding 2*pad_size since we pad in both positive and negative directions + * along the axis. */ + const size_t px = divide_up(params->resolution.x + 2*params->pad_size, CUBE_SIZE); + const size_t py = divide_up(params->resolution.y + 2*params->pad_size, CUBE_SIZE); + const size_t pz = divide_up(params->resolution.z + 2*params->pad_size, CUBE_SIZE); + + res = make_int3(px, py, pz); + pad_offset = make_int3(px - x, py - y, pz - z); + + grid.resize(px*py*pz, 0); +} + +void VolumeMeshBuilder::add_node(int x, int y, int z) +{ + /* Map coordinates to index space. */ + const int index_x = (x/CUBE_SIZE) + pad_offset.x; + const int index_y = (y/CUBE_SIZE) + pad_offset.y; + const int index_z = (z/CUBE_SIZE) + pad_offset.z; + + assert((index_x >= 0) && (index_y >= 0) && (index_z >= 0)); + + const size_t index = compute_voxel_index(res, index_x, index_y, index_z); + + /* We already have a node here. */ + if(grid[index] == 1) { + return; + } + + ++number_of_nodes; + + grid[index] = 1; +} + +void VolumeMeshBuilder::add_node_with_padding(int x, int y, int z) +{ + for(int px = x - params->pad_size; px < x + params->pad_size; ++px) { + for(int py = y - params->pad_size; py < y + params->pad_size; ++py) { + for(int pz = z - params->pad_size; pz < z + params->pad_size; ++pz) { + add_node(px, py, pz); + } + } + } +} + +void VolumeMeshBuilder::create_mesh(vector<float3> &vertices, + vector<int> &indices, + vector<float3> &face_normals) +{ + /* We create vertices in index space (is), and only convert them to object + * space when done. */ + vector<int3> vertices_is; + vector<QuadData> quads; + + generate_vertices_and_quads(vertices_is, quads); + + deduplicate_vertices(vertices_is, quads); + + convert_object_space(vertices_is, vertices); + + convert_quads_to_tris(quads, indices, face_normals); +} + +void VolumeMeshBuilder::generate_vertices_and_quads( + vector<ccl::int3> &vertices_is, + vector<QuadData> &quads) +{ + /* Overallocation, we could count the number of quads and vertices to create + * in a pre-pass if memory becomes an issue. */ + vertices_is.reserve(number_of_nodes*8); + quads.reserve(number_of_nodes*6); + + for(int z = 0; z < res.z; ++z) { + for(int y = 0; y < res.y; ++y) { + for(int x = 0; x < res.x; ++x) { + size_t voxel_index = compute_voxel_index(res, x, y, z); + if(grid[voxel_index] == 0) { + continue; + } + + /* Compute min and max coords of the node in index space. */ + int3 min = make_int3((x - pad_offset.x)*CUBE_SIZE, + (y - pad_offset.y)*CUBE_SIZE, + (z - pad_offset.z)*CUBE_SIZE); + + /* Maximum is just CUBE_SIZE voxels away from minimum on each axis. */ + int3 max = make_int3(min.x + CUBE_SIZE, min.y + CUBE_SIZE, min.z + CUBE_SIZE); + + int3 corners[8] = { + make_int3(min[0], min[1], min[2]), + make_int3(max[0], min[1], min[2]), + make_int3(max[0], max[1], min[2]), + make_int3(min[0], max[1], min[2]), + make_int3(min[0], min[1], max[2]), + make_int3(max[0], min[1], max[2]), + make_int3(max[0], max[1], max[2]), + make_int3(min[0], max[1], max[2]), + }; + + /* Only create a quad if on the border between an active and + * an inactive node. + */ + + voxel_index = compute_voxel_index(res, x - 1, y, z); + if(voxel_index == -1 || grid[voxel_index] == 0) { + create_quad(corners, vertices_is, quads, QUAD_X_MIN); + } + + voxel_index = compute_voxel_index(res, x + 1, y, z); + if(voxel_index == -1 || grid[voxel_index] == 0) { + create_quad(corners, vertices_is, quads, QUAD_X_MAX); + } + + voxel_index = compute_voxel_index(res, x, y - 1, z); + if(voxel_index == -1 || grid[voxel_index] == 0) { + create_quad(corners, vertices_is, quads, QUAD_Y_MIN); + } + + voxel_index = compute_voxel_index(res, x, y + 1, z); + if(voxel_index == -1 || grid[voxel_index] == 0) { + create_quad(corners, vertices_is, quads, QUAD_Y_MAX); + } + + voxel_index = compute_voxel_index(res, x, y, z - 1); + if(voxel_index == -1 || grid[voxel_index] == 0) { + create_quad(corners, vertices_is, quads, QUAD_Z_MIN); + } + + voxel_index = compute_voxel_index(res, x, y, z + 1); + if(voxel_index == -1 || grid[voxel_index] == 0) { + create_quad(corners, vertices_is, quads, QUAD_Z_MAX); + } + } + } + } +} + +void VolumeMeshBuilder::deduplicate_vertices(vector<int3> &vertices, + vector<QuadData> &quads) +{ + vector<int3> sorted_vertices = vertices; + std::sort(sorted_vertices.begin(), sorted_vertices.end()); + vector<int3>::iterator it = std::unique(sorted_vertices.begin(), sorted_vertices.end()); + sorted_vertices.resize(std::distance(sorted_vertices.begin(), it)); + + vector<QuadData> new_quads = quads; + + for(size_t i = 0; i < vertices.size(); ++i) { + for(size_t j = 0; j < sorted_vertices.size(); ++j) { + if(vertices[i] != sorted_vertices[j]) { + continue; + } + + for(int k = 0; k < quads.size(); ++k) { + if(quads[k].v0 == i) { + new_quads[k].v0 = j; + } + else if(quads[k].v1 == i) { + new_quads[k].v1 = j; + } + else if(quads[k].v2 == i) { + new_quads[k].v2 = j; + } + else if(quads[k].v3 == i) { + new_quads[k].v3 = j; + } + } + + break; + } + } + + vertices = sorted_vertices; + quads = new_quads; +} + +void VolumeMeshBuilder::convert_object_space(const vector<int3> &vertices, + vector<float3> &out_vertices) +{ + out_vertices.reserve(vertices.size()); + + for(size_t i = 0; i < vertices.size(); ++i) { + float3 vertex = make_float3(vertices[i].x, vertices[i].y, vertices[i].z); + vertex *= params->cell_size; + vertex += params->start_point; + + out_vertices.push_back(vertex); + } +} + +void VolumeMeshBuilder::convert_quads_to_tris(const vector<QuadData> &quads, + vector<int> &tris, + vector<float3> &face_normals) +{ + int index_offset = 0; + tris.resize(quads.size()*6); + face_normals.reserve(quads.size()*2); + + for(size_t i = 0; i < quads.size(); ++i) { + tris[index_offset++] = quads[i].v0; + tris[index_offset++] = quads[i].v2; + tris[index_offset++] = quads[i].v1; + + face_normals.push_back(quads[i].normal); + + tris[index_offset++] = quads[i].v0; + tris[index_offset++] = quads[i].v3; + tris[index_offset++] = quads[i].v2; + + face_normals.push_back(quads[i].normal); + } +} + +/* ************************************************************************** */ + +struct VoxelAttributeGrid { + float *data; + int channels; +}; + +void MeshManager::create_volume_mesh(Scene *scene, + Mesh *mesh, + Progress& progress) +{ + string msg = string_printf("Computing Volume Mesh %s", mesh->name.c_str()); + progress.set_status("Updating Mesh", msg); + + vector<VoxelAttributeGrid> voxel_grids; + + /* Compute volume parameters. */ + VolumeParams volume_params; + volume_params.resolution = make_int3(0, 0, 0); + + foreach(Attribute& attr, mesh->attributes.attributes) { + if(attr.element != ATTR_ELEMENT_VOXEL) { + continue; + } + + VoxelAttribute *voxel = attr.data_voxel(); + device_memory *image_memory = scene->image_manager->image_memory(voxel->slot); + int3 resolution = make_int3(image_memory->data_width, + image_memory->data_height, + image_memory->data_depth); + + if(volume_params.resolution == make_int3(0, 0, 0)) { + volume_params.resolution = resolution; + } + else if(volume_params.resolution != resolution) { + VLOG(1) << "Can't create volume mesh, all voxel grid resolutions must be equal\n"; + return; + } + + VoxelAttributeGrid voxel_grid; + voxel_grid.data = static_cast<float*>(image_memory->host_pointer); + voxel_grid.channels = image_memory->data_elements; + voxel_grids.push_back(voxel_grid); + } + + if(voxel_grids.empty()) { + return; + } + + /* Compute padding. */ + Shader *volume_shader = NULL; + int pad_size = 0; + + foreach(Shader *shader, mesh->used_shaders) { + if(!shader->has_volume) { + continue; + } + + volume_shader = shader; + + if(shader->volume_interpolation_method == VOLUME_INTERPOLATION_LINEAR) { + pad_size = max(1, pad_size); + } + else if(shader->volume_interpolation_method == VOLUME_INTERPOLATION_CUBIC) { + pad_size = max(2, pad_size); + } + + break; + } + + if(!volume_shader) { + return; + } + + /* Compute start point and cell size from transform. */ + Attribute *attr = mesh->attributes.find(ATTR_STD_GENERATED_TRANSFORM); + const int3 resolution = volume_params.resolution; + float3 start_point = make_float3(0.0f, 0.0f, 0.0f); + float3 cell_size = make_float3(1.0f/resolution.x, + 1.0f/resolution.y, + 1.0f/resolution.z); + + if(attr) { + const Transform *tfm = attr->data_transform(); + const Transform itfm = transform_inverse(*tfm); + start_point = transform_point(&itfm, start_point); + cell_size = transform_direction(&itfm, cell_size); + } + + volume_params.start_point = start_point; + volume_params.cell_size = cell_size; + volume_params.pad_size = pad_size; + + /* Build bounding mesh around non-empty volume cells. */ + VolumeMeshBuilder builder(&volume_params); + const float isovalue = mesh->volume_isovalue; + + for(int z = 0; z < resolution.z; ++z) { + for(int y = 0; y < resolution.y; ++y) { + for(int x = 0; x < resolution.x; ++x) { + size_t voxel_index = compute_voxel_index(resolution, x, y, z); + + for(size_t i = 0; i < voxel_grids.size(); ++i) { + const VoxelAttributeGrid &voxel_grid = voxel_grids[i]; + const int channels = voxel_grid.channels; + + for(int c = 0; c < channels; c++) { + if(voxel_grid.data[voxel_index * channels + c] >= isovalue) { + builder.add_node_with_padding(x, y, z); + break; + } + } + } + } + } + } + + /* Create mesh. */ + vector<float3> vertices; + vector<int> indices; + vector<float3> face_normals; + builder.create_mesh(vertices, indices, face_normals); + + mesh->clear(true); + mesh->reserve_mesh(vertices.size(), indices.size()/3); + mesh->used_shaders.push_back(volume_shader); + + for(size_t i = 0; i < vertices.size(); ++i) { + mesh->add_vertex(vertices[i]); + } + + for(size_t i = 0; i < indices.size(); i += 3) { + mesh->add_triangle(indices[i], indices[i + 1], indices[i + 2], 0, false); + } + + Attribute *attr_fN = mesh->attributes.add(ATTR_STD_FACE_NORMAL); + float3 *fN = attr_fN->data_float3(); + + for(size_t i = 0; i < face_normals.size(); ++i) { + fN[i] = face_normals[i]; + } + + /* Print stats. */ + VLOG(1) << "Memory usage volume mesh: " + << ((vertices.size() + face_normals.size())*sizeof(float3) + indices.size()*sizeof(int))/(1024.0*1024.0) + << "Mb."; + + VLOG(1) << "Memory usage volume grid: " + << (resolution.x*resolution.y*resolution.z*sizeof(float))/(1024.0*1024.0) + << "Mb."; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp index 13b149eddfa..01399c85bc0 100644 --- a/intern/cycles/render/nodes.cpp +++ b/intern/cycles/render/nodes.cpp @@ -14,20 +14,21 @@ * limitations under the License. */ -#include "image.h" -#include "integrator.h" -#include "nodes.h" -#include "scene.h" -#include "svm.h" -#include "svm_color_util.h" -#include "svm_ramp_util.h" -#include "svm_math_util.h" -#include "osl.h" -#include "constant_fold.h" - -#include "util_sky_model.h" -#include "util_foreach.h" -#include "util_transform.h" +#include "render/image.h" +#include "render/integrator.h" +#include "render/nodes.h" +#include "render/scene.h" +#include "render/svm.h" +#include "kernel/svm/svm_color_util.h" +#include "kernel/svm/svm_ramp_util.h" +#include "kernel/svm/svm_math_util.h" +#include "render/osl.h" +#include "render/constant_fold.h" + +#include "util/util_sky_model.h" +#include "util/util_foreach.h" +#include "util/util_logging.h" +#include "util/util_transform.h" CCL_NAMESPACE_BEGIN @@ -116,8 +117,7 @@ Transform TextureMapping::compute_transform() case NORMAL: /* no translation for normals, and inverse transpose */ mat = rmat*smat; - mat = transform_inverse(mat); - mat = transform_transpose(mat); + mat = transform_transposed_inverse(mat); break; } @@ -152,7 +152,6 @@ void TextureMapping::compile(SVMCompiler& compiler, int offset_in, int offset_ou compiler.add_node(tfm.x); compiler.add_node(tfm.y); compiler.add_node(tfm.z); - compiler.add_node(tfm.w); if(use_minmax) { compiler.add_node(NODE_MIN_MAX, offset_out, offset_out); @@ -192,9 +191,7 @@ void TextureMapping::compile_end(SVMCompiler& compiler, ShaderInput *vector_in, void TextureMapping::compile(OSLCompiler &compiler) { if(!skip()) { - Transform tfm = transform_transpose(compute_transform()); - - compiler.parameter("mapping", tfm); + compiler.parameter("mapping", compute_transform()); compiler.parameter("use_mapping", 1); } } @@ -207,7 +204,7 @@ NODE_DEFINE(ImageTextureNode) TEXTURE_MAPPING_DEFINE(ImageTextureNode); - SOCKET_STRING(filename, "Filename", ustring("")); + SOCKET_STRING(filename, "Filename", ustring()); static NodeEnum color_space_enum; color_space_enum.insert("none", NODE_COLOR_SPACE_NONE); @@ -301,17 +298,17 @@ void ImageTextureNode::compile(SVMCompiler& compiler) image_manager = compiler.image_manager; if(is_float == -1) { - bool is_float_bool; + ImageMetaData metadata; slot = image_manager->add_image(filename.string(), builtin_data, animated, 0, - is_float_bool, - is_linear, interpolation, extension, - use_alpha); - is_float = (int)is_float_bool; + use_alpha, + metadata); + is_float = metadata.is_float; + is_linear = metadata.is_linear; } if(slot != -1) { @@ -362,25 +359,22 @@ void ImageTextureNode::compile(OSLCompiler& compiler) image_manager = compiler.image_manager; if(is_float == -1) { + ImageMetaData metadata; if(builtin_data == NULL) { - ImageManager::ImageDataType type; - type = image_manager->get_image_metadata(filename.string(), NULL, is_linear); - if(type == ImageManager::IMAGE_DATA_TYPE_FLOAT || type == ImageManager::IMAGE_DATA_TYPE_FLOAT4) - is_float = 1; + image_manager->get_image_metadata(filename.string(), NULL, metadata); } else { - bool is_float_bool; slot = image_manager->add_image(filename.string(), builtin_data, animated, 0, - is_float_bool, - is_linear, interpolation, extension, - use_alpha); - is_float = (int)is_float_bool; + use_alpha, + metadata); } + is_float = metadata.is_float; + is_linear = metadata.is_linear; } if(slot == -1) { @@ -417,7 +411,7 @@ NODE_DEFINE(EnvironmentTextureNode) TEXTURE_MAPPING_DEFINE(EnvironmentTextureNode); - SOCKET_STRING(filename, "Filename", ustring("")); + SOCKET_STRING(filename, "Filename", ustring()); static NodeEnum color_space_enum; color_space_enum.insert("none", NODE_COLOR_SPACE_NONE); @@ -499,17 +493,17 @@ void EnvironmentTextureNode::compile(SVMCompiler& compiler) image_manager = compiler.image_manager; if(slot == -1) { - bool is_float_bool; + ImageMetaData metadata; slot = image_manager->add_image(filename.string(), builtin_data, animated, 0, - is_float_bool, - is_linear, interpolation, EXTENSION_REPEAT, - use_alpha); - is_float = (int)is_float_bool; + use_alpha, + metadata); + is_float = metadata.is_float; + is_linear = metadata.is_linear; } if(slot != -1) { @@ -551,25 +545,22 @@ void EnvironmentTextureNode::compile(OSLCompiler& compiler) */ image_manager = compiler.image_manager; if(is_float == -1) { + ImageMetaData metadata; if(builtin_data == NULL) { - ImageManager::ImageDataType type; - type = image_manager->get_image_metadata(filename.string(), NULL, is_linear); - if(type == ImageManager::IMAGE_DATA_TYPE_FLOAT || type == ImageManager::IMAGE_DATA_TYPE_FLOAT4) - is_float = 1; + image_manager->get_image_metadata(filename.string(), NULL, metadata); } else { - bool is_float_bool; slot = image_manager->add_image(filename.string(), builtin_data, animated, 0, - is_float_bool, - is_linear, interpolation, EXTENSION_REPEAT, - use_alpha); - is_float = (int)is_float_bool; + use_alpha, + metadata); } + is_float = metadata.is_float; + is_linear = metadata.is_linear; } if(slot == -1) { @@ -1345,7 +1336,7 @@ NODE_DEFINE(PointDensityTextureNode) { NodeType* type = NodeType::add("point_density_texture", create, NodeType::SHADER); - SOCKET_STRING(filename, "Filename", ustring("")); + SOCKET_STRING(filename, "Filename", ustring()); static NodeEnum space_enum; space_enum.insert("object", NODE_TEX_VOXEL_SPACE_OBJECT); @@ -1418,13 +1409,13 @@ void PointDensityTextureNode::compile(SVMCompiler& compiler) if(use_density || use_color) { if(slot == -1) { - bool is_float, is_linear; + ImageMetaData metadata; slot = image_manager->add_image(filename.string(), builtin_data, false, 0, - is_float, is_linear, interpolation, EXTENSION_CLIP, - true); + true, + metadata); } if(slot != -1) { @@ -1439,7 +1430,6 @@ void PointDensityTextureNode::compile(SVMCompiler& compiler) compiler.add_node(tfm.x); compiler.add_node(tfm.y); compiler.add_node(tfm.z); - compiler.add_node(tfm.w); } } else { @@ -1470,20 +1460,20 @@ void PointDensityTextureNode::compile(OSLCompiler& compiler) if(use_density || use_color) { if(slot == -1) { - bool is_float, is_linear; + ImageMetaData metadata; slot = image_manager->add_image(filename.string(), builtin_data, false, 0, - is_float, is_linear, interpolation, EXTENSION_CLIP, - true); + true, + metadata); } if(slot != -1) { compiler.parameter("filename", string_printf("@%d", slot).c_str()); } if(space == NODE_TEX_VOXEL_SPACE_WORLD) { - compiler.parameter("mapping", transform_transpose(tfm)); + compiler.parameter("mapping", tfm); compiler.parameter("use_mapping", 1); } compiler.parameter(this, "interpolation"); @@ -1563,8 +1553,7 @@ void MappingNode::compile(SVMCompiler& compiler) void MappingNode::compile(OSLCompiler& compiler) { - Transform tfm = transform_transpose(tex_mapping.compute_transform()); - compiler.parameter("Matrix", tfm); + compiler.parameter("Matrix", tex_mapping.compute_transform()); compiler.parameter_point("mapping_min", tex_mapping.min); compiler.parameter_point("mapping_max", tex_mapping.max); compiler.parameter("use_minmax", tex_mapping.use_minmax); @@ -1790,12 +1779,27 @@ void ConvertNode::compile(OSLCompiler& compiler) assert(0); } +/* Base type for all closure-type nodes */ + +BsdfBaseNode::BsdfBaseNode(const NodeType *node_type) + : ShaderNode(node_type) +{ + special_type = SHADER_SPECIAL_TYPE_CLOSURE; +} + +bool BsdfBaseNode::has_bump() +{ + /* detect if anything is plugged into the normal input besides the default */ + ShaderInput *normal_in = input("Normal"); + return (normal_in && normal_in->link && + normal_in->link->parent->special_type != SHADER_SPECIAL_TYPE_GEOMETRY); +} + /* BSDF Closure */ BsdfNode::BsdfNode(const NodeType *node_type) -: ShaderNode(node_type) +: BsdfBaseNode(node_type) { - special_type = SHADER_SPECIAL_TYPE_CLOSURE; } void BsdfNode::compile(SVMCompiler& compiler, ShaderInput *param1, ShaderInput *param2, ShaderInput *param3, ShaderInput *param4) @@ -1854,7 +1858,7 @@ NODE_DEFINE(AnisotropicBsdfNode) SOCKET_IN_VECTOR(tangent, "Tangent", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_TANGENT); - SOCKET_IN_FLOAT(roughness, "Roughness", 0.2f); + SOCKET_IN_FLOAT(roughness, "Roughness", 0.5f); SOCKET_IN_FLOAT(anisotropy, "Anisotropy", 0.5f); SOCKET_IN_FLOAT(rotation, "Rotation", 0.0f); @@ -1914,7 +1918,7 @@ NODE_DEFINE(GlossyBsdfNode) distribution_enum.insert("ashikhmin_shirley", CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID); distribution_enum.insert("Multiscatter GGX", CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID); SOCKET_ENUM(distribution, "Distribution", distribution_enum, CLOSURE_BSDF_MICROFACET_GGX_ID); - SOCKET_IN_FLOAT(roughness, "Roughness", 0.2f); + SOCKET_IN_FLOAT(roughness, "Roughness", 0.5f); SOCKET_OUT_CLOSURE(BSDF, "BSDF"); @@ -1931,21 +1935,38 @@ GlossyBsdfNode::GlossyBsdfNode() void GlossyBsdfNode::simplify_settings(Scene *scene) { if(distribution_orig == NBUILTIN_CLOSURES) { + roughness_orig = roughness; distribution_orig = distribution; } + else { + /* By default we use original values, so we don't worry about restoring + * defaults later one and can only do override when needed. + */ + roughness = roughness_orig; + distribution = distribution_orig; + } Integrator *integrator = scene->integrator; + ShaderInput *roughness_input = input("Roughness"); if(integrator->filter_glossy == 0.0f) { /* Fallback to Sharp closure for Roughness close to 0. * Note: Keep the epsilon in sync with kernel! */ - ShaderInput *roughness_input = input("Roughness"); if(!roughness_input->link && roughness <= 1e-4f) { + VLOG(1) << "Using sharp glossy BSDF."; distribution = CLOSURE_BSDF_REFLECTION_ID; } } else { - /* Rollback to original distribution when filter glossy is used. */ - distribution = distribution_orig; + /* If filter glossy is used we replace Sharp glossy with GGX so we can + * benefit from closure blur to remove unwanted noise. + */ + if(roughness_input->link == NULL && + distribution == CLOSURE_BSDF_REFLECTION_ID) + { + VLOG(1) << "Using GGX glossy with filter glossy."; + distribution = CLOSURE_BSDF_MICROFACET_GGX_ID; + roughness = 0.0f; + } } closure = distribution; } @@ -1953,7 +1974,8 @@ void GlossyBsdfNode::simplify_settings(Scene *scene) bool GlossyBsdfNode::has_integrator_dependency() { ShaderInput *roughness_input = input("Roughness"); - return !roughness_input->link && roughness <= 1e-4f; + return !roughness_input->link && + (distribution == CLOSURE_BSDF_REFLECTION_ID || roughness <= 1e-4f); } void GlossyBsdfNode::compile(SVMCompiler& compiler) @@ -2008,21 +2030,38 @@ GlassBsdfNode::GlassBsdfNode() void GlassBsdfNode::simplify_settings(Scene *scene) { if(distribution_orig == NBUILTIN_CLOSURES) { + roughness_orig = roughness; distribution_orig = distribution; } + else { + /* By default we use original values, so we don't worry about restoring + * defaults later one and can only do override when needed. + */ + roughness = roughness_orig; + distribution = distribution_orig; + } Integrator *integrator = scene->integrator; + ShaderInput *roughness_input = input("Roughness"); if(integrator->filter_glossy == 0.0f) { /* Fallback to Sharp closure for Roughness close to 0. * Note: Keep the epsilon in sync with kernel! */ - ShaderInput *roughness_input = input("Roughness"); if(!roughness_input->link && roughness <= 1e-4f) { + VLOG(1) << "Using sharp glass BSDF."; distribution = CLOSURE_BSDF_SHARP_GLASS_ID; } } else { - /* Rollback to original distribution when filter glossy is used. */ - distribution = distribution_orig; + /* If filter glossy is used we replace Sharp glossy with GGX so we can + * benefit from closure blur to remove unwanted noise. + */ + if(roughness_input->link == NULL && + distribution == CLOSURE_BSDF_SHARP_GLASS_ID) + { + VLOG(1) << "Using GGX glass with filter glossy."; + distribution = CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID; + roughness = 0.0f; + } } closure = distribution; } @@ -2030,7 +2069,8 @@ void GlassBsdfNode::simplify_settings(Scene *scene) bool GlassBsdfNode::has_integrator_dependency() { ShaderInput *roughness_input = input("Roughness"); - return !roughness_input->link && roughness <= 1e-4f; + return !roughness_input->link && + (distribution == CLOSURE_BSDF_SHARP_GLASS_ID || roughness <= 1e-4f); } void GlassBsdfNode::compile(SVMCompiler& compiler) @@ -2085,21 +2125,38 @@ RefractionBsdfNode::RefractionBsdfNode() void RefractionBsdfNode::simplify_settings(Scene *scene) { if(distribution_orig == NBUILTIN_CLOSURES) { + roughness_orig = roughness; distribution_orig = distribution; } + else { + /* By default we use original values, so we don't worry about restoring + * defaults later one and can only do override when needed. + */ + roughness = roughness_orig; + distribution = distribution_orig; + } Integrator *integrator = scene->integrator; + ShaderInput *roughness_input = input("Roughness"); if(integrator->filter_glossy == 0.0f) { /* Fallback to Sharp closure for Roughness close to 0. * Note: Keep the epsilon in sync with kernel! */ - ShaderInput *roughness_input = input("Roughness"); if(!roughness_input->link && roughness <= 1e-4f) { + VLOG(1) << "Using sharp refraction BSDF."; distribution = CLOSURE_BSDF_REFRACTION_ID; } } else { - /* Rollback to original distribution when filter glossy is used. */ - distribution = distribution_orig; + /* If filter glossy is used we replace Sharp glossy with GGX so we can + * benefit from closure blur to remove unwanted noise. + */ + if(roughness_input->link == NULL && + distribution == CLOSURE_BSDF_REFRACTION_ID) + { + VLOG(1) << "Using GGX refraction with filter glossy."; + distribution = CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID; + roughness = 0.0f; + } } closure = distribution; } @@ -2107,7 +2164,8 @@ void RefractionBsdfNode::simplify_settings(Scene *scene) bool RefractionBsdfNode::has_integrator_dependency() { ShaderInput *roughness_input = input("Roughness"); - return !roughness_input->link && roughness <= 1e-4f; + return !roughness_input->link && + (distribution == CLOSURE_BSDF_REFRACTION_ID || roughness <= 1e-4f); } void RefractionBsdfNode::compile(SVMCompiler& compiler) @@ -2231,6 +2289,160 @@ void DiffuseBsdfNode::compile(OSLCompiler& compiler) compiler.add(this, "node_diffuse_bsdf"); } +/* Disney principled BSDF Closure */ +NODE_DEFINE(PrincipledBsdfNode) +{ + NodeType* type = NodeType::add("principled_bsdf", create, NodeType::SHADER); + + static NodeEnum distribution_enum; + distribution_enum.insert("GGX", CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID); + distribution_enum.insert("Multiscatter GGX", CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID); + SOCKET_ENUM(distribution, "Distribution", distribution_enum, CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID); + + static NodeEnum subsurface_method_enum; + subsurface_method_enum.insert("burley", CLOSURE_BSSRDF_PRINCIPLED_ID); + subsurface_method_enum.insert("random_walk", CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID); + SOCKET_ENUM(subsurface_method, "Subsurface Method", subsurface_method_enum, CLOSURE_BSSRDF_PRINCIPLED_ID); + + SOCKET_IN_COLOR(base_color, "Base Color", make_float3(0.8f, 0.8f, 0.8f)); + SOCKET_IN_COLOR(subsurface_color, "Subsurface Color", make_float3(0.8f, 0.8f, 0.8f)); + SOCKET_IN_FLOAT(metallic, "Metallic", 0.0f); + SOCKET_IN_FLOAT(subsurface, "Subsurface", 0.0f); + SOCKET_IN_VECTOR(subsurface_radius, "Subsurface Radius", make_float3(0.1f, 0.1f, 0.1f)); + SOCKET_IN_FLOAT(specular, "Specular", 0.0f); + SOCKET_IN_FLOAT(roughness, "Roughness", 0.5f); + SOCKET_IN_FLOAT(specular_tint, "Specular Tint", 0.0f); + SOCKET_IN_FLOAT(anisotropic, "Anisotropic", 0.0f); + SOCKET_IN_FLOAT(sheen, "Sheen", 0.0f); + SOCKET_IN_FLOAT(sheen_tint, "Sheen Tint", 0.0f); + SOCKET_IN_FLOAT(clearcoat, "Clearcoat", 0.0f); + SOCKET_IN_FLOAT(clearcoat_roughness, "Clearcoat Roughness", 0.03f); + SOCKET_IN_FLOAT(ior, "IOR", 0.0f); + SOCKET_IN_FLOAT(transmission, "Transmission", 0.0f); + SOCKET_IN_FLOAT(transmission_roughness, "Transmission Roughness", 0.0f); + SOCKET_IN_FLOAT(anisotropic_rotation, "Anisotropic Rotation", 0.0f); + SOCKET_IN_NORMAL(normal, "Normal", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_NORMAL); + SOCKET_IN_NORMAL(clearcoat_normal, "Clearcoat Normal", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_NORMAL); + SOCKET_IN_NORMAL(tangent, "Tangent", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_TANGENT); + SOCKET_IN_FLOAT(surface_mix_weight, "SurfaceMixWeight", 0.0f, SocketType::SVM_INTERNAL); + + SOCKET_OUT_CLOSURE(BSDF, "BSDF"); + + return type; +} + +PrincipledBsdfNode::PrincipledBsdfNode() + : BsdfBaseNode(node_type) +{ + closure = CLOSURE_BSDF_PRINCIPLED_ID; + distribution = CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID; + distribution_orig = NBUILTIN_CLOSURES; +} + +bool PrincipledBsdfNode::has_surface_bssrdf() +{ + ShaderInput *subsurface_in = input("Subsurface"); + return (subsurface_in->link != NULL || subsurface > CLOSURE_WEIGHT_CUTOFF); +} + +void PrincipledBsdfNode::attributes(Shader *shader, AttributeRequestSet *attributes) +{ + if(shader->has_surface) { + ShaderInput *tangent_in = input("Tangent"); + + if(!tangent_in->link) + attributes->add(ATTR_STD_GENERATED); + } + + ShaderNode::attributes(shader, attributes); +} + +void PrincipledBsdfNode::compile(SVMCompiler& compiler, ShaderInput *p_metallic, ShaderInput *p_subsurface, ShaderInput *p_subsurface_radius, + ShaderInput *p_specular, ShaderInput *p_roughness, ShaderInput *p_specular_tint, ShaderInput *p_anisotropic, + ShaderInput *p_sheen, ShaderInput *p_sheen_tint, ShaderInput *p_clearcoat, ShaderInput *p_clearcoat_roughness, + ShaderInput *p_ior, ShaderInput *p_transmission, ShaderInput *p_anisotropic_rotation, ShaderInput *p_transmission_roughness) +{ + ShaderInput *base_color_in = input("Base Color"); + ShaderInput *subsurface_color_in = input("Subsurface Color"); + ShaderInput *normal_in = input("Normal"); + ShaderInput *clearcoat_normal_in = input("Clearcoat Normal"); + ShaderInput *tangent_in = input("Tangent"); + + float3 weight = make_float3(1.0f, 1.0f, 1.0f); + + compiler.add_node(NODE_CLOSURE_SET_WEIGHT, weight); + + int normal_offset = compiler.stack_assign_if_linked(normal_in); + int clearcoat_normal_offset = compiler.stack_assign_if_linked(clearcoat_normal_in); + int tangent_offset = compiler.stack_assign_if_linked(tangent_in); + int specular_offset = compiler.stack_assign(p_specular); + int roughness_offset = compiler.stack_assign(p_roughness); + int specular_tint_offset = compiler.stack_assign(p_specular_tint); + int anisotropic_offset = compiler.stack_assign(p_anisotropic); + int sheen_offset = compiler.stack_assign(p_sheen); + int sheen_tint_offset = compiler.stack_assign(p_sheen_tint); + int clearcoat_offset = compiler.stack_assign(p_clearcoat); + int clearcoat_roughness_offset = compiler.stack_assign(p_clearcoat_roughness); + int ior_offset = compiler.stack_assign(p_ior); + int transmission_offset = compiler.stack_assign(p_transmission); + int transmission_roughness_offset = compiler.stack_assign(p_transmission_roughness); + int anisotropic_rotation_offset = compiler.stack_assign(p_anisotropic_rotation); + int subsurface_radius_offset = compiler.stack_assign(p_subsurface_radius); + + compiler.add_node(NODE_CLOSURE_BSDF, + compiler.encode_uchar4(closure, + compiler.stack_assign(p_metallic), + compiler.stack_assign(p_subsurface), + compiler.closure_mix_weight_offset()), + __float_as_int((p_metallic) ? get_float(p_metallic->socket_type) : 0.0f), + __float_as_int((p_subsurface) ? get_float(p_subsurface->socket_type) : 0.0f)); + + compiler.add_node(normal_offset, tangent_offset, + compiler.encode_uchar4(specular_offset, roughness_offset, specular_tint_offset, anisotropic_offset), + compiler.encode_uchar4(sheen_offset, sheen_tint_offset, clearcoat_offset, clearcoat_roughness_offset)); + + compiler.add_node(compiler.encode_uchar4(ior_offset, transmission_offset, anisotropic_rotation_offset, transmission_roughness_offset), + distribution, subsurface_method, SVM_STACK_INVALID); + + float3 bc_default = get_float3(base_color_in->socket_type); + + compiler.add_node(((base_color_in->link) ? compiler.stack_assign(base_color_in) : SVM_STACK_INVALID), + __float_as_int(bc_default.x), __float_as_int(bc_default.y), __float_as_int(bc_default.z)); + + compiler.add_node(clearcoat_normal_offset, subsurface_radius_offset, SVM_STACK_INVALID, SVM_STACK_INVALID); + + float3 ss_default = get_float3(subsurface_color_in->socket_type); + + compiler.add_node(((subsurface_color_in->link) ? compiler.stack_assign(subsurface_color_in) : SVM_STACK_INVALID), + __float_as_int(ss_default.x), __float_as_int(ss_default.y), __float_as_int(ss_default.z)); +} + +bool PrincipledBsdfNode::has_integrator_dependency() +{ + ShaderInput *roughness_input = input("Roughness"); + return !roughness_input->link && roughness <= 1e-4f; +} + +void PrincipledBsdfNode::compile(SVMCompiler& compiler) +{ + compile(compiler, input("Metallic"), input("Subsurface"), input("Subsurface Radius"), input("Specular"), + input("Roughness"), input("Specular Tint"), input("Anisotropic"), input("Sheen"), input("Sheen Tint"), + input("Clearcoat"), input("Clearcoat Roughness"), input("IOR"), input("Transmission"), + input("Anisotropic Rotation"), input("Transmission Roughness")); +} + +void PrincipledBsdfNode::compile(OSLCompiler& compiler) +{ + compiler.parameter(this, "distribution"); + compiler.parameter(this, "subsurface_method"); + compiler.add(this, "node_principled_bsdf"); +} + +bool PrincipledBsdfNode::has_bssrdf_bump() +{ + return has_surface_bssrdf() && has_bump(); +} + /* Translucent BSDF Closure */ NODE_DEFINE(TranslucentBsdfNode) @@ -2306,6 +2518,7 @@ NODE_DEFINE(SubsurfaceScatteringNode) falloff_enum.insert("cubic", CLOSURE_BSSRDF_CUBIC_ID); falloff_enum.insert("gaussian", CLOSURE_BSSRDF_GAUSSIAN_ID); falloff_enum.insert("burley", CLOSURE_BSSRDF_BURLEY_ID); + falloff_enum.insert("random_walk", CLOSURE_BSSRDF_RANDOM_WALK_ID); SOCKET_ENUM(falloff, "Falloff", falloff_enum, CLOSURE_BSSRDF_BURLEY_ID); SOCKET_IN_FLOAT(scale, "Scale", 0.01f); SOCKET_IN_VECTOR(radius, "Radius", make_float3(0.1f, 0.1f, 0.1f)); @@ -2617,6 +2830,120 @@ void ScatterVolumeNode::compile(OSLCompiler& compiler) compiler.add(this, "node_scatter_volume"); } +/* Principled Volume Closure */ + +NODE_DEFINE(PrincipledVolumeNode) +{ + NodeType* type = NodeType::add("principled_volume", create, NodeType::SHADER); + + SOCKET_IN_STRING(density_attribute, "Density Attribute", ustring()); + SOCKET_IN_STRING(color_attribute, "Color Attribute", ustring()); + SOCKET_IN_STRING(temperature_attribute, "Temperature Attribute", ustring()); + + SOCKET_IN_COLOR(color, "Color", make_float3(0.5f, 0.5f, 0.5f)); + SOCKET_IN_FLOAT(density, "Density", 1.0f); + SOCKET_IN_FLOAT(anisotropy, "Anisotropy", 0.0f); + SOCKET_IN_COLOR(absorption_color, "Absorption Color", make_float3(0.0f, 0.0f, 0.0f)); + SOCKET_IN_FLOAT(emission_strength, "Emission Strength", 0.0f); + SOCKET_IN_COLOR(emission_color, "Emission Color", make_float3(1.0f, 1.0f, 1.0f)); + SOCKET_IN_FLOAT(blackbody_intensity, "Blackbody Intensity", 0.0f); + SOCKET_IN_COLOR(blackbody_tint, "Blackbody Tint", make_float3(1.0f, 1.0f, 1.0f)); + SOCKET_IN_FLOAT(temperature, "Temperature", 1500.0f); + SOCKET_IN_FLOAT(volume_mix_weight, "VolumeMixWeight", 0.0f, SocketType::SVM_INTERNAL); + + SOCKET_OUT_CLOSURE(volume, "Volume"); + + return type; +} + +PrincipledVolumeNode::PrincipledVolumeNode() +: VolumeNode(node_type) +{ + closure = CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID; +} + +void PrincipledVolumeNode::attributes(Shader *shader, AttributeRequestSet *attributes) +{ + if(shader->has_volume) { + ShaderInput *density_in = input("Density"); + ShaderInput *blackbody_in = input("Blackbody Intensity"); + + if(density_in->link || density > 0.0f) { + attributes->add_standard(density_attribute); + attributes->add_standard(color_attribute); + } + + if(blackbody_in->link || blackbody_intensity > 0.0f) { + attributes->add_standard(temperature_attribute); + } + + attributes->add(ATTR_STD_GENERATED_TRANSFORM); + } + + ShaderNode::attributes(shader, attributes); +} + +void PrincipledVolumeNode::compile(SVMCompiler& compiler) +{ + ShaderInput *color_in = input("Color"); + ShaderInput *density_in = input("Density"); + ShaderInput *anisotropy_in = input("Anisotropy"); + ShaderInput *absorption_color_in = input("Absorption Color"); + ShaderInput *emission_in = input("Emission Strength"); + ShaderInput *emission_color_in = input("Emission Color"); + ShaderInput *blackbody_in = input("Blackbody Intensity"); + ShaderInput *blackbody_tint_in = input("Blackbody Tint"); + ShaderInput *temperature_in = input("Temperature"); + + if(color_in->link) + compiler.add_node(NODE_CLOSURE_WEIGHT, compiler.stack_assign(color_in)); + else + compiler.add_node(NODE_CLOSURE_SET_WEIGHT, color); + + compiler.add_node(NODE_PRINCIPLED_VOLUME, + compiler.encode_uchar4( + compiler.stack_assign_if_linked(density_in), + compiler.stack_assign_if_linked(anisotropy_in), + compiler.stack_assign(absorption_color_in), + compiler.closure_mix_weight_offset()), + compiler.encode_uchar4( + compiler.stack_assign_if_linked(emission_in), + compiler.stack_assign(emission_color_in), + compiler.stack_assign_if_linked(blackbody_in), + compiler.stack_assign(temperature_in)), + compiler.stack_assign(blackbody_tint_in)); + + int attr_density = compiler.attribute_standard(density_attribute); + int attr_color = compiler.attribute_standard(color_attribute); + int attr_temperature = compiler.attribute_standard(temperature_attribute); + + compiler.add_node( + __float_as_int(density), + __float_as_int(anisotropy), + __float_as_int(emission_strength), + __float_as_int(blackbody_intensity)); + + compiler.add_node( + attr_density, + attr_color, + attr_temperature); +} + +void PrincipledVolumeNode::compile(OSLCompiler& compiler) +{ + if(Attribute::name_standard(density_attribute.c_str())) { + density_attribute = ustring("geom:" + density_attribute.string()); + } + if(Attribute::name_standard(color_attribute.c_str())) { + color_attribute = ustring("geom:" + color_attribute.string()); + } + if(Attribute::name_standard(temperature_attribute.c_str())) { + temperature_attribute = ustring("geom:" + temperature_attribute.string()); + } + + compiler.add(this, "node_principled_volume"); +} + /* Hair BSDF Closure */ NODE_DEFINE(HairBsdfNode) @@ -2887,7 +3214,6 @@ void TextureCoordinateNode::compile(SVMCompiler& compiler) compiler.add_node(ob_itfm.x); compiler.add_node(ob_itfm.y); compiler.add_node(ob_itfm.z); - compiler.add_node(ob_itfm.w); } } @@ -2926,7 +3252,7 @@ void TextureCoordinateNode::compile(OSLCompiler& compiler) if(compiler.output_type() == SHADER_TYPE_VOLUME) compiler.parameter("is_volume", true); compiler.parameter(this, "use_transform"); - Transform ob_itfm = transform_transpose(transform_inverse(ob_tfm)); + Transform ob_itfm = transform_transposed_inverse(ob_tfm); compiler.parameter("object_itfm", ob_itfm); compiler.parameter(this, "from_dupli"); @@ -2940,7 +3266,7 @@ NODE_DEFINE(UVMapNode) { NodeType* type = NodeType::add("uvmap", create, NodeType::SHADER); - SOCKET_IN_STRING(attribute, "attribute", ustring("")); + SOCKET_STRING(attribute, "attribute", ustring()); SOCKET_IN_BOOLEAN(from_dupli, "from dupli", false); SOCKET_OUT_POINT(UV, "UV"); @@ -3237,6 +3563,7 @@ NODE_DEFINE(ParticleInfoNode) NodeType* type = NodeType::add("particle_info", create, NodeType::SHADER); SOCKET_OUT_FLOAT(index, "Index"); + SOCKET_OUT_FLOAT(random, "Random"); SOCKET_OUT_FLOAT(age, "Age"); SOCKET_OUT_FLOAT(lifetime, "Lifetime"); SOCKET_OUT_POINT(location, "Location"); @@ -3259,6 +3586,8 @@ void ParticleInfoNode::attributes(Shader *shader, AttributeRequestSet *attribute { if(!output("Index")->links.empty()) attributes->add(ATTR_STD_PARTICLE); + if(!output("Random")->links.empty()) + attributes->add(ATTR_STD_PARTICLE); if(!output("Age")->links.empty()) attributes->add(ATTR_STD_PARTICLE); if(!output("Lifetime")->links.empty()) @@ -3287,6 +3616,11 @@ void ParticleInfoNode::compile(SVMCompiler& compiler) if(!out->links.empty()) { compiler.add_node(NODE_PARTICLE_INFO, NODE_INFO_PAR_INDEX, compiler.stack_assign(out)); } + + out = output("Random"); + if(!out->links.empty()) { + compiler.add_node(NODE_PARTICLE_INFO, NODE_INFO_PAR_RANDOM, compiler.stack_assign(out)); + } out = output("Age"); if(!out->links.empty()) { @@ -3341,10 +3675,11 @@ NODE_DEFINE(HairInfoNode) SOCKET_OUT_FLOAT(is_strand, "Is Strand"); SOCKET_OUT_FLOAT(intercept, "Intercept"); SOCKET_OUT_FLOAT(thickness, "Thickness"); - SOCKET_OUT_NORMAL(tangent Normal, "Tangent Normal"); + SOCKET_OUT_NORMAL(tangent_normal, "Tangent Normal"); #if 0 /*output for minimum hair width transparency - deactivated */ SOCKET_OUT_FLOAT(fade, "Fade"); #endif + SOCKET_OUT_FLOAT(index, "Random"); return type; } @@ -3361,6 +3696,9 @@ void HairInfoNode::attributes(Shader *shader, AttributeRequestSet *attributes) if(!intercept_out->links.empty()) attributes->add(ATTR_STD_CURVE_INTERCEPT); + + if(!output("Random")->links.empty()) + attributes->add(ATTR_STD_CURVE_RANDOM); } ShaderNode::attributes(shader, attributes); @@ -3396,6 +3734,11 @@ void HairInfoNode::compile(SVMCompiler& compiler) compiler.add_node(NODE_HAIR_INFO, NODE_INFO_CURVE_FADE, compiler.stack_assign(out)); }*/ + out = output("Random"); + if(!out->links.empty()) { + int attr = compiler.attribute(ATTR_STD_CURVE_RANDOM); + compiler.add_node(NODE_ATTR, attr, compiler.stack_assign(out), NODE_ATTR_FLOAT); + } } void HairInfoNode::compile(OSLCompiler& compiler) @@ -4221,7 +4564,7 @@ NODE_DEFINE(AttributeNode) { NodeType* type = NodeType::add("attribute", create, NodeType::SHADER); - SOCKET_STRING(attribute, "Attribute", ustring("")); + SOCKET_STRING(attribute, "Attribute", ustring()); SOCKET_OUT_COLOR(color, "Color"); SOCKET_OUT_VECTOR(vector, "Vector"); @@ -4242,16 +4585,12 @@ void AttributeNode::attributes(Shader *shader, AttributeRequestSet *attributes) ShaderOutput *fac_out = output("Fac"); if(!color_out->links.empty() || !vector_out->links.empty() || !fac_out->links.empty()) { - AttributeStandard std = Attribute::name_standard(attribute.c_str()); - - if(std != ATTR_STD_NONE) - attributes->add(std); - else - attributes->add(attribute); + attributes->add_standard(attribute); } - if(shader->has_volume) + if(shader->has_volume) { attributes->add(ATTR_STD_GENERATED_TRANSFORM); + } ShaderNode::attributes(shader, attributes); } @@ -4262,13 +4601,7 @@ void AttributeNode::compile(SVMCompiler& compiler) ShaderOutput *vector_out = output("Vector"); ShaderOutput *fac_out = output("Fac"); ShaderNodeType attr_node = NODE_ATTR; - AttributeStandard std = Attribute::name_standard(attribute.c_str()); - int attr; - - if(std != ATTR_STD_NONE) - attr = compiler.attribute(std); - else - attr = compiler.attribute(attribute); + int attr = compiler.attribute_standard(attribute); if(bump == SHADER_BUMP_DX) attr_node = NODE_ATTR_BUMP_DX; @@ -4561,7 +4894,7 @@ NODE_DEFINE(OutputNode) SOCKET_IN_CLOSURE(surface, "Surface"); SOCKET_IN_CLOSURE(volume, "Volume"); - SOCKET_IN_FLOAT(displacement, "Displacement", 0.0f); + SOCKET_IN_VECTOR(displacement, "Displacement", make_float3(0.0f, 0.0f, 0.0f)); SOCKET_IN_NORMAL(normal, "Normal", make_float3(0.0f, 0.0f, 0.0f)); return type; @@ -5226,7 +5559,7 @@ NODE_DEFINE(NormalMapNode) space_enum.insert("blender_world", NODE_NORMAL_MAP_BLENDER_WORLD); SOCKET_ENUM(space, "Space", space_enum, NODE_TANGENT_RADIAL); - SOCKET_STRING(attribute, "Attribute", ustring("")); + SOCKET_STRING(attribute, "Attribute", ustring()); SOCKET_IN_NORMAL(normal_osl, "NormalIn", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_NORMAL | SocketType::OSL_INTERNAL); SOCKET_IN_FLOAT(strength, "Strength", 1.0f); @@ -5245,7 +5578,7 @@ NormalMapNode::NormalMapNode() void NormalMapNode::attributes(Shader *shader, AttributeRequestSet *attributes) { if(shader->has_surface && space == NODE_NORMAL_MAP_TANGENT) { - if(attribute == ustring("")) { + if(attribute.empty()) { attributes->add(ATTR_STD_UV_TANGENT); attributes->add(ATTR_STD_UV_TANGENT_SIGN); } @@ -5268,7 +5601,7 @@ void NormalMapNode::compile(SVMCompiler& compiler) int attr = 0, attr_sign = 0; if(space == NODE_NORMAL_MAP_TANGENT) { - if(attribute == ustring("")) { + if(attribute.empty()) { attr = compiler.attribute(ATTR_STD_UV_TANGENT); attr_sign = compiler.attribute(ATTR_STD_UV_TANGENT_SIGN); } @@ -5290,7 +5623,7 @@ void NormalMapNode::compile(SVMCompiler& compiler) void NormalMapNode::compile(OSLCompiler& compiler) { if(space == NODE_NORMAL_MAP_TANGENT) { - if(attribute == ustring("")) { + if(attribute.empty()) { compiler.parameter("attr_name", ustring("geom:tangent")); compiler.parameter("attr_sign_name", ustring("geom:tangent_sign")); } @@ -5321,7 +5654,7 @@ NODE_DEFINE(TangentNode) axis_enum.insert("z", NODE_TANGENT_AXIS_Z); SOCKET_ENUM(axis, "Axis", axis_enum, NODE_TANGENT_AXIS_X); - SOCKET_STRING(attribute, "Attribute", ustring("")); + SOCKET_STRING(attribute, "Attribute", ustring()); SOCKET_IN_NORMAL(normal_osl, "NormalIn", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_NORMAL | SocketType::OSL_INTERNAL); SOCKET_OUT_NORMAL(tangent, "Tangent"); @@ -5338,7 +5671,7 @@ void TangentNode::attributes(Shader *shader, AttributeRequestSet *attributes) { if(shader->has_surface) { if(direction_type == NODE_TANGENT_UVMAP) { - if(attribute == ustring("")) + if(attribute.empty()) attributes->add(ATTR_STD_UV_TANGENT); else attributes->add(ustring((string(attribute.c_str()) + ".tangent").c_str())); @@ -5356,7 +5689,7 @@ void TangentNode::compile(SVMCompiler& compiler) int attr; if(direction_type == NODE_TANGENT_UVMAP) { - if(attribute == ustring("")) + if(attribute.empty()) attr = compiler.attribute(ATTR_STD_UV_TANGENT); else attr = compiler.attribute(ustring((string(attribute.c_str()) + ".tangent").c_str())); @@ -5374,7 +5707,7 @@ void TangentNode::compile(SVMCompiler& compiler) void TangentNode::compile(OSLCompiler& compiler) { if(direction_type == NODE_TANGENT_UVMAP) { - if(attribute == ustring("")) + if(attribute.empty()) compiler.parameter("attr_name", ustring("geom:tangent")); else compiler.parameter("attr_name", ustring((string(attribute.c_str()) + ".tangent").c_str())); @@ -5385,4 +5718,205 @@ void TangentNode::compile(OSLCompiler& compiler) compiler.add(this, "node_tangent"); } +/* Bevel */ + +NODE_DEFINE(BevelNode) +{ + NodeType* type = NodeType::add("bevel", create, NodeType::SHADER); + + SOCKET_INT(samples, "Samples", 4); + + SOCKET_IN_FLOAT(radius, "Radius", 0.05f); + SOCKET_IN_NORMAL(normal, "Normal", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_NORMAL); + + SOCKET_OUT_NORMAL(bevel, "Normal"); + + return type; +} + +BevelNode::BevelNode() +: ShaderNode(node_type) +{ +} + +void BevelNode::compile(SVMCompiler& compiler) +{ + ShaderInput *radius_in = input("Radius"); + ShaderInput *normal_in = input("Normal"); + ShaderOutput *normal_out = output("Normal"); + + compiler.add_node(NODE_BEVEL, + compiler.encode_uchar4(samples, + compiler.stack_assign(radius_in), + compiler.stack_assign_if_linked(normal_in), + compiler.stack_assign(normal_out))); +} + +void BevelNode::compile(OSLCompiler& compiler) +{ + compiler.parameter(this, "samples"); + compiler.add(this, "node_bevel"); +} + +/* Displacement */ + +NODE_DEFINE(DisplacementNode) +{ + NodeType* type = NodeType::add("displacement", create, NodeType::SHADER); + + static NodeEnum space_enum; + space_enum.insert("object", NODE_NORMAL_MAP_OBJECT); + space_enum.insert("world", NODE_NORMAL_MAP_WORLD); + + SOCKET_ENUM(space, "Space", space_enum, NODE_NORMAL_MAP_TANGENT); + + SOCKET_IN_FLOAT(height, "Height", 0.0f); + SOCKET_IN_FLOAT(midlevel, "Midlevel", 0.5f); + SOCKET_IN_FLOAT(scale, "Scale", 1.0f); + SOCKET_IN_NORMAL(normal, "Normal", make_float3(0.0f, 0.0f, 0.0f), SocketType::LINK_NORMAL); + + SOCKET_OUT_VECTOR(displacement, "Displacement"); + + return type; +} + +DisplacementNode::DisplacementNode() +: ShaderNode(node_type) +{ +} + +void DisplacementNode::constant_fold(const ConstantFolder& folder) +{ + if(folder.all_inputs_constant()) { + if((height - midlevel == 0.0f) || (scale == 0.0f)) { + folder.make_zero(); + } + } +} + +void DisplacementNode::compile(SVMCompiler& compiler) +{ + ShaderInput *height_in = input("Height"); + ShaderInput *midlevel_in = input("Midlevel"); + ShaderInput *scale_in = input("Scale"); + ShaderInput *normal_in = input("Normal"); + ShaderOutput *displacement_out = output("Displacement"); + + compiler.add_node(NODE_DISPLACEMENT, + compiler.encode_uchar4(compiler.stack_assign(height_in), + compiler.stack_assign(midlevel_in), + compiler.stack_assign(scale_in), + compiler.stack_assign_if_linked(normal_in)), + compiler.stack_assign(displacement_out), + space); +} + +void DisplacementNode::compile(OSLCompiler& compiler) +{ + compiler.parameter(this, "space"); + compiler.add(this, "node_displacement"); +} + +/* Vector Displacement */ + +NODE_DEFINE(VectorDisplacementNode) +{ + NodeType* type = NodeType::add("vector_displacement", create, NodeType::SHADER); + + static NodeEnum space_enum; + space_enum.insert("tangent", NODE_NORMAL_MAP_TANGENT); + space_enum.insert("object", NODE_NORMAL_MAP_OBJECT); + space_enum.insert("world", NODE_NORMAL_MAP_WORLD); + + SOCKET_ENUM(space, "Space", space_enum, NODE_NORMAL_MAP_TANGENT); + SOCKET_STRING(attribute, "Attribute", ustring()); + + SOCKET_IN_COLOR(vector, "Vector", make_float3(0.0f, 0.0f, 0.0f)); + SOCKET_IN_FLOAT(midlevel, "Midlevel", 0.0f); + SOCKET_IN_FLOAT(scale, "Scale", 1.0f); + + SOCKET_OUT_VECTOR(displacement, "Displacement"); + + return type; +} + +VectorDisplacementNode::VectorDisplacementNode() +: ShaderNode(node_type) +{ +} + +void VectorDisplacementNode::constant_fold(const ConstantFolder& folder) +{ + if(folder.all_inputs_constant()) { + if((vector == make_float3(0.0f, 0.0f, 0.0f) && midlevel == 0.0f) || + (scale == 0.0f)) { + folder.make_zero(); + } + } +} + +void VectorDisplacementNode::attributes(Shader *shader, AttributeRequestSet *attributes) +{ + if(shader->has_surface && space == NODE_NORMAL_MAP_TANGENT) { + if(attribute.empty()) { + attributes->add(ATTR_STD_UV_TANGENT); + attributes->add(ATTR_STD_UV_TANGENT_SIGN); + } + else { + attributes->add(ustring((string(attribute.c_str()) + ".tangent").c_str())); + attributes->add(ustring((string(attribute.c_str()) + ".tangent_sign").c_str())); + } + + attributes->add(ATTR_STD_VERTEX_NORMAL); + } + + ShaderNode::attributes(shader, attributes); +} + +void VectorDisplacementNode::compile(SVMCompiler& compiler) +{ + ShaderInput *vector_in = input("Vector"); + ShaderInput *midlevel_in = input("Midlevel"); + ShaderInput *scale_in = input("Scale"); + ShaderOutput *displacement_out = output("Displacement"); + int attr = 0, attr_sign = 0; + + if(space == NODE_NORMAL_MAP_TANGENT) { + if(attribute.empty()) { + attr = compiler.attribute(ATTR_STD_UV_TANGENT); + attr_sign = compiler.attribute(ATTR_STD_UV_TANGENT_SIGN); + } + else { + attr = compiler.attribute(ustring((string(attribute.c_str()) + ".tangent").c_str())); + attr_sign = compiler.attribute(ustring((string(attribute.c_str()) + ".tangent_sign").c_str())); + } + } + + compiler.add_node(NODE_VECTOR_DISPLACEMENT, + compiler.encode_uchar4(compiler.stack_assign(vector_in), + compiler.stack_assign(midlevel_in), + compiler.stack_assign(scale_in), + compiler.stack_assign(displacement_out)), + attr, attr_sign); + + compiler.add_node(space); +} + +void VectorDisplacementNode::compile(OSLCompiler& compiler) +{ + if(space == NODE_NORMAL_MAP_TANGENT) { + if(attribute.empty()) { + compiler.parameter("attr_name", ustring("geom:tangent")); + compiler.parameter("attr_sign_name", ustring("geom:tangent_sign")); + } + else { + compiler.parameter("attr_name", ustring((string(attribute.c_str()) + ".tangent").c_str())); + compiler.parameter("attr_sign_name", ustring((string(attribute.c_str()) + ".tangent_sign").c_str())); + } + } + + compiler.parameter(this, "space"); + compiler.add(this, "node_vector_displacement"); +} + CCL_NAMESPACE_END diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h index eb0f7977dd1..58c3d472cd3 100644 --- a/intern/cycles/render/nodes.h +++ b/intern/cycles/render/nodes.h @@ -17,10 +17,10 @@ #ifndef __NODES_H__ #define __NODES_H__ -#include "graph.h" -#include "node.h" +#include "render/graph.h" +#include "graph/node.h" -#include "util_string.h" +#include "util/util_string.h" CCL_NAMESPACE_BEGIN @@ -82,6 +82,7 @@ public: ~ImageTextureNode(); ShaderNode *clone() const; void attributes(Shader *shader, AttributeRequestSet *attributes); + bool has_attribute_dependency() { return true; } ImageManager *image_manager; int is_float; @@ -112,6 +113,7 @@ public: ~EnvironmentTextureNode(); ShaderNode *clone() const; void attributes(Shader *shader, AttributeRequestSet *attributes); + bool has_attribute_dependency() { return true; } virtual int get_group() { return NODE_GROUP_LEVEL_2; } ImageManager *image_manager; @@ -154,7 +156,7 @@ public: void *surface; void *volume; - float displacement; + float3 displacement; float3 normal; /* Don't allow output node de-duplication. */ @@ -252,10 +254,12 @@ public: class PointDensityTextureNode : public ShaderNode { public: SHADER_NODE_NO_CLONE_CLASS(PointDensityTextureNode) + virtual int get_group() { return NODE_GROUP_LEVEL_3; } ~PointDensityTextureNode(); ShaderNode *clone() const; void attributes(Shader *shader, AttributeRequestSet *attributes); + bool has_attribute_dependency() { return true; } bool has_spatial_varying() { return true; } bool has_object_dependency() { return true; } @@ -321,25 +325,33 @@ private: static bool initialized; }; -class BsdfNode : public ShaderNode { +class BsdfBaseNode : public ShaderNode { public: - explicit BsdfNode(const NodeType *node_type); - SHADER_NODE_BASE_CLASS(BsdfNode); + BsdfBaseNode(const NodeType *node_type); bool has_spatial_varying() { return true; } - void compile(SVMCompiler& compiler, ShaderInput *param1, ShaderInput *param2, ShaderInput *param3 = NULL, ShaderInput *param4 = NULL); virtual ClosureType get_closure_type() { return closure; } - - float3 color; - float3 normal; - float surface_mix_weight; - ClosureType closure; + virtual bool has_bump(); virtual bool equals(const ShaderNode& /*other*/) { /* TODO(sergey): With some care BSDF nodes can be de-duplicated. */ return false; } + + ClosureType closure; +}; + +class BsdfNode : public BsdfBaseNode { +public: + explicit BsdfNode(const NodeType *node_type); + SHADER_NODE_BASE_CLASS(BsdfNode) + + void compile(SVMCompiler& compiler, ShaderInput *param1, ShaderInput *param2, ShaderInput *param3 = NULL, ShaderInput *param4 = NULL); + + float3 color; + float3 normal; + float surface_mix_weight; }; class AnisotropicBsdfNode : public BsdfNode { @@ -352,6 +364,7 @@ public: ClosureType get_closure_type() { return distribution; } void attributes(Shader *shader, AttributeRequestSet *attributes); + bool has_attribute_dependency() { return true; } }; class DiffuseBsdfNode : public BsdfNode { @@ -361,6 +374,33 @@ public: float roughness; }; +/* Disney principled BRDF */ +class PrincipledBsdfNode : public BsdfBaseNode { +public: + SHADER_NODE_CLASS(PrincipledBsdfNode) + + bool has_surface_bssrdf(); + bool has_bssrdf_bump(); + void compile(SVMCompiler& compiler, ShaderInput *metallic, ShaderInput *subsurface, ShaderInput *subsurface_radius, + ShaderInput *specular, ShaderInput *roughness, ShaderInput *specular_tint, ShaderInput *anisotropic, + ShaderInput *sheen, ShaderInput *sheen_tint, ShaderInput *clearcoat, ShaderInput *clearcoat_roughness, + ShaderInput *ior, ShaderInput *transmission, ShaderInput *anisotropic_rotation, ShaderInput *transmission_roughness); + + float3 base_color; + float3 subsurface_color, subsurface_radius; + float metallic, subsurface, specular, roughness, specular_tint, anisotropic, + sheen, sheen_tint, clearcoat, clearcoat_roughness, ior, transmission, + anisotropic_rotation, transmission_roughness; + float3 normal, clearcoat_normal, tangent; + float surface_mix_weight; + ClosureType distribution, distribution_orig; + ClosureType subsurface_method; + + bool has_integrator_dependency(); + void attributes(Shader *shader, AttributeRequestSet *attributes); + bool has_attribute_dependency() { return true; } +}; + class TranslucentBsdfNode : public BsdfNode { public: SHADER_NODE_CLASS(TranslucentBsdfNode) @@ -388,7 +428,7 @@ public: bool has_integrator_dependency(); ClosureType get_closure_type() { return distribution; } - float roughness; + float roughness, roughness_orig; ClosureType distribution, distribution_orig; }; @@ -400,7 +440,7 @@ public: bool has_integrator_dependency(); ClosureType get_closure_type() { return distribution; } - float roughness, IOR; + float roughness, roughness_orig, IOR; ClosureType distribution, distribution_orig; }; @@ -412,7 +452,7 @@ public: bool has_integrator_dependency(); ClosureType get_closure_type() { return distribution; } - float roughness, IOR; + float roughness, roughness_orig, IOR; ClosureType distribution, distribution_orig; }; @@ -442,9 +482,9 @@ class EmissionNode : public ShaderNode { public: SHADER_NODE_CLASS(EmissionNode) void constant_fold(const ConstantFolder& folder); - virtual ClosureType get_closure_type() { return CLOSURE_EMISSION_ID; } bool has_surface_emission() { return true; } + bool has_volume_support() { return true; } float3 color; float strength; @@ -455,7 +495,6 @@ class BackgroundNode : public ShaderNode { public: SHADER_NODE_CLASS(BackgroundNode) void constant_fold(const ConstantFolder& folder); - virtual ClosureType get_closure_type() { return CLOSURE_BACKGROUND_ID; } float3 color; float strength; @@ -496,6 +535,7 @@ public: return ShaderNode::get_feature() | NODE_FEATURE_VOLUME; } virtual ClosureType get_closure_type() { return closure; } + virtual bool has_volume_support() { return true; } float3 color; float density; @@ -521,6 +561,25 @@ public: float anisotropy; }; +class PrincipledVolumeNode : public VolumeNode { +public: + SHADER_NODE_CLASS(PrincipledVolumeNode) + void attributes(Shader *shader, AttributeRequestSet *attributes); + bool has_attribute_dependency() { return true; } + + ustring density_attribute; + ustring color_attribute; + ustring temperature_attribute; + + float anisotropy; + float3 absorption_color; + float emission_strength; + float3 emission_color; + float blackbody_intensity; + float3 blackbody_tint; + float temperature; +}; + class HairBsdfNode : public BsdfNode { public: SHADER_NODE_CLASS(HairBsdfNode) @@ -537,6 +596,7 @@ class GeometryNode : public ShaderNode { public: SHADER_NODE_CLASS(GeometryNode) void attributes(Shader *shader, AttributeRequestSet *attributes); + bool has_attribute_dependency() { return true; } bool has_spatial_varying() { return true; } float3 normal_osl; @@ -546,6 +606,7 @@ class TextureCoordinateNode : public ShaderNode { public: SHADER_NODE_CLASS(TextureCoordinateNode) void attributes(Shader *shader, AttributeRequestSet *attributes); + bool has_attribute_dependency() { return true; } bool has_spatial_varying() { return true; } bool has_object_dependency() { return use_transform; } @@ -559,6 +620,7 @@ class UVMapNode : public ShaderNode { public: SHADER_NODE_CLASS(UVMapNode) void attributes(Shader *shader, AttributeRequestSet *attributes); + bool has_attribute_dependency() { return true; } bool has_spatial_varying() { return true; } virtual int get_group() { return NODE_GROUP_LEVEL_1; } @@ -592,6 +654,7 @@ class ParticleInfoNode : public ShaderNode { public: SHADER_NODE_CLASS(ParticleInfoNode) void attributes(Shader *shader, AttributeRequestSet *attributes); + bool has_attribute_dependency() { return true; } virtual int get_group() { return NODE_GROUP_LEVEL_1; } }; @@ -600,6 +663,7 @@ public: SHADER_NODE_CLASS(HairInfoNode) void attributes(Shader *shader, AttributeRequestSet *attributes); + bool has_attribute_dependency() { return true; } bool has_spatial_varying() { return true; } virtual int get_group() { return NODE_GROUP_LEVEL_1; } virtual int get_feature() { @@ -641,7 +705,7 @@ public: class MixClosureWeightNode : public ShaderNode { public: - SHADER_NODE_CLASS(MixClosureWeightNode); + SHADER_NODE_CLASS(MixClosureWeightNode) float weight; float fac; @@ -761,6 +825,7 @@ class AttributeNode : public ShaderNode { public: SHADER_NODE_CLASS(AttributeNode) void attributes(Shader *shader, AttributeRequestSet *attributes); + bool has_attribute_dependency() { return true; } bool has_spatial_varying() { return true; } ustring attribute; @@ -887,7 +952,7 @@ public: class CurvesNode : public ShaderNode { public: explicit CurvesNode(const NodeType *node_type); - SHADER_NODE_BASE_CLASS(CurvesNode); + SHADER_NODE_BASE_CLASS(CurvesNode) virtual int get_group() { return NODE_GROUP_LEVEL_3; } @@ -946,6 +1011,8 @@ public: /* ideally we could beter detect this, but we can't query this now */ bool has_spatial_varying() { return true; } + bool has_volume_support() { return true; } + virtual bool equals(const ShaderNode& /*other*/) { return false; } string filepath; @@ -956,6 +1023,7 @@ class NormalMapNode : public ShaderNode { public: SHADER_NODE_CLASS(NormalMapNode) void attributes(Shader *shader, AttributeRequestSet *attributes); + bool has_attribute_dependency() { return true; } bool has_spatial_varying() { return true; } virtual int get_group() { return NODE_GROUP_LEVEL_3; } @@ -970,6 +1038,7 @@ class TangentNode : public ShaderNode { public: SHADER_NODE_CLASS(TangentNode) void attributes(Shader *shader, AttributeRequestSet *attributes); + bool has_attribute_dependency() { return true; } bool has_spatial_varying() { return true; } virtual int get_group() { return NODE_GROUP_LEVEL_3; } @@ -979,6 +1048,50 @@ public: float3 normal_osl; }; +class BevelNode : public ShaderNode { +public: + SHADER_NODE_CLASS(BevelNode) + bool has_spatial_varying() { return true; } + virtual int get_group() { return NODE_GROUP_LEVEL_3; } + virtual bool has_raytrace() { return true; } + + float radius; + float3 normal; + int samples; +}; + +class DisplacementNode : public ShaderNode { +public: + SHADER_NODE_CLASS(DisplacementNode) + void constant_fold(const ConstantFolder& folder); + virtual int get_feature() { + return NODE_FEATURE_BUMP; + } + + NodeNormalMapSpace space; + float height; + float midlevel; + float scale; + float3 normal; +}; + +class VectorDisplacementNode : public ShaderNode { +public: + SHADER_NODE_CLASS(VectorDisplacementNode) + void attributes(Shader *shader, AttributeRequestSet *attributes); + bool has_attribute_dependency() { return true; } + void constant_fold(const ConstantFolder& folder); + virtual int get_feature() { + return NODE_FEATURE_BUMP; + } + + NodeNormalMapSpace space; + ustring attribute; + float3 vector; + float midlevel; + float scale; +}; + CCL_NAMESPACE_END #endif /* __NODES_H__ */ diff --git a/intern/cycles/render/object.cpp b/intern/cycles/render/object.cpp index 8342f376836..4d64d841206 100644 --- a/intern/cycles/render/object.cpp +++ b/intern/cycles/render/object.cpp @@ -14,25 +14,71 @@ * limitations under the License. */ -#include "camera.h" -#include "device.h" -#include "light.h" -#include "mesh.h" -#include "curves.h" -#include "object.h" -#include "particles.h" -#include "scene.h" - -#include "util_foreach.h" -#include "util_logging.h" -#include "util_map.h" -#include "util_progress.h" -#include "util_vector.h" - -#include "subd_patch_table.h" +#include "render/camera.h" +#include "device/device.h" +#include "render/light.h" +#include "render/mesh.h" +#include "render/curves.h" +#include "render/object.h" +#include "render/particles.h" +#include "render/scene.h" + +#include "util/util_foreach.h" +#include "util/util_logging.h" +#include "util/util_map.h" +#include "util/util_progress.h" +#include "util/util_vector.h" + +#include "subd/subd_patch_table.h" CCL_NAMESPACE_BEGIN +/* Global state of object transform update. */ + +struct UpdateObjectTransformState { + /* Global state used by device_update_object_transform(). + * Common for both threaded and non-threaded update. + */ + + /* Type of the motion required by the scene settings. */ + Scene::MotionType need_motion; + + /* Mapping from particle system to a index in packed particle array. + * Only used for read. + */ + map<ParticleSystem*, int> particle_offset; + + /* Mesh area. + * Used to avoid calculation of mesh area multiple times. Used for both + * read and write. Acquire surface_area_lock to keep it all thread safe. + */ + map<Mesh*, float> surface_area_map; + + /* Motion offsets for each object. */ + array<uint> motion_offset; + + /* Packed object arrays. Those will be filled in. */ + uint *object_flag; + KernelObject *objects; + Transform *object_motion_pass; + DecomposedTransform *object_motion; + + /* Flags which will be synchronized to Integrator. */ + bool have_motion; + bool have_curves; + + /* ** Scheduling queue. ** */ + + Scene *scene; + + /* Some locks to keep everything thread-safe. */ + thread_spin_lock queue_lock; + thread_spin_lock surface_area_lock; + + /* First unused object index in the queue. */ + int queue_start_object; +}; + /* Object */ NODE_DEFINE(Object) @@ -48,6 +94,9 @@ NODE_DEFINE(Object) SOCKET_BOOLEAN(hide_on_missing_motion, "Hide on Missing Motion", false); SOCKET_POINT(dupli_generated, "Dupli Generated", make_float3(0.0f, 0.0f, 0.0f)); SOCKET_POINT2(dupli_uv, "Dupli UV", make_float2(0.0f, 0.0f)); + SOCKET_TRANSFORM_ARRAY(motion, "Motion", array<Transform>()); + + SOCKET_BOOLEAN(is_shadow_catcher, "Shadow Catcher", false); return type; } @@ -58,45 +107,54 @@ Object::Object() particle_system = NULL; particle_index = 0; bounds = BoundBox::empty; - motion.pre = transform_empty(); - motion.mid = transform_empty(); - motion.post = transform_empty(); - use_motion = false; } Object::~Object() { } -void Object::compute_bounds(bool motion_blur) +void Object::update_motion() { - BoundBox mbounds = mesh->bounds; - - if(motion_blur && use_motion) { - MotionTransform mtfm = motion; + if(!use_motion()) { + return; + } - if(hide_on_missing_motion) { - /* Hide objects that have no valid previous or next transform, for - * example particle that stop existing. TODO: add support for this - * case in the kernel so we don't get render artifacts. */ - if(mtfm.pre == transform_empty() || - mtfm.post == transform_empty()) { - bounds = BoundBox::empty; + bool have_motion = false; + + for(size_t i = 0; i < motion.size(); i++) { + if(motion[i] == transform_empty()) { + if(hide_on_missing_motion) { + /* Hide objects that have no valid previous or next + * transform, for example particle that stop existing. It + * would be better to handle this in the kernel and make + * objects invisible outside certain motion steps. */ + tfm = transform_empty(); + motion.clear(); return; } + else { + /* Otherwise just copy center motion. */ + motion[i] = tfm; + } } - /* In case of missing motion information for previous/next frame, - * assume there is no motion. */ - if(mtfm.pre == transform_empty()) { - mtfm.pre = tfm; - } - if(mtfm.post == transform_empty()) { - mtfm.post = tfm; - } + /* Test if any of the transforms are actually different. */ + have_motion = have_motion || motion[i] != tfm; + } + + /* Clear motion array if there is no actual motion. */ + if(!have_motion) { + motion.clear(); + } +} + +void Object::compute_bounds(bool motion_blur) +{ + BoundBox mbounds = mesh->bounds; - DecompMotionTransform decomp; - transform_motion_decompose(&decomp, &mtfm, &tfm); + if(motion_blur && use_motion()) { + array<DecomposedTransform> decomp(motion.size()); + transform_motion_decompose(decomp.data(), motion.data(), motion.size()); bounds = BoundBox::empty; @@ -106,11 +164,12 @@ void Object::compute_bounds(bool motion_blur) for(float t = 0.0f; t < 1.0f; t += (1.0f/128.0f)) { Transform ttfm; - transform_motion_interpolate(&ttfm, &decomp, t); + transform_motion_array_interpolate(&ttfm, decomp.data(), motion.size(), t); bounds.grow(mbounds.transformed(&ttfm)); } } else { + /* No motion blur case. */ if(mesh->transform_applied) { bounds = mbounds; } @@ -130,7 +189,7 @@ void Object::apply_transform(bool apply_to_motion) /* store matrix to transform later. when accessing these as attributes we * do not want the transform to be applied for consistency between static * and dynamic BVH, so we do it on packing. */ - mesh->transform_normal = transform_transpose(transform_inverse(tfm)); + mesh->transform_normal = transform_transposed_inverse(tfm); /* apply to mesh vertices */ for(size_t i = 0; i < mesh->verts.size(); i++) @@ -230,27 +289,30 @@ void Object::tag_update(Scene *scene) scene->object_manager->need_update = true; } -vector<float> Object::motion_times() +bool Object::use_motion() const { - /* compute times at which we sample motion for this object */ - vector<float> times; - - if(!mesh || mesh->motion_steps == 1) - return times; + return (motion.size() > 1); +} - int motion_steps = mesh->motion_steps; +float Object::motion_time(int step) const +{ + return (use_motion()) ? 2.0f * step / (motion.size() - 1) - 1.0f : 0.0f; +} - for(int step = 0; step < motion_steps; step++) { - if(step != motion_steps / 2) { - float time = 2.0f * step / (motion_steps - 1) - 1.0f; - times.push_back(time); +int Object::motion_step(float time) const +{ + if(use_motion()) { + for(size_t step = 0; step < motion.size(); step++) { + if(time == motion_time(step)) { + return step; + } } } - return times; + return -1; } -bool Object::is_traceable() +bool Object::is_traceable() const { /* Mesh itself can be empty,can skip all such objects. */ if(!bounds.valid() || bounds.size() == make_float3(0.0f, 0.0f, 0.0f)) { @@ -260,6 +322,17 @@ bool Object::is_traceable() return true; } +uint Object::visibility_for_tracing() const { + uint trace_visibility = visibility; + if(is_shadow_catcher) { + trace_visibility &= ~PATH_RAY_SHADOW_NON_CATCHER; + } + else { + trace_visibility &= ~PATH_RAY_SHADOW_CATCHER; + } + return trace_visibility; +} + /* Object Manager */ ObjectManager::ObjectManager() @@ -272,12 +345,12 @@ ObjectManager::~ObjectManager() { } -void ObjectManager::device_update_object_transform(UpdateObejctTransformState *state, +void ObjectManager::device_update_object_transform(UpdateObjectTransformState *state, Object *ob, int object_index) { - float4 *objects = state->objects; - float4 *objects_vector = state->objects_vector; + KernelObject& kobject = state->objects[object_index]; + Transform *object_motion_pass = state->object_motion_pass; Mesh *mesh = ob->mesh; uint flag = 0; @@ -344,69 +417,72 @@ void ObjectManager::device_update_object_transform(UpdateObejctTransformState *s } } - /* Pack in texture. */ - int offset = object_index*OBJECT_SIZE; + kobject.tfm = tfm; + kobject.itfm = itfm; + kobject.surface_area = surface_area; + kobject.pass_id = pass_id; + kobject.random_number = random_number; + kobject.particle_index = particle_index; + kobject.motion_offset = 0; - /* OBJECT_TRANSFORM */ - memcpy(&objects[offset], &tfm, sizeof(float4)*3); - /* OBJECT_INVERSE_TRANSFORM */ - memcpy(&objects[offset+4], &itfm, sizeof(float4)*3); - /* OBJECT_PROPERTIES */ - objects[offset+8] = make_float4(surface_area, pass_id, random_number, __int_as_float(particle_index)); + if(mesh->use_motion_blur) { + state->have_motion = true; + } + if(mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION)) { + flag |= SD_OBJECT_HAS_VERTEX_MOTION; + } if(state->need_motion == Scene::MOTION_PASS) { - /* Motion transformations, is world/object space depending if mesh - * comes with deformed position in object space, or if we transform - * the shading point in world space. - */ - MotionTransform mtfm = ob->motion; - - /* In case of missing motion information for previous/next frame, - * assume there is no motion. */ - if(!ob->use_motion || mtfm.pre == transform_empty()) { - mtfm.pre = ob->tfm; + /* Clear motion array if there is no actual motion. */ + ob->update_motion(); + + /* Compute motion transforms. */ + Transform tfm_pre, tfm_post; + if(ob->use_motion()) { + tfm_pre = ob->motion[0]; + tfm_post = ob->motion[ob->motion.size() - 1]; } - if(!ob->use_motion || mtfm.post == transform_empty()) { - mtfm.post = ob->tfm; + else { + tfm_pre = tfm; + tfm_post = tfm; } + /* Motion transformations, is world/object space depending if mesh + * comes with deformed position in object space, or if we transform + * the shading point in world space. */ if(!mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION)) { - mtfm.pre = mtfm.pre * itfm; - mtfm.post = mtfm.post * itfm; - } - else { - flag |= SD_OBJECT_HAS_VERTEX_MOTION; + tfm_pre = tfm_pre * itfm; + tfm_post = tfm_post * itfm; } - memcpy(&objects_vector[object_index*OBJECT_VECTOR_SIZE+0], &mtfm.pre, sizeof(float4)*3); - memcpy(&objects_vector[object_index*OBJECT_VECTOR_SIZE+3], &mtfm.post, sizeof(float4)*3); + int motion_pass_offset = object_index*OBJECT_MOTION_PASS_SIZE; + object_motion_pass[motion_pass_offset + 0] = tfm_pre; + object_motion_pass[motion_pass_offset + 1] = tfm_post; } -#ifdef __OBJECT_MOTION__ else if(state->need_motion == Scene::MOTION_BLUR) { - if(ob->use_motion) { - /* decompose transformations for interpolation. */ - DecompMotionTransform decomp; + if(ob->use_motion()) { + kobject.motion_offset = state->motion_offset[object_index]; - transform_motion_decompose(&decomp, &ob->motion, &ob->tfm); - memcpy(&objects[offset], &decomp, sizeof(float4)*8); + /* Decompose transforms for interpolation. */ + DecomposedTransform *decomp = state->object_motion + kobject.motion_offset; + transform_motion_decompose(decomp, ob->motion.data(), ob->motion.size()); flag |= SD_OBJECT_MOTION; state->have_motion = true; } } -#endif - - if(mesh->use_motion_blur) { - state->have_motion = true; - } /* Dupli object coords and motion info. */ + kobject.dupli_generated[0] = ob->dupli_generated[0]; + kobject.dupli_generated[1] = ob->dupli_generated[1]; + kobject.dupli_generated[2] = ob->dupli_generated[2]; + kobject.numkeys = mesh->curve_keys.size(); + kobject.dupli_uv[0] = ob->dupli_uv[0]; + kobject.dupli_uv[1] = ob->dupli_uv[1]; int totalsteps = mesh->motion_steps; - int numsteps = (totalsteps - 1)/2; - int numverts = mesh->verts.size(); - int numkeys = mesh->curve_keys.size(); - - objects[offset+9] = make_float4(ob->dupli_generated[0], ob->dupli_generated[1], ob->dupli_generated[2], __int_as_float(numkeys)); - objects[offset+10] = make_float4(ob->dupli_uv[0], ob->dupli_uv[1], __int_as_float(numsteps), __int_as_float(numverts)); + kobject.numsteps = (totalsteps - 1)/2; + kobject.numverts = mesh->verts.size(); + kobject.patch_map_offset = 0; + kobject.attribute_map_offset = 0; /* Object flag. */ if(ob->use_holdout) { @@ -421,7 +497,7 @@ void ObjectManager::device_update_object_transform(UpdateObejctTransformState *s } bool ObjectManager::device_update_object_transform_pop_work( - UpdateObejctTransformState *state, + UpdateObjectTransformState *state, int *start_index, int *num_objects) { @@ -446,7 +522,7 @@ bool ObjectManager::device_update_object_transform_pop_work( } void ObjectManager::device_update_object_transform_task( - UpdateObejctTransformState *state) + UpdateObjectTransformState *state) { int start_index, num_objects; while(device_update_object_transform_pop_work(state, @@ -461,26 +537,40 @@ void ObjectManager::device_update_object_transform_task( } } -void ObjectManager::device_update_transforms(Device *device, - DeviceScene *dscene, +void ObjectManager::device_update_transforms(DeviceScene *dscene, Scene *scene, - uint *object_flag, Progress& progress) { - UpdateObejctTransformState state; - state.need_motion = scene->need_motion(device->info.advanced_shading); + UpdateObjectTransformState state; + state.need_motion = scene->need_motion(); state.have_motion = false; state.have_curves = false; state.scene = scene; state.queue_start_object = 0; - state.object_flag = object_flag; - state.objects = dscene->objects.resize(OBJECT_SIZE*scene->objects.size()); + state.objects = dscene->objects.alloc(scene->objects.size()); + state.object_flag = dscene->object_flag.alloc(scene->objects.size()); + state.object_motion = NULL; + state.object_motion_pass = NULL; + if(state.need_motion == Scene::MOTION_PASS) { - state.objects_vector = dscene->objects_vector.resize(OBJECT_VECTOR_SIZE*scene->objects.size()); + state.object_motion_pass = dscene->object_motion_pass.alloc(OBJECT_MOTION_PASS_SIZE*scene->objects.size()); } - else { - state.objects_vector = NULL; + else if(state.need_motion == Scene::MOTION_BLUR) { + /* Set object offsets into global object motion array. */ + uint *motion_offsets = state.motion_offset.resize(scene->objects.size()); + uint motion_offset = 0; + + foreach(Object *ob, scene->objects) { + *motion_offsets = motion_offset; + motion_offsets++; + + /* Clear motion array if there is no actual motion. */ + ob->update_motion(); + motion_offset += ob->motion.size(); + } + + state.object_motion = dscene->object_motion.alloc(motion_offset); } /* Particle system device offsets @@ -521,9 +611,12 @@ void ObjectManager::device_update_transforms(Device *device, } } - device->tex_alloc("__objects", dscene->objects); + dscene->objects.copy_to_device(); if(state.need_motion == Scene::MOTION_PASS) { - device->tex_alloc("__objects_vector", dscene->objects_vector); + dscene->object_motion_pass.copy_to_device(); + } + else if(state.need_motion == Scene::MOTION_BLUR) { + dscene->object_motion.copy_to_device(); } dscene->data.bvh.have_motion = state.have_motion; @@ -543,12 +636,9 @@ void ObjectManager::device_update(Device *device, DeviceScene *dscene, Scene *sc if(scene->objects.size() == 0) return; - /* object info flag */ - uint *object_flag = dscene->object_flag.resize(scene->objects.size()); - /* set object transform matrices, before applying static transforms */ progress.set_status("Updating Objects", "Copying Transformations to device"); - device_update_transforms(device, dscene, scene, object_flag, progress); + device_update_transforms(dscene, scene, progress); if(progress.get_cancel()) return; @@ -556,11 +646,11 @@ void ObjectManager::device_update(Device *device, DeviceScene *dscene, Scene *sc /* todo: do before to support getting object level coords? */ if(scene->params.bvh_type == SceneParams::BVH_STATIC) { progress.set_status("Updating Objects", "Applying Static Transformations"); - apply_static_transforms(dscene, scene, object_flag, progress); + apply_static_transforms(dscene, scene, progress); } } -void ObjectManager::device_update_flags(Device *device, +void ObjectManager::device_update_flags(Device *, DeviceScene *dscene, Scene *scene, Progress& /*progress*/, @@ -575,9 +665,10 @@ void ObjectManager::device_update_flags(Device *device, if(scene->objects.size() == 0) return; - /* object info flag */ - uint *object_flag = dscene->object_flag.get_data(); + /* Object info flag. */ + uint *object_flag = dscene->object_flag.data(); + /* Object volume intersection. */ vector<Object *> volume_objects; bool has_volume_objects = false; foreach(Object *object, scene->objects) { @@ -593,9 +684,22 @@ void ObjectManager::device_update_flags(Device *device, foreach(Object *object, scene->objects) { if(object->mesh->has_volume) { object_flag[object_index] |= SD_OBJECT_HAS_VOLUME; + object_flag[object_index] &= ~SD_OBJECT_HAS_VOLUME_ATTRIBUTES; + + foreach(Attribute& attr, object->mesh->attributes.attributes) { + if(attr.element == ATTR_ELEMENT_VOXEL) { + object_flag[object_index] |= SD_OBJECT_HAS_VOLUME_ATTRIBUTES; + } + } } else { - object_flag[object_index] &= ~SD_OBJECT_HAS_VOLUME; + object_flag[object_index] &= ~(SD_OBJECT_HAS_VOLUME|SD_OBJECT_HAS_VOLUME_ATTRIBUTES); + } + if(object->is_shadow_catcher) { + object_flag[object_index] |= SD_OBJECT_SHADOW_CATCHER; + } + else { + object_flag[object_index] &= ~SD_OBJECT_SHADOW_CATCHER; } if(bounds_valid) { @@ -618,72 +722,65 @@ void ObjectManager::device_update_flags(Device *device, ++object_index; } - /* allocate object flag */ - device->tex_alloc("__object_flag", dscene->object_flag); + /* Copy object flag. */ + dscene->object_flag.copy_to_device(); } -void ObjectManager::device_update_patch_map_offsets(Device *device, DeviceScene *dscene, Scene *scene) +void ObjectManager::device_update_mesh_offsets(Device *, DeviceScene *dscene, Scene *scene) { - if(scene->objects.size() == 0) { + if(dscene->objects.size() == 0) { return; } - uint4* objects = (uint4*)dscene->objects.get_data(); + KernelObject *kobjects = dscene->objects.data(); bool update = false; - int object_index = 0; - foreach(Object *object, scene->objects) { - int offset = object_index*OBJECT_SIZE + 11; + foreach(Object *object, scene->objects) { Mesh* mesh = object->mesh; if(mesh->patch_table) { uint patch_map_offset = 2*(mesh->patch_table_offset + mesh->patch_table->total_size() - mesh->patch_table->num_nodes * PATCH_NODE_SIZE) - mesh->patch_offset; - if(objects[offset].x != patch_map_offset) { - objects[offset].x = patch_map_offset; + if(kobjects[object_index].patch_map_offset != patch_map_offset) { + kobjects[object_index].patch_map_offset = patch_map_offset; update = true; } } + if(kobjects[object_index].attribute_map_offset != mesh->attr_map_offset) { + kobjects[object_index].attribute_map_offset = mesh->attr_map_offset; + update = true; + } + object_index++; } if(update) { - device->tex_free(dscene->objects); - device->tex_alloc("__objects", dscene->objects); + dscene->objects.copy_to_device(); } } -void ObjectManager::device_free(Device *device, DeviceScene *dscene) +void ObjectManager::device_free(Device *, DeviceScene *dscene) { - device->tex_free(dscene->objects); - dscene->objects.clear(); - - device->tex_free(dscene->objects_vector); - dscene->objects_vector.clear(); - - device->tex_free(dscene->object_flag); - dscene->object_flag.clear(); + dscene->objects.free(); + dscene->object_motion_pass.free(); + dscene->object_motion.free(); + dscene->object_flag.free(); } -void ObjectManager::apply_static_transforms(DeviceScene *dscene, Scene *scene, uint *object_flag, Progress& progress) +void ObjectManager::apply_static_transforms(DeviceScene *dscene, Scene *scene, Progress& progress) { /* todo: normals and displacement should be done before applying transform! */ /* todo: create objects/meshes in right order! */ /* counter mesh users */ map<Mesh*, int> mesh_users; -#ifdef __OBJECT_MOTION__ Scene::MotionType need_motion = scene->need_motion(); bool motion_blur = need_motion == Scene::MOTION_BLUR; bool apply_to_motion = need_motion != Scene::MOTION_PASS; -#else - bool motion_blur = false; - bool apply_to_motion = false; -#endif int i = 0; bool have_instancing = false; @@ -698,6 +795,8 @@ void ObjectManager::apply_static_transforms(DeviceScene *dscene, Scene *scene, u if(progress.get_cancel()) return; + uint *object_flag = dscene->object_flag.data(); + /* apply transforms for objects with single user meshes */ foreach(Object *object, scene->objects) { /* Annoying feedback loop here: we can't use is_instanced() because @@ -708,7 +807,7 @@ void ObjectManager::apply_static_transforms(DeviceScene *dscene, Scene *scene, u if((mesh_users[object->mesh] == 1 && !object->mesh->has_surface_bssrdf) && !object->mesh->has_true_displacement() && object->mesh->subdivision_type == Mesh::SUBDIVISION_NONE) { - if(!(motion_blur && object->use_motion)) { + if(!(motion_blur && object->use_motion())) { if(!object->mesh->transform_applied) { object->apply_transform(apply_to_motion); object->mesh->transform_applied = true; diff --git a/intern/cycles/render/object.h b/intern/cycles/render/object.h index 7e306fab2a8..c7212ae25f9 100644 --- a/intern/cycles/render/object.h +++ b/intern/cycles/render/object.h @@ -17,14 +17,14 @@ #ifndef __OBJECT_H__ #define __OBJECT_H__ -#include "node.h" -#include "scene.h" +#include "graph/node.h" +#include "render/scene.h" -#include "util_boundbox.h" -#include "util_param.h" -#include "util_transform.h" -#include "util_thread.h" -#include "util_types.h" +#include "util/util_boundbox.h" +#include "util/util_param.h" +#include "util/util_transform.h" +#include "util/util_thread.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN @@ -35,12 +35,13 @@ class ParticleSystem; class Progress; class Scene; struct Transform; +struct UpdateObjectTransformState; /* Object */ class Object : public Node { public: - NODE_DECLARE; + NODE_DECLARE Mesh *mesh; Transform tfm; @@ -49,17 +50,17 @@ public: int pass_id; vector<ParamValue> attributes; uint visibility; - MotionTransform motion; - bool use_motion; + array<Transform> motion; bool hide_on_missing_motion; bool use_holdout; + bool is_shadow_catcher; float3 dupli_generated; float2 dupli_uv; ParticleSystem *particle_system; int particle_index; - + Object(); ~Object(); @@ -68,12 +69,22 @@ public: void compute_bounds(bool motion_blur); void apply_transform(bool apply_to_motion); - vector<float> motion_times(); + /* Convert between normalized -1..1 motion time and index + * in the motion array. */ + bool use_motion() const; + float motion_time(int step) const; + int motion_step(float time) const; + void update_motion(); /* Check whether object is traceable and it worth adding it to * kernel scene. */ - bool is_traceable(); + bool is_traceable() const; + + /* Combine object's visibility with all possible internal run-time + * determined flags which denotes trace-time visibility. + */ + uint visibility_for_tracing() const; }; /* Object Manager */ @@ -87,10 +98,8 @@ public: ~ObjectManager(); void device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress); - void device_update_transforms(Device *device, - DeviceScene *dscene, + void device_update_transforms(DeviceScene *dscene, Scene *scene, - uint *object_flag, Progress& progress); void device_update_flags(Device *device, @@ -98,61 +107,21 @@ public: Scene *scene, Progress& progress, bool bounds_valid = true); - void device_update_patch_map_offsets(Device *device, DeviceScene *dscene, Scene *scene); + void device_update_mesh_offsets(Device *device, DeviceScene *dscene, Scene *scene); void device_free(Device *device, DeviceScene *dscene); void tag_update(Scene *scene); - void apply_static_transforms(DeviceScene *dscene, Scene *scene, uint *object_flag, Progress& progress); + void apply_static_transforms(DeviceScene *dscene, Scene *scene, Progress& progress); protected: - /* Global state of object transform update. */ - struct UpdateObejctTransformState { - /* Global state used by device_update_object_transform(). - * Common for both threaded and non-threaded update. - */ - - /* Type of the motion required by the scene settings. */ - Scene::MotionType need_motion; - - /* Mapping from particle system to a index in packed particle array. - * Only used for read. - */ - map<ParticleSystem*, int> particle_offset; - - /* Mesh area. - * Used to avoid calculation of mesh area multiple times. Used for both - * read and write. Acquire surface_area_lock to keep it all thread safe. - */ - map<Mesh*, float> surface_area_map; - - /* Packed object arrays. Those will be filled in. */ - uint *object_flag; - float4 *objects; - float4 *objects_vector; - - /* Flags which will be synchronized to Integrator. */ - bool have_motion; - bool have_curves; - - /* ** Scheduling queue. ** */ - - Scene *scene; - - /* Some locks to keep everything thread-safe. */ - thread_spin_lock queue_lock; - thread_spin_lock surface_area_lock; - - /* First unused object index in the queue. */ - int queue_start_object; - }; - void device_update_object_transform(UpdateObejctTransformState *state, + void device_update_object_transform(UpdateObjectTransformState *state, Object *ob, const int object_index); - void device_update_object_transform_task(UpdateObejctTransformState *state); + void device_update_object_transform_task(UpdateObjectTransformState *state); bool device_update_object_transform_pop_work( - UpdateObejctTransformState *state, + UpdateObjectTransformState *state, int *start_index, int *num_objects); }; diff --git a/intern/cycles/render/osl.cpp b/intern/cycles/render/osl.cpp index 67b68e63cb2..f1a22350060 100644 --- a/intern/cycles/render/osl.cpp +++ b/intern/cycles/render/osl.cpp @@ -14,26 +14,27 @@ * limitations under the License. */ -#include "device.h" +#include "device/device.h" -#include "graph.h" -#include "light.h" -#include "osl.h" -#include "scene.h" -#include "shader.h" -#include "nodes.h" +#include "render/graph.h" +#include "render/light.h" +#include "render/osl.h" +#include "render/scene.h" +#include "render/shader.h" +#include "render/nodes.h" #ifdef WITH_OSL -#include "osl_globals.h" -#include "osl_services.h" -#include "osl_shader.h" +#include "kernel/osl/osl_globals.h" +#include "kernel/osl/osl_services.h" +#include "kernel/osl/osl_shader.h" -#include "util_foreach.h" -#include "util_logging.h" -#include "util_md5.h" -#include "util_path.h" -#include "util_progress.h" +#include "util/util_foreach.h" +#include "util/util_logging.h" +#include "util/util_md5.h" +#include "util/util_path.h" +#include "util/util_progress.h" +#include "util/util_projection.h" #endif @@ -156,6 +157,7 @@ void OSLShaderManager::device_free(Device *device, DeviceScene *dscene, Scene *s og->surface_state.clear(); og->volume_state.clear(); og->displacement_state.clear(); + og->bump_state.clear(); og->background_state.reset(); } @@ -232,16 +234,25 @@ void OSLShaderManager::shading_system_init() "glossy", /* PATH_RAY_GLOSSY */ "singular", /* PATH_RAY_SINGULAR */ "transparent", /* PATH_RAY_TRANSPARENT */ - "shadow", /* PATH_RAY_SHADOW_OPAQUE */ - "shadow", /* PATH_RAY_SHADOW_TRANSPARENT */ + + "shadow", /* PATH_RAY_SHADOW_OPAQUE_NON_CATCHER */ + "shadow", /* PATH_RAY_SHADOW_OPAQUE_CATCHER */ + "shadow", /* PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER */ + "shadow", /* PATH_RAY_SHADOW_TRANSPARENT_CATCHER */ "__unused__", + "volume_scatter", /* PATH_RAY_VOLUME_SCATTER */ + "__unused__", + "__unused__", "diffuse_ancestor", /* PATH_RAY_DIFFUSE_ANCESTOR */ "__unused__", "__unused__", - "__unused__", /* PATH_RAY_SINGLE_PASS_DONE */ - "volume_scatter", /* PATH_RAY_VOLUME_SCATTER */ + "__unused__", + "__unused__", + "__unused__", + "__unused__", + "__unused__", }; const int nraytypes = sizeof(raytypes)/sizeof(raytypes[0]); @@ -718,6 +729,7 @@ void OSLCompiler::add(ShaderNode *node, const char *name, bool isfilepath) current_shader->has_surface_bssrdf = true; current_shader->has_bssrdf_bump = true; /* can't detect yet */ } + current_shader->has_bump = true; /* can't detect yet */ } if(node->has_spatial_varying()) { @@ -733,6 +745,10 @@ void OSLCompiler::add(ShaderNode *node, const char *name, bool isfilepath) current_shader->has_object_dependency = true; } + if(node->has_attribute_dependency()) { + current_shader->has_attribute_dependency = true; + } + if(node->has_integrator_dependency()) { current_shader->has_integrator_dependency = true; } @@ -817,7 +833,9 @@ void OSLCompiler::parameter(ShaderNode* node, const char *name) case SocketType::TRANSFORM: { Transform value = node->get_transform(socket); - ss->Parameter(uname, TypeDesc::TypeMatrix, &value); + ProjectionTransform projection(value); + projection = projection_transpose(projection); + ss->Parameter(uname, TypeDesc::TypeMatrix, &projection); break; } case SocketType::BOOLEAN_ARRAY: @@ -885,7 +903,11 @@ void OSLCompiler::parameter(ShaderNode* node, const char *name) case SocketType::TRANSFORM_ARRAY: { const array<Transform>& value = node->get_transform_array(socket); - ss->Parameter(uname, array_typedesc(TypeDesc::TypeMatrix, value.size()), value.data()); + array<ProjectionTransform> fvalue(value.size()); + for(size_t i = 0; i < value.size(); i++) { + fvalue[i] = projection_transpose(ProjectionTransform(value[i])); + } + ss->Parameter(uname, array_typedesc(TypeDesc::TypeMatrix, fvalue.size()), fvalue.data()); break; } case SocketType::CLOSURE: @@ -952,7 +974,9 @@ void OSLCompiler::parameter(const char *name, ustring s) void OSLCompiler::parameter(const char *name, const Transform& tfm) { OSL::ShadingSystem *ss = (OSL::ShadingSystem*)shadingsys; - ss->Parameter(name, TypeDesc::TypeMatrix, (float*)&tfm); + ProjectionTransform projection(tfm); + projection = projection_transpose(projection); + ss->Parameter(name, TypeDesc::TypeMatrix, (float*)&projection); } void OSLCompiler::parameter_array(const char *name, const float f[], int arraylen) @@ -980,6 +1004,14 @@ void OSLCompiler::parameter_color_array(const char *name, const array<float3>& f ss->Parameter(name, type, table.data()); } +void OSLCompiler::parameter_attribute(const char *name, ustring s) +{ + if(Attribute::name_standard(s.c_str())) + parameter(name, (string("geom:") + s.c_str()).c_str()); + else + parameter(name, s.c_str()); +} + void OSLCompiler::find_dependencies(ShaderNodeSet& dependencies, ShaderInput *input) { ShaderNode *node = (input->link)? input->link->parent: NULL; @@ -1026,6 +1058,9 @@ void OSLCompiler::generate_nodes(const ShaderNodeSet& nodes) if(node->has_bssrdf_bump()) current_shader->has_bssrdf_bump = true; } + if(node->has_bump()) { + current_shader->has_bump = true; + } } else if(current_type == SHADER_TYPE_VOLUME) { if(node->has_spatial_varying()) @@ -1088,23 +1123,14 @@ void OSLCompiler::compile(Scene *scene, OSLGlobals *og, Shader *shader) ShaderGraph *graph = shader->graph; ShaderNode *output = (graph)? graph->output(): NULL; - /* copy graph for shader with bump mapping */ - if(output->input("Surface")->link && output->input("Displacement")->link) - if(!shader->graph_bump) - shader->graph_bump = shader->graph->copy(); + bool has_bump = (shader->displacement_method != DISPLACE_TRUE) && + output->input("Surface")->link && output->input("Displacement")->link; /* finalize */ shader->graph->finalize(scene, - false, - true, - shader->has_integrator_dependency); - if(shader->graph_bump) { - shader->graph_bump->finalize(scene, - true, - true, - shader->has_integrator_dependency, - shader->displacement_method == DISPLACE_BOTH); - } + has_bump, + shader->has_integrator_dependency, + shader->displacement_method == DISPLACE_BOTH); current_shader = shader; @@ -1112,20 +1138,22 @@ void OSLCompiler::compile(Scene *scene, OSLGlobals *og, Shader *shader) shader->has_surface_emission = false; shader->has_surface_transparent = false; shader->has_surface_bssrdf = false; - shader->has_bssrdf_bump = false; + shader->has_bump = has_bump; + shader->has_bssrdf_bump = has_bump; shader->has_volume = false; shader->has_displacement = false; shader->has_surface_spatial_varying = false; shader->has_volume_spatial_varying = false; shader->has_object_dependency = false; + shader->has_attribute_dependency = false; shader->has_integrator_dependency = false; /* generate surface shader */ if(shader->used && graph && output->input("Surface")->link) { shader->osl_surface_ref = compile_type(shader, shader->graph, SHADER_TYPE_SURFACE); - if(shader->graph_bump && shader->displacement_method != DISPLACE_TRUE) - shader->osl_surface_bump_ref = compile_type(shader, shader->graph_bump, SHADER_TYPE_BUMP); + if(has_bump) + shader->osl_surface_bump_ref = compile_type(shader, shader->graph, SHADER_TYPE_BUMP); else shader->osl_surface_bump_ref = OSL::ShaderGroupRef(); diff --git a/intern/cycles/render/osl.h b/intern/cycles/render/osl.h index b131b672b8c..95e35dd857b 100644 --- a/intern/cycles/render/osl.h +++ b/intern/cycles/render/osl.h @@ -17,13 +17,13 @@ #ifndef __OSL_H__ #define __OSL_H__ -#include "util_set.h" -#include "util_string.h" -#include "util_thread.h" +#include "util/util_set.h" +#include "util/util_string.h" +#include "util/util_thread.h" -#include "graph.h" -#include "nodes.h" -#include "shader.h" +#include "render/graph.h" +#include "render/nodes.h" +#include "render/shader.h" #ifdef WITH_OSL #include <OSL/oslcomp.h> @@ -140,6 +140,8 @@ public: void parameter_array(const char *name, const float f[], int arraylen); void parameter_color_array(const char *name, const array<float3>& f); + void parameter_attribute(const char *name, ustring s); + ShaderType output_type() { return current_type; } bool background; diff --git a/intern/cycles/render/particles.cpp b/intern/cycles/render/particles.cpp index 1a35d60fb4b..e4be3306d7e 100644 --- a/intern/cycles/render/particles.cpp +++ b/intern/cycles/render/particles.cpp @@ -14,15 +14,16 @@ * limitations under the License. */ -#include "device.h" -#include "particles.h" -#include "scene.h" +#include "device/device.h" +#include "render/particles.h" +#include "render/scene.h" -#include "util_foreach.h" -#include "util_logging.h" -#include "util_map.h" -#include "util_progress.h" -#include "util_vector.h" +#include "util/util_foreach.h" +#include "util/util_hash.h" +#include "util/util_logging.h" +#include "util/util_map.h" +#include "util/util_progress.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN @@ -52,7 +53,7 @@ ParticleSystemManager::~ParticleSystemManager() { } -void ParticleSystemManager::device_update_particles(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress) +void ParticleSystemManager::device_update_particles(Device *, DeviceScene *dscene, Scene *scene, Progress& progress) { /* count particles. * adds one dummy particle at the beginning to avoid invalid lookups, @@ -61,14 +62,10 @@ void ParticleSystemManager::device_update_particles(Device *device, DeviceScene for(size_t j = 0; j < scene->particle_systems.size(); j++) num_particles += scene->particle_systems[j]->particles.size(); - float4 *particles = dscene->particles.resize(PARTICLE_SIZE*num_particles); + KernelParticle *kparticles = dscene->particles.alloc(num_particles); /* dummy particle */ - particles[0] = make_float4(0.0f, 0.0f, 0.0f, 0.0f); - particles[1] = make_float4(0.0f, 0.0f, 0.0f, 0.0f); - particles[2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f); - particles[3] = make_float4(0.0f, 0.0f, 0.0f, 0.0f); - particles[4] = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + memset(kparticles, 0, sizeof(KernelParticle)); int i = 1; for(size_t j = 0; j < scene->particle_systems.size(); j++) { @@ -77,13 +74,15 @@ void ParticleSystemManager::device_update_particles(Device *device, DeviceScene for(size_t k = 0; k < psys->particles.size(); k++) { /* pack in texture */ Particle& pa = psys->particles[k]; - int offset = i*PARTICLE_SIZE; - particles[offset] = make_float4(pa.index, pa.age, pa.lifetime, pa.size); - particles[offset+1] = pa.rotation; - particles[offset+2] = make_float4(pa.location.x, pa.location.y, pa.location.z, pa.velocity.x); - particles[offset+3] = make_float4(pa.velocity.y, pa.velocity.z, pa.angular_velocity.x, pa.angular_velocity.y); - particles[offset+4] = make_float4(pa.angular_velocity.z, 0.0f, 0.0f, 0.0f); + kparticles[i].index = pa.index; + kparticles[i].age = pa.age; + kparticles[i].lifetime = pa.lifetime; + kparticles[i].size = pa.size; + kparticles[i].rotation = pa.rotation; + kparticles[i].location = float3_to_float4(pa.location); + kparticles[i].velocity = float3_to_float4(pa.velocity); + kparticles[i].angular_velocity = float3_to_float4(pa.angular_velocity); i++; @@ -91,7 +90,7 @@ void ParticleSystemManager::device_update_particles(Device *device, DeviceScene } } - device->tex_alloc("__particles", dscene->particles); + dscene->particles.copy_to_device(); } void ParticleSystemManager::device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress) @@ -112,10 +111,9 @@ void ParticleSystemManager::device_update(Device *device, DeviceScene *dscene, S need_update = false; } -void ParticleSystemManager::device_free(Device *device, DeviceScene *dscene) +void ParticleSystemManager::device_free(Device *, DeviceScene *dscene) { - device->tex_free(dscene->particles); - dscene->particles.clear(); + dscene->particles.free(); } void ParticleSystemManager::tag_update(Scene * /*scene*/) diff --git a/intern/cycles/render/particles.h b/intern/cycles/render/particles.h index 2509e27b44b..66d46114b3e 100644 --- a/intern/cycles/render/particles.h +++ b/intern/cycles/render/particles.h @@ -17,8 +17,8 @@ #ifndef __PARTICLES_H__ #define __PARTICLES_H__ -#include "util_types.h" -#include "util_vector.h" +#include "util/util_types.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/render/scene.cpp b/intern/cycles/render/scene.cpp index 68124e78cb5..ba47e3ab6f8 100644 --- a/intern/cycles/render/scene.cpp +++ b/intern/cycles/render/scene.cpp @@ -16,37 +16,78 @@ #include <stdlib.h> -#include "background.h" -#include "bake.h" -#include "camera.h" -#include "curves.h" -#include "device.h" -#include "film.h" -#include "integrator.h" -#include "light.h" -#include "mesh.h" -#include "object.h" -#include "osl.h" -#include "particles.h" -#include "scene.h" -#include "shader.h" -#include "svm.h" -#include "tables.h" - -#include "util_foreach.h" -#include "util_guarded_allocator.h" -#include "util_logging.h" -#include "util_progress.h" +#include "render/background.h" +#include "render/bake.h" +#include "render/camera.h" +#include "render/curves.h" +#include "device/device.h" +#include "render/film.h" +#include "render/integrator.h" +#include "render/light.h" +#include "render/mesh.h" +#include "render/object.h" +#include "render/osl.h" +#include "render/particles.h" +#include "render/scene.h" +#include "render/shader.h" +#include "render/svm.h" +#include "render/tables.h" + +#include "util/util_foreach.h" +#include "util/util_guarded_allocator.h" +#include "util/util_logging.h" +#include "util/util_progress.h" CCL_NAMESPACE_BEGIN -Scene::Scene(const SceneParams& params_, const DeviceInfo& device_info_) -: params(params_) +DeviceScene::DeviceScene(Device *device) +: bvh_nodes(device, "__bvh_nodes", MEM_TEXTURE), + bvh_leaf_nodes(device, "__bvh_leaf_nodes", MEM_TEXTURE), + object_node(device, "__object_node", MEM_TEXTURE), + prim_tri_index(device, "__prim_tri_index", MEM_TEXTURE), + prim_tri_verts(device, "__prim_tri_verts", MEM_TEXTURE), + prim_type(device, "__prim_type", MEM_TEXTURE), + prim_visibility(device, "__prim_visibility", MEM_TEXTURE), + prim_index(device, "__prim_index", MEM_TEXTURE), + prim_object(device, "__prim_object", MEM_TEXTURE), + prim_time(device, "__prim_time", MEM_TEXTURE), + tri_shader(device, "__tri_shader", MEM_TEXTURE), + tri_vnormal(device, "__tri_vnormal", MEM_TEXTURE), + tri_vindex(device, "__tri_vindex", MEM_TEXTURE), + tri_patch(device, "__tri_patch", MEM_TEXTURE), + tri_patch_uv(device, "__tri_patch_uv", MEM_TEXTURE), + curves(device, "__curves", MEM_TEXTURE), + curve_keys(device, "__curve_keys", MEM_TEXTURE), + patches(device, "__patches", MEM_TEXTURE), + objects(device, "__objects", MEM_TEXTURE), + object_motion_pass(device, "__object_motion_pass", MEM_TEXTURE), + object_motion(device, "__object_motion", MEM_TEXTURE), + object_flag(device, "__object_flag", MEM_TEXTURE), + camera_motion(device, "__camera_motion", MEM_TEXTURE), + attributes_map(device, "__attributes_map", MEM_TEXTURE), + attributes_float(device, "__attributes_float", MEM_TEXTURE), + attributes_float3(device, "__attributes_float3", MEM_TEXTURE), + attributes_uchar4(device, "__attributes_uchar4", MEM_TEXTURE), + light_distribution(device, "__light_distribution", MEM_TEXTURE), + lights(device, "__lights", MEM_TEXTURE), + light_background_marginal_cdf(device, "__light_background_marginal_cdf", MEM_TEXTURE), + light_background_conditional_cdf(device, "__light_background_conditional_cdf", MEM_TEXTURE), + particles(device, "__particles", MEM_TEXTURE), + svm_nodes(device, "__svm_nodes", MEM_TEXTURE), + shaders(device, "__shaders", MEM_TEXTURE), + lookup_table(device, "__lookup_table", MEM_TEXTURE), + sobol_directions(device, "__sobol_directions", MEM_TEXTURE) +{ + memset(&data, 0, sizeof(data)); +} + +Scene::Scene(const SceneParams& params_, Device *device) +: device(device), dscene(device), params(params_) { - device = NULL; memset(&dscene.data, 0, sizeof(dscene.data)); camera = new Camera(); + dicing_camera = new Camera(); lookup_tables = new LookupTables(); film = new Film(); background = new Background(); @@ -54,13 +95,13 @@ Scene::Scene(const SceneParams& params_, const DeviceInfo& device_info_) mesh_manager = new MeshManager(); object_manager = new ObjectManager(); integrator = new Integrator(); - image_manager = new ImageManager(device_info_); + image_manager = new ImageManager(device->info); particle_system_manager = new ParticleSystemManager(); curve_system_manager = new CurveSystemManager(); bake_manager = new BakeManager(); /* OSL only works on the CPU */ - if(device_info_.type == DEVICE_CPU) + if(device->info.has_osl) shader_manager = ShaderManager::create(this, params.shadingsystem); else shader_manager = ShaderManager::create(this, SHADINGSYSTEM_SVM); @@ -107,9 +148,9 @@ void Scene::free_memory(bool final) bake_manager->device_free(device, &dscene); if(!params.persistent_data || final) - image_manager->device_free(device, &dscene); + image_manager->device_free(device); else - image_manager->device_free_builtin(device, &dscene); + image_manager->device_free_builtin(device); lookup_tables->device_free(device, &dscene); } @@ -117,6 +158,7 @@ void Scene::free_memory(bool final) if(final) { delete lookup_tables; delete camera; + delete dicing_camera; delete film; delete background; delete integrator; @@ -148,8 +190,6 @@ void Scene::device_update(Device *device_, Progress& progress) * - Film needs light manager to run for use_light_visibility * - Lookup tables are done a second time to handle film tables */ - - image_manager->set_pack_images(device->info.pack_images); progress.set_status("Updating Shaders"); shader_manager->device_update(device, &dscene, this, progress); @@ -166,8 +206,7 @@ void Scene::device_update(Device *device_, Progress& progress) if(progress.get_cancel() || device->have_error()) return; - progress.set_status("Updating Meshes Flags"); - mesh_manager->device_update_flags(device, &dscene, this, progress); + mesh_manager->device_update_preprocess(device, this, progress); if(progress.get_cancel() || device->have_error()) return; @@ -176,6 +215,11 @@ void Scene::device_update(Device *device_, Progress& progress) if(progress.get_cancel() || device->have_error()) return; + progress.set_status("Updating Particle Systems"); + particle_system_manager->device_update(device, &dscene, this, progress); + + if(progress.get_cancel() || device->have_error()) return; + progress.set_status("Updating Meshes"); mesh_manager->device_update(device, &dscene, this, progress); @@ -187,7 +231,7 @@ void Scene::device_update(Device *device_, Progress& progress) if(progress.get_cancel() || device->have_error()) return; progress.set_status("Updating Images"); - image_manager->device_update(device, &dscene, this, progress); + image_manager->device_update(device, this, progress); if(progress.get_cancel() || device->have_error()) return; @@ -211,11 +255,6 @@ void Scene::device_update(Device *device_, Progress& progress) if(progress.get_cancel() || device->have_error()) return; - progress.set_status("Updating Particle Systems"); - particle_system_manager->device_update(device, &dscene, this, progress); - - if(progress.get_cancel() || device->have_error()) return; - progress.set_status("Updating Integrator"); integrator->device_update(device, &dscene, this); @@ -253,10 +292,10 @@ void Scene::device_update(Device *device_, Progress& progress) } } -Scene::MotionType Scene::need_motion(bool advanced_shading) +Scene::MotionType Scene::need_motion() { if(integrator->motion_blur) - return (advanced_shading)? MOTION_BLUR: MOTION_NONE; + return MOTION_BLUR; else if(Pass::contains(film->passes, PASS_MOTION)) return MOTION_PASS; else @@ -323,6 +362,7 @@ void Scene::reset() /* ensure all objects are updated */ camera->tag_update(); + dicing_camera->tag_update(); film->tag_update(this); background->tag_update(this); integrator->tag_update(this); diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h index 9f398c444f4..04bd4735a86 100644 --- a/intern/cycles/render/scene.h +++ b/intern/cycles/render/scene.h @@ -17,18 +17,20 @@ #ifndef __SCENE_H__ #define __SCENE_H__ -#include "image.h" -#include "shader.h" +#include "bvh/bvh_params.h" -#include "device_memory.h" +#include "render/image.h" +#include "render/shader.h" -#include "util_param.h" -#include "util_string.h" -#include "util_system.h" -#include "util_texture.h" -#include "util_thread.h" -#include "util_types.h" -#include "util_vector.h" +#include "device/device_memory.h" + +#include "util/util_param.h" +#include "util/util_string.h" +#include "util/util_system.h" +#include "util/util_texture.h" +#include "util/util_thread.h" +#include "util/util_types.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN @@ -60,15 +62,15 @@ class BakeData; class DeviceScene { public: /* BVH */ - device_vector<float4> bvh_nodes; - device_vector<float4> bvh_leaf_nodes; - device_vector<uint> object_node; + device_vector<int4> bvh_nodes; + device_vector<int4> bvh_leaf_nodes; + device_vector<int> object_node; device_vector<uint> prim_tri_index; device_vector<float4> prim_tri_verts; - device_vector<uint> prim_type; + device_vector<int> prim_type; device_vector<uint> prim_visibility; - device_vector<uint> prim_index; - device_vector<uint> prim_object; + device_vector<int> prim_index; + device_vector<int> prim_object; device_vector<float2> prim_time; /* mesh */ @@ -84,8 +86,13 @@ public: device_vector<uint> patches; /* objects */ - device_vector<float4> objects; - device_vector<float4> objects_vector; + device_vector<KernelObject> objects; + device_vector<Transform> object_motion_pass; + device_vector<DecomposedTransform> object_motion; + device_vector<uint> object_flag; + + /* cameras */ + device_vector<DecomposedTransform> camera_motion; /* attributes */ device_vector<uint4> attributes_map; @@ -94,18 +101,17 @@ public: device_vector<uchar4> attributes_uchar4; /* lights */ - device_vector<float4> light_distribution; - device_vector<float4> light_data; + device_vector<KernelLightDistribution> light_distribution; + device_vector<KernelLight> lights; device_vector<float2> light_background_marginal_cdf; device_vector<float2> light_background_conditional_cdf; /* particles */ - device_vector<float4> particles; + device_vector<KernelParticle> particles; /* shaders */ - device_vector<uint4> svm_nodes; - device_vector<uint> shader_flag; - device_vector<uint> object_flag; + device_vector<int4> svm_nodes; + device_vector<KernelShader> shaders; /* lookup tables */ device_vector<float> lookup_table; @@ -113,61 +119,72 @@ public: /* integrator */ device_vector<uint> sobol_directions; - /* cpu images */ - device_vector<uchar4> tex_byte4_image[TEX_NUM_BYTE4_CPU]; - device_vector<float4> tex_float4_image[TEX_NUM_FLOAT4_CPU]; - device_vector<float> tex_float_image[TEX_NUM_FLOAT_CPU]; - device_vector<uchar> tex_byte_image[TEX_NUM_BYTE_CPU]; - device_vector<half4> tex_half4_image[TEX_NUM_HALF4_CPU]; - device_vector<half> tex_half_image[TEX_NUM_HALF_CPU]; - - /* opencl images */ - device_vector<uchar4> tex_image_byte4_packed; - device_vector<float4> tex_image_float4_packed; - device_vector<uchar> tex_image_byte_packed; - device_vector<float> tex_image_float_packed; - device_vector<uint4> tex_image_packed_info; - KernelData data; + + DeviceScene(Device *device); }; /* Scene Parameters */ class SceneParams { public: - ShadingSystem shadingsystem; + /* Type of BVH, in terms whether it is supported dynamic updates of meshes + * or whether modifying geometry requires full BVH rebuild. + */ enum BVHType { + /* BVH supports dynamic updates of geometry. + * + * Faster for updating BVH tree when doing modifications in viewport, + * but slower for rendering. + */ BVH_DYNAMIC = 0, + /* BVH tree is calculated for specific scene, updates in geometry + * requires full tree rebuild. + * + * Slower to update BVH tree when modifying objects in viewport, also + * slower to build final BVH tree but gives best possible render speed. + */ BVH_STATIC = 1, BVH_NUM_TYPES, - } bvh_type; + }; + + ShadingSystem shadingsystem; + + /* Requested BVH layout. + * + * If it's not supported by the device, the widest one from supported ones + * will be used, but BVH wider than this one will never be used. + */ + BVHLayout bvh_layout; + + BVHType bvh_type; bool use_bvh_spatial_split; bool use_bvh_unaligned_nodes; int num_bvh_time_steps; - bool use_qbvh; + bool persistent_data; int texture_limit; SceneParams() { shadingsystem = SHADINGSYSTEM_SVM; + bvh_layout = BVH_LAYOUT_BVH2; bvh_type = BVH_DYNAMIC; use_bvh_spatial_split = false; use_bvh_unaligned_nodes = true; num_bvh_time_steps = 0; - use_qbvh = false; persistent_data = false; texture_limit = 0; } bool modified(const SceneParams& params) { return !(shadingsystem == params.shadingsystem + && bvh_layout == params.bvh_layout && bvh_type == params.bvh_type && use_bvh_spatial_split == params.use_bvh_spatial_split && use_bvh_unaligned_nodes == params.use_bvh_unaligned_nodes && num_bvh_time_steps == params.num_bvh_time_steps - && use_qbvh == params.use_qbvh && persistent_data == params.persistent_data && texture_limit == params.texture_limit); } }; @@ -178,6 +195,7 @@ class Scene { public: /* data */ Camera *camera; + Camera *dicing_camera; LookupTables *lookup_tables; Film *film; Background *background; @@ -216,7 +234,7 @@ public: /* mutex must be locked manually by callers */ thread_mutex mutex; - Scene(const SceneParams& params, const DeviceInfo& device_info); + Scene(const SceneParams& params, Device *device); ~Scene(); void device_update(Device *device, Progress& progress); @@ -225,7 +243,7 @@ public: void need_global_attributes(AttributeRequestSet& attributes); enum MotionType { MOTION_NONE = 0, MOTION_PASS, MOTION_BLUR }; - MotionType need_motion(bool advanced_shading = true); + MotionType need_motion(); float motion_shutter_time(); bool need_update(); diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp index 420866c9436..bb636dd962a 100644 --- a/intern/cycles/render/session.cpp +++ b/intern/cycles/render/session.cpp @@ -17,24 +17,24 @@ #include <string.h> #include <limits.h> -#include "buffers.h" -#include "camera.h" -#include "device.h" -#include "graph.h" -#include "integrator.h" -#include "mesh.h" -#include "object.h" -#include "scene.h" -#include "session.h" -#include "bake.h" - -#include "util_foreach.h" -#include "util_function.h" -#include "util_logging.h" -#include "util_math.h" -#include "util_opengl.h" -#include "util_task.h" -#include "util_time.h" +#include "render/buffers.h" +#include "render/camera.h" +#include "device/device.h" +#include "render/graph.h" +#include "render/integrator.h" +#include "render/mesh.h" +#include "render/object.h" +#include "render/scene.h" +#include "render/session.h" +#include "render/bake.h" + +#include "util/util_foreach.h" +#include "util/util_function.h" +#include "util/util_logging.h" +#include "util/util_math.h" +#include "util/util_opengl.h" +#include "util/util_task.h" +#include "util/util_time.h" CCL_NAMESPACE_BEGIN @@ -46,7 +46,7 @@ Session::Session(const SessionParams& params_) : params(params_), tile_manager(params.progressive, params.samples, params.tile_size, params.start_resolution, params.background == false || params.progressive_refine, params.background, params.tile_order, - max(params.device.multi_devices.size(), 1)), + max(params.device.multi_devices.size(), 1), params.pixel_size), stats() { device_use_gl = ((params.device.type != DEVICE_CPU) && !params.background); @@ -55,7 +55,7 @@ Session::Session(const SessionParams& params_) device = Device::create(params.device, stats, params.background); - if(params.background && params.output_path.empty()) { + if(params.background && !params.write_render_cb) { buffers = NULL; display = NULL; } @@ -101,21 +101,22 @@ Session::~Session() wait(); } - if(!params.output_path.empty()) { + if(params.write_render_cb) { /* tonemap and write out image if requested */ delete display; display = new DisplayBuffer(device, false); - display->reset(device, buffers->params); + display->reset(buffers->params); tonemap(params.samples); - progress.set_status("Writing Image", params.output_path); - display->write(device, params.output_path); + int w = display->draw_width; + int h = display->draw_height; + uchar4 *pixels = display->rgba_byte.copy_from_device(0, w, h); + params.write_render_cb((uchar*)pixels, w, h, 4); } /* clean up */ - foreach(RenderBuffers *buffers, tile_buffers) - delete buffers; + tile_manager.device_free(); delete buffers; delete display; @@ -268,8 +269,8 @@ void Session::run_gpu() /* update status and timing */ update_status_time(); - /* path trace */ - path_trace(); + /* render */ + render(); device->task_wait(); @@ -358,30 +359,31 @@ bool Session::acquire_tile(Device *tile_device, RenderTile& rtile) thread_scoped_lock tile_lock(tile_mutex); /* get next tile from manager */ - Tile tile; + Tile *tile; int device_num = device->device_number(tile_device); if(!tile_manager.next_tile(tile, device_num)) return false; /* fill render tile */ - rtile.x = tile_manager.state.buffer.full_x + tile.x; - rtile.y = tile_manager.state.buffer.full_y + tile.y; - rtile.w = tile.w; - rtile.h = tile.h; + rtile.x = tile_manager.state.buffer.full_x + tile->x; + rtile.y = tile_manager.state.buffer.full_y + tile->y; + rtile.w = tile->w; + rtile.h = tile->h; rtile.start_sample = tile_manager.state.sample; rtile.num_samples = tile_manager.state.num_samples; rtile.resolution = tile_manager.state.resolution_divider; + rtile.tile_index = tile->index; + rtile.task = (tile->state == Tile::DENOISE)? RenderTile::DENOISE: RenderTile::PATH_TRACE; tile_lock.unlock(); /* in case of a permanent buffer, return it, otherwise we will allocate * a new temporary buffer */ - if(!(params.background && params.output_path.empty())) { + if(buffers) { tile_manager.state.buffer.get_offset_stride(rtile.offset, rtile.stride); rtile.buffer = buffers->buffer.device_pointer; - rtile.rng_state = buffers->rng_state.device_pointer; rtile.buffers = buffers; device->map_tile(tile_device, rtile); @@ -389,48 +391,24 @@ bool Session::acquire_tile(Device *tile_device, RenderTile& rtile) return true; } - /* fill buffer parameters */ - BufferParams buffer_params = tile_manager.params; - buffer_params.full_x = rtile.x; - buffer_params.full_y = rtile.y; - buffer_params.width = rtile.w; - buffer_params.height = rtile.h; - - buffer_params.get_offset_stride(rtile.offset, rtile.stride); - - RenderBuffers *tilebuffers; - - /* allocate buffers */ - if(params.progressive_refine) { - tile_lock.lock(); - - if(tile_buffers.size() == 0) - tile_buffers.resize(tile_manager.state.num_tiles, NULL); - - /* In certain circumstances number of tiles in the tile manager could - * be changed. This is not supported by the progressive refine feature. - */ - assert(tile_buffers.size() == tile_manager.state.num_tiles); - - tilebuffers = tile_buffers[tile.index]; - if(tilebuffers == NULL) { - tilebuffers = new RenderBuffers(tile_device); - tile_buffers[tile.index] = tilebuffers; - - tilebuffers->reset(tile_device, buffer_params); - } - - tile_lock.unlock(); + if(tile->buffers == NULL) { + /* fill buffer parameters */ + BufferParams buffer_params = tile_manager.params; + buffer_params.full_x = rtile.x; + buffer_params.full_y = rtile.y; + buffer_params.width = rtile.w; + buffer_params.height = rtile.h; + + /* allocate buffers */ + tile->buffers = new RenderBuffers(tile_device); + tile->buffers->reset(buffer_params); } - else { - tilebuffers = new RenderBuffers(tile_device); - tilebuffers->reset(tile_device, buffer_params); - } + tile->buffers->params.get_offset_stride(rtile.offset, rtile.stride); - rtile.buffer = tilebuffers->buffer.device_pointer; - rtile.rng_state = tilebuffers->rng_state.device_pointer; - rtile.buffers = tilebuffers; + rtile.buffer = tile->buffers->buffer.device_pointer; + rtile.buffers = tile->buffers; + rtile.sample = tile_manager.state.sample; /* this will tag tile as IN PROGRESS in blender-side render pipeline, * which is needed to highlight currently rendering tile before first @@ -449,7 +427,7 @@ void Session::update_tile_sample(RenderTile& rtile) if(params.progressive_refine == false) { /* todo: optimize this by making it thread safe and removing lock */ - update_render_tile_cb(rtile); + update_render_tile_cb(rtile, true); } } @@ -460,20 +438,78 @@ void Session::release_tile(RenderTile& rtile) { thread_scoped_lock tile_lock(tile_mutex); - progress.add_finished_tile(); + progress.add_finished_tile(rtile.task == RenderTile::DENOISE); - if(write_render_tile_cb) { - if(params.progressive_refine == false) { - /* todo: optimize this by making it thread safe and removing lock */ + bool delete_tile; + + if(tile_manager.finish_tile(rtile.tile_index, delete_tile)) { + if(write_render_tile_cb && params.progressive_refine == false) { write_render_tile_cb(rtile); + } + if(delete_tile) { delete rtile.buffers; + tile_manager.state.tiles[rtile.tile_index].buffers = NULL; + } + } + else { + if(update_render_tile_cb && params.progressive_refine == false) { + update_render_tile_cb(rtile, false); } } update_status_time(); } +void Session::map_neighbor_tiles(RenderTile *tiles, Device *tile_device) +{ + thread_scoped_lock tile_lock(tile_mutex); + + int center_idx = tiles[4].tile_index; + assert(tile_manager.state.tiles[center_idx].state == Tile::DENOISE); + BufferParams buffer_params = tile_manager.params; + int4 image_region = make_int4(buffer_params.full_x, buffer_params.full_y, + buffer_params.full_x + buffer_params.width, buffer_params.full_y + buffer_params.height); + + for(int dy = -1, i = 0; dy <= 1; dy++) { + for(int dx = -1; dx <= 1; dx++, i++) { + int px = tiles[4].x + dx*params.tile_size.x; + int py = tiles[4].y + dy*params.tile_size.y; + if(px >= image_region.x && py >= image_region.y && + px < image_region.z && py < image_region.w) { + int tile_index = center_idx + dy*tile_manager.state.tile_stride + dx; + Tile *tile = &tile_manager.state.tiles[tile_index]; + assert(tile->buffers); + + tiles[i].buffer = tile->buffers->buffer.device_pointer; + tiles[i].x = tile_manager.state.buffer.full_x + tile->x; + tiles[i].y = tile_manager.state.buffer.full_y + tile->y; + tiles[i].w = tile->w; + tiles[i].h = tile->h; + tiles[i].buffers = tile->buffers; + + tile->buffers->params.get_offset_stride(tiles[i].offset, tiles[i].stride); + } + else { + tiles[i].buffer = (device_ptr)NULL; + tiles[i].buffers = NULL; + tiles[i].x = clamp(px, image_region.x, image_region.z); + tiles[i].y = clamp(py, image_region.y, image_region.w); + tiles[i].w = tiles[i].h = 0; + } + } + } + + assert(tiles[4].buffers); + device->map_neighbor_tiles(tile_device, tiles); +} + +void Session::unmap_neighbor_tiles(RenderTile *tiles, Device *tile_device) +{ + thread_scoped_lock tile_lock(tile_mutex); + device->unmap_neighbor_tiles(tile_device, tiles); +} + void Session::run_cpu() { bool tiles_written = false; @@ -558,8 +594,8 @@ void Session::run_cpu() /* update status and timing */ update_status_time(); - /* path trace */ - path_trace(); + /* render */ + render(); /* update status and timing */ update_status_time(); @@ -608,13 +644,11 @@ DeviceRequestedFeatures Session::get_requested_device_features() DeviceRequestedFeatures requested_features; requested_features.experimental = params.experimental; - requested_features.max_closure = get_max_closure_count(); scene->shader_manager->get_requested_features( scene, &requested_features); if(!params.background) { /* Avoid too much re-compilations for viewport render. */ - requested_features.max_closure = 64; requested_features.max_nodes_group = NODE_GROUP_LEVEL_MAX; requested_features.nodes_features = NODE_FEATURE_ALL; } @@ -624,37 +658,46 @@ DeviceRequestedFeatures Session::get_requested_device_features() */ requested_features.use_hair = false; requested_features.use_object_motion = false; - requested_features.use_camera_motion = scene->camera->use_motion; + requested_features.use_camera_motion = scene->camera->use_motion(); foreach(Object *object, scene->objects) { Mesh *mesh = object->mesh; if(mesh->num_curves()) { requested_features.use_hair = true; } - requested_features.use_object_motion |= object->use_motion | mesh->use_motion_blur; + requested_features.use_object_motion |= object->use_motion() | mesh->use_motion_blur; requested_features.use_camera_motion |= mesh->use_motion_blur; #ifdef WITH_OPENSUBDIV if(mesh->subdivision_type != Mesh::SUBDIVISION_NONE) { requested_features.use_patch_evaluation = true; } #endif + if(object->is_shadow_catcher) { + requested_features.use_shadow_tricks = true; + } } BakeManager *bake_manager = scene->bake_manager; requested_features.use_baking = bake_manager->get_baking(); requested_features.use_integrator_branched = (scene->integrator->method == Integrator::BRANCHED_PATH); - requested_features.use_transparent &= scene->integrator->transparent_shadows; + requested_features.use_denoising = params.use_denoising; return requested_features; } -void Session::load_kernels() +void Session::load_kernels(bool lock_scene) { - thread_scoped_lock scene_lock(scene->mutex); + thread_scoped_lock scene_lock; + if(lock_scene) { + scene_lock = thread_scoped_lock(scene->mutex); + } - if(!kernels_loaded) { + DeviceRequestedFeatures requested_features = get_requested_device_features(); + + if(!kernels_loaded || loaded_kernel_features.modified(requested_features)) { progress.set_status("Loading render kernels (may take a few minutes the first time)"); - DeviceRequestedFeatures requested_features = get_requested_device_features(); + scoped_timer timer; + VLOG(2) << "Requested features:\n" << requested_features; if(!device->load_kernels(requested_features)) { string message = device->error_message(); @@ -667,7 +710,11 @@ void Session::load_kernels() return; } + progress.add_skip_time(timer, false); + VLOG(1) << "Total time spent loading kernels: " << time_dt() - timer.get_start(); + kernels_loaded = true; + loaded_kernel_features = requested_features; } } @@ -707,11 +754,11 @@ bool Session::draw(BufferParams& buffer_params, DeviceDrawParams &draw_params) void Session::reset_(BufferParams& buffer_params, int samples) { - if(buffers) { - if(buffer_params.modified(buffers->params)) { - gpu_draw_ready = false; - buffers->reset(device, buffer_params); - display->reset(device, buffer_params); + if(buffers && buffer_params.modified(tile_manager.params)) { + gpu_draw_ready = false; + buffers->reset(buffer_params); + if(display) { + display->reset(buffer_params); } } @@ -732,15 +779,6 @@ void Session::reset(BufferParams& buffer_params, int samples) reset_gpu(buffer_params, samples); else reset_cpu(buffer_params, samples); - - if(params.progressive_refine) { - thread_scoped_lock buffers_lock(buffers_mutex); - - foreach(RenderBuffers *buffers, tile_buffers) - delete buffers; - - tile_buffers.clear(); - } } void Session::set_samples(int samples) @@ -818,6 +856,18 @@ void Session::update_scene() /* update scene */ if(scene->need_update()) { + load_kernels(false); + + /* Update max_closures. */ + KernelIntegrator *kintegrator = &scene->dscene.data.integrator; + if(params.background) { + kintegrator->max_closures = get_max_closure_count(); + } + else { + /* Currently viewport render is faster with higher max_closures, needs investigating. */ + kintegrator->max_closures = 64; + } + progress.set_status("Updating Scene"); MEM_GUARDED_CALL(&progress, scene->device_update, device, progress); } @@ -828,7 +878,7 @@ void Session::update_status_time(bool show_pause, bool show_done) int progressive_sample = tile_manager.state.sample; int num_samples = tile_manager.get_num_effective_samples(); - int tile = tile_manager.state.num_rendered_tiles; + int tile = progress.get_rendered_tiles(); int num_tiles = tile_manager.state.num_tiles; /* update status */ @@ -836,11 +886,12 @@ void Session::update_status_time(bool show_pause, bool show_done) if(!params.progressive) { const bool is_cpu = params.device.type == DEVICE_CPU; - const bool is_last_tile = (progress.get_finished_tiles() + 1) == num_tiles; + const bool rendering_finished = (tile == num_tiles); + const bool is_last_tile = (tile + 1) == num_tiles; - substatus = string_printf("Path Tracing Tile %d/%d", tile, num_tiles); + substatus = string_printf("Rendered %d/%d Tiles", tile, num_tiles); - if(device->show_samples() || (is_cpu && is_last_tile)) { + if(!rendering_finished && (device->show_samples() || (is_cpu && is_last_tile))) { /* Some devices automatically support showing the sample number: * - CUDADevice * - OpenCLDevice when using the megakernel (the split kernel renders multiple @@ -852,6 +903,9 @@ void Session::update_status_time(bool show_pause, bool show_done) */ substatus += string_printf(", Sample %d/%d", progress.get_current_sample(), num_samples); } + if(params.use_denoising) { + substatus += string_printf(", Denoised %d tiles", progress.get_denoised_tiles()); + } } else if(tile_manager.num_samples == INT_MAX) substatus = string_printf("Path Tracing Sample %d", progressive_sample+1); @@ -865,6 +919,7 @@ void Session::update_status_time(bool show_pause, bool show_done) } else if(show_done) { status = "Done"; + progress.set_end_time(); /* Save end time so that further calls to get_time are accurate. */ } else { status = substatus; @@ -874,19 +929,39 @@ void Session::update_status_time(bool show_pause, bool show_done) progress.set_status(status, substatus); } -void Session::path_trace() +void Session::render() { - /* add path trace task */ - DeviceTask task(DeviceTask::PATH_TRACE); + /* Clear buffers. */ + if(buffers && tile_manager.state.sample == tile_manager.range_start_sample) { + buffers->zero(); + } + + /* Add path trace task. */ + DeviceTask task(DeviceTask::RENDER); task.acquire_tile = function_bind(&Session::acquire_tile, this, _1, _2); task.release_tile = function_bind(&Session::release_tile, this, _1); + task.map_neighbor_tiles = function_bind(&Session::map_neighbor_tiles, this, _1, _2); + task.unmap_neighbor_tiles = function_bind(&Session::unmap_neighbor_tiles, this, _1, _2); task.get_cancel = function_bind(&Progress::get_cancel, &this->progress); task.update_tile_sample = function_bind(&Session::update_tile_sample, this, _1); task.update_progress_sample = function_bind(&Progress::add_samples, &this->progress, _1, _2); task.need_finish_queue = params.progressive_refine; task.integrator_branched = scene->integrator->method == Integrator::BRANCHED_PATH; task.requested_tile_size = params.tile_size; + task.passes_size = tile_manager.params.get_passes_size(); + + if(params.use_denoising) { + task.denoising_radius = params.denoising_radius; + task.denoising_strength = params.denoising_strength; + task.denoising_feature_strength = params.denoising_feature_strength; + task.denoising_relative_pca = params.denoising_relative_pca; + + assert(!scene->film->need_update); + task.pass_stride = scene->film->pass_stride; + task.pass_denoising_data = scene->film->denoising_data_offset; + task.pass_denoising_clean = scene->film->denoising_clean_offset; + } device->task_add(task); } @@ -931,10 +1006,18 @@ bool Session::update_progressive_refine(bool cancel) } if(params.progressive_refine) { - foreach(RenderBuffers *buffers, tile_buffers) { + foreach(Tile& tile, tile_manager.state.tiles) { + if(!tile.buffers) { + continue; + } + RenderTile rtile; - rtile.buffers = buffers; + rtile.x = tile_manager.state.buffer.full_x + tile.x; + rtile.y = tile_manager.state.buffer.full_y + tile.y; + rtile.w = tile.w; + rtile.h = tile.h; rtile.sample = sample; + rtile.buffers = tile.buffers; if(write) { if(write_render_tile_cb) @@ -942,7 +1025,7 @@ bool Session::update_progressive_refine(bool cancel) } else { if(update_render_tile_cb) - update_render_tile_cb(rtile); + update_render_tile_cb(rtile, true); } } } @@ -956,10 +1039,7 @@ void Session::device_free() { scene->device_free(); - foreach(RenderBuffers *buffers, tile_buffers) - delete buffers; - - tile_buffers.clear(); + tile_manager.device_free(); /* used from background render only, so no need to * re-create render/display buffers here diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h index c7ff1446171..e63cad0d977 100644 --- a/intern/cycles/render/session.h +++ b/intern/cycles/render/session.h @@ -17,15 +17,15 @@ #ifndef __SESSION_H__ #define __SESSION_H__ -#include "buffers.h" -#include "device.h" -#include "shader.h" -#include "tile.h" +#include "render/buffers.h" +#include "device/device.h" +#include "render/shader.h" +#include "render/tile.h" -#include "util_progress.h" -#include "util_stats.h" -#include "util_thread.h" -#include "util_vector.h" +#include "util/util_progress.h" +#include "util/util_stats.h" +#include "util/util_thread.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN @@ -45,7 +45,6 @@ public: DeviceInfo device; bool background; bool progressive_refine; - string output_path; bool progressive; bool experimental; @@ -53,10 +52,17 @@ public: int2 tile_size; TileOrder tile_order; int start_resolution; + int pixel_size; int threads; bool display_buffer_linear; + bool use_denoising; + int denoising_radius; + float denoising_strength; + float denoising_feature_strength; + bool denoising_relative_pca; + double cancel_timeout; double reset_timeout; double text_timeout; @@ -64,19 +70,30 @@ public: ShadingSystem shadingsystem; + function<bool(const uchar *pixels, + int width, + int height, + int channels)> write_render_cb; + SessionParams() { background = false; progressive_refine = false; - output_path = ""; progressive = false; experimental = false; samples = INT_MAX; tile_size = make_int2(64, 64); start_resolution = INT_MAX; + pixel_size = 1; threads = 0; + use_denoising = false; + denoising_radius = 8; + denoising_strength = 0.0f; + denoising_feature_strength = 0.0f; + denoising_relative_pca = false; + display_buffer_linear = false; cancel_timeout = 0.1; @@ -92,12 +109,12 @@ public: { return !(device == params.device && background == params.background && progressive_refine == params.progressive_refine - && output_path == params.output_path /* && samples == params.samples */ && progressive == params.progressive && experimental == params.experimental && tile_size == params.tile_size && start_resolution == params.start_resolution + && pixel_size == params.pixel_size && threads == params.threads && display_buffer_linear == params.display_buffer_linear && cancel_timeout == params.cancel_timeout @@ -126,7 +143,7 @@ public: Stats stats; function<void(RenderTile&)> write_render_tile_cb; - function<void(RenderTile&)> update_render_tile_cb; + function<void(RenderTile&, bool)> update_render_tile_cb; explicit Session(const SessionParams& params); ~Session(); @@ -141,7 +158,7 @@ public: void set_pause(bool pause); void update_scene(); - void load_kernels(); + void load_kernels(bool lock_scene=true); void device_free(); @@ -162,7 +179,7 @@ protected: void update_status_time(bool show_pause = false, bool show_done = false); void tonemap(int sample); - void path_trace(); + void render(); void reset_(BufferParams& params, int samples); void run_cpu(); @@ -177,6 +194,9 @@ protected: void update_tile_sample(RenderTile& tile); void release_tile(RenderTile& tile); + void map_neighbor_tiles(RenderTile *tiles, Device *tile_device); + void unmap_neighbor_tiles(RenderTile *tiles, Device *tile_device); + bool device_use_gl; thread *session_thread; @@ -195,6 +215,7 @@ protected: thread_mutex display_mutex; bool kernels_loaded; + DeviceRequestedFeatures loaded_kernel_features; double reset_time; @@ -202,8 +223,6 @@ protected: double last_update_time; bool update_progressive_refine(bool cancel); - vector<RenderBuffers *> tile_buffers; - DeviceRequestedFeatures get_requested_device_features(); /* ** Split kernel routines ** */ diff --git a/intern/cycles/render/shader.cpp b/intern/cycles/render/shader.cpp index 335edcbe609..ec52c51e337 100644 --- a/intern/cycles/render/shader.cpp +++ b/intern/cycles/render/shader.cpp @@ -14,26 +14,28 @@ * limitations under the License. */ -#include "background.h" -#include "camera.h" -#include "device.h" -#include "graph.h" -#include "integrator.h" -#include "light.h" -#include "mesh.h" -#include "nodes.h" -#include "object.h" -#include "osl.h" -#include "scene.h" -#include "shader.h" -#include "svm.h" -#include "tables.h" - -#include "util_foreach.h" +#include "render/background.h" +#include "render/camera.h" +#include "device/device.h" +#include "render/graph.h" +#include "render/integrator.h" +#include "render/light.h" +#include "render/mesh.h" +#include "render/nodes.h" +#include "render/object.h" +#include "render/osl.h" +#include "render/scene.h" +#include "render/shader.h" +#include "render/svm.h" +#include "render/tables.h" + +#include "util/util_foreach.h" CCL_NAMESPACE_BEGIN +thread_mutex ShaderManager::lookup_table_mutex; vector<float> ShaderManager::beckmann_table; +bool ShaderManager::beckmann_table_ready = false; /* Beckmann sampling precomputed table, see bsdf_microfacet.h */ @@ -49,6 +51,16 @@ static float beckmann_table_slope_max() return 6.0; } + +/* MSVC 2015 needs this ugly hack to prevent a codegen bug on x86 + * see T50176 for details + */ +#if defined(_MSC_VER) && (_MSC_VER == 1900) +# define MSVC_VOLATILE volatile +#else +# define MSVC_VOLATILE +#endif + /* Paper used: Importance Sampling Microfacet-Based BSDFs with the * Distribution of Visible Normals. Supplemental Material 2/2. * @@ -72,7 +84,7 @@ static void beckmann_table_rows(float *table, int row_from, int row_to) slope_x[0] = (double)-beckmann_table_slope_max(); CDF_P22_omega_i[0] = 0; - for(int index_slope_x = 1; index_slope_x < DATA_TMP_SIZE; ++index_slope_x) { + for(MSVC_VOLATILE int index_slope_x = 1; index_slope_x < DATA_TMP_SIZE; ++index_slope_x) { /* slope_x */ slope_x[index_slope_x] = (double)(-beckmann_table_slope_max() + 2.0f * beckmann_table_slope_max() * index_slope_x/(DATA_TMP_SIZE - 1.0f)); @@ -116,6 +128,8 @@ static void beckmann_table_rows(float *table, int row_from, int row_to) } } +#undef MSVC_VOLATILE + static void beckmann_table_build(vector<float>& table) { table.resize(BECKMANN_TABLE_SIZE*BECKMANN_TABLE_SIZE); @@ -165,7 +179,6 @@ Shader::Shader() pass_id = 0; graph = NULL; - graph_bump = NULL; has_surface = false; has_surface_transparent = false; @@ -173,11 +186,14 @@ Shader::Shader() has_surface_bssrdf = false; has_volume = false; has_displacement = false; + has_bump = false; has_bssrdf_bump = false; has_surface_spatial_varying = false; has_volume_spatial_varying = false; has_object_dependency = false; + has_attribute_dependency = false; has_integrator_dependency = false; + has_volume_connected = false; displacement_method = DISPLACE_BUMP; @@ -185,13 +201,12 @@ Shader::Shader() used = false; need_update = true; - need_update_attributes = true; + need_update_mesh = true; } Shader::~Shader() { delete graph; - delete graph_bump; } bool Shader::is_constant_emission(float3 *emission) @@ -221,14 +236,31 @@ void Shader::set_graph(ShaderGraph *graph_) /* do this here already so that we can detect if mesh or object attributes * are needed, since the node attribute callbacks check if their sockets * are connected but proxy nodes should not count */ - if(graph_) + if(graph_) { graph_->remove_proxy_nodes(); + if(displacement_method != DISPLACE_BUMP) { + graph_->compute_displacement_hash(); + } + } + + /* update geometry if displacement changed */ + if(displacement_method != DISPLACE_BUMP) { + const char *old_hash = (graph)? graph->displacement_hash.c_str() : ""; + const char *new_hash = (graph_)? graph_->displacement_hash.c_str() : ""; + + if(strcmp(old_hash, new_hash) != 0) { + need_update_mesh = true; + } + } + /* assign graph */ delete graph; - delete graph_bump; graph = graph_; - graph_bump = NULL; + + /* Store info here before graph optimization to make sure that + * nodes that get optimized away still count. */ + has_volume_connected = (graph->output()->input("Volume")->link != NULL); } void Shader::tag_update(Scene *scene) @@ -278,9 +310,9 @@ void Shader::tag_update(Scene *scene) } /* compare if the attributes changed, mesh manager will check - * need_update_attributes, update the relevant meshes and clear it. */ + * need_update_mesh, update the relevant meshes and clear it. */ if(attributes.modified(prev_attributes)) { - need_update_attributes = true; + need_update_mesh = true; scene->mesh_manager->need_update = true; } @@ -319,11 +351,14 @@ ShaderManager *ShaderManager::create(Scene *scene, int shadingsystem) (void)shadingsystem; /* Ignored when built without OSL. */ #ifdef WITH_OSL - if(shadingsystem == SHADINGSYSTEM_OSL) + if(shadingsystem == SHADINGSYSTEM_OSL) { manager = new OSLShaderManager(); + } else #endif + { manager = new SVMShaderManager(); + } add_default(scene); @@ -332,6 +367,8 @@ ShaderManager *ShaderManager::create(Scene *scene, int shadingsystem) uint ShaderManager::get_attribute_id(ustring name) { + thread_scoped_spin_lock lock(attribute_lock_); + /* get a unique id for each name, for SVM attribute lookup */ AttributeIDMap::iterator it = unique_attribute_id.find(name); @@ -395,15 +432,12 @@ void ShaderManager::device_update_common(Device *device, Scene *scene, Progress& /*progress*/) { - device->tex_free(dscene->shader_flag); - dscene->shader_flag.clear(); + dscene->shaders.free(); if(scene->shaders.size() == 0) return; - uint shader_flag_size = scene->shaders.size()*SHADER_SIZE; - uint *shader_flag = dscene->shader_flag.resize(shader_flag_size); - uint i = 0; + KernelShader *kshader = dscene->shaders.alloc(scene->shaders.size()); bool has_volumes = false; bool has_transparent_shadow = false; @@ -418,60 +452,61 @@ void ShaderManager::device_update_common(Device *device, flag |= SD_HAS_VOLUME; has_volumes = true; - /* in this case we can assume transparent surface */ - if(!shader->has_surface) - flag |= SD_HAS_ONLY_VOLUME; - /* todo: this could check more fine grained, to skip useless volumes * enclosed inside an opaque bsdf. */ flag |= SD_HAS_TRANSPARENT_SHADOW; } + /* in this case we can assume transparent surface */ + if(shader->has_volume_connected && !shader->has_surface) + flag |= SD_HAS_ONLY_VOLUME; if(shader->heterogeneous_volume && shader->has_volume_spatial_varying) flag |= SD_HETEROGENEOUS_VOLUME; + if(shader->has_attribute_dependency) + flag |= SD_NEED_ATTRIBUTES; if(shader->has_bssrdf_bump) flag |= SD_HAS_BSSRDF_BUMP; - if(shader->volume_sampling_method == VOLUME_SAMPLING_EQUIANGULAR) - flag |= SD_VOLUME_EQUIANGULAR; - if(shader->volume_sampling_method == VOLUME_SAMPLING_MULTIPLE_IMPORTANCE) - flag |= SD_VOLUME_MIS; + if(device->info.has_volume_decoupled) { + if(shader->volume_sampling_method == VOLUME_SAMPLING_EQUIANGULAR) + flag |= SD_VOLUME_EQUIANGULAR; + if(shader->volume_sampling_method == VOLUME_SAMPLING_MULTIPLE_IMPORTANCE) + flag |= SD_VOLUME_MIS; + } if(shader->volume_interpolation_method == VOLUME_INTERPOLATION_CUBIC) flag |= SD_VOLUME_CUBIC; - if(shader->graph_bump) + if(shader->has_bump) flag |= SD_HAS_BUMP; if(shader->displacement_method != DISPLACE_BUMP) flag |= SD_HAS_DISPLACEMENT; - /* shader with bump mapping */ - if(shader->displacement_method != DISPLACE_TRUE && shader->graph_bump) - flag |= SD_HAS_BSSRDF_BUMP; - /* constant emission check */ float3 constant_emission = make_float3(0.0f, 0.0f, 0.0f); if(shader->is_constant_emission(&constant_emission)) flag |= SD_HAS_CONSTANT_EMISSION; /* regular shader */ - shader_flag[i++] = flag; - shader_flag[i++] = shader->pass_id; - shader_flag[i++] = __float_as_int(constant_emission.x); - shader_flag[i++] = __float_as_int(constant_emission.y); - shader_flag[i++] = __float_as_int(constant_emission.z); + kshader->flags = flag; + kshader->pass_id = shader->pass_id; + kshader->constant_emission[0] = constant_emission.x; + kshader->constant_emission[1] = constant_emission.y; + kshader->constant_emission[2] = constant_emission.z; + kshader++; has_transparent_shadow |= (flag & SD_HAS_TRANSPARENT_SHADOW) != 0; } - device->tex_alloc("__shader_flag", dscene->shader_flag); + dscene->shaders.copy_to_device(); /* lookup tables */ KernelTables *ktables = &dscene->data.tables; /* beckmann lookup table */ if(beckmann_table_offset == TABLE_OFFSET_INVALID) { - if(beckmann_table.size() == 0) { + if(!beckmann_table_ready) { thread_scoped_lock lock(lookup_table_mutex); - if(beckmann_table.size() == 0) { + if(!beckmann_table_ready) { beckmann_table_build(beckmann_table); + beckmann_table_ready = true; } } beckmann_table_offset = scene->lookup_tables->add_table(dscene, beckmann_table); @@ -482,17 +517,14 @@ void ShaderManager::device_update_common(Device *device, KernelIntegrator *kintegrator = &dscene->data.integrator; kintegrator->use_volumes = has_volumes; /* TODO(sergey): De-duplicate with flags set in integrator.cpp. */ - if(scene->integrator->transparent_shadows) { - kintegrator->transparent_shadows = has_transparent_shadow; - } + kintegrator->transparent_shadows = has_transparent_shadow; } -void ShaderManager::device_free_common(Device *device, DeviceScene *dscene, Scene *scene) +void ShaderManager::device_free_common(Device *, DeviceScene *dscene, Scene *scene) { scene->lookup_tables->remove_table(&beckmann_table_offset); - device->tex_free(dscene->shader_flag); - dscene->shader_flag.clear(); + dscene->shaders.free(); } void ShaderManager::add_default(Scene *scene) @@ -567,6 +599,9 @@ void ShaderManager::get_requested_graph_features(ShaderGraph *graph, if(CLOSURE_IS_VOLUME(bsdf_node->closure)) { requested_features->nodes_features |= NODE_FEATURE_VOLUME; } + else if(CLOSURE_IS_PRINCIPLED(bsdf_node->closure)) { + requested_features->use_principled = true; + } } if(node->has_surface_bssrdf()) { requested_features->use_subsurface = true; @@ -574,6 +609,9 @@ void ShaderManager::get_requested_graph_features(ShaderGraph *graph, if(node->has_surface_transparent()) { requested_features->use_transparent = true; } + if(node->has_raytrace()) { + requested_features->use_shader_raytrace = true; + } } } @@ -586,15 +624,10 @@ void ShaderManager::get_requested_features(Scene *scene, Shader *shader = scene->shaders[i]; /* Gather requested features from all the nodes from the graph nodes. */ get_requested_graph_features(shader->graph, requested_features); - /* Gather requested features from the graph itself. */ - if(shader->graph_bump) { - get_requested_graph_features(shader->graph_bump, - requested_features); - } ShaderNode *output_node = shader->graph->output(); if(output_node->input("Displacement")->link != NULL) { requested_features->nodes_features |= NODE_FEATURE_BUMP; - if(shader->displacement_method == DISPLACE_BOTH && requested_features->experimental) { + if(shader->displacement_method == DISPLACE_BOTH) { requested_features->nodes_features |= NODE_FEATURE_BUMP_STATE; } } diff --git a/intern/cycles/render/shader.h b/intern/cycles/render/shader.h index 7d896652196..abd483caabc 100644 --- a/intern/cycles/render/shader.h +++ b/intern/cycles/render/shader.h @@ -19,20 +19,20 @@ #ifdef WITH_OSL /* So no context pollution happens from indirectly included windows.h */ -# include "util_windows.h" +# include "util/util_windows.h" # include <OSL/oslexec.h> #endif -#include "attribute.h" -#include "kernel_types.h" +#include "render/attribute.h" +#include "kernel/kernel_types.h" -#include "node.h" +#include "graph/node.h" -#include "util_map.h" -#include "util_param.h" -#include "util_string.h" -#include "util_thread.h" -#include "util_types.h" +#include "util/util_map.h" +#include "util/util_param.h" +#include "util/util_string.h" +#include "util/util_thread.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN @@ -82,18 +82,13 @@ enum DisplacementMethod { class Shader : public Node { public: - NODE_DECLARE; + NODE_DECLARE int pass_id; /* shader graph */ ShaderGraph *graph; - /* shader graph with auto bump mapping included, we compile two shaders, - * with and without bump, because the displacement method is a mesh - * level setting, so we need to handle both */ - ShaderGraph *graph_bump; - /* sampling */ bool use_mis; bool use_transparent_shadow; @@ -103,7 +98,16 @@ public: /* synchronization */ bool need_update; - bool need_update_attributes; + bool need_update_mesh; + + /* If the shader has only volume components, the surface is assumed to + * be transparent. + * However, graph optimization might remove the volume subgraph, but + * since the user connected something to the volume output the surface + * should still be transparent. + * Therefore, has_volume_connected stores whether some volume subtree + * was connected before optimization. */ + bool has_volume_connected; /* information about shader after compiling */ bool has_surface; @@ -112,10 +116,12 @@ public: bool has_volume; bool has_displacement; bool has_surface_bssrdf; + bool has_bump; bool has_bssrdf_bump; bool has_surface_spatial_varying; bool has_volume_spatial_varying; bool has_object_dependency; + bool has_attribute_dependency; bool has_integrator_dependency; /* displacement */ @@ -195,13 +201,16 @@ protected: typedef unordered_map<ustring, uint, ustringHash> AttributeIDMap; AttributeIDMap unique_attribute_id; - thread_mutex lookup_table_mutex; + static thread_mutex lookup_table_mutex; static vector<float> beckmann_table; + static bool beckmann_table_ready; size_t beckmann_table_offset; void get_requested_graph_features(ShaderGraph *graph, DeviceRequestedFeatures *requested_features); + + thread_spin_lock attribute_lock_; }; CCL_NAMESPACE_END diff --git a/intern/cycles/render/sobol.cpp b/intern/cycles/render/sobol.cpp index e3c2e802067..6906667ac19 100644 --- a/intern/cycles/render/sobol.cpp +++ b/intern/cycles/render/sobol.cpp @@ -46,10 +46,9 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "util_debug.h" -#include "util_types.h" +#include "util/util_types.h" -#include "sobol.h" +#include "render/sobol.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/render/sobol.h b/intern/cycles/render/sobol.h index 574f148b9a2..9fbce4e14a5 100644 --- a/intern/cycles/render/sobol.h +++ b/intern/cycles/render/sobol.h @@ -17,7 +17,7 @@ #ifndef __SOBOL_H__ #define __SOBOL_H__ -#include "util_types.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/render/svm.cpp b/intern/cycles/render/svm.cpp index 955b892f4c3..c5b4060d5c3 100644 --- a/intern/cycles/render/svm.cpp +++ b/intern/cycles/render/svm.cpp @@ -14,20 +14,19 @@ * limitations under the License. */ -#include "device.h" -#include "graph.h" -#include "light.h" -#include "mesh.h" -#include "nodes.h" -#include "scene.h" -#include "shader.h" -#include "svm.h" - -#include "util_debug.h" -#include "util_logging.h" -#include "util_foreach.h" -#include "util_progress.h" -#include "util_task.h" +#include "device/device.h" +#include "render/graph.h" +#include "render/light.h" +#include "render/mesh.h" +#include "render/nodes.h" +#include "render/scene.h" +#include "render/shader.h" +#include "render/svm.h" + +#include "util/util_logging.h" +#include "util/util_foreach.h" +#include "util/util_progress.h" +#include "util/util_task.h" CCL_NAMESPACE_BEGIN @@ -48,15 +47,15 @@ void SVMShaderManager::reset(Scene * /*scene*/) void SVMShaderManager::device_update_shader(Scene *scene, Shader *shader, Progress *progress, - vector<int4> *global_svm_nodes) + array<int4> *global_svm_nodes) { if(progress->get_cancel()) { return; } assert(shader->graph); - vector<int4> svm_nodes; - svm_nodes.push_back(make_int4(NODE_SHADER_JUMP, 0, 0, 0)); + array<int4> svm_nodes; + svm_nodes.push_back_slow(make_int4(NODE_SHADER_JUMP, 0, 0, 0)); SVMCompiler::Summary summary; SVMCompiler compiler(scene->shader_manager, scene->image_manager); @@ -67,6 +66,7 @@ void SVMShaderManager::device_update_shader(Scene *scene, << "Shader name: " << shader->name << "\n" << summary.full_report(); + nodes_lock_.lock(); if(shader->use_mis && shader->has_surface_emission) { scene->light_manager->need_update = true; } @@ -74,17 +74,16 @@ void SVMShaderManager::device_update_shader(Scene *scene, /* The copy needs to be done inside the lock, if another thread resizes the array * while memcpy is running, it'll be copying into possibly invalid/freed ram. */ - nodes_lock_.lock(); size_t global_nodes_size = global_svm_nodes->size(); global_svm_nodes->resize(global_nodes_size + svm_nodes.size()); /* Offset local SVM nodes to a global address space. */ - int4& jump_node = global_svm_nodes->at(shader->id); + int4& jump_node = (*global_svm_nodes)[shader->id]; jump_node.y = svm_nodes[0].y + global_nodes_size - 1; jump_node.z = svm_nodes[0].z + global_nodes_size - 1; jump_node.w = svm_nodes[0].w + global_nodes_size - 1; /* Copy new nodes to global storage. */ - memcpy(&global_svm_nodes->at(global_nodes_size), + memcpy(&(*global_svm_nodes)[global_nodes_size], &svm_nodes[1], sizeof(int4) * (svm_nodes.size() - 1)); nodes_lock_.unlock(); @@ -106,11 +105,11 @@ void SVMShaderManager::device_update(Device *device, DeviceScene *dscene, Scene device_update_shaders_used(scene); /* svm_nodes */ - vector<int4> svm_nodes; + array<int4> svm_nodes; size_t i; for(i = 0; i < scene->shaders.size(); i++) { - svm_nodes.push_back(make_int4(NODE_SHADER_JUMP, 0, 0, 0)); + svm_nodes.push_back_slow(make_int4(NODE_SHADER_JUMP, 0, 0, 0)); } TaskPool task_pool; @@ -129,8 +128,8 @@ void SVMShaderManager::device_update(Device *device, DeviceScene *dscene, Scene return; } - dscene->svm_nodes.copy((uint4*)&svm_nodes[0], svm_nodes.size()); - device->tex_alloc("__svm_nodes", dscene->svm_nodes); + dscene->svm_nodes.steal_data(svm_nodes); + dscene->svm_nodes.copy_to_device(); for(i = 0; i < scene->shaders.size(); i++) { Shader *shader = scene->shaders[i]; @@ -150,8 +149,7 @@ void SVMShaderManager::device_free(Device *device, DeviceScene *dscene, Scene *s { device_free_common(device, dscene, scene); - device->tex_free(dscene->svm_nodes); - dscene->svm_nodes.clear(); + dscene->svm_nodes.free(); } /* Graph Compiler */ @@ -366,17 +364,17 @@ uint SVMCompiler::encode_uchar4(uint x, uint y, uint z, uint w) void SVMCompiler::add_node(int a, int b, int c, int d) { - current_svm_nodes.push_back(make_int4(a, b, c, d)); + current_svm_nodes.push_back_slow(make_int4(a, b, c, d)); } void SVMCompiler::add_node(ShaderNodeType type, int a, int b, int c) { - current_svm_nodes.push_back(make_int4(type, a, b, c)); + current_svm_nodes.push_back_slow(make_int4(type, a, b, c)); } void SVMCompiler::add_node(ShaderNodeType type, const float3& f) { - current_svm_nodes.push_back(make_int4(type, + current_svm_nodes.push_back_slow(make_int4(type, __float_as_int(f.x), __float_as_int(f.y), __float_as_int(f.z))); @@ -384,7 +382,7 @@ void SVMCompiler::add_node(ShaderNodeType type, const float3& f) void SVMCompiler::add_node(const float4& f) { - current_svm_nodes.push_back(make_int4( + current_svm_nodes.push_back_slow(make_int4( __float_as_int(f.x), __float_as_int(f.y), __float_as_int(f.z), @@ -401,6 +399,12 @@ uint SVMCompiler::attribute(AttributeStandard std) return shader_manager->get_attribute_id(std); } +uint SVMCompiler::attribute_standard(ustring name) +{ + AttributeStandard std = Attribute::name_standard(name.c_str()); + return (std)? attribute(std): attribute(name); +} + bool SVMCompiler::node_skip_input(ShaderNode * /*node*/, ShaderInput *input) { /* nasty exception .. */ @@ -449,6 +453,10 @@ void SVMCompiler::generate_node(ShaderNode *node, ShaderNodeSet& done) current_shader->has_object_dependency = true; } + if(node->has_attribute_dependency()) { + current_shader->has_attribute_dependency = true; + } + if(node->has_integrator_dependency()) { current_shader->has_integrator_dependency = true; } @@ -521,6 +529,9 @@ void SVMCompiler::generate_closure_node(ShaderNode *node, if(node->has_bssrdf_bump()) current_shader->has_bssrdf_bump = true; } + if(node->has_bump()) { + current_shader->has_bump = true; + } } } @@ -624,7 +635,7 @@ void SVMCompiler::generate_multi_closure(ShaderNode *root_node, /* Add instruction to skip closure and its dependencies if mix * weight is zero. */ - current_svm_nodes.push_back(make_int4(NODE_JUMP_IF_ONE, + current_svm_nodes.push_back_slow(make_int4(NODE_JUMP_IF_ONE, 0, stack_assign(facin), 0)); @@ -642,7 +653,7 @@ void SVMCompiler::generate_multi_closure(ShaderNode *root_node, /* Add instruction to skip closure and its dependencies if mix * weight is zero. */ - current_svm_nodes.push_back(make_int4(NODE_JUMP_IF_ZERO, + current_svm_nodes.push_back_slow(make_int4(NODE_JUMP_IF_ZERO, 0, stack_assign(facin), 0)); @@ -794,36 +805,26 @@ void SVMCompiler::compile_type(Shader *shader, ShaderGraph *graph, ShaderType ty void SVMCompiler::compile(Scene *scene, Shader *shader, - vector<int4>& svm_nodes, + array<int4>& svm_nodes, int index, Summary *summary) { /* copy graph for shader with bump mapping */ - ShaderNode *node = shader->graph->output(); + ShaderNode *output = shader->graph->output(); int start_num_svm_nodes = svm_nodes.size(); const double time_start = time_dt(); - if(node->input("Surface")->link && node->input("Displacement")->link) - if(!shader->graph_bump) - shader->graph_bump = shader->graph->copy(); + bool has_bump = (shader->displacement_method != DISPLACE_TRUE) && + output->input("Surface")->link && output->input("Displacement")->link; /* finalize */ { scoped_timer timer((summary != NULL)? &summary->time_finalize: NULL); shader->graph->finalize(scene, - false, - false, - shader->has_integrator_dependency); - } - - if(shader->graph_bump) { - scoped_timer timer((summary != NULL)? &summary->time_finalize_bump: NULL); - shader->graph_bump->finalize(scene, - true, - false, - shader->has_integrator_dependency, - shader->displacement_method == DISPLACE_BOTH); + has_bump, + shader->has_integrator_dependency, + shader->displacement_method == DISPLACE_BOTH); } current_shader = shader; @@ -832,22 +833,22 @@ void SVMCompiler::compile(Scene *scene, shader->has_surface_emission = false; shader->has_surface_transparent = false; shader->has_surface_bssrdf = false; - shader->has_bssrdf_bump = false; + shader->has_bump = has_bump; + shader->has_bssrdf_bump = has_bump; shader->has_volume = false; shader->has_displacement = false; shader->has_surface_spatial_varying = false; shader->has_volume_spatial_varying = false; shader->has_object_dependency = false; + shader->has_attribute_dependency = false; shader->has_integrator_dependency = false; /* generate bump shader */ - if(shader->displacement_method != DISPLACE_TRUE && shader->graph_bump) { + if(has_bump) { scoped_timer timer((summary != NULL)? &summary->time_generate_bump: NULL); - compile_type(shader, shader->graph_bump, SHADER_TYPE_BUMP); + compile_type(shader, shader->graph, SHADER_TYPE_BUMP); svm_nodes[index].y = svm_nodes.size(); - svm_nodes.insert(svm_nodes.end(), - current_svm_nodes.begin(), - current_svm_nodes.end()); + svm_nodes.append(current_svm_nodes); } /* generate surface shader */ @@ -855,12 +856,10 @@ void SVMCompiler::compile(Scene *scene, scoped_timer timer((summary != NULL)? &summary->time_generate_surface: NULL); compile_type(shader, shader->graph, SHADER_TYPE_SURFACE); /* only set jump offset if there's no bump shader, as the bump shader will fall thru to this one if it exists */ - if(shader->displacement_method == DISPLACE_TRUE || !shader->graph_bump) { + if(!has_bump) { svm_nodes[index].y = svm_nodes.size(); } - svm_nodes.insert(svm_nodes.end(), - current_svm_nodes.begin(), - current_svm_nodes.end()); + svm_nodes.append(current_svm_nodes); } /* generate volume shader */ @@ -868,9 +867,7 @@ void SVMCompiler::compile(Scene *scene, scoped_timer timer((summary != NULL)? &summary->time_generate_volume: NULL); compile_type(shader, shader->graph, SHADER_TYPE_VOLUME); svm_nodes[index].z = svm_nodes.size(); - svm_nodes.insert(svm_nodes.end(), - current_svm_nodes.begin(), - current_svm_nodes.end()); + svm_nodes.append(current_svm_nodes); } /* generate displacement shader */ @@ -878,9 +875,7 @@ void SVMCompiler::compile(Scene *scene, scoped_timer timer((summary != NULL)? &summary->time_generate_displacement: NULL); compile_type(shader, shader->graph, SHADER_TYPE_DISPLACEMENT); svm_nodes[index].w = svm_nodes.size(); - svm_nodes.insert(svm_nodes.end(), - current_svm_nodes.begin(), - current_svm_nodes.end()); + svm_nodes.append(current_svm_nodes); } /* Fill in summary information. */ @@ -897,7 +892,6 @@ SVMCompiler::Summary::Summary() : num_svm_nodes(0), peak_stack_usage(0), time_finalize(0.0), - time_finalize_bump(0.0), time_generate_surface(0.0), time_generate_bump(0.0), time_generate_volume(0.0), @@ -913,10 +907,7 @@ string SVMCompiler::Summary::full_report() const report += string_printf("Peak stack usage: %d\n", peak_stack_usage); report += string_printf("Time (in seconds):\n"); - report += string_printf(" Finalize: %f\n", time_finalize); - report += string_printf(" Bump finalize: %f\n", time_finalize_bump); - report += string_printf("Finalize: %f\n", time_finalize + - time_finalize_bump); + report += string_printf("Finalize: %f\n", time_finalize); report += string_printf(" Surface: %f\n", time_generate_surface); report += string_printf(" Bump: %f\n", time_generate_bump); report += string_printf(" Volume: %f\n", time_generate_volume); diff --git a/intern/cycles/render/svm.h b/intern/cycles/render/svm.h index a501b6bc8b1..18be0fa9a22 100644 --- a/intern/cycles/render/svm.h +++ b/intern/cycles/render/svm.h @@ -17,13 +17,13 @@ #ifndef __SVM_H__ #define __SVM_H__ -#include "attribute.h" -#include "graph.h" -#include "shader.h" +#include "render/attribute.h" +#include "render/graph.h" +#include "render/shader.h" -#include "util_set.h" -#include "util_string.h" -#include "util_thread.h" +#include "util/util_set.h" +#include "util/util_string.h" +#include "util/util_thread.h" CCL_NAMESPACE_BEGIN @@ -55,7 +55,7 @@ protected: void device_update_shader(Scene *scene, Shader *shader, Progress *progress, - vector<int4> *global_svm_nodes); + array<int4> *global_svm_nodes); }; /* Graph Compiler */ @@ -74,9 +74,6 @@ public: /* Time spent on surface graph finalization. */ double time_finalize; - /* Time spent on bump graph finalization. */ - double time_finalize_bump; - /* Time spent on generating SVM nodes for surface shader. */ double time_generate_surface; @@ -101,7 +98,7 @@ public: SVMCompiler(ShaderManager *shader_manager, ImageManager *image_manager); void compile(Scene *scene, Shader *shader, - vector<int4>& svm_nodes, + array<int4>& svm_nodes, int index, Summary *summary = NULL); @@ -120,6 +117,7 @@ public: void add_node(const float4& f); uint attribute(ustring name); uint attribute(AttributeStandard std); + uint attribute_standard(ustring name); uint encode_uchar4(uint x, uint y = 0, uint z = 0, uint w = 0); uint closure_mix_weight_offset() { return mix_weight_offset; } @@ -210,7 +208,7 @@ protected: /* compile */ void compile_type(Shader *shader, ShaderGraph *graph, ShaderType type); - vector<int4> current_svm_nodes; + array<int4> current_svm_nodes; ShaderType current_type; Shader *current_shader; ShaderGraph *current_graph; diff --git a/intern/cycles/render/tables.cpp b/intern/cycles/render/tables.cpp index dfafd99961b..536fdd0775e 100644 --- a/intern/cycles/render/tables.cpp +++ b/intern/cycles/render/tables.cpp @@ -14,12 +14,11 @@ * limitations under the License. */ -#include "device.h" -#include "scene.h" -#include "tables.h" +#include "device/device.h" +#include "render/scene.h" +#include "render/tables.h" -#include "util_debug.h" -#include "util_logging.h" +#include "util/util_logging.h" CCL_NAMESPACE_BEGIN @@ -35,25 +34,22 @@ LookupTables::~LookupTables() assert(lookup_tables.size() == 0); } -void LookupTables::device_update(Device *device, DeviceScene *dscene) +void LookupTables::device_update(Device *, DeviceScene *dscene) { if(!need_update) return; VLOG(1) << "Total " << lookup_tables.size() << " lookup tables."; - device->tex_free(dscene->lookup_table); - if(lookup_tables.size() > 0) - device->tex_alloc("__lookup_table", dscene->lookup_table); + dscene->lookup_table.copy_to_device(); need_update = false; } -void LookupTables::device_free(Device *device, DeviceScene *dscene) +void LookupTables::device_free(Device *, DeviceScene *dscene) { - device->tex_free(dscene->lookup_table); - dscene->lookup_table.clear(); + dscene->lookup_table.free(); } static size_t round_up_to_multiple(size_t size, size_t chunk) @@ -90,7 +86,9 @@ size_t LookupTables::add_table(DeviceScene *dscene, vector<float>& data) } /* copy table data and return offset */ - dscene->lookup_table.copy_at(&data[0], new_table.offset, data.size()); + float *dtable = dscene->lookup_table.data(); + memcpy(dtable + new_table.offset, &data[0], sizeof(float) * data.size()); + return new_table.offset; } diff --git a/intern/cycles/render/tables.h b/intern/cycles/render/tables.h index 1bb70b22762..bc261c2a74d 100644 --- a/intern/cycles/render/tables.h +++ b/intern/cycles/render/tables.h @@ -17,7 +17,7 @@ #ifndef __TABLES_H__ #define __TABLES_H__ -#include <util_list.h> +#include "util/util_list.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/render/tile.cpp b/intern/cycles/render/tile.cpp index a493c3fa1cd..a388f5dfc8b 100644 --- a/intern/cycles/render/tile.cpp +++ b/intern/cycles/render/tile.cpp @@ -14,10 +14,11 @@ * limitations under the License. */ -#include "tile.h" +#include "render/tile.h" -#include "util_algorithm.h" -#include "util_types.h" +#include "util/util_algorithm.h" +#include "util/util_foreach.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN @@ -25,37 +26,39 @@ namespace { class TileComparator { public: - TileComparator(TileOrder order, int2 center) - : order_(order), - center_(center) + TileComparator(TileOrder order_, int2 center_, Tile *tiles_) + : order(order_), + center(center_), + tiles(tiles_) {} - bool operator()(Tile &a, Tile &b) + bool operator()(int a, int b) { - switch(order_) { + switch(order) { case TILE_CENTER: { - float2 dist_a = make_float2(center_.x - (a.x + a.w/2), - center_.y - (a.y + a.h/2)); - float2 dist_b = make_float2(center_.x - (b.x + b.w/2), - center_.y - (b.y + b.h/2)); + float2 dist_a = make_float2(center.x - (tiles[a].x + tiles[a].w/2), + center.y - (tiles[a].y + tiles[a].h/2)); + float2 dist_b = make_float2(center.x - (tiles[b].x + tiles[b].w/2), + center.y - (tiles[b].y + tiles[b].h/2)); return dot(dist_a, dist_a) < dot(dist_b, dist_b); } case TILE_LEFT_TO_RIGHT: - return (a.x == b.x)? (a.y < b.y): (a.x < b.x); + return (tiles[a].x == tiles[b].x)? (tiles[a].y < tiles[b].y): (tiles[a].x < tiles[b].x); case TILE_RIGHT_TO_LEFT: - return (a.x == b.x)? (a.y < b.y): (a.x > b.x); + return (tiles[a].x == tiles[b].x)? (tiles[a].y < tiles[b].y): (tiles[a].x > tiles[b].x); case TILE_TOP_TO_BOTTOM: - return (a.y == b.y)? (a.x < b.x): (a.y > b.y); + return (tiles[a].y == tiles[b].y)? (tiles[a].x < tiles[b].x): (tiles[a].y > tiles[b].y); case TILE_BOTTOM_TO_TOP: default: - return (a.y == b.y)? (a.x < b.x): (a.y < b.y); + return (tiles[a].y == tiles[b].y)? (tiles[a].x < tiles[b].x): (tiles[a].y < tiles[b].y); } } protected: - TileOrder order_; - int2 center_; + TileOrder order; + int2 center; + Tile *tiles; }; inline int2 hilbert_index_to_pos(int n, int d) @@ -86,16 +89,19 @@ enum SpiralDirection { } /* namespace */ TileManager::TileManager(bool progressive_, int num_samples_, int2 tile_size_, int start_resolution_, - bool preserve_tile_device_, bool background_, TileOrder tile_order_, int num_devices_) + bool preserve_tile_device_, bool background_, TileOrder tile_order_, + int num_devices_, int pixel_size_) { progressive = progressive_; tile_size = tile_size_; tile_order = tile_order_; start_resolution = start_resolution_; + pixel_size = pixel_size_; num_samples = num_samples_; num_devices = num_devices_; preserve_tile_device = preserve_tile_device_; background = background_; + schedule_denoising = false; range_start_sample = 0; range_num_samples = -1; @@ -108,6 +114,18 @@ TileManager::~TileManager() { } +void TileManager::device_free() +{ + if(schedule_denoising || progressive) { + for(int i = 0; i < state.tiles.size(); i++) { + delete state.tiles[i].buffers; + state.tiles[i].buffers = NULL; + } + } + + state.tiles.clear(); +} + static int get_divider(int w, int h, int start_resolution) { int divider = 1; @@ -131,10 +149,11 @@ void TileManager::reset(BufferParams& params_, int num_samples_) state.buffer = BufferParams(); state.sample = range_start_sample - 1; state.num_tiles = 0; - state.num_rendered_tiles = 0; state.num_samples = 0; state.resolution_divider = get_divider(params.width, params.height, start_resolution); - state.tiles.clear(); + state.render_tiles.clear(); + state.denoising_tiles.clear(); + device_free(); } void TileManager::set_samples(int num_samples_) @@ -149,15 +168,20 @@ void TileManager::set_samples(int num_samples_) uint64_t pixel_samples = 0; /* While rendering in the viewport, the initial preview resolution is increased to the native resolution * before the actual rendering begins. Therefore, additional pixel samples will be rendered. */ - int divider = get_divider(params.width, params.height, start_resolution) / 2; - while(divider > 1) { + int divider = max(get_divider(params.width, params.height, start_resolution) / 2, pixel_size); + while(divider > pixel_size) { int image_w = max(1, params.width/divider); int image_h = max(1, params.height/divider); pixel_samples += image_w * image_h; divider >>= 1; } - state.total_pixel_samples = pixel_samples + (uint64_t)get_num_effective_samples() * params.width*params.height; + int image_w = max(1, params.width/divider); + int image_h = max(1, params.height/divider); + state.total_pixel_samples = pixel_samples + (uint64_t)get_num_effective_samples() * image_w*image_h; + if(schedule_denoising) { + state.total_pixel_samples += params.width*params.height; + } } } @@ -170,32 +194,36 @@ int TileManager::gen_tiles(bool sliced) int image_h = max(1, params.height/resolution); int2 center = make_int2(image_w/2, image_h/2); - state.tiles.clear(); - int num_logical_devices = preserve_tile_device? num_devices: 1; int num = min(image_h, num_logical_devices); int slice_num = sliced? num: 1; - int tile_index = 0; + int tile_w = (tile_size.x >= image_w) ? 1 : divide_up(image_w, tile_size.x); - state.tiles.clear(); - state.tiles.resize(num); - vector<list<Tile> >::iterator tile_list = state.tiles.begin(); + device_free(); + state.render_tiles.clear(); + state.denoising_tiles.clear(); + state.render_tiles.resize(num); + state.denoising_tiles.resize(num); + state.tile_stride = tile_w; + vector<list<int> >::iterator tile_list; + tile_list = state.render_tiles.begin(); if(tile_order == TILE_HILBERT_SPIRAL) { assert(!sliced); + int tile_h = (tile_size.y >= image_h) ? 1 : divide_up(image_h, tile_size.y); + state.tiles.resize(tile_w*tile_h); + /* Size of blocks in tiles, must be a power of 2 */ const int hilbert_size = (max(tile_size.x, tile_size.y) <= 12)? 8: 4; - int tile_w = (tile_size.x >= image_w)? 1: (image_w + tile_size.x - 1)/tile_size.x; - int tile_h = (tile_size.y >= image_h)? 1: (image_h + tile_size.y - 1)/tile_size.y; - int tiles_per_device = (tile_w * tile_h + num - 1) / num; + int tiles_per_device = divide_up(tile_w * tile_h, num); int cur_device = 0, cur_tiles = 0; int2 block_size = tile_size * make_int2(hilbert_size, hilbert_size); /* Number of blocks to fill the image */ - int blocks_x = (block_size.x >= image_w)? 1: (image_w + block_size.x - 1)/block_size.x; - int blocks_y = (block_size.y >= image_h)? 1: (image_h + block_size.y - 1)/block_size.y; + int blocks_x = (block_size.x >= image_w)? 1: divide_up(image_w, block_size.x); + int blocks_y = (block_size.y >= image_h)? 1: divide_up(image_h, block_size.y); int n = max(blocks_x, blocks_y) | 0x1; /* Side length of the spiral (must be odd) */ /* Offset of spiral (to keep it centered) */ int2 offset = make_int2((image_w - n*block_size.x)/2, (image_h - n*block_size.y)/2); @@ -226,9 +254,11 @@ int TileManager::gen_tiles(bool sliced) if(pos.x >= 0 && pos.y >= 0 && pos.x < image_w && pos.y < image_h) { int w = min(tile_size.x, image_w - pos.x); int h = min(tile_size.y, image_h - pos.y); - tile_list->push_front(Tile(tile_index, pos.x, pos.y, w, h, cur_device)); + int2 ipos = pos / tile_size; + int idx = ipos.y*tile_w + ipos.x; + state.tiles[idx] = Tile(idx, pos.x, pos.y, w, h, cur_device, Tile::RENDER); + tile_list->push_front(idx); cur_tiles++; - tile_index++; if(cur_tiles == tiles_per_device) { tile_list++; @@ -272,27 +302,28 @@ int TileManager::gen_tiles(bool sliced) break; } } - return tile_index; + return tile_w*tile_h; } + int idx = 0; for(int slice = 0; slice < slice_num; slice++) { int slice_y = (image_h/slice_num)*slice; int slice_h = (slice == slice_num-1)? image_h - slice*(image_h/slice_num): image_h/slice_num; - int tile_w = (tile_size.x >= image_w)? 1: (image_w + tile_size.x - 1)/tile_size.x; - int tile_h = (tile_size.y >= slice_h)? 1: (slice_h + tile_size.y - 1)/tile_size.y; + int tile_h = (tile_size.y >= slice_h)? 1: divide_up(slice_h, tile_size.y); - int tiles_per_device = (tile_w * tile_h + num - 1) / num; + int tiles_per_device = divide_up(tile_w * tile_h, num); int cur_device = 0, cur_tiles = 0; for(int tile_y = 0; tile_y < tile_h; tile_y++) { - for(int tile_x = 0; tile_x < tile_w; tile_x++, tile_index++) { + for(int tile_x = 0; tile_x < tile_w; tile_x++, idx++) { int x = tile_x * tile_size.x; int y = tile_y * tile_size.y; int w = (tile_x == tile_w-1)? image_w - x: tile_size.x; int h = (tile_y == tile_h-1)? slice_h - y: tile_size.y; - tile_list->push_back(Tile(tile_index, x, y + slice_y, w, h, sliced? slice: cur_device)); + state.tiles.push_back(Tile(idx, x, y + slice_y, w, h, sliced? slice: cur_device, Tile::RENDER)); + tile_list->push_back(idx); if(!sliced) { cur_tiles++; @@ -300,7 +331,7 @@ int TileManager::gen_tiles(bool sliced) if(cur_tiles == tiles_per_device) { /* Tiles are already generated in Bottom-to-Top order, so no sort is necessary in that case. */ if(tile_order != TILE_BOTTOM_TO_TOP) { - tile_list->sort(TileComparator(tile_order, center)); + tile_list->sort(TileComparator(tile_order, center, &state.tiles[0])); } tile_list++; cur_tiles = 0; @@ -314,7 +345,15 @@ int TileManager::gen_tiles(bool sliced) } } - return tile_index; + return idx; +} + +void TileManager::gen_render_tiles() +{ + /* Regenerate just the render tiles for progressive render. */ + foreach(Tile& tile, state.tiles) { + state.render_tiles[tile.device].push_back(tile.index); + } } void TileManager::set_tiles() @@ -334,16 +373,115 @@ void TileManager::set_tiles() state.buffer.full_height = max(1, params.full_height/resolution); } -bool TileManager::next_tile(Tile& tile, int device) +int TileManager::get_neighbor_index(int index, int neighbor) +{ + static const int dx[] = {-1, 0, 1, -1, 1, -1, 0, 1, 0}, dy[] = {-1, -1, -1, 0, 0, 1, 1, 1, 0}; + + int resolution = state.resolution_divider; + int image_w = max(1, params.width/resolution); + int image_h = max(1, params.height/resolution); + int tile_w = (tile_size.x >= image_w)? 1: divide_up(image_w, tile_size.x); + int tile_h = (tile_size.y >= image_h)? 1: divide_up(image_h, tile_size.y); + + int nx = state.tiles[index].x/tile_size.x + dx[neighbor], ny = state.tiles[index].y/tile_size.y + dy[neighbor]; + if(nx < 0 || ny < 0 || nx >= tile_w || ny >= tile_h) + return -1; + + return ny*state.tile_stride + nx; +} + +/* Checks whether all neighbors of a tile (as well as the tile itself) are at least at state min_state. */ +bool TileManager::check_neighbor_state(int index, Tile::State min_state) +{ + if(index < 0 || state.tiles[index].state < min_state) { + return false; + } + for(int neighbor = 0; neighbor < 9; neighbor++) { + int nindex = get_neighbor_index(index, neighbor); + /* Out-of-bounds tiles don't matter. */ + if(nindex >= 0 && state.tiles[nindex].state < min_state) { + return false; + } + } + + return true; +} + +/* Returns whether the tile should be written (and freed if no denoising is used) instead of updating. */ +bool TileManager::finish_tile(int index, bool &delete_tile) +{ + delete_tile = false; + + if(progressive) { + return true; + } + + switch(state.tiles[index].state) { + case Tile::RENDER: + { + if(!schedule_denoising) { + state.tiles[index].state = Tile::DONE; + delete_tile = true; + return true; + } + state.tiles[index].state = Tile::RENDERED; + /* For each neighbor and the tile itself, check whether all of its neighbors have been rendered. If yes, it can be denoised. */ + for(int neighbor = 0; neighbor < 9; neighbor++) { + int nindex = get_neighbor_index(index, neighbor); + if(check_neighbor_state(nindex, Tile::RENDERED)) { + state.tiles[nindex].state = Tile::DENOISE; + state.denoising_tiles[state.tiles[nindex].device].push_back(nindex); + } + } + return false; + } + case Tile::DENOISE: + { + state.tiles[index].state = Tile::DENOISED; + /* For each neighbor and the tile itself, check whether all of its neighbors have been denoised. If yes, it can be freed. */ + for(int neighbor = 0; neighbor < 9; neighbor++) { + int nindex = get_neighbor_index(index, neighbor); + if(check_neighbor_state(nindex, Tile::DENOISED)) { + state.tiles[nindex].state = Tile::DONE; + /* It can happen that the tile just finished denoising and already can be freed here. + * However, in that case it still has to be written before deleting, so we can't delete it yet. */ + if(neighbor == 8) { + delete_tile = true; + } + else { + delete state.tiles[nindex].buffers; + state.tiles[nindex].buffers = NULL; + } + } + } + return true; + } + default: + assert(false); + return true; + } +} + +bool TileManager::next_tile(Tile* &tile, int device) { int logical_device = preserve_tile_device? device: 0; - if((logical_device >= state.tiles.size()) || state.tiles[logical_device].empty()) + if(logical_device >= state.render_tiles.size()) + return false; + + if(!state.denoising_tiles[logical_device].empty()) { + int idx = state.denoising_tiles[logical_device].front(); + state.denoising_tiles[logical_device].pop_front(); + tile = &state.tiles[idx]; + return true; + } + + if(state.render_tiles[logical_device].empty()) return false; - tile = Tile(state.tiles[logical_device].front()); - state.tiles[logical_device].pop_front(); - state.num_rendered_tiles++; + int idx = state.render_tiles[logical_device].front(); + state.render_tiles[logical_device].pop_front(); + tile = &state.tiles[idx]; return true; } @@ -352,7 +490,7 @@ bool TileManager::done() int end_sample = (range_num_samples == -1) ? num_samples : range_start_sample + range_num_samples; - return (state.resolution_divider == 1) && + return (state.resolution_divider == pixel_size) && (state.sample+state.num_samples >= end_sample); } @@ -361,9 +499,9 @@ bool TileManager::next() if(done()) return false; - if(progressive && state.resolution_divider > 1) { + if(progressive && state.resolution_divider > pixel_size) { state.sample = 0; - state.resolution_divider /= 2; + state.resolution_divider = max(state.resolution_divider/2, pixel_size); state.num_samples = 1; set_tiles(); } @@ -377,8 +515,14 @@ bool TileManager::next() else state.num_samples = range_num_samples; - state.resolution_divider = 1; - set_tiles(); + state.resolution_divider = pixel_size; + + if(state.sample == range_start_sample) { + set_tiles(); + } + else { + gen_render_tiles(); + } } return true; diff --git a/intern/cycles/render/tile.h b/intern/cycles/render/tile.h index 5d92ebac355..2692c7cf9f0 100644 --- a/intern/cycles/render/tile.h +++ b/intern/cycles/render/tile.h @@ -19,8 +19,8 @@ #include <limits.h> -#include "buffers.h" -#include "util_list.h" +#include "render/buffers.h" +#include "util/util_list.h" CCL_NAMESPACE_BEGIN @@ -31,12 +31,20 @@ public: int index; int x, y, w, h; int device; + /* RENDER: The tile has to be rendered. + * RENDERED: The tile has been rendered, but can't be denoised yet (waiting for neighbors). + * DENOISE: The tile can be denoised now. + * DENOISED: The tile has been denoised, but can't be freed yet (waiting for neighbors). + * DONE: The tile is finished and has been freed. */ + typedef enum { RENDER = 0, RENDERED, DENOISE, DENOISED, DONE } State; + State state; + RenderBuffers *buffers; Tile() {} - Tile(int index_, int x_, int y_, int w_, int h_, int device_) - : index(index_), x(x_), y(y_), w(w_), h(h_), device(device_) {} + Tile(int index_, int x_, int y_, int w_, int h_, int device_, State state_ = RENDER) + : index(index_), x(x_), y(y_), w(w_), h(h_), device(device_), state(state_), buffers(NULL) {} }; /* Tile order */ @@ -58,31 +66,37 @@ public: BufferParams params; struct State { + vector<Tile> tiles; + int tile_stride; BufferParams buffer; int sample; int num_samples; int resolution_divider; int num_tiles; - int num_rendered_tiles; /* Total samples over all pixels: Generally num_samples*num_pixels, * but can be higher due to the initial resolution division for previews. */ uint64_t total_pixel_samples; - /* This vector contains a list of tiles for every logical device in the session. - * In each list, the tiles are sorted according to the tile order setting. */ - vector<list<Tile> > tiles; + + /* These lists contain the indices of the tiles to be rendered/denoised and are used + * when acquiring a new tile for the device. + * Each list in each vector is for one logical device. */ + vector<list<int> > render_tiles; + vector<list<int> > denoising_tiles; } state; int num_samples; TileManager(bool progressive, int num_samples, int2 tile_size, int start_resolution, - bool preserve_tile_device, bool background, TileOrder tile_order, int num_devices = 1); + bool preserve_tile_device, bool background, TileOrder tile_order, int num_devices = 1, int pixel_size = 1); ~TileManager(); + void device_free(); void reset(BufferParams& params, int num_samples); void set_samples(int num_samples); bool next(); - bool next_tile(Tile& tile, int device = 0); + bool next_tile(Tile* &tile, int device = 0); + bool finish_tile(int index, bool& delete_tile); bool done(); void set_tile_order(TileOrder tile_order_) { tile_order = tile_order_; } @@ -97,6 +111,9 @@ public: /* Get number of actual samples to render. */ int get_num_effective_samples(); + + /* Schedule tiles for denoising after they've been rendered. */ + bool schedule_denoising; protected: void set_tiles(); @@ -105,6 +122,7 @@ protected: int2 tile_size; TileOrder tile_order; int start_resolution; + int pixel_size; int num_devices; /* in some cases it is important that the same tile will be returned for the same @@ -128,6 +146,10 @@ protected: /* Generate tile list, return number of tiles. */ int gen_tiles(bool sliced); + void gen_render_tiles(); + + int get_neighbor_index(int index, int neighbor); + bool check_neighbor_state(int index, Tile::State state); }; CCL_NAMESPACE_END diff --git a/intern/cycles/subd/CMakeLists.txt b/intern/cycles/subd/CMakeLists.txt index dafb807bdf3..7f952dd43ce 100644 --- a/intern/cycles/subd/CMakeLists.txt +++ b/intern/cycles/subd/CMakeLists.txt @@ -1,11 +1,6 @@ set(INC - . - ../graph - ../kernel - ../kernel/svm - ../render - ../util + .. ) set(INC_SYS @@ -29,4 +24,4 @@ set(SRC_HEADERS include_directories(${INC}) include_directories(SYSTEM ${INC_SYS}) -add_library(cycles_subd ${SRC} ${SRC_HEADERS}) +cycles_add_library(cycles_subd ${SRC} ${SRC_HEADERS}) diff --git a/intern/cycles/subd/subd_dice.cpp b/intern/cycles/subd/subd_dice.cpp index a1bd349b167..8c426da4acf 100644 --- a/intern/cycles/subd/subd_dice.cpp +++ b/intern/cycles/subd/subd_dice.cpp @@ -14,13 +14,11 @@ * limitations under the License. */ -#include "camera.h" -#include "mesh.h" +#include "render/camera.h" +#include "render/mesh.h" -#include "subd_dice.h" -#include "subd_patch.h" - -#include "util_debug.h" +#include "subd/subd_dice.h" +#include "subd/subd_patch.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/subd/subd_dice.h b/intern/cycles/subd/subd_dice.h index 33d13a4ab3a..c0e32be18c4 100644 --- a/intern/cycles/subd/subd_dice.h +++ b/intern/cycles/subd/subd_dice.h @@ -22,8 +22,8 @@ * DiagSplit. For more algorithm details, see the DiagSplit paper or the * ARB_tessellation_shader OpenGL extension, Section 2.X.2. */ -#include "util_types.h" -#include "util_vector.h" +#include "util/util_types.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/subd/subd_patch.cpp b/intern/cycles/subd/subd_patch.cpp index d3319c5ccf5..fa2fe2bf113 100644 --- a/intern/cycles/subd/subd_patch.cpp +++ b/intern/cycles/subd/subd_patch.cpp @@ -16,12 +16,12 @@ /* Parts adapted from code in the public domain in NVidia Mesh Tools. */ -#include "mesh.h" +#include "render/mesh.h" -#include "subd_patch.h" +#include "subd/subd_patch.h" -#include "util_math.h" -#include "util_types.h" +#include "util/util_math.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/subd/subd_patch.h b/intern/cycles/subd/subd_patch.h index 360c1abf27b..1bb81588835 100644 --- a/intern/cycles/subd/subd_patch.h +++ b/intern/cycles/subd/subd_patch.h @@ -17,8 +17,8 @@ #ifndef __SUBD_PATCH_H__ #define __SUBD_PATCH_H__ -#include "util_boundbox.h" -#include "util_types.h" +#include "util/util_boundbox.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/subd/subd_patch_table.cpp b/intern/cycles/subd/subd_patch_table.cpp index d437b045c07..63bf673a90b 100644 --- a/intern/cycles/subd/subd_patch_table.cpp +++ b/intern/cycles/subd/subd_patch_table.cpp @@ -25,10 +25,10 @@ * */ -#include "subd_patch_table.h" -#include "kernel_types.h" +#include "subd/subd_patch_table.h" +#include "kernel/kernel_types.h" -#include "util_math.h" +#include "util/util_math.h" #ifdef WITH_OPENSUBDIV #include <opensubdiv/far/patchTable.h> diff --git a/intern/cycles/subd/subd_patch_table.h b/intern/cycles/subd/subd_patch_table.h index 3166a1691d8..907f2dd6c28 100644 --- a/intern/cycles/subd/subd_patch_table.h +++ b/intern/cycles/subd/subd_patch_table.h @@ -17,8 +17,8 @@ #ifndef __SUBD_PATCH_TABLE_H__ #define __SUBD_PATCH_TABLE_H__ -#include "util_types.h" -#include "util_vector.h" +#include "util/util_types.h" +#include "util/util_vector.h" #ifdef WITH_OPENSUBDIV #ifdef _MSC_VER diff --git a/intern/cycles/subd/subd_split.cpp b/intern/cycles/subd/subd_split.cpp index 3c91ad8ab0d..d3bed6a5c53 100644 --- a/intern/cycles/subd/subd_split.cpp +++ b/intern/cycles/subd/subd_split.cpp @@ -14,16 +14,15 @@ * limitations under the License. */ -#include "camera.h" -#include "mesh.h" +#include "render/camera.h" +#include "render/mesh.h" -#include "subd_dice.h" -#include "subd_patch.h" -#include "subd_split.h" +#include "subd/subd_dice.h" +#include "subd/subd_patch.h" +#include "subd/subd_split.h" -#include "util_debug.h" -#include "util_math.h" -#include "util_types.h" +#include "util/util_math.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/subd/subd_split.h b/intern/cycles/subd/subd_split.h index a2f76dd2e03..f869cc6a48e 100644 --- a/intern/cycles/subd/subd_split.h +++ b/intern/cycles/subd/subd_split.h @@ -22,10 +22,10 @@ * evaluation at arbitrary points is required for this to work. See the paper * for more details. */ -#include "subd_dice.h" +#include "subd/subd_dice.h" -#include "util_types.h" -#include "util_vector.h" +#include "util/util_types.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/test/CMakeLists.txt b/intern/cycles/test/CMakeLists.txt index 80564c33be6..f3e49dc0c4e 100644 --- a/intern/cycles/test/CMakeLists.txt +++ b/intern/cycles/test/CMakeLists.txt @@ -22,12 +22,17 @@ set(INC ) set(ALL_CYCLES_LIBRARIES - cycles_render cycles_device + cycles_kernel + cycles_render cycles_bvh cycles_graph cycles_subd cycles_util + extern_clew + ${BLENDER_GL_LIBRARIES} + ${BLENDER_GLEW_LIBRARIES} + ${CYCLES_APP_GLEW_LIBRARY} ${OPENIMAGEIO_LIBRARIES} ) if(WITH_CYCLES_OSL) @@ -37,10 +42,12 @@ if(WITH_CYCLES_OSL) ${LLVM_LIBRARIES} ) endif() -if(WITH_IMAGE_OPENJPEG AND NOT WITH_SYSTEM_OPENJPEG) - list(APPEND ALL_CYCLES_LIBRARIES - extern_openjpeg - ) +if(WITH_IMAGE_OPENJPEG) + if(WITH_SYSTEM_OPENJPEG) + list(APPEND ALL_CYCLES_LIBRARIES ${OPENJPEG_LIBRARIES}) + else() + list(APPEND ALL_CYCLES_LIBRARIES extern_openjpeg) + endif() endif() if(WITH_CYCLES_OPENSUBDIV) add_definitions(-DWITH_OPENSUBDIV) @@ -52,6 +59,15 @@ if(WITH_CYCLES_OPENSUBDIV) ${OPENSUBDIV_LIBRARIES} ) endif() +if(WITH_CUDA_DYNLOAD) + list(APPEND ALL_CYCLES_LIBRARIES extern_cuew) +else() + list(APPEND ALL_CYCLES_LIBRARIES ${CUDA_CUDA_LIBRARY}) +endif() +if(NOT CYCLES_STANDALONE_REPOSITORY) + list(APPEND ALL_CYCLES_LIBRARIES bf_intern_glew_mx bf_intern_guardedalloc ${GLEW_LIBRARY}) +endif() + list(APPEND ALL_CYCLES_LIBRARIES ${BOOST_LIBRARIES} ${PNG_LIBRARIES} diff --git a/intern/cycles/test/render_graph_finalize_test.cpp b/intern/cycles/test/render_graph_finalize_test.cpp index 3fc086cbc0c..b66a91adbda 100644 --- a/intern/cycles/test/render_graph_finalize_test.cpp +++ b/intern/cycles/test/render_graph_finalize_test.cpp @@ -74,7 +74,7 @@ protected: class ShaderGraphBuilder { public: - explicit ShaderGraphBuilder(ShaderGraph *graph) + ShaderGraphBuilder(ShaderGraph *graph) : graph_(graph) { node_map_["Output"] = graph->output(); @@ -155,15 +155,39 @@ protected: } // namespace -#define DEFINE_COMMON_VARIABLES(builder_name, mock_log_name) \ - util_logging_start(); \ - util_logging_verbosity_set(1); \ - ScopedMockLog mock_log_name; \ - DeviceInfo device_info; \ - SceneParams scene_params; \ - Scene scene(scene_params, device_info); \ - ShaderGraph graph; \ - ShaderGraphBuilder builder(&graph); \ +class RenderGraph : public testing::Test +{ +protected: + ScopedMockLog log; + Stats stats; + DeviceInfo device_info; + Device *device_cpu; + SceneParams scene_params; + Scene *scene; + ShaderGraph graph; + ShaderGraphBuilder builder; + + RenderGraph() + : testing::Test(), + builder(&graph) + { + } + + virtual void SetUp() + { + util_logging_start(); + util_logging_verbosity_set(1); + + device_cpu = Device::create(device_info, stats, true); + scene = new Scene(scene_params, device_cpu); + } + + virtual void TearDown() + { + delete scene; + delete device_cpu; + } +}; #define EXPECT_ANY_MESSAGE(log) \ EXPECT_CALL(log, Log(_, _, _)).Times(AnyNumber()); \ @@ -177,10 +201,8 @@ protected: /* * Test deduplication of nodes that have inputs, some of them folded. */ -TEST(render_graph, deduplicate_deep) +TEST_F(RenderGraph, deduplicate_deep) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Folding Value1::Value to constant (0.8)."); CORRECT_INFO_MESSAGE(log, "Folding Value2::Value to constant (0.8)."); @@ -206,7 +228,7 @@ TEST(render_graph, deduplicate_deep) .add_connection("Noise2::Color", "Mix::Color2") .output_color("Mix::Color"); - graph.finalize(&scene); + graph.finalize(scene); EXPECT_EQ(graph.nodes.size(), 5); } @@ -214,10 +236,8 @@ TEST(render_graph, deduplicate_deep) /* * Test RGB to BW node. */ -TEST(render_graph, constant_fold_rgb_to_bw) +TEST_F(RenderGraph, constant_fold_rgb_to_bw) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Folding RGBToBWNodeNode::Val to constant (0.8)."); CORRECT_INFO_MESSAGE(log, "Folding convert_float_to_color::value_color to constant (0.8, 0.8, 0.8)."); @@ -227,17 +247,15 @@ TEST(render_graph, constant_fold_rgb_to_bw) .set("Color", make_float3(0.8f, 0.8f, 0.8f))) .output_color("RGBToBWNodeNode::Val"); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: * - folding of Emission nodes that don't emit to nothing. */ -TEST(render_graph, constant_fold_emission1) +TEST_F(RenderGraph, constant_fold_emission1) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Discarding closure Emission."); @@ -246,13 +264,11 @@ TEST(render_graph, constant_fold_emission1) .set("Color", make_float3(0.0f, 0.0f, 0.0f))) .output_closure("Emission::Emission"); - graph.finalize(&scene); + graph.finalize(scene); } -TEST(render_graph, constant_fold_emission2) +TEST_F(RenderGraph, constant_fold_emission2) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Discarding closure Emission."); @@ -261,17 +277,15 @@ TEST(render_graph, constant_fold_emission2) .set("Strength", 0.0f)) .output_closure("Emission::Emission"); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: * - folding of Background nodes that don't emit to nothing. */ -TEST(render_graph, constant_fold_background1) +TEST_F(RenderGraph, constant_fold_background1) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Discarding closure Background."); @@ -280,13 +294,11 @@ TEST(render_graph, constant_fold_background1) .set("Color", make_float3(0.0f, 0.0f, 0.0f))) .output_closure("Background::Background"); - graph.finalize(&scene); + graph.finalize(scene); } -TEST(render_graph, constant_fold_background2) +TEST_F(RenderGraph, constant_fold_background2) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Discarding closure Background."); @@ -295,17 +307,15 @@ TEST(render_graph, constant_fold_background2) .set("Strength", 0.0f)) .output_closure("Background::Background"); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: * - Folding of Add Closure with only one input. */ -TEST(render_graph, constant_fold_shader_add) +TEST_F(RenderGraph, constant_fold_shader_add) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Folding AddClosure1::Closure to socket Diffuse::BSDF."); CORRECT_INFO_MESSAGE(log, "Folding AddClosure2::Closure to socket Diffuse::BSDF."); @@ -322,7 +332,7 @@ TEST(render_graph, constant_fold_shader_add) .add_connection("AddClosure2::Closure", "AddClosure3::Closure2") .output_closure("AddClosure3::Closure"); - graph.finalize(&scene); + graph.finalize(scene); } /* @@ -330,10 +340,8 @@ TEST(render_graph, constant_fold_shader_add) * - Folding of Mix Closure with 0 or 1 fac. * - Folding of Mix Closure with both inputs folded to the same node. */ -TEST(render_graph, constant_fold_shader_mix) +TEST_F(RenderGraph, constant_fold_shader_mix) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Folding MixClosure1::Closure to socket Diffuse::BSDF."); CORRECT_INFO_MESSAGE(log, "Folding MixClosure2::Closure to socket Diffuse::BSDF."); @@ -357,17 +365,15 @@ TEST(render_graph, constant_fold_shader_mix) .add_connection("MixClosure2::Closure", "MixClosure3::Closure2") .output_closure("MixClosure3::Closure"); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: * - Folding of Invert with all constant inputs. */ -TEST(render_graph, constant_fold_invert) +TEST_F(RenderGraph, constant_fold_invert) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Folding Invert::Color to constant (0.68, 0.5, 0.32)."); @@ -377,17 +383,15 @@ TEST(render_graph, constant_fold_invert) .set("Color", make_float3(0.2f, 0.5f, 0.8f))) .output_color("Invert::Color"); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: * - Folding of Invert with zero Fac. */ -TEST(render_graph, constant_fold_invert_fac_0) +TEST_F(RenderGraph, constant_fold_invert_fac_0) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Folding Invert::Color to socket Attribute::Color."); @@ -398,17 +402,15 @@ TEST(render_graph, constant_fold_invert_fac_0) .add_connection("Attribute::Color", "Invert::Color") .output_color("Invert::Color"); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: * - Folding of Invert with zero Fac and constant input. */ -TEST(render_graph, constant_fold_invert_fac_0_const) +TEST_F(RenderGraph, constant_fold_invert_fac_0_const) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Folding Invert::Color to constant (0.2, 0.5, 0.8)."); @@ -418,17 +420,15 @@ TEST(render_graph, constant_fold_invert_fac_0_const) .set("Color", make_float3(0.2f, 0.5f, 0.8f))) .output_color("Invert::Color"); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: * - Folding of MixRGB Add with all constant inputs (clamp false). */ -TEST(render_graph, constant_fold_mix_add) +TEST_F(RenderGraph, constant_fold_mix_add) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Folding MixAdd::Color to constant (0.62, 1.14, 1.42)."); @@ -441,17 +441,15 @@ TEST(render_graph, constant_fold_mix_add) .set("Color2", make_float3(0.4, 0.8, 0.9))) .output_color("MixAdd::Color"); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: * - Folding of MixRGB Add with all constant inputs (clamp true). */ -TEST(render_graph, constant_fold_mix_add_clamp) +TEST_F(RenderGraph, constant_fold_mix_add_clamp) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Folding MixAdd::Color to constant (0.62, 1, 1)."); @@ -464,17 +462,15 @@ TEST(render_graph, constant_fold_mix_add_clamp) .set("Color2", make_float3(0.4, 0.8, 0.9))) .output_color("MixAdd::Color"); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: * - No folding on fac 0 for dodge. */ -TEST(render_graph, constant_fold_part_mix_dodge_no_fac_0) +TEST_F(RenderGraph, constant_fold_part_mix_dodge_no_fac_0) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); INVALID_INFO_MESSAGE(log, "Folding "); @@ -489,17 +485,15 @@ TEST(render_graph, constant_fold_part_mix_dodge_no_fac_0) .add_connection("Attribute2::Color", "Mix::Color2") .output_color("Mix::Color"); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: * - No folding on fac 0 for light. */ -TEST(render_graph, constant_fold_part_mix_light_no_fac_0) +TEST_F(RenderGraph, constant_fold_part_mix_light_no_fac_0) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); INVALID_INFO_MESSAGE(log, "Folding "); @@ -514,17 +508,15 @@ TEST(render_graph, constant_fold_part_mix_light_no_fac_0) .add_connection("Attribute2::Color", "Mix::Color2") .output_color("Mix::Color"); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: * - No folding on fac 0 for burn. */ -TEST(render_graph, constant_fold_part_mix_burn_no_fac_0) +TEST_F(RenderGraph, constant_fold_part_mix_burn_no_fac_0) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); INVALID_INFO_MESSAGE(log, "Folding "); @@ -539,17 +531,15 @@ TEST(render_graph, constant_fold_part_mix_burn_no_fac_0) .add_connection("Attribute2::Color", "Mix::Color2") .output_color("Mix::Color"); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: * - No folding on fac 0 for clamped blend. */ -TEST(render_graph, constant_fold_part_mix_blend_clamped_no_fac_0) +TEST_F(RenderGraph, constant_fold_part_mix_blend_clamped_no_fac_0) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); INVALID_INFO_MESSAGE(log, "Folding "); @@ -564,7 +554,7 @@ TEST(render_graph, constant_fold_part_mix_blend_clamped_no_fac_0) .add_connection("Attribute2::Color", "Mix::Color2") .output_color("Mix::Color"); - graph.finalize(&scene); + graph.finalize(scene); } /* @@ -572,10 +562,8 @@ TEST(render_graph, constant_fold_part_mix_blend_clamped_no_fac_0) * - Folding of Mix with 0 or 1 Fac. * - Folding of Mix with both inputs folded to the same node. */ -TEST(render_graph, constant_fold_part_mix_blend) +TEST_F(RenderGraph, constant_fold_part_mix_blend) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Folding MixBlend1::Color to socket Attribute1::Color."); CORRECT_INFO_MESSAGE(log, "Folding MixBlend2::Color to socket Attribute1::Color."); @@ -607,17 +595,15 @@ TEST(render_graph, constant_fold_part_mix_blend) .add_connection("MixBlend2::Color", "MixBlend3::Color2") .output_color("MixBlend3::Color"); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: * - NOT folding of MixRGB Sub with the same inputs and fac NOT 1. */ -TEST(render_graph, constant_fold_part_mix_sub_same_fac_bad) +TEST_F(RenderGraph, constant_fold_part_mix_sub_same_fac_bad) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); INVALID_INFO_MESSAGE(log, "Folding Mix::"); @@ -631,17 +617,15 @@ TEST(render_graph, constant_fold_part_mix_sub_same_fac_bad) .add_connection("Attribute::Color", "Mix::Color2") .output_color("Mix::Color"); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: * - Folding of MixRGB Sub with the same inputs and fac 1. */ -TEST(render_graph, constant_fold_part_mix_sub_same_fac_1) +TEST_F(RenderGraph, constant_fold_part_mix_sub_same_fac_1) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Folding Mix::Color to constant (0, 0, 0)."); @@ -655,7 +639,7 @@ TEST(render_graph, constant_fold_part_mix_sub_same_fac_1) .add_connection("Attribute::Color", "Mix::Color2") .output_color("Mix::Color"); - graph.finalize(&scene); + graph.finalize(scene); } /* @@ -717,10 +701,8 @@ static void build_mix_partial_test_graph(ShaderGraphBuilder &builder, NodeMix ty /* * Tests: partial folding for RGB Add with known 0. */ -TEST(render_graph, constant_fold_part_mix_add_0) +TEST_F(RenderGraph, constant_fold_part_mix_add_0) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); /* 0 + X (fac 1) == X */ INVALID_INFO_MESSAGE(log, "Folding Mix_Cx_Fx::Color"); @@ -731,16 +713,14 @@ TEST(render_graph, constant_fold_part_mix_add_0) INVALID_INFO_MESSAGE(log, "Folding Out"); build_mix_partial_test_graph(builder, NODE_MIX_ADD, make_float3(0, 0, 0)); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: partial folding for RGB Sub with known 0. */ -TEST(render_graph, constant_fold_part_mix_sub_0) +TEST_F(RenderGraph, constant_fold_part_mix_sub_0) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); INVALID_INFO_MESSAGE(log, "Folding Mix_Cx_Fx::Color"); INVALID_INFO_MESSAGE(log, "Folding Mix_Cx_F1::Color"); @@ -750,16 +730,14 @@ TEST(render_graph, constant_fold_part_mix_sub_0) INVALID_INFO_MESSAGE(log, "Folding Out"); build_mix_partial_test_graph(builder, NODE_MIX_SUB, make_float3(0, 0, 0)); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: partial folding for RGB Mul with known 1. */ -TEST(render_graph, constant_fold_part_mix_mul_1) +TEST_F(RenderGraph, constant_fold_part_mix_mul_1) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); /* 1 * X (fac 1) == X */ INVALID_INFO_MESSAGE(log, "Folding Mix_Cx_Fx::Color"); @@ -770,16 +748,14 @@ TEST(render_graph, constant_fold_part_mix_mul_1) INVALID_INFO_MESSAGE(log, "Folding Out"); build_mix_partial_test_graph(builder, NODE_MIX_MUL, make_float3(1, 1, 1)); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: partial folding for RGB Div with known 1. */ -TEST(render_graph, constant_fold_part_mix_div_1) +TEST_F(RenderGraph, constant_fold_part_mix_div_1) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); INVALID_INFO_MESSAGE(log, "Folding Mix_Cx_Fx::Color"); INVALID_INFO_MESSAGE(log, "Folding Mix_Cx_F1::Color"); @@ -789,16 +765,14 @@ TEST(render_graph, constant_fold_part_mix_div_1) INVALID_INFO_MESSAGE(log, "Folding Out"); build_mix_partial_test_graph(builder, NODE_MIX_DIV, make_float3(1, 1, 1)); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: partial folding for RGB Mul with known 0. */ -TEST(render_graph, constant_fold_part_mix_mul_0) +TEST_F(RenderGraph, constant_fold_part_mix_mul_0) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); /* 0 * ? (fac ?) == 0 */ CORRECT_INFO_MESSAGE(log, "Folding Mix_Cx_Fx::Color to constant (0, 0, 0)."); @@ -811,16 +785,14 @@ TEST(render_graph, constant_fold_part_mix_mul_0) INVALID_INFO_MESSAGE(log, "Folding Out1234"); build_mix_partial_test_graph(builder, NODE_MIX_MUL, make_float3(0, 0, 0)); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: partial folding for RGB Div with known 0. */ -TEST(render_graph, constant_fold_part_mix_div_0) +TEST_F(RenderGraph, constant_fold_part_mix_div_0) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); /* 0 / ? (fac ?) == 0 */ CORRECT_INFO_MESSAGE(log, "Folding Mix_Cx_Fx::Color to constant (0, 0, 0)."); @@ -832,16 +804,14 @@ TEST(render_graph, constant_fold_part_mix_div_0) INVALID_INFO_MESSAGE(log, "Folding Out1234"); build_mix_partial_test_graph(builder, NODE_MIX_DIV, make_float3(0, 0, 0)); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: Separate/Combine RGB with all constant inputs. */ -TEST(render_graph, constant_fold_separate_combine_rgb) +TEST_F(RenderGraph, constant_fold_separate_combine_rgb) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Folding SeparateRGB::R to constant (0.3)."); CORRECT_INFO_MESSAGE(log, "Folding SeparateRGB::G to constant (0.5)."); @@ -857,16 +827,14 @@ TEST(render_graph, constant_fold_separate_combine_rgb) .add_connection("SeparateRGB::B", "CombineRGB::B") .output_color("CombineRGB::Image"); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: Separate/Combine XYZ with all constant inputs. */ -TEST(render_graph, constant_fold_separate_combine_xyz) +TEST_F(RenderGraph, constant_fold_separate_combine_xyz) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Folding SeparateXYZ::X to constant (0.3)."); CORRECT_INFO_MESSAGE(log, "Folding SeparateXYZ::Y to constant (0.5)."); @@ -883,16 +851,14 @@ TEST(render_graph, constant_fold_separate_combine_xyz) .add_connection("SeparateXYZ::Z", "CombineXYZ::Z") .output_color("CombineXYZ::Vector"); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: Separate/Combine HSV with all constant inputs. */ -TEST(render_graph, constant_fold_separate_combine_hsv) +TEST_F(RenderGraph, constant_fold_separate_combine_hsv) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Folding SeparateHSV::H to constant (0.583333)."); CORRECT_INFO_MESSAGE(log, "Folding SeparateHSV::S to constant (0.571429)."); @@ -908,16 +874,14 @@ TEST(render_graph, constant_fold_separate_combine_hsv) .add_connection("SeparateHSV::V", "CombineHSV::V") .output_color("CombineHSV::Color"); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: Gamma with all constant inputs. */ -TEST(render_graph, constant_fold_gamma) +TEST_F(RenderGraph, constant_fold_gamma) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Folding Gamma::Color to constant (0.164317, 0.353553, 0.585662)."); @@ -927,16 +891,14 @@ TEST(render_graph, constant_fold_gamma) .set("Gamma", 1.5f)) .output_color("Gamma::Color"); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: Gamma with one constant 0 input. */ -TEST(render_graph, constant_fold_gamma_part_0) +TEST_F(RenderGraph, constant_fold_gamma_part_0) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); INVALID_INFO_MESSAGE(log, "Folding Gamma_Cx::"); CORRECT_INFO_MESSAGE(log, "Folding Gamma_xC::Color to constant (1, 1, 1)."); @@ -960,16 +922,14 @@ TEST(render_graph, constant_fold_gamma_part_0) .add_connection("Gamma_xC::Color", "Out::Color2") .output_color("Out::Color"); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: Gamma with one constant 1 input. */ -TEST(render_graph, constant_fold_gamma_part_1) +TEST_F(RenderGraph, constant_fold_gamma_part_1) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Folding Gamma_Cx::Color to constant (1, 1, 1)."); CORRECT_INFO_MESSAGE(log, "Folding Gamma_xC::Color to socket Attribute::Color."); @@ -993,16 +953,14 @@ TEST(render_graph, constant_fold_gamma_part_1) .add_connection("Gamma_xC::Color", "Out::Color2") .output_color("Out::Color"); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: BrightnessContrast with all constant inputs. */ -TEST(render_graph, constant_fold_bright_contrast) +TEST_F(RenderGraph, constant_fold_bright_contrast) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Folding BrightContrast::Color to constant (0.16, 0.6, 1.04)."); @@ -1013,16 +971,14 @@ TEST(render_graph, constant_fold_bright_contrast) .set("Contrast", 1.2f)) .output_color("BrightContrast::Color"); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: blackbody with all constant inputs. */ -TEST(render_graph, constant_fold_blackbody) +TEST_F(RenderGraph, constant_fold_blackbody) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Folding Blackbody::Color to constant (3.94163, 0.226523, 0)."); @@ -1031,16 +987,14 @@ TEST(render_graph, constant_fold_blackbody) .set("Temperature", 1200.0f)) .output_color("Blackbody::Color"); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: Math with all constant inputs (clamp false). */ -TEST(render_graph, constant_fold_math) +TEST_F(RenderGraph, constant_fold_math) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Folding Math::Value to constant (1.6)."); @@ -1052,16 +1006,14 @@ TEST(render_graph, constant_fold_math) .set("Value2", 0.9f)) .output_value("Math::Value"); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: Math with all constant inputs (clamp true). */ -TEST(render_graph, constant_fold_math_clamp) +TEST_F(RenderGraph, constant_fold_math_clamp) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Folding Math::Value to constant (1)."); @@ -1073,7 +1025,7 @@ TEST(render_graph, constant_fold_math_clamp) .set("Value2", 0.9f)) .output_value("Math::Value"); - graph.finalize(&scene); + graph.finalize(scene); } /* @@ -1108,10 +1060,8 @@ static void build_math_partial_test_graph(ShaderGraphBuilder &builder, NodeMath /* * Tests: partial folding for Math Add with known 0. */ -TEST(render_graph, constant_fold_part_math_add_0) +TEST_F(RenderGraph, constant_fold_part_math_add_0) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); /* X + 0 == 0 + X == X */ CORRECT_INFO_MESSAGE(log, "Folding Math_Cx::Value to socket Attribute::Fac."); @@ -1119,16 +1069,14 @@ TEST(render_graph, constant_fold_part_math_add_0) INVALID_INFO_MESSAGE(log, "Folding Out::"); build_math_partial_test_graph(builder, NODE_MATH_ADD, 0.0f); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: partial folding for Math Sub with known 0. */ -TEST(render_graph, constant_fold_part_math_sub_0) +TEST_F(RenderGraph, constant_fold_part_math_sub_0) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); /* X - 0 == X */ INVALID_INFO_MESSAGE(log, "Folding Math_Cx::"); @@ -1136,16 +1084,14 @@ TEST(render_graph, constant_fold_part_math_sub_0) INVALID_INFO_MESSAGE(log, "Folding Out::"); build_math_partial_test_graph(builder, NODE_MATH_SUBTRACT, 0.0f); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: partial folding for Math Mul with known 1. */ -TEST(render_graph, constant_fold_part_math_mul_1) +TEST_F(RenderGraph, constant_fold_part_math_mul_1) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); /* X * 1 == 1 * X == X */ CORRECT_INFO_MESSAGE(log, "Folding Math_Cx::Value to socket Attribute::Fac."); @@ -1153,16 +1099,14 @@ TEST(render_graph, constant_fold_part_math_mul_1) INVALID_INFO_MESSAGE(log, "Folding Out::"); build_math_partial_test_graph(builder, NODE_MATH_MULTIPLY, 1.0f); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: partial folding for Math Div with known 1. */ -TEST(render_graph, constant_fold_part_math_div_1) +TEST_F(RenderGraph, constant_fold_part_math_div_1) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); /* X / 1 == X */ INVALID_INFO_MESSAGE(log, "Folding Math_Cx::"); @@ -1170,16 +1114,14 @@ TEST(render_graph, constant_fold_part_math_div_1) INVALID_INFO_MESSAGE(log, "Folding Out::"); build_math_partial_test_graph(builder, NODE_MATH_DIVIDE, 1.0f); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: partial folding for Math Mul with known 0. */ -TEST(render_graph, constant_fold_part_math_mul_0) +TEST_F(RenderGraph, constant_fold_part_math_mul_0) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); /* X * 0 == 0 * X == 0 */ CORRECT_INFO_MESSAGE(log, "Folding Math_Cx::Value to constant (0)."); @@ -1188,16 +1130,14 @@ TEST(render_graph, constant_fold_part_math_mul_0) CORRECT_INFO_MESSAGE(log, "Discarding closure EmissionNode."); build_math_partial_test_graph(builder, NODE_MATH_MULTIPLY, 0.0f); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: partial folding for Math Div with known 0. */ -TEST(render_graph, constant_fold_part_math_div_0) +TEST_F(RenderGraph, constant_fold_part_math_div_0) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); /* 0 / X == 0 */ CORRECT_INFO_MESSAGE(log, "Folding Math_Cx::Value to constant (0)."); @@ -1205,16 +1145,14 @@ TEST(render_graph, constant_fold_part_math_div_0) INVALID_INFO_MESSAGE(log, "Folding Out::"); build_math_partial_test_graph(builder, NODE_MATH_DIVIDE, 0.0f); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: partial folding for Math Power with known 0. */ -TEST(render_graph, constant_fold_part_math_pow_0) +TEST_F(RenderGraph, constant_fold_part_math_pow_0) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); /* X ^ 0 == 1 */ INVALID_INFO_MESSAGE(log, "Folding Math_Cx::"); @@ -1222,16 +1160,14 @@ TEST(render_graph, constant_fold_part_math_pow_0) INVALID_INFO_MESSAGE(log, "Folding Out::"); build_math_partial_test_graph(builder, NODE_MATH_POWER, 0.0f); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: partial folding for Math Power with known 1. */ -TEST(render_graph, constant_fold_part_math_pow_1) +TEST_F(RenderGraph, constant_fold_part_math_pow_1) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); /* 1 ^ X == 1; X ^ 1 == X */ CORRECT_INFO_MESSAGE(log, "Folding Math_Cx::Value to constant (1)"); @@ -1239,16 +1175,14 @@ TEST(render_graph, constant_fold_part_math_pow_1) INVALID_INFO_MESSAGE(log, "Folding Out::"); build_math_partial_test_graph(builder, NODE_MATH_POWER, 1.0f); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: Vector Math with all constant inputs. */ -TEST(render_graph, constant_fold_vector_math) +TEST_F(RenderGraph, constant_fold_vector_math) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Folding VectorMath::Value to constant (1)."); CORRECT_INFO_MESSAGE(log, "Folding VectorMath::Vector to constant (3, 0, 0)."); @@ -1267,7 +1201,7 @@ TEST(render_graph, constant_fold_vector_math) .add_connection("VectorMath::Value", "Math::Value2") .output_color("Math::Value"); - graph.finalize(&scene); + graph.finalize(scene); } /* @@ -1299,10 +1233,8 @@ static void build_vecmath_partial_test_graph(ShaderGraphBuilder &builder, NodeVe /* * Tests: partial folding for Vector Math Add with known 0. */ -TEST(render_graph, constant_fold_part_vecmath_add_0) +TEST_F(RenderGraph, constant_fold_part_vecmath_add_0) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); /* X + 0 == 0 + X == X */ CORRECT_INFO_MESSAGE(log, "Folding Math_Cx::Vector to socket Attribute::Vector."); @@ -1310,16 +1242,14 @@ TEST(render_graph, constant_fold_part_vecmath_add_0) INVALID_INFO_MESSAGE(log, "Folding Out::"); build_vecmath_partial_test_graph(builder, NODE_VECTOR_MATH_ADD, make_float3(0,0,0)); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: partial folding for Vector Math Sub with known 0. */ -TEST(render_graph, constant_fold_part_vecmath_sub_0) +TEST_F(RenderGraph, constant_fold_part_vecmath_sub_0) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); /* X - 0 == X */ INVALID_INFO_MESSAGE(log, "Folding Math_Cx::"); @@ -1327,16 +1257,14 @@ TEST(render_graph, constant_fold_part_vecmath_sub_0) INVALID_INFO_MESSAGE(log, "Folding Out::"); build_vecmath_partial_test_graph(builder, NODE_VECTOR_MATH_SUBTRACT, make_float3(0,0,0)); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: partial folding for Vector Math Dot Product with known 0. */ -TEST(render_graph, constant_fold_part_vecmath_dot_0) +TEST_F(RenderGraph, constant_fold_part_vecmath_dot_0) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); /* X * 0 == 0 * X == X */ CORRECT_INFO_MESSAGE(log, "Folding Math_Cx::Vector to constant (0, 0, 0)."); @@ -1345,16 +1273,14 @@ TEST(render_graph, constant_fold_part_vecmath_dot_0) CORRECT_INFO_MESSAGE(log, "Discarding closure EmissionNode."); build_vecmath_partial_test_graph(builder, NODE_VECTOR_MATH_DOT_PRODUCT, make_float3(0,0,0)); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: partial folding for Vector Math Cross Product with known 0. */ -TEST(render_graph, constant_fold_part_vecmath_cross_0) +TEST_F(RenderGraph, constant_fold_part_vecmath_cross_0) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); /* X * 0 == 0 * X == X */ CORRECT_INFO_MESSAGE(log, "Folding Math_Cx::Vector to constant (0, 0, 0)."); @@ -1363,16 +1289,14 @@ TEST(render_graph, constant_fold_part_vecmath_cross_0) CORRECT_INFO_MESSAGE(log, "Discarding closure EmissionNode."); build_vecmath_partial_test_graph(builder, NODE_VECTOR_MATH_CROSS_PRODUCT, make_float3(0,0,0)); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: Bump with no height input folded to Normal input. */ -TEST(render_graph, constant_fold_bump) +TEST_F(RenderGraph, constant_fold_bump) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Folding Bump::Normal to socket Geometry1::Normal."); @@ -1382,16 +1306,14 @@ TEST(render_graph, constant_fold_bump) .add_connection("Geometry1::Normal", "Bump::Normal") .output_color("Bump::Normal"); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: Bump with no inputs folded to Geometry::Normal. */ -TEST(render_graph, constant_fold_bump_no_input) +TEST_F(RenderGraph, constant_fold_bump_no_input) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Folding Bump::Normal to socket geometry::Normal."); @@ -1399,7 +1321,7 @@ TEST(render_graph, constant_fold_bump_no_input) .add_node(ShaderNodeBuilder<BumpNode>("Bump")) .output_color("Bump::Normal"); - graph.finalize(&scene); + graph.finalize(scene); } template<class T> @@ -1416,10 +1338,8 @@ void init_test_curve(array<T> &buffer, T start, T end, int steps) * Tests: * - Folding of RGB Curves with all constant inputs. */ -TEST(render_graph, constant_fold_rgb_curves) +TEST_F(RenderGraph, constant_fold_rgb_curves) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Folding Curves::Color to constant (0.275, 0.5, 0.475)."); @@ -1435,17 +1355,15 @@ TEST(render_graph, constant_fold_rgb_curves) .set("Color", make_float3(0.3f, 0.5f, 0.7f))) .output_color("Curves::Color"); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: * - Folding of RGB Curves with zero Fac. */ -TEST(render_graph, constant_fold_rgb_curves_fac_0) +TEST_F(RenderGraph, constant_fold_rgb_curves_fac_0) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Folding Curves::Color to socket Attribute::Color."); @@ -1462,7 +1380,7 @@ TEST(render_graph, constant_fold_rgb_curves_fac_0) .add_connection("Attribute::Color", "Curves::Color") .output_color("Curves::Color"); - graph.finalize(&scene); + graph.finalize(scene); } @@ -1470,10 +1388,8 @@ TEST(render_graph, constant_fold_rgb_curves_fac_0) * Tests: * - Folding of RGB Curves with zero Fac and all constant inputs. */ -TEST(render_graph, constant_fold_rgb_curves_fac_0_const) +TEST_F(RenderGraph, constant_fold_rgb_curves_fac_0_const) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Folding Curves::Color to constant (0.3, 0.5, 0.7)."); @@ -1489,17 +1405,15 @@ TEST(render_graph, constant_fold_rgb_curves_fac_0_const) .set("Color", make_float3(0.3f, 0.5f, 0.7f))) .output_color("Curves::Color"); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: * - Folding of Vector Curves with all constant inputs. */ -TEST(render_graph, constant_fold_vector_curves) +TEST_F(RenderGraph, constant_fold_vector_curves) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Folding Curves::Vector to constant (0.275, 0.5, 0.475)."); @@ -1515,17 +1429,15 @@ TEST(render_graph, constant_fold_vector_curves) .set("Vector", make_float3(0.3f, 0.5f, 0.7f))) .output_color("Curves::Vector"); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: * - Folding of Vector Curves with zero Fac. */ -TEST(render_graph, constant_fold_vector_curves_fac_0) +TEST_F(RenderGraph, constant_fold_vector_curves_fac_0) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Folding Curves::Vector to socket Attribute::Vector."); @@ -1542,17 +1454,15 @@ TEST(render_graph, constant_fold_vector_curves_fac_0) .add_connection("Attribute::Vector", "Curves::Vector") .output_color("Curves::Vector"); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: * - Folding of Color Ramp with all constant inputs. */ -TEST(render_graph, constant_fold_rgb_ramp) +TEST_F(RenderGraph, constant_fold_rgb_ramp) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Folding Ramp::Color to constant (0.14, 0.39, 0.64)."); CORRECT_INFO_MESSAGE(log, "Folding Ramp::Alpha to constant (0.89)."); @@ -1574,17 +1484,15 @@ TEST(render_graph, constant_fold_rgb_ramp) .add_connection("Ramp::Alpha", "Mix::Color2") .output_color("Mix::Color"); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: * - Folding of Color Ramp with all constant inputs (interpolate false). */ -TEST(render_graph, constant_fold_rgb_ramp_flat) +TEST_F(RenderGraph, constant_fold_rgb_ramp_flat) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Folding Ramp::Color to constant (0.125, 0.375, 0.625)."); CORRECT_INFO_MESSAGE(log, "Folding Ramp::Alpha to constant (0.875)."); @@ -1606,17 +1514,15 @@ TEST(render_graph, constant_fold_rgb_ramp_flat) .add_connection("Ramp::Alpha", "Mix::Color2") .output_color("Mix::Color"); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: * - Folding of redundant conversion of float to color to float. */ -TEST(render_graph, constant_fold_convert_float_color_float) +TEST_F(RenderGraph, constant_fold_convert_float_color_float) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Folding Invert::Color to socket convert_float_to_color::value_color."); CORRECT_INFO_MESSAGE(log, "Folding convert_color_to_float::value_float to socket Attribute::Fac."); @@ -1628,17 +1534,15 @@ TEST(render_graph, constant_fold_convert_float_color_float) .add_connection("Attribute::Fac", "Invert::Color") .output_value("Invert::Color"); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: * - Folding of redundant conversion of color to vector to color. */ -TEST(render_graph, constant_fold_convert_color_vector_color) +TEST_F(RenderGraph, constant_fold_convert_color_vector_color) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Folding VecAdd::Vector to socket convert_color_to_vector::value_vector."); CORRECT_INFO_MESSAGE(log, "Folding convert_vector_to_color::value_color to socket Attribute::Color."); @@ -1651,17 +1555,15 @@ TEST(render_graph, constant_fold_convert_color_vector_color) .add_connection("Attribute::Color", "VecAdd::Vector1") .output_color("VecAdd::Vector"); - graph.finalize(&scene); + graph.finalize(scene); } /* * Tests: * - NOT folding conversion of color to float to color. */ -TEST(render_graph, constant_fold_convert_color_float_color) +TEST_F(RenderGraph, constant_fold_convert_color_float_color) { - DEFINE_COMMON_VARIABLES(builder, log); - EXPECT_ANY_MESSAGE(log); CORRECT_INFO_MESSAGE(log, "Folding MathAdd::Value to socket convert_color_to_float::value_float."); INVALID_INFO_MESSAGE(log, "Folding convert_float_to_color::"); @@ -1674,7 +1576,7 @@ TEST(render_graph, constant_fold_convert_color_float_color) .add_connection("Attribute::Color", "MathAdd::Value1") .output_color("MathAdd::Value"); - graph.finalize(&scene); + graph.finalize(scene); } CCL_NAMESPACE_END diff --git a/intern/cycles/test/util_string_test.cpp b/intern/cycles/test/util_string_test.cpp index 22ec8e0ee8e..6c059ba5d12 100644 --- a/intern/cycles/test/util_string_test.cpp +++ b/intern/cycles/test/util_string_test.cpp @@ -245,4 +245,41 @@ TEST(util_string_remove_trademark, both) EXPECT_EQ(str, "foo bar zzz"); } +TEST(util_string_remove_trademark, both_space) +{ + string str = string_remove_trademark("foo bar(TM) (R) zzz"); + EXPECT_EQ(str, "foo bar zzz"); +} + +TEST(util_string_remove_trademark, both_space_around) +{ + string str = string_remove_trademark("foo bar (TM) (R) zzz"); + EXPECT_EQ(str, "foo bar zzz"); +} + +TEST(util_string_remove_trademark, trademark_space_suffix) +{ + string str = string_remove_trademark("foo bar (TM)"); + EXPECT_EQ(str, "foo bar"); +} + +TEST(util_string_remove_trademark, trademark_space_middle) +{ + string str = string_remove_trademark("foo bar (TM) baz"); + EXPECT_EQ(str, "foo bar baz"); +} + + +TEST(util_string_remove_trademark, r_space_suffix) +{ + string str = string_remove_trademark("foo bar (R)"); + EXPECT_EQ(str, "foo bar"); +} + +TEST(util_string_remove_trademark, r_space_middle) +{ + string str = string_remove_trademark("foo bar (R) baz"); + EXPECT_EQ(str, "foo bar baz"); +} + CCL_NAMESPACE_END diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt index d8abf671bd6..24043e2231b 100644 --- a/intern/cycles/util/CMakeLists.txt +++ b/intern/cycles/util/CMakeLists.txt @@ -1,6 +1,6 @@ set(INC - . + .. ../../glew-mx ) @@ -38,6 +38,7 @@ set(SRC_HEADERS util_atomic.h util_boundbox.h util_debug.h + util_defines.h util_guarded_allocator.cpp util_foreach.h util_function.h @@ -52,13 +53,23 @@ set(SRC_HEADERS util_math.h util_math_cdf.h util_math_fast.h + util_math_intersect.h + util_math_float2.h + util_math_float3.h + util_math_float4.h + util_math_int2.h + util_math_int3.h + util_math_int4.h + util_math_matrix.h util_md5.h util_opengl.h util_optimization.h util_param.h util_path.h util_progress.h + util_projection.h util_queue.h + util_rect.h util_set.h util_simd.h util_sky_model.cpp @@ -79,6 +90,32 @@ set(SRC_HEADERS util_time.h util_transform.h util_types.h + util_types_float2.h + util_types_float2_impl.h + util_types_float3.h + util_types_float3_impl.h + util_types_float4.h + util_types_float4_impl.h + util_types_int2.h + util_types_int2_impl.h + util_types_int3.h + util_types_int3_impl.h + util_types_int4.h + util_types_int4_impl.h + util_types_uchar2.h + util_types_uchar2_impl.h + util_types_uchar3.h + util_types_uchar3_impl.h + util_types_uchar4.h + util_types_uchar4_impl.h + util_types_uint2.h + util_types_uint2_impl.h + util_types_uint3.h + util_types_uint3_impl.h + util_types_uint4.h + util_types_uint4_impl.h + util_types_vector3.h + util_types_vector3_impl.h util_vector.h util_version.h util_view.h @@ -91,4 +128,4 @@ include_directories(SYSTEM ${INC_SYS}) add_definitions(${GL_DEFINITIONS}) -add_library(cycles_util ${SRC} ${SRC_HEADERS}) +cycles_add_library(cycles_util ${SRC} ${SRC_HEADERS}) diff --git a/intern/cycles/util/util_aligned_malloc.cpp b/intern/cycles/util/util_aligned_malloc.cpp index 15d2eb3271b..cc7252dcc58 100644 --- a/intern/cycles/util/util_aligned_malloc.cpp +++ b/intern/cycles/util/util_aligned_malloc.cpp @@ -14,8 +14,8 @@ * limitations under the License. */ -#include "util_aligned_malloc.h" -#include "util_guarded_allocator.h" +#include "util/util_aligned_malloc.h" +#include "util/util_guarded_allocator.h" #include <cassert> diff --git a/intern/cycles/util/util_aligned_malloc.h b/intern/cycles/util/util_aligned_malloc.h index ecc0f28c376..66d77c83454 100644 --- a/intern/cycles/util/util_aligned_malloc.h +++ b/intern/cycles/util/util_aligned_malloc.h @@ -17,10 +17,13 @@ #ifndef __UTIL_ALIGNED_MALLOC_H__ #define __UTIL_ALIGNED_MALLOC_H__ -#include "util_types.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN +/* Minimum alignment needed by all CPU native data types (SSE, AVX). */ +#define MIN_ALIGNMENT_CPU_DATA_TYPES 16 + /* Allocate block of size bytes at least aligned to a given value. */ void *util_aligned_malloc(size_t size, int alignment); diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h index 433e41fbbb6..f3c7ae546a0 100644 --- a/intern/cycles/util/util_atomic.h +++ b/intern/cycles/util/util_atomic.h @@ -22,15 +22,13 @@ /* Using atomic ops header from Blender. */ #include "atomic_ops.h" -ATOMIC_INLINE void atomic_update_max_z(size_t *maximum_value, size_t value) -{ - size_t prev_value = *maximum_value; - while(prev_value < value) { - if(atomic_cas_z(maximum_value, prev_value, value) != prev_value) { - break; - } - } -} +#define atomic_add_and_fetch_float(p, x) atomic_add_and_fetch_fl((p), (x)) + +#define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1) +#define atomic_fetch_and_dec_uint32(p) atomic_fetch_and_add_uint32((p), -1) + +#define CCL_LOCAL_MEM_FENCE 0 +#define ccl_barrier(flags) (void)0 #else /* __KERNEL_GPU__ */ @@ -39,7 +37,7 @@ ATOMIC_INLINE void atomic_update_max_z(size_t *maximum_value, size_t value) /* Float atomics implementation credits: * http://suhorukov.blogspot.in/2011/12/opencl-11-atomic-operations-on-floating.html */ -ccl_device_inline void atomic_add_and_fetch_float(volatile ccl_global float *source, +ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *source, const float operand) { union { @@ -56,10 +54,32 @@ ccl_device_inline void atomic_add_and_fetch_float(volatile ccl_global float *sou } while(atomic_cmpxchg((volatile ccl_global unsigned int *)source, prev_value.int_value, new_value.int_value) != prev_value.int_value); + return new_value.float_value; } +#define atomic_fetch_and_add_uint32(p, x) atomic_add((p), (x)) +#define atomic_fetch_and_inc_uint32(p) atomic_inc((p)) +#define atomic_fetch_and_dec_uint32(p) atomic_dec((p)) + +#define CCL_LOCAL_MEM_FENCE CLK_LOCAL_MEM_FENCE +#define ccl_barrier(flags) barrier(flags) + #endif /* __KERNEL_OPENCL__ */ +#ifdef __KERNEL_CUDA__ + +#define atomic_add_and_fetch_float(p, x) (atomicAdd((float*)(p), (float)(x)) + (float)(x)) + +#define atomic_fetch_and_add_uint32(p, x) atomicAdd((unsigned int*)(p), (unsigned int)(x)) +#define atomic_fetch_and_sub_uint32(p, x) atomicSub((unsigned int*)(p), (unsigned int)(x)) +#define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1) +#define atomic_fetch_and_dec_uint32(p) atomic_fetch_and_sub_uint32((p), 1) + +#define CCL_LOCAL_MEM_FENCE +#define ccl_barrier(flags) __syncthreads() + +#endif /* __KERNEL_CUDA__ */ + #endif /* __KERNEL_GPU__ */ #endif /* __UTIL_ATOMIC_H__ */ diff --git a/intern/cycles/util/util_boundbox.h b/intern/cycles/util/util_boundbox.h index dfe4977aef3..ed94ca20211 100644 --- a/intern/cycles/util/util_boundbox.h +++ b/intern/cycles/util/util_boundbox.h @@ -20,10 +20,10 @@ #include <math.h> #include <float.h> -#include "util_math.h" -#include "util_string.h" -#include "util_transform.h" -#include "util_types.h" +#include "util/util_math.h" +#include "util/util_string.h" +#include "util/util_transform.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/util/util_color.h b/intern/cycles/util/util_color.h index d3598f84b94..c73beab98dc 100644 --- a/intern/cycles/util/util_color.h +++ b/intern/cycles/util/util_color.h @@ -17,11 +17,11 @@ #ifndef __UTIL_COLOR_H__ #define __UTIL_COLOR_H__ -#include "util_math.h" -#include "util_types.h" +#include "util/util_math.h" +#include "util/util_types.h" #ifdef __KERNEL_SSE2__ -#include "util_simd.h" +#include "util/util_simd.h" #endif CCL_NAMESPACE_BEGIN @@ -157,16 +157,6 @@ ccl_device float3 xyz_to_rgb(float x, float y, float z) 0.055648f * x + -0.204043f * y + 1.057311f * z); } -#ifndef __KERNEL_OPENCL__ - -ccl_device float3 color_srgb_to_scene_linear(float3 c) -{ - return make_float3( - color_srgb_to_scene_linear(c.x), - color_srgb_to_scene_linear(c.y), - color_srgb_to_scene_linear(c.z)); -} - #ifdef __KERNEL_SSE2__ /* * Calculate initial guess for arg^exp based on float representation @@ -222,17 +212,38 @@ ccl_device ssef color_srgb_to_scene_linear(const ssef &c) ssef gte = fastpow24(gtebase); return select(cmp, lt, gte); } -#endif +#endif /* __KERNEL_SSE2__ */ -ccl_device float3 color_scene_linear_to_srgb(float3 c) +ccl_device float3 color_srgb_to_scene_linear_v3(float3 c) { - return make_float3( - color_scene_linear_to_srgb(c.x), - color_scene_linear_to_srgb(c.y), - color_scene_linear_to_srgb(c.z)); + return make_float3(color_srgb_to_scene_linear(c.x), + color_srgb_to_scene_linear(c.y), + color_srgb_to_scene_linear(c.z)); } +ccl_device float3 color_scene_linear_to_srgb_v3(float3 c) +{ + return make_float3(color_scene_linear_to_srgb(c.x), + color_scene_linear_to_srgb(c.y), + color_scene_linear_to_srgb(c.z)); +} + +ccl_device float4 color_srgb_to_scene_linear_v4(float4 c) +{ +#ifdef __KERNEL_SSE2__ + ssef r_ssef; + float4 &r = (float4 &)r_ssef; + r = c; + r_ssef = color_srgb_to_scene_linear(r_ssef); + r.w = c.w; + return r; +#else + return make_float4(color_srgb_to_scene_linear(c.x), + color_srgb_to_scene_linear(c.y), + color_srgb_to_scene_linear(c.z), + c.w); #endif +} ccl_device float linear_rgb_to_gray(float3 c) { diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp index 80d177d2cae..9a66a372822 100644 --- a/intern/cycles/util/util_debug.cpp +++ b/intern/cycles/util/util_debug.cpp @@ -14,12 +14,14 @@ * limitations under the License. */ -#include "util_debug.h" +#include "util/util_debug.h" #include <stdlib.h> -#include "util_logging.h" -#include "util_string.h" +#include "bvh/bvh_params.h" + +#include "util/util_logging.h" +#include "util/util_string.h" CCL_NAMESPACE_BEGIN @@ -29,7 +31,8 @@ DebugFlags::CPU::CPU() sse41(true), sse3(true), sse2(true), - qbvh(true) + bvh_layout(BVH_LAYOUT_DEFAULT), + split_kernel(false) { reset(); } @@ -54,11 +57,13 @@ void DebugFlags::CPU::reset() #undef STRINGIFY #undef CHECK_CPU_FLAGS - qbvh = true; + bvh_layout = BVH_LAYOUT_DEFAULT; + split_kernel = false; } DebugFlags::CUDA::CUDA() - : adaptive_compile(false) + : adaptive_compile(false), + split_kernel(false) { reset(); } @@ -67,12 +72,15 @@ void DebugFlags::CUDA::reset() { if(getenv("CYCLES_CUDA_ADAPTIVE_COMPILE") != NULL) adaptive_compile = true; + + split_kernel = false; } DebugFlags::OpenCL::OpenCL() : device_type(DebugFlags::OpenCL::DEVICE_ALL), kernel_type(DebugFlags::OpenCL::KERNEL_DEFAULT), - debug(false) + debug(false), + single_program(false) { reset(); } @@ -112,16 +120,20 @@ void DebugFlags::OpenCL::reset() } /* Initialize other flags from environment variables. */ debug = (getenv("CYCLES_OPENCL_DEBUG") != NULL); + single_program = (getenv("CYCLES_OPENCL_MULTI_PROGRAM") == NULL); } DebugFlags::DebugFlags() +: viewport_static_bvh(false) { /* Nothing for now. */ } void DebugFlags::reset() { + viewport_static_bvh = false; cpu.reset(); + cuda.reset(); opencl.reset(); } @@ -129,11 +141,13 @@ std::ostream& operator <<(std::ostream &os, DebugFlagsConstRef debug_flags) { os << "CPU flags:\n" - << " AVX2 : " << string_from_bool(debug_flags.cpu.avx2) << "\n" - << " AVX : " << string_from_bool(debug_flags.cpu.avx) << "\n" - << " SSE4.1 : " << string_from_bool(debug_flags.cpu.sse41) << "\n" - << " SSE3 : " << string_from_bool(debug_flags.cpu.sse3) << "\n" - << " SSE2 : " << string_from_bool(debug_flags.cpu.sse2) << "\n"; + << " AVX2 : " << string_from_bool(debug_flags.cpu.avx2) << "\n" + << " AVX : " << string_from_bool(debug_flags.cpu.avx) << "\n" + << " SSE4.1 : " << string_from_bool(debug_flags.cpu.sse41) << "\n" + << " SSE3 : " << string_from_bool(debug_flags.cpu.sse3) << "\n" + << " SSE2 : " << string_from_bool(debug_flags.cpu.sse2) << "\n" + << " BVH layout : " << bvh_layout_name(debug_flags.cpu.bvh_layout) << "\n" + << " Split : " << string_from_bool(debug_flags.cpu.split_kernel) << "\n"; os << "CUDA flags:\n" << " Adaptive Compile: " << string_from_bool(debug_flags.cuda.adaptive_compile) << "\n"; @@ -172,10 +186,11 @@ std::ostream& operator <<(std::ostream &os, break; } os << "OpenCL flags:\n" - << " Device type : " << opencl_device_type << "\n" - << " Kernel type : " << opencl_kernel_type << "\n" - << " Debug : " << string_from_bool(debug_flags.opencl.debug) - << "\n"; + << " Device type : " << opencl_device_type << "\n" + << " Kernel type : " << opencl_kernel_type << "\n" + << " Debug : " << string_from_bool(debug_flags.opencl.debug) << "\n" + << " Single program : " << string_from_bool(debug_flags.opencl.single_program) << "\n" + << " Memory limit : " << string_human_readable_size(debug_flags.opencl.mem_limit) << "\n"; return os; } diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h index 73fd228b5d9..f17f8a560ee 100644 --- a/intern/cycles/util/util_debug.h +++ b/intern/cycles/util/util_debug.h @@ -20,7 +20,7 @@ #include <cassert> #include <iostream> -#include "util_static_assert.h" +#include "bvh/bvh_params.h" CCL_NAMESPACE_BEGIN @@ -30,6 +30,9 @@ CCL_NAMESPACE_BEGIN */ class DebugFlags { public: + /* Use static BVH in viewport, to match final render exactly. */ + bool viewport_static_bvh; + /* Descriptor of CPU feature-set to be used. */ struct CPU { CPU(); @@ -44,8 +47,24 @@ public: bool sse3; bool sse2; - /* Whether QBVH usage is allowed or not. */ - bool qbvh; + /* Check functions to see whether instructions up to the given one + * are allowed for use. + */ + bool has_avx2() { return has_avx() && avx2; } + bool has_avx() { return has_sse41() && avx; } + bool has_sse41() { return has_sse3() && sse41; } + bool has_sse3() { return has_sse2() && sse3; } + bool has_sse2() { return sse2; } + + /* Requested BVH size. + * + * Rendering will use widest possible BVH which is below or equal + * this one. + */ + BVHLayout bvh_layout; + + /* Whether split kernel is used */ + bool split_kernel; }; /* Descriptor of CUDA feature-set to be used. */ @@ -58,6 +77,9 @@ public: /* Whether adaptive feature based runtime compile is enabled or not. * Requires the CUDA Toolkit and only works on Linux atm. */ bool adaptive_compile; + + /* Whether split kernel is used */ + bool split_kernel; }; /* Descriptor of OpenCL feature-set to be used. */ @@ -106,6 +128,13 @@ public: /* Use debug version of the kernel. */ bool debug; + + /* Use single program */ + bool single_program; + + /* TODO(mai): Currently this is only for OpenCL, but we should have it implemented for all devices. */ + /* Artificial memory limit in bytes (0 if disabled). */ + size_t mem_limit; }; /* Get instance of debug flags registry. */ diff --git a/intern/cycles/util/util_defines.h b/intern/cycles/util/util_defines.h new file mode 100644 index 00000000000..98944a19022 --- /dev/null +++ b/intern/cycles/util/util_defines.h @@ -0,0 +1,136 @@ + +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_DEFINES_H__ +#define __UTIL_DEFINES_H__ + +/* Bitness */ + +#if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) +# define __KERNEL_64_BIT__ +#endif + +/* Qualifiers for kernel code shared by CPU and GPU */ + +#ifndef __KERNEL_GPU__ +# define ccl_device static inline +# define ccl_device_noinline static +# define ccl_global +# define ccl_static_constant static const +# define ccl_constant const +# define ccl_local +# define ccl_local_param +# define ccl_private +# define ccl_restrict __restrict +# define ccl_ref & +# define __KERNEL_WITH_SSE_ALIGN__ + +# if defined(_WIN32) && !defined(FREE_WINDOWS) +# define ccl_device_inline static __forceinline +# define ccl_device_forceinline static __forceinline +# define ccl_align(...) __declspec(align(__VA_ARGS__)) +# ifdef __KERNEL_64_BIT__ +# define ccl_try_align(...) __declspec(align(__VA_ARGS__)) +# else /* __KERNEL_64_BIT__ */ +# undef __KERNEL_WITH_SSE_ALIGN__ +/* No support for function arguments (error C2719). */ +# define ccl_try_align(...) +# endif /* __KERNEL_64_BIT__ */ +# define ccl_may_alias +# define ccl_always_inline __forceinline +# define ccl_never_inline __declspec(noinline) +# define ccl_maybe_unused +# else /* _WIN32 && !FREE_WINDOWS */ +# define ccl_device_inline static inline __attribute__((always_inline)) +# define ccl_device_forceinline static inline __attribute__((always_inline)) +# define ccl_align(...) __attribute__((aligned(__VA_ARGS__))) +# ifndef FREE_WINDOWS64 +# define __forceinline inline __attribute__((always_inline)) +# endif +# define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__))) +# define ccl_may_alias __attribute__((__may_alias__)) +# define ccl_always_inline __attribute__((always_inline)) +# define ccl_never_inline __attribute__((noinline)) +# define ccl_maybe_unused __attribute__((used)) +# endif /* _WIN32 && !FREE_WINDOWS */ + +/* Use to suppress '-Wimplicit-fallthrough' (in place of 'break'). */ +# if defined(__GNUC__) && (__GNUC__ >= 7) /* gcc7.0+ only */ +# define ATTR_FALLTHROUGH __attribute__((fallthrough)) +# else +# define ATTR_FALLTHROUGH ((void)0) +# endif +#endif /* __KERNEL_GPU__ */ + +/* macros */ + +/* hints for branch prediction, only use in code that runs a _lot_ */ +#if defined(__GNUC__) && defined(__KERNEL_CPU__) +# define LIKELY(x) __builtin_expect(!!(x), 1) +# define UNLIKELY(x) __builtin_expect(!!(x), 0) +#else +# define LIKELY(x) (x) +# define UNLIKELY(x) (x) +#endif + +#if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && _MSC_VER >= 1800)) +# define HAS_CPP11_FEATURES +#endif + +#if defined(__GNUC__) || defined(__clang__) +# if defined(HAS_CPP11_FEATURES) +/* Some magic to be sure we don't have reference in the type. */ +template<typename T> static inline T decltype_helper(T x) { return x; } +# define TYPEOF(x) decltype(decltype_helper(x)) +# else +# define TYPEOF(x) typeof(x) +# endif +#endif + +/* Causes warning: + * incompatible types when assigning to type 'Foo' from type 'Bar' + * ... the compiler optimizes away the temp var */ +#ifdef __GNUC__ +#define CHECK_TYPE(var, type) { \ + TYPEOF(var) *__tmp; \ + __tmp = (type *)NULL; \ + (void)__tmp; \ +} (void)0 + +#define CHECK_TYPE_PAIR(var_a, var_b) { \ + TYPEOF(var_a) *__tmp; \ + __tmp = (typeof(var_b) *)NULL; \ + (void)__tmp; \ +} (void)0 +#else +# define CHECK_TYPE(var, type) +# define CHECK_TYPE_PAIR(var_a, var_b) +#endif + +/* can be used in simple macros */ +#define CHECK_TYPE_INLINE(val, type) \ + ((void)(((type)0) != (val))) + +#ifndef __KERNEL_GPU__ +# include <cassert> +# define util_assert(statement) assert(statement) +#else +# define util_assert(statement) +#endif + +#endif /* __UTIL_DEFINES_H__ */ + diff --git a/intern/cycles/util/util_guarded_allocator.cpp b/intern/cycles/util/util_guarded_allocator.cpp index 615ac95f324..54fa6a80df5 100644 --- a/intern/cycles/util/util_guarded_allocator.cpp +++ b/intern/cycles/util/util_guarded_allocator.cpp @@ -14,8 +14,8 @@ * limitations under the License. */ -#include "util_guarded_allocator.h" -#include "util_stats.h" +#include "util/util_guarded_allocator.h" +#include "util/util_stats.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/util/util_guarded_allocator.h b/intern/cycles/util/util_guarded_allocator.h index 78453d214be..87c1526dee4 100644 --- a/intern/cycles/util/util_guarded_allocator.h +++ b/intern/cycles/util/util_guarded_allocator.h @@ -20,9 +20,6 @@ #include <cstddef> #include <memory> -#include "util_debug.h" -#include "util_types.h" - #ifdef WITH_BLENDER_GUARDEDALLOC # include "../../guardedalloc/MEM_guardedalloc.h" #endif @@ -50,9 +47,9 @@ public: T *allocate(size_t n, const void *hint = 0) { + (void)hint; size_t size = n * sizeof(T); util_guarded_mem_alloc(size); - (void)hint; if(n == 0) { return NULL; } diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h index 7285c6ef600..612228dd1c1 100644 --- a/intern/cycles/util/util_half.h +++ b/intern/cycles/util/util_half.h @@ -17,10 +17,11 @@ #ifndef __UTIL_HALF_H__ #define __UTIL_HALF_H__ -#include "util_types.h" +#include "util/util_types.h" +#include "util/util_math.h" #ifdef __KERNEL_SSE2__ -#include "util_simd.h" +#include "util/util_simd.h" #endif CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/util/util_hash.h b/intern/cycles/util/util_hash.h index 98c3a681ff2..2307ca158f0 100644 --- a/intern/cycles/util/util_hash.h +++ b/intern/cycles/util/util_hash.h @@ -17,7 +17,7 @@ #ifndef __UTIL_HASH_H__ #define __UTIL_HASH_H__ -#include "util_types.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN @@ -61,6 +61,11 @@ static inline uint hash_string(const char *str) } #endif +ccl_device_inline float hash_int_01(uint k) +{ + return (float)hash_int(k) * (1.0f/(float)0xFFFFFFFF); +} + CCL_NAMESPACE_END #endif /* __UTIL_HASH_H__ */ diff --git a/intern/cycles/util/util_image.h b/intern/cycles/util/util_image.h index c8efc551d97..18876841b5b 100644 --- a/intern/cycles/util/util_image.h +++ b/intern/cycles/util/util_image.h @@ -21,7 +21,7 @@ #include <OpenImageIO/imageio.h> -#include "util_vector.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN @@ -42,4 +42,4 @@ CCL_NAMESPACE_END #endif /* __UTIL_IMAGE_H__ */ -#include "util_image_impl.h" +#include "util/util_image_impl.h" diff --git a/intern/cycles/util/util_image_impl.h b/intern/cycles/util/util_image_impl.h index 4daf1eaac22..751f52aaa86 100644 --- a/intern/cycles/util/util_image_impl.h +++ b/intern/cycles/util/util_image_impl.h @@ -17,10 +17,9 @@ #ifndef __UTIL_IMAGE_IMPL_H__ #define __UTIL_IMAGE_IMPL_H__ -#include "util_algorithm.h" -#include "util_debug.h" -#include "util_half.h" -#include "util_image.h" +#include "util/util_algorithm.h" +#include "util/util_half.h" +#include "util/util_image.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/util/util_logging.cpp b/intern/cycles/util/util_logging.cpp index 03041723e15..f38683bf7de 100644 --- a/intern/cycles/util/util_logging.cpp +++ b/intern/cycles/util/util_logging.cpp @@ -14,9 +14,9 @@ * limitations under the License. */ -#include <util_logging.h> +#include "util/util_logging.h" -#include "util_math.h" +#include "util/util_math.h" #include <stdio.h> #ifdef _MSC_VER @@ -30,10 +30,10 @@ void util_logging_init(const char *argv0) #ifdef WITH_CYCLES_LOGGING using CYCLES_GFLAGS_NAMESPACE::SetCommandLineOption; - /* Make it so FATAL messages are always print into console. */ + /* Make it so ERROR messages are always print into console. */ char severity_fatal[32]; snprintf(severity_fatal, sizeof(severity_fatal), "%d", - google::GLOG_FATAL); + google::GLOG_ERROR); google::InitGoogleLogging(argv0); SetCommandLineOption("logtostderr", "1"); @@ -69,6 +69,15 @@ void util_logging_verbosity_set(int verbosity) } std::ostream& operator <<(std::ostream &os, + const int2 &value) +{ + os << "(" << value.x + << ", " << value.y + << ")"; + return os; +} + +std::ostream& operator <<(std::ostream &os, const float3 &value) { os << "(" << value.x diff --git a/intern/cycles/util/util_logging.h b/intern/cycles/util/util_logging.h index 2aa9c25b1a0..5c84b6593d3 100644 --- a/intern/cycles/util/util_logging.h +++ b/intern/cycles/util/util_logging.h @@ -18,33 +18,37 @@ #define __UTIL_LOGGING_H__ #if defined(WITH_CYCLES_LOGGING) && !defined(__KERNEL_GPU__) +# include <gflags/gflags.h> # include <glog/logging.h> -#else -# include <iostream> #endif +#include <iostream> + CCL_NAMESPACE_BEGIN #if !defined(WITH_CYCLES_LOGGING) || defined(__KERNEL_GPU__) -class StubStream : public std::ostream { - public: - StubStream() : std::ostream(NULL) { } +class StubStream { +public: + template<class T> + StubStream& operator<<(const T&) { + return *this; + } }; class LogMessageVoidify { public: LogMessageVoidify() { } - void operator&(::std::ostream&) { } + void operator&(StubStream&) { } }; # define LOG_SUPPRESS() (true) ? (void) 0 : LogMessageVoidify() & StubStream() # define LOG(severity) LOG_SUPPRESS() # define VLOG(severity) LOG_SUPPRESS() - #endif #define VLOG_ONCE(level, flag) if(!flag) flag = true, VLOG(level) +struct int2; struct float3; void util_logging_init(const char *argv0); @@ -52,6 +56,8 @@ void util_logging_start(void); void util_logging_verbosity_set(int verbosity); std::ostream& operator <<(std::ostream &os, + const int2 &value); +std::ostream& operator <<(std::ostream &os, const float3 &value); CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h index 2b81c8c498a..d0e91a2a1c9 100644 --- a/intern/cycles/util/util_math.h +++ b/intern/cycles/util/util_math.h @@ -28,14 +28,12 @@ #ifndef __KERNEL_OPENCL__ +# include <float.h> +# include <math.h> +# include <stdio.h> +#endif /* __KERNEL_OPENCL__ */ -#include <float.h> -#include <math.h> -#include <stdio.h> - -#endif - -#include "util_types.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN @@ -43,49 +41,44 @@ CCL_NAMESPACE_BEGIN /* Division */ #ifndef M_PI_F -#define M_PI_F ((float)3.14159265358979323846264338327950288) /* pi */ +# define M_PI_F (3.1415926535897932f) /* pi */ #endif #ifndef M_PI_2_F -#define M_PI_2_F ((float)1.57079632679489661923132169163975144) /* pi/2 */ +# define M_PI_2_F (1.5707963267948966f) /* pi/2 */ #endif #ifndef M_PI_4_F -#define M_PI_4_F ((float)0.785398163397448309615660845819875721) /* pi/4 */ +# define M_PI_4_F (0.7853981633974830f) /* pi/4 */ #endif #ifndef M_1_PI_F -#define M_1_PI_F ((float)0.318309886183790671537767526745028724) /* 1/pi */ +# define M_1_PI_F (0.3183098861837067f) /* 1/pi */ #endif #ifndef M_2_PI_F -#define M_2_PI_F ((float)0.636619772367581343075535053490057448) /* 2/pi */ +# define M_2_PI_F (0.6366197723675813f) /* 2/pi */ #endif /* Multiplication */ #ifndef M_2PI_F -#define M_2PI_F ((float)6.283185307179586476925286766559005768) /* 2*pi */ +# define M_2PI_F (6.2831853071795864f) /* 2*pi */ #endif #ifndef M_4PI_F -#define M_4PI_F ((float)12.56637061435917295385057353311801153) /* 4*pi */ +# define M_4PI_F (12.566370614359172f) /* 4*pi */ #endif /* Float sqrt variations */ - #ifndef M_SQRT2_F -#define M_SQRT2_F ((float)1.41421356237309504880) /* sqrt(2) */ +# define M_SQRT2_F (1.4142135623730950f) /* sqrt(2) */ #endif - #ifndef M_LN2_F -#define M_LN2_F ((float)0.6931471805599453) /* ln(2) */ +# define M_LN2_F (0.6931471805599453f) /* ln(2) */ #endif - #ifndef M_LN10_F -#define M_LN10_F ((float)2.3025850929940457) /* ln(10) */ +# define M_LN10_F (2.3025850929940457f) /* ln(10) */ #endif /* Scalar */ #ifdef _WIN32 - -#ifndef __KERNEL_OPENCL__ - +# ifndef __KERNEL_OPENCL__ ccl_device_inline float fmaxf(float a, float b) { return (a > b)? a: b; @@ -95,15 +88,13 @@ ccl_device_inline float fminf(float a, float b) { return (a < b)? a: b; } - -#endif - -#endif +# endif /* !__KERNEL_OPENCL__ */ +#endif /* _WIN32 */ #ifndef __KERNEL_GPU__ - using std::isfinite; using std::isnan; +using std::sqrt; ccl_device_inline int abs(int x) { @@ -157,8 +148,7 @@ ccl_device_inline T max4(const T& a, const T& b, const T& c, const T& d) { return max(max(a,b),max(c,d)); } - -#endif +#endif /* __KERNEL_GPU__ */ ccl_device_inline float min4(float a, float b, float c, float d) { @@ -170,525 +160,141 @@ ccl_device_inline float max4(float a, float b, float c, float d) return max(max(a, b), max(c, d)); } -ccl_device_inline float max3(float3 a) -{ - return max(max(a.x, a.y), a.z); -} - -#ifndef __KERNEL_OPENCL__ - -ccl_device_inline int clamp(int a, int mn, int mx) -{ - return min(max(a, mn), mx); -} - -ccl_device_inline float clamp(float a, float mn, float mx) -{ - return min(max(a, mn), mx); -} - -ccl_device_inline float mix(float a, float b, float t) -{ - return a + t*(b - a); -} - -#endif - -#ifndef __KERNEL_CUDA__ - -ccl_device_inline float saturate(float a) -{ - return clamp(a, 0.0f, 1.0f); -} - -#endif - -ccl_device_inline int float_to_int(float f) -{ - return (int)f; -} - -ccl_device_inline int floor_to_int(float f) -{ - return float_to_int(floorf(f)); -} - -ccl_device_inline int ceil_to_int(float f) -{ - return float_to_int(ceilf(f)); -} - -ccl_device_inline float signf(float f) -{ - return (f < 0.0f)? -1.0f: 1.0f; -} - -ccl_device_inline float nonzerof(float f, float eps) -{ - if(fabsf(f) < eps) - return signf(f)*eps; - else - return f; -} - -ccl_device_inline float smoothstepf(float f) -{ - float ff = f*f; - return (3.0f*ff - 2.0f*ff*f); -} - -ccl_device_inline int mod(int x, int m) -{ - return (x % m + m) % m; -} - -/* Float2 Vector */ - -#ifndef __KERNEL_OPENCL__ - -ccl_device_inline bool is_zero(const float2& a) -{ - return (a.x == 0.0f && a.y == 0.0f); -} - -#endif - -#ifndef __KERNEL_OPENCL__ - -ccl_device_inline float average(const float2& a) -{ - return (a.x + a.y)*(1.0f/2.0f); -} - -#endif - #ifndef __KERNEL_OPENCL__ +/* Int/Float conversion */ -ccl_device_inline float2 operator-(const float2& a) -{ - return make_float2(-a.x, -a.y); -} - -ccl_device_inline float2 operator*(const float2& a, const float2& b) -{ - return make_float2(a.x*b.x, a.y*b.y); -} - -ccl_device_inline float2 operator*(const float2& a, float f) -{ - return make_float2(a.x*f, a.y*f); -} - -ccl_device_inline float2 operator*(float f, const float2& a) -{ - return make_float2(a.x*f, a.y*f); -} - -ccl_device_inline float2 operator/(float f, const float2& a) -{ - return make_float2(f/a.x, f/a.y); -} - -ccl_device_inline float2 operator/(const float2& a, float f) -{ - float invf = 1.0f/f; - return make_float2(a.x*invf, a.y*invf); -} - -ccl_device_inline float2 operator/(const float2& a, const float2& b) +ccl_device_inline int as_int(uint i) { - return make_float2(a.x/b.x, a.y/b.y); + union { uint ui; int i; } u; + u.ui = i; + return u.i; } -ccl_device_inline float2 operator+(const float2& a, const float2& b) +ccl_device_inline uint as_uint(int i) { - return make_float2(a.x+b.x, a.y+b.y); + union { uint ui; int i; } u; + u.i = i; + return u.ui; } -ccl_device_inline float2 operator-(const float2& a, const float2& b) +ccl_device_inline uint as_uint(float f) { - return make_float2(a.x-b.x, a.y-b.y); + union { uint i; float f; } u; + u.f = f; + return u.i; } -ccl_device_inline float2 operator+=(float2& a, const float2& b) +ccl_device_inline int __float_as_int(float f) { - return a = a + b; + union { int i; float f; } u; + u.f = f; + return u.i; } -ccl_device_inline float2 operator*=(float2& a, const float2& b) +ccl_device_inline float __int_as_float(int i) { - return a = a * b; + union { int i; float f; } u; + u.i = i; + return u.f; } -ccl_device_inline float2 operator*=(float2& a, float f) +ccl_device_inline uint __float_as_uint(float f) { - return a = a * f; + union { uint i; float f; } u; + u.f = f; + return u.i; } -ccl_device_inline float2 operator/=(float2& a, const float2& b) +ccl_device_inline float __uint_as_float(uint i) { - return a = a / b; + union { uint i; float f; } u; + u.i = i; + return u.f; } +#endif /* __KERNEL_OPENCL__ */ -ccl_device_inline float2 operator/=(float2& a, float f) +/* Versions of functions which are safe for fast math. */ +ccl_device_inline bool isnan_safe(float f) { - float invf = 1.0f/f; - return a = a * invf; + unsigned int x = __float_as_uint(f); + return (x << 1) > 0xff000000u; } - -ccl_device_inline float dot(const float2& a, const float2& b) +ccl_device_inline bool isfinite_safe(float f) { - return a.x*b.x + a.y*b.y; + /* By IEEE 754 rule, 2*Inf equals Inf */ + unsigned int x = __float_as_uint(f); + return (f == f) && (x == 0 || x == (1u << 31) || (f != 2.0f*f)) && !((x << 1) > 0xff000000u); } -ccl_device_inline float cross(const float2& a, const float2& b) +ccl_device_inline float ensure_finite(float v) { - return (a.x*b.y - a.y*b.x); + return isfinite_safe(v)? v : 0.0f; } -#endif - #ifndef __KERNEL_OPENCL__ - -ccl_device_inline bool operator==(const int2 a, const int2 b) -{ - return (a.x == b.x && a.y == b.y); -} - -ccl_device_inline float len(const float2& a) -{ - return sqrtf(dot(a, a)); -} - -ccl_device_inline float2 normalize(const float2& a) -{ - return a/len(a); -} - -ccl_device_inline float2 normalize_len(const float2& a, float *t) -{ - *t = len(a); - return a/(*t); -} - -ccl_device_inline float2 safe_normalize(const float2& a) -{ - float t = len(a); - return (t != 0.0f)? a/t: a; -} - -ccl_device_inline bool operator==(const float2& a, const float2& b) -{ - return (a.x == b.x && a.y == b.y); -} - -ccl_device_inline bool operator!=(const float2& a, const float2& b) -{ - return !(a == b); -} - -ccl_device_inline float2 min(const float2& a, const float2& b) -{ - return make_float2(min(a.x, b.x), min(a.y, b.y)); -} - -ccl_device_inline float2 max(const float2& a, const float2& b) -{ - return make_float2(max(a.x, b.x), max(a.y, b.y)); -} - -ccl_device_inline float2 clamp(const float2& a, const float2& mn, const float2& mx) +ccl_device_inline int clamp(int a, int mn, int mx) { return min(max(a, mn), mx); } -ccl_device_inline float2 fabs(const float2& a) -{ - return make_float2(fabsf(a.x), fabsf(a.y)); -} - -ccl_device_inline float2 as_float2(const float4& a) -{ - return make_float2(a.x, a.y); -} - -#endif - -#ifndef __KERNEL_GPU__ - -ccl_device_inline void print_float2(const char *label, const float2& a) -{ - printf("%s: %.8f %.8f\n", label, (double)a.x, (double)a.y); -} - -#endif - -#ifndef __KERNEL_OPENCL__ - -ccl_device_inline float2 interp(const float2& a, const float2& b, float t) -{ - return a + t*(b - a); -} - -#endif - -/* Float3 Vector */ - -#ifndef __KERNEL_OPENCL__ - -ccl_device_inline float3 operator-(const float3& a) -{ -#ifdef __KERNEL_SSE__ - return float3(_mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)))); -#else - return make_float3(-a.x, -a.y, -a.z); -#endif -} - -ccl_device_inline float3 operator*(const float3& a, const float3& b) -{ -#ifdef __KERNEL_SSE__ - return float3(_mm_mul_ps(a.m128,b.m128)); -#else - return make_float3(a.x*b.x, a.y*b.y, a.z*b.z); -#endif -} - -ccl_device_inline float3 operator*(const float3& a, const float f) -{ -#ifdef __KERNEL_SSE__ - return float3(_mm_mul_ps(a.m128,_mm_set1_ps(f))); -#else - return make_float3(a.x*f, a.y*f, a.z*f); -#endif -} - -ccl_device_inline float3 operator*(const float f, const float3& a) -{ - /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */ -#if defined(__KERNEL_SSE__) && 0 - return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128)); -#else - return make_float3(a.x*f, a.y*f, a.z*f); -#endif -} - -ccl_device_inline float3 operator/(const float f, const float3& a) -{ - /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */ -#if defined(__KERNEL_SSE__) && 0 - __m128 rc = _mm_rcp_ps(a.m128); - return float3(_mm_mul_ps(_mm_set1_ps(f),rc)); -#else - return make_float3(f / a.x, f / a.y, f / a.z); -#endif -} - -ccl_device_inline float3 operator/(const float3& a, const float f) -{ - float invf = 1.0f/f; - return a * invf; -} - -ccl_device_inline float3 operator/(const float3& a, const float3& b) -{ - /* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */ -#if defined(__KERNEL_SSE__) && 0 - __m128 rc = _mm_rcp_ps(b.m128); - return float3(_mm_mul_ps(a, rc)); -#else - return make_float3(a.x / b.x, a.y / b.y, a.z / b.z); -#endif -} - -ccl_device_inline float3 operator+(const float3& a, const float3& b) -{ -#ifdef __KERNEL_SSE__ - return float3(_mm_add_ps(a.m128, b.m128)); -#else - return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); -#endif -} - -ccl_device_inline float3 operator-(const float3& a, const float3& b) -{ -#ifdef __KERNEL_SSE__ - return float3(_mm_sub_ps(a.m128, b.m128)); -#else - return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); -#endif -} - -ccl_device_inline float3 operator+=(float3& a, const float3& b) -{ - return a = a + b; -} - -ccl_device_inline float3 operator*=(float3& a, const float3& b) -{ - return a = a * b; -} - -ccl_device_inline float3 operator*=(float3& a, float f) -{ - return a = a * f; -} - -ccl_device_inline float3 operator/=(float3& a, const float3& b) -{ - return a = a / b; -} - -ccl_device_inline float3 operator/=(float3& a, float f) -{ - float invf = 1.0f/f; - return a = a * invf; -} - -ccl_device_inline float dot(const float3& a, const float3& b) -{ -#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) - return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F)); -#else - return a.x*b.x + a.y*b.y + a.z*b.z; -#endif -} - -ccl_device_inline float dot_xy(const float3& a, const float3& b) -{ -#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) - return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a,b),b)); -#else - return a.x*b.x + a.y*b.y; -#endif -} - -ccl_device_inline float dot(const float4& a, const float4& b) -{ -#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) - return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF)); -#else - return (a.x*b.x + a.y*b.y) + (a.z*b.z + a.w*b.w); -#endif -} - -ccl_device_inline float3 cross(const float3& a, const float3& b) -{ - float3 r = make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x); - return r; -} - -#endif - -ccl_device_inline float len(const float3 a) -{ -#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) - return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(a.m128, a.m128, 0x7F))); -#else - return sqrtf(dot(a, a)); -#endif -} - -ccl_device_inline float len_squared(const float3 a) -{ - return dot(a, a); -} - -#ifndef __KERNEL_OPENCL__ - -ccl_device_inline float len_squared(const float4& a) -{ - return dot(a, a); -} - -ccl_device_inline float3 normalize(const float3& a) -{ -#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) - __m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F)); - return _mm_div_ps(a.m128, norm); -#else - return a/len(a); -#endif -} - -#endif - -ccl_device_inline float3 saturate3(float3 a) +ccl_device_inline float clamp(float a, float mn, float mx) { - return make_float3(saturate(a.x), saturate(a.y), saturate(a.z)); + return min(max(a, mn), mx); } -ccl_device_inline float3 normalize_len(const float3 a, float *t) +ccl_device_inline float mix(float a, float b, float t) { - *t = len(a); - float x = 1.0f / *t; - return a*x; + return a + t*(b - a); } +#endif /* __KERNEL_OPENCL__ */ -ccl_device_inline float3 safe_normalize(const float3 a) +#ifndef __KERNEL_CUDA__ +ccl_device_inline float saturate(float a) { - float t = len(a); - return (t != 0.0f)? a * (1.0f/t) : a; + return clamp(a, 0.0f, 1.0f); } +#endif /* __KERNEL_CUDA__ */ -ccl_device_inline float3 safe_normalize_len(const float3 a, float *t) +ccl_device_inline int float_to_int(float f) { - *t = len(a); - return (*t != 0.0f)? a/(*t): a; + return (int)f; } -#ifndef __KERNEL_OPENCL__ - -ccl_device_inline bool operator==(const float3& a, const float3& b) +ccl_device_inline int floor_to_int(float f) { -#ifdef __KERNEL_SSE__ - return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7; -#else - return (a.x == b.x && a.y == b.y && a.z == b.z); -#endif + return float_to_int(floorf(f)); } -ccl_device_inline bool operator!=(const float3& a, const float3& b) +ccl_device_inline int ceil_to_int(float f) { - return !(a == b); + return float_to_int(ceilf(f)); } -ccl_device_inline float3 min(const float3& a, const float3& b) +ccl_device_inline float signf(float f) { -#ifdef __KERNEL_SSE__ - return _mm_min_ps(a.m128, b.m128); -#else - return make_float3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); -#endif + return (f < 0.0f)? -1.0f: 1.0f; } -ccl_device_inline float3 max(const float3& a, const float3& b) +ccl_device_inline float nonzerof(float f, float eps) { -#ifdef __KERNEL_SSE__ - return _mm_max_ps(a.m128, b.m128); -#else - return make_float3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); -#endif + if(fabsf(f) < eps) + return signf(f)*eps; + else + return f; } -ccl_device_inline float3 clamp(const float3& a, const float3& mn, const float3& mx) +ccl_device_inline float smoothstepf(float f) { - return min(max(a, mn), mx); + float ff = f*f; + return (3.0f*ff - 2.0f*ff*f); } -ccl_device_inline float3 fabs(const float3& a) +ccl_device_inline int mod(int x, int m) { -#ifdef __KERNEL_SSE__ - __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); - return _mm_and_ps(a.m128, mask); -#else - return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z)); -#endif + return (x % m + m) % m; } -#endif - ccl_device_inline float3 float2_to_float3(const float2 a) { return make_float3(a.x, a.y, 0.0f); @@ -704,557 +310,21 @@ ccl_device_inline float4 float3_to_float4(const float3 a) return make_float4(a.x, a.y, a.z, 1.0f); } -#ifndef __KERNEL_GPU__ - -ccl_device_inline void print_float3(const char *label, const float3& a) -{ - printf("%s: %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z); -} - -ccl_device_inline float3 rcp(const float3& a) -{ -#ifdef __KERNEL_SSE__ - float4 r = _mm_rcp_ps(a.m128); - return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); -#else - return make_float3(1.0f/a.x, 1.0f/a.y, 1.0f/a.z); -#endif -} - -#endif - -ccl_device_inline float3 interp(float3 a, float3 b, float t) -{ - return a + t*(b - a); -} - -#ifndef __KERNEL_OPENCL__ - -ccl_device_inline float3 mix(const float3& a, const float3& b, float t) -{ - return a + t*(b - a); -} - -#endif - -ccl_device_inline bool is_zero(const float3 a) -{ -#ifdef __KERNEL_SSE__ - return a == make_float3(0.0f); -#else - return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f); -#endif -} - -ccl_device_inline float reduce_add(const float3 a) -{ - return (a.x + a.y + a.z); -} - -ccl_device_inline float average(const float3 a) -{ - return reduce_add(a)*(1.0f/3.0f); -} - -ccl_device_inline bool isequal_float3(const float3 a, const float3 b) -{ -#ifdef __KERNEL_OPENCL__ - return all(a == b); -#else - return a == b; -#endif -} - -/* Float4 Vector */ - -#ifdef __KERNEL_SSE__ - -template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> __forceinline const float4 shuffle(const float4& b) -{ - return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(index_3, index_2, index_1, index_0))); -} - -template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b) -{ - return _mm_moveldup_ps(b); -} - -template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4& b) -{ - return _mm_movehdup_ps(b); -} - -template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b) -{ - return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b))); -} - -#endif - -#ifndef __KERNEL_OPENCL__ - -ccl_device_inline float4 operator-(const float4& a) -{ -#ifdef __KERNEL_SSE__ - __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); - return _mm_xor_ps(a.m128, mask); -#else - return make_float4(-a.x, -a.y, -a.z, -a.w); -#endif -} - -ccl_device_inline float4 operator*(const float4& a, const float4& b) -{ -#ifdef __KERNEL_SSE__ - return _mm_mul_ps(a.m128, b.m128); -#else - return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w); -#endif -} - -ccl_device_inline float4 operator*(const float4& a, float f) -{ -#if defined(__KERNEL_SSE__) - return a * make_float4(f); -#else - return make_float4(a.x*f, a.y*f, a.z*f, a.w*f); -#endif -} - -ccl_device_inline float4 operator*(float f, const float4& a) -{ - return a * f; -} - -ccl_device_inline float4 rcp(const float4& a) -{ -#ifdef __KERNEL_SSE__ - float4 r = _mm_rcp_ps(a.m128); - return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); -#else - return make_float4(1.0f/a.x, 1.0f/a.y, 1.0f/a.z, 1.0f/a.w); -#endif -} - -ccl_device_inline float4 operator/(const float4& a, float f) -{ - return a * (1.0f/f); -} - -ccl_device_inline float4 operator/(const float4& a, const float4& b) -{ -#ifdef __KERNEL_SSE__ - return a * rcp(b); -#else - return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w); -#endif - -} - -ccl_device_inline float4 operator+(const float4& a, const float4& b) -{ -#ifdef __KERNEL_SSE__ - return _mm_add_ps(a.m128, b.m128); -#else - return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w); -#endif -} - -ccl_device_inline float4 operator-(const float4& a, const float4& b) -{ -#ifdef __KERNEL_SSE__ - return _mm_sub_ps(a.m128, b.m128); -#else - return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w); -#endif -} - -ccl_device_inline float4 operator+=(float4& a, const float4& b) -{ - return a = a + b; -} - -ccl_device_inline float4 operator*=(float4& a, const float4& b) -{ - return a = a * b; -} - -ccl_device_inline float4 operator/=(float4& a, float f) -{ - return a = a / f; -} - -ccl_device_inline int4 operator<(const float4& a, const float4& b) -{ -#ifdef __KERNEL_SSE__ - return _mm_cvtps_epi32(_mm_cmplt_ps(a.m128, b.m128)); /* todo: avoid cvt */ -#else - return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w); -#endif -} - -ccl_device_inline int4 operator>=(const float4& a, const float4& b) -{ -#ifdef __KERNEL_SSE__ - return _mm_cvtps_epi32(_mm_cmpge_ps(a.m128, b.m128)); /* todo: avoid cvt */ -#else - return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w); -#endif -} - -ccl_device_inline int4 operator<=(const float4& a, const float4& b) -{ -#ifdef __KERNEL_SSE__ - return _mm_cvtps_epi32(_mm_cmple_ps(a.m128, b.m128)); /* todo: avoid cvt */ -#else - return make_int4(a.x <= b.x, a.y <= b.y, a.z <= b.z, a.w <= b.w); -#endif -} - -ccl_device_inline bool operator==(const float4& a, const float4& b) -{ -#ifdef __KERNEL_SSE__ - return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15; -#else - return (a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w); -#endif -} - -ccl_device_inline float4 cross(const float4& a, const float4& b) -{ -#ifdef __KERNEL_SSE__ - return (shuffle<1,2,0,0>(a)*shuffle<2,0,1,0>(b)) - (shuffle<2,0,1,0>(a)*shuffle<1,2,0,0>(b)); -#else - return make_float4(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x, 0.0f); -#endif -} - -ccl_device_inline bool is_zero(const float4& a) -{ -#ifdef __KERNEL_SSE__ - return a == make_float4(0.0f); -#else - return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f); -#endif -} - -ccl_device_inline float reduce_add(const float4& a) -{ -#ifdef __KERNEL_SSE__ - float4 h = shuffle<1,0,3,2>(a) + a; - return _mm_cvtss_f32(shuffle<2,3,0,1>(h) + h); /* todo: efficiency? */ -#else - return ((a.x + a.y) + (a.z + a.w)); -#endif -} - -ccl_device_inline float average(const float4& a) -{ - return reduce_add(a) * 0.25f; -} - -ccl_device_inline float len(const float4& a) -{ - return sqrtf(dot(a, a)); -} - -ccl_device_inline float4 normalize(const float4& a) -{ - return a/len(a); -} - -ccl_device_inline float4 safe_normalize(const float4& a) -{ - float t = len(a); - return (t != 0.0f)? a/t: a; -} - -ccl_device_inline float4 min(const float4& a, const float4& b) -{ -#ifdef __KERNEL_SSE__ - return _mm_min_ps(a.m128, b.m128); -#else - return make_float4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); -#endif -} - -ccl_device_inline float4 max(const float4& a, const float4& b) -{ -#ifdef __KERNEL_SSE__ - return _mm_max_ps(a.m128, b.m128); -#else - return make_float4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); -#endif -} - -#endif - -#ifndef __KERNEL_GPU__ - -ccl_device_inline float4 select(const int4& mask, const float4& a, const float4& b) -{ -#ifdef __KERNEL_SSE__ - return _mm_or_ps(_mm_and_ps(_mm_cvtepi32_ps(mask), a), _mm_andnot_ps(_mm_cvtepi32_ps(mask), b)); /* todo: avoid cvt */ -#else - return make_float4((mask.x)? a.x: b.x, (mask.y)? a.y: b.y, (mask.z)? a.z: b.z, (mask.w)? a.w: b.w); -#endif -} - -ccl_device_inline float4 reduce_min(const float4& a) -{ -#ifdef __KERNEL_SSE__ - float4 h = min(shuffle<1,0,3,2>(a), a); - return min(shuffle<2,3,0,1>(h), h); -#else - return make_float4(min(min(a.x, a.y), min(a.z, a.w))); -#endif -} - -ccl_device_inline float4 reduce_max(const float4& a) -{ -#ifdef __KERNEL_SSE__ - float4 h = max(shuffle<1,0,3,2>(a), a); - return max(shuffle<2,3,0,1>(h), h); -#else - return make_float4(max(max(a.x, a.y), max(a.z, a.w))); -#endif -} - -#if 0 -ccl_device_inline float4 reduce_add(const float4& a) -{ -#ifdef __KERNEL_SSE__ - float4 h = shuffle<1,0,3,2>(a) + a; - return shuffle<2,3,0,1>(h) + h; -#else - return make_float4((a.x + a.y) + (a.z + a.w)); -#endif -} -#endif - -ccl_device_inline void print_float4(const char *label, const float4& a) -{ - printf("%s: %.8f %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z, (double)a.w); -} - -#endif - -/* Int2 */ - -#ifndef __KERNEL_OPENCL__ - -ccl_device_inline int2 operator+(const int2 &a, const int2 &b) -{ - return make_int2(a.x + b.x, a.y + b.y); -} - -ccl_device_inline int2 operator+=(int2 &a, const int2 &b) -{ - return a = a + b; -} - -ccl_device_inline int2 operator-(const int2 &a, const int2 &b) -{ - return make_int2(a.x - b.x, a.y - b.y); -} - -ccl_device_inline int2 operator*(const int2 &a, const int2 &b) -{ - return make_int2(a.x * b.x, a.y * b.y); -} - -ccl_device_inline int2 operator/(const int2 &a, const int2 &b) -{ - return make_int2(a.x / b.x, a.y / b.y); -} - -#endif - -/* Int3 */ - -#ifndef __KERNEL_OPENCL__ - -ccl_device_inline int3 min(int3 a, int3 b) -{ -#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) - return _mm_min_epi32(a.m128, b.m128); -#else - return make_int3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); -#endif -} - -ccl_device_inline int3 max(int3 a, int3 b) -{ -#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) - return _mm_max_epi32(a.m128, b.m128); -#else - return make_int3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); -#endif -} - -ccl_device_inline int3 clamp(const int3& a, int mn, int mx) -{ -#ifdef __KERNEL_SSE__ - return min(max(a, make_int3(mn)), make_int3(mx)); -#else - return make_int3(clamp(a.x, mn, mx), clamp(a.y, mn, mx), clamp(a.z, mn, mx)); -#endif -} - -ccl_device_inline int3 clamp(const int3& a, int3& mn, int mx) -{ -#ifdef __KERNEL_SSE__ - return min(max(a, mn), make_int3(mx)); -#else - return make_int3(clamp(a.x, mn.x, mx), clamp(a.y, mn.y, mx), clamp(a.z, mn.z, mx)); -#endif -} - -#endif - -#ifndef __KERNEL_GPU__ - -ccl_device_inline void print_int3(const char *label, const int3& a) -{ - printf("%s: %d %d %d\n", label, a.x, a.y, a.z); -} - -#endif - -/* Int4 */ - -#ifndef __KERNEL_GPU__ - -ccl_device_inline int4 operator+(const int4& a, const int4& b) -{ -#ifdef __KERNEL_SSE__ - return _mm_add_epi32(a.m128, b.m128); -#else - return make_int4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w); -#endif -} - -ccl_device_inline int4 operator+=(int4& a, const int4& b) -{ - return a = a + b; -} - -ccl_device_inline int4 operator>>(const int4& a, int i) -{ -#ifdef __KERNEL_SSE__ - return _mm_srai_epi32(a.m128, i); -#else - return make_int4(a.x >> i, a.y >> i, a.z >> i, a.w >> i); -#endif -} - -ccl_device_inline int4 min(int4 a, int4 b) -{ -#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) - return _mm_min_epi32(a.m128, b.m128); -#else - return make_int4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); -#endif -} - -ccl_device_inline int4 max(int4 a, int4 b) -{ -#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) - return _mm_max_epi32(a.m128, b.m128); -#else - return make_int4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); -#endif -} - -ccl_device_inline int4 clamp(const int4& a, const int4& mn, const int4& mx) -{ - return min(max(a, mn), mx); -} +CCL_NAMESPACE_END -ccl_device_inline int4 select(const int4& mask, const int4& a, const int4& b) -{ -#ifdef __KERNEL_SSE__ - __m128 m = _mm_cvtepi32_ps(mask); - return _mm_castps_si128(_mm_or_ps(_mm_and_ps(m, _mm_castsi128_ps(a)), _mm_andnot_ps(m, _mm_castsi128_ps(b)))); /* todo: avoid cvt */ -#else - return make_int4((mask.x)? a.x: b.x, (mask.y)? a.y: b.y, (mask.z)? a.z: b.z, (mask.w)? a.w: b.w); -#endif -} +#include "util/util_math_int2.h" +#include "util/util_math_int3.h" +#include "util/util_math_int4.h" -ccl_device_inline void print_int4(const char *label, const int4& a) -{ - printf("%s: %d %d %d %d\n", label, a.x, a.y, a.z, a.w); -} +#include "util/util_math_float2.h" +#include "util/util_math_float3.h" +#include "util/util_math_float4.h" -#endif +#include "util/util_rect.h" -/* Int/Float conversion */ +CCL_NAMESPACE_BEGIN #ifndef __KERNEL_OPENCL__ - -ccl_device_inline int as_int(uint i) -{ - union { uint ui; int i; } u; - u.ui = i; - return u.i; -} - -ccl_device_inline uint as_uint(int i) -{ - union { uint ui; int i; } u; - u.i = i; - return u.ui; -} - -ccl_device_inline uint as_uint(float f) -{ - union { uint i; float f; } u; - u.f = f; - return u.i; -} - -ccl_device_inline int __float_as_int(float f) -{ - union { int i; float f; } u; - u.f = f; - return u.i; -} - -ccl_device_inline float __int_as_float(int i) -{ - union { int i; float f; } u; - u.i = i; - return u.f; -} - -ccl_device_inline uint __float_as_uint(float f) -{ - union { uint i; float f; } u; - u.f = f; - return u.i; -} - -ccl_device_inline float __uint_as_float(uint i) -{ - union { uint i; float f; } u; - u.i = i; - return u.f; -} - -/* Versions of functions which are safe for fast math. */ -ccl_device_inline bool isnan_safe(float f) -{ - unsigned int x = __float_as_uint(f); - return (x << 1) > 0xff000000u; -} - -ccl_device_inline bool isfinite_safe(float f) -{ - /* By IEEE 754 rule, 2*Inf equals Inf */ - unsigned int x = __float_as_uint(f); - return (f == f) && (x == 0 || (f != 2.0f*f)); -} - /* Interpolation */ template<class A, class B> A lerp(const A& a, const A& b, const B& t) @@ -1262,15 +332,23 @@ template<class A, class B> A lerp(const A& a, const A& b, const B& t) return (A)(a * ((B)1 - t) + b * t); } +#endif /* __KERNEL_OPENCL__ */ + /* Triangle */ -ccl_device_inline float triangle_area(const float3& v1, const float3& v2, const float3& v3) +#ifndef __KERNEL_OPENCL__ +ccl_device_inline float triangle_area(const float3& v1, + const float3& v2, + const float3& v3) +#else +ccl_device_inline float triangle_area(const float3 v1, + const float3 v2, + const float3 v3) +#endif { return len(cross(v3 - v2, v1 - v2))*0.5f; } -#endif - /* Orthonormal vectors */ ccl_device_inline void make_orthonormals(const float3 N, float3 *a, float3 *b) @@ -1329,7 +407,7 @@ ccl_device_inline float3 safe_divide_even_color(float3 a, float3 b) y = (b.y != 0.0f)? a.y/b.y: 0.0f; z = (b.z != 0.0f)? a.z/b.z: 0.0f; - /* try to get grey even if b is zero */ + /* try to get gray even if b is zero */ if(b.x == 0.0f) { if(b.y == 0.0f) { x = z; @@ -1366,16 +444,16 @@ ccl_device_inline float3 rotate_around_axis(float3 p, float3 axis, float angle) float3 r; r.x = ((costheta + (1 - costheta) * axis.x * axis.x) * p.x) + - (((1 - costheta) * axis.x * axis.y - axis.z * sintheta) * p.y) + - (((1 - costheta) * axis.x * axis.z + axis.y * sintheta) * p.z); + (((1 - costheta) * axis.x * axis.y - axis.z * sintheta) * p.y) + + (((1 - costheta) * axis.x * axis.z + axis.y * sintheta) * p.z); r.y = (((1 - costheta) * axis.x * axis.y + axis.z * sintheta) * p.x) + - ((costheta + (1 - costheta) * axis.y * axis.y) * p.y) + - (((1 - costheta) * axis.y * axis.z - axis.x * sintheta) * p.z); + ((costheta + (1 - costheta) * axis.y * axis.y) * p.y) + + (((1 - costheta) * axis.y * axis.z - axis.x * sintheta) * p.z); r.z = (((1 - costheta) * axis.x * axis.z - axis.y * sintheta) * p.x) + - (((1 - costheta) * axis.y * axis.z + axis.x * sintheta) * p.y) + - ((costheta + (1 - costheta) * axis.z * axis.z) * p.z); + (((1 - costheta) * axis.y * axis.z + axis.x * sintheta) * p.y) + + ((costheta + (1 - costheta) * axis.z * axis.z) * p.z); return r; } @@ -1424,22 +502,27 @@ ccl_device float safe_powf(float a, float b) return compatible_powf(a, b); } +ccl_device float safe_divide(float a, float b) +{ + return (b != 0.0f)? a/b: 0.0f; +} + ccl_device float safe_logf(float a, float b) { - if(UNLIKELY(a < 0.0f || b < 0.0f)) + if(UNLIKELY(a <= 0.0f || b <= 0.0f)) return 0.0f; - return logf(a)/logf(b); + return safe_divide(logf(a),logf(b)); } -ccl_device float safe_divide(float a, float b) +ccl_device float safe_modulo(float a, float b) { - return (b != 0.0f)? a/b: 0.0f; + return (b != 0.0f)? fmodf(a, b): 0.0f; } -ccl_device float safe_modulo(float a, float b) +ccl_device_inline float sqr(float a) { - return (b != 0.0f)? fmodf(a, b): 0.0f; + return a * a; } ccl_device_inline float beta(float x, float y) @@ -1451,181 +534,9 @@ ccl_device_inline float beta(float x, float y) #endif } -/* Ray Intersection */ - -ccl_device bool ray_sphere_intersect( - float3 ray_P, float3 ray_D, float ray_t, - float3 sphere_P, float sphere_radius, - float3 *isect_P, float *isect_t) -{ - float3 d = sphere_P - ray_P; - float radiussq = sphere_radius*sphere_radius; - float tsq = dot(d, d); - - if(tsq > radiussq) { /* ray origin outside sphere */ - float tp = dot(d, ray_D); - - if(tp < 0.0f) /* dir points away from sphere */ - return false; - - float dsq = tsq - tp*tp; /* pythagoras */ - - if(dsq > radiussq) /* closest point on ray outside sphere */ - return false; - - float t = tp - sqrtf(radiussq - dsq); /* pythagoras */ - - if(t < ray_t) { - *isect_t = t; - *isect_P = ray_P + ray_D*t; - return true; - } - } - - return false; -} - -ccl_device bool ray_aligned_disk_intersect( - float3 ray_P, float3 ray_D, float ray_t, - float3 disk_P, float disk_radius, - float3 *isect_P, float *isect_t) -{ - /* aligned disk normal */ - float disk_t; - float3 disk_N = normalize_len(ray_P - disk_P, &disk_t); - float div = dot(ray_D, disk_N); - - if(UNLIKELY(div == 0.0f)) - return false; - - /* compute t to intersection point */ - float t = -disk_t/div; - if(t < 0.0f || t > ray_t) - return false; - - /* test if within radius */ - float3 P = ray_P + ray_D*t; - if(len_squared(P - disk_P) > disk_radius*disk_radius) - return false; - - *isect_P = P; - *isect_t = t; - - return true; -} - -ccl_device bool ray_triangle_intersect( - float3 ray_P, float3 ray_D, float ray_t, - float3 v0, float3 v1, float3 v2, - float3 *isect_P, float *isect_t) -{ - /* Calculate intersection */ - float3 e1 = v1 - v0; - float3 e2 = v2 - v0; - float3 s1 = cross(ray_D, e2); - - const float divisor = dot(s1, e1); - if(UNLIKELY(divisor == 0.0f)) - return false; - - const float invdivisor = 1.0f/divisor; - - /* compute first barycentric coordinate */ - const float3 d = ray_P - v0; - const float u = dot(d, s1)*invdivisor; - if(u < 0.0f) - return false; - - /* Compute second barycentric coordinate */ - const float3 s2 = cross(d, e1); - const float v = dot(ray_D, s2)*invdivisor; - if(v < 0.0f) - return false; - - const float b0 = 1.0f - u - v; - if(b0 < 0.0f) - return false; - - /* compute t to intersection point */ - const float t = dot(e2, s2)*invdivisor; - if(t < 0.0f || t > ray_t) - return false; - - *isect_t = t; - *isect_P = ray_P + ray_D*t; - - return true; -} - -ccl_device_inline bool ray_triangle_intersect_uv( - float3 ray_P, float3 ray_D, float ray_t, - float3 v0, float3 v1, float3 v2, - float *isect_u, float *isect_v, float *isect_t) -{ - /* Calculate intersection */ - float3 e1 = v1 - v0; - float3 e2 = v2 - v0; - float3 s1 = cross(ray_D, e2); - - const float divisor = dot(s1, e1); - if(UNLIKELY(divisor == 0.0f)) - return false; - - const float invdivisor = 1.0f/divisor; - - /* compute first barycentric coordinate */ - const float3 d = ray_P - v0; - const float u = dot(d, s1)*invdivisor; - if(u < 0.0f) - return false; - - /* Compute second barycentric coordinate */ - const float3 s2 = cross(d, e1); - const float v = dot(ray_D, s2)*invdivisor; - if(v < 0.0f) - return false; - - const float b0 = 1.0f - u - v; - if(b0 < 0.0f) - return false; - - /* compute t to intersection point */ - const float t = dot(e2, s2)*invdivisor; - if(t < 0.0f || t > ray_t) - return false; - - *isect_u = u; - *isect_v = v; - *isect_t = t; - - return true; -} - -ccl_device bool ray_quad_intersect(float3 ray_P, float3 ray_D, float ray_mint, float ray_maxt, - float3 quad_P, float3 quad_u, float3 quad_v, float3 quad_n, - float3 *isect_P, float *isect_t, float *isect_u, float *isect_v) +ccl_device_inline float xor_signmask(float x, int y) { - float t = -(dot(ray_P, quad_n) - dot(quad_P, quad_n)) / dot(ray_D, quad_n); - if(t < ray_mint || t > ray_maxt) - return false; - - float3 hit = ray_P + t*ray_D; - float3 inplane = hit - quad_P; - - float u = dot(inplane, quad_u) / dot(quad_u, quad_u) + 0.5f; - if(u < 0.0f || u > 1.0f) - return false; - - float v = dot(inplane, quad_v) / dot(quad_v, quad_v) + 0.5f; - if(v < 0.0f || v > 1.0f) - return false; - - if(isect_P) *isect_P = hit; - if(isect_t) *isect_t = t; - if(isect_u) *isect_u = u; - if(isect_v) *isect_v = v; - - return true; + return __int_as_float(__float_as_int(x) ^ y); } /* projections */ @@ -1662,32 +573,6 @@ ccl_device_inline float2 map_to_sphere(const float3 co) return make_float2(u, v); } -ccl_device_inline int util_max_axis(float3 vec) -{ -#ifdef __KERNEL_SSE__ - __m128 a = shuffle<0,0,1,1>(vec.m128); - __m128 b = shuffle<1,2,2,1>(vec.m128); - __m128 c = _mm_cmpgt_ps(a, b); - int mask = _mm_movemask_ps(c) & 0x7; - static const char tab[8] = {2, 2, 2, 0, 1, 2, 1, 0}; - return tab[mask]; -#else - if(vec.x > vec.y) { - if(vec.x > vec.z) - return 0; - else - return 2; - } - else { - if(vec.y > vec.z) - return 1; - else - return 2; - } -#endif -} - CCL_NAMESPACE_END #endif /* __UTIL_MATH_H__ */ - diff --git a/intern/cycles/util/util_math_cdf.cpp b/intern/cycles/util/util_math_cdf.cpp index ec78ca15d88..c14d4793ea1 100644 --- a/intern/cycles/util/util_math_cdf.cpp +++ b/intern/cycles/util/util_math_cdf.cpp @@ -14,10 +14,10 @@ * limitations under the License. */ -#include "util_math_cdf.h" +#include "util/util_math_cdf.h" -#include "util_algorithm.h" -#include "util_math.h" +#include "util/util_algorithm.h" +#include "util/util_math.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/util/util_math_cdf.h b/intern/cycles/util/util_math_cdf.h index 47dfb68ba44..79643fe26e3 100644 --- a/intern/cycles/util/util_math_cdf.h +++ b/intern/cycles/util/util_math_cdf.h @@ -17,9 +17,9 @@ #ifndef __UTIL_MATH_CDF_H__ #define __UTIL_MATH_CDF_H__ -#include "util_algorithm.h" -#include "util_math.h" -#include "util_vector.h" +#include "util/util_algorithm.h" +#include "util/util_math.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/util/util_math_float2.h b/intern/cycles/util/util_math_float2.h new file mode 100644 index 00000000000..6f9d0855d50 --- /dev/null +++ b/intern/cycles/util/util_math_float2.h @@ -0,0 +1,227 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_MATH_FLOAT2_H__ +#define __UTIL_MATH_FLOAT2_H__ + +#ifndef __UTIL_MATH_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +/******************************************************************************* + * Declaration. + */ + +#ifndef __KERNEL_OPENCL__ +ccl_device_inline float2 operator-(const float2& a); +ccl_device_inline float2 operator*(const float2& a, const float2& b); +ccl_device_inline float2 operator*(const float2& a, float f); +ccl_device_inline float2 operator*(float f, const float2& a); +ccl_device_inline float2 operator/(float f, const float2& a); +ccl_device_inline float2 operator/(const float2& a, float f); +ccl_device_inline float2 operator/(const float2& a, const float2& b); +ccl_device_inline float2 operator+(const float2& a, const float2& b); +ccl_device_inline float2 operator-(const float2& a, const float2& b); +ccl_device_inline float2 operator+=(float2& a, const float2& b); +ccl_device_inline float2 operator*=(float2& a, const float2& b); +ccl_device_inline float2 operator*=(float2& a, float f); +ccl_device_inline float2 operator/=(float2& a, const float2& b); +ccl_device_inline float2 operator/=(float2& a, float f); + +ccl_device_inline bool operator==(const float2& a, const float2& b); +ccl_device_inline bool operator!=(const float2& a, const float2& b); + +ccl_device_inline bool is_zero(const float2& a); +ccl_device_inline float average(const float2& a); +ccl_device_inline float dot(const float2& a, const float2& b); +ccl_device_inline float cross(const float2& a, const float2& b); +ccl_device_inline float len(const float2& a); +ccl_device_inline float2 normalize(const float2& a); +ccl_device_inline float2 normalize_len(const float2& a, float *t); +ccl_device_inline float2 safe_normalize(const float2& a); +ccl_device_inline float2 min(const float2& a, const float2& b); +ccl_device_inline float2 max(const float2& a, const float2& b); +ccl_device_inline float2 clamp(const float2& a, const float2& mn, const float2& mx); +ccl_device_inline float2 fabs(const float2& a); +ccl_device_inline float2 as_float2(const float4& a); +ccl_device_inline float2 interp(const float2& a, const float2& b, float t); +#endif /* !__KERNEL_OPENCL__ */ + +/******************************************************************************* + * Definition. + */ + +#ifndef __KERNEL_OPENCL__ +ccl_device_inline float2 operator-(const float2& a) +{ + return make_float2(-a.x, -a.y); +} + +ccl_device_inline float2 operator*(const float2& a, const float2& b) +{ + return make_float2(a.x*b.x, a.y*b.y); +} + +ccl_device_inline float2 operator*(const float2& a, float f) +{ + return make_float2(a.x*f, a.y*f); +} + +ccl_device_inline float2 operator*(float f, const float2& a) +{ + return make_float2(a.x*f, a.y*f); +} + +ccl_device_inline float2 operator/(float f, const float2& a) +{ + return make_float2(f/a.x, f/a.y); +} + +ccl_device_inline float2 operator/(const float2& a, float f) +{ + float invf = 1.0f/f; + return make_float2(a.x*invf, a.y*invf); +} + +ccl_device_inline float2 operator/(const float2& a, const float2& b) +{ + return make_float2(a.x/b.x, a.y/b.y); +} + +ccl_device_inline float2 operator+(const float2& a, const float2& b) +{ + return make_float2(a.x+b.x, a.y+b.y); +} + +ccl_device_inline float2 operator-(const float2& a, const float2& b) +{ + return make_float2(a.x-b.x, a.y-b.y); +} + +ccl_device_inline float2 operator+=(float2& a, const float2& b) +{ + return a = a + b; +} + +ccl_device_inline float2 operator*=(float2& a, const float2& b) +{ + return a = a * b; +} + +ccl_device_inline float2 operator*=(float2& a, float f) +{ + return a = a * f; +} + +ccl_device_inline float2 operator/=(float2& a, const float2& b) +{ + return a = a / b; +} + +ccl_device_inline float2 operator/=(float2& a, float f) +{ + float invf = 1.0f/f; + return a = a * invf; +} + +ccl_device_inline bool operator==(const float2& a, const float2& b) +{ + return (a.x == b.x && a.y == b.y); +} + +ccl_device_inline bool operator!=(const float2& a, const float2& b) +{ + return !(a == b); +} + +ccl_device_inline bool is_zero(const float2& a) +{ + return (a.x == 0.0f && a.y == 0.0f); +} + +ccl_device_inline float average(const float2& a) +{ + return (a.x + a.y)*(1.0f/2.0f); +} + +ccl_device_inline float dot(const float2& a, const float2& b) +{ + return a.x*b.x + a.y*b.y; +} + +ccl_device_inline float cross(const float2& a, const float2& b) +{ + return (a.x*b.y - a.y*b.x); +} + +ccl_device_inline float len(const float2& a) +{ + return sqrtf(dot(a, a)); +} + +ccl_device_inline float2 normalize(const float2& a) +{ + return a/len(a); +} + +ccl_device_inline float2 normalize_len(const float2& a, float *t) +{ + *t = len(a); + return a/(*t); +} + +ccl_device_inline float2 safe_normalize(const float2& a) +{ + float t = len(a); + return (t != 0.0f)? a/t: a; +} + +ccl_device_inline float2 min(const float2& a, const float2& b) +{ + return make_float2(min(a.x, b.x), min(a.y, b.y)); +} + +ccl_device_inline float2 max(const float2& a, const float2& b) +{ + return make_float2(max(a.x, b.x), max(a.y, b.y)); +} + +ccl_device_inline float2 clamp(const float2& a, const float2& mn, const float2& mx) +{ + return min(max(a, mn), mx); +} + +ccl_device_inline float2 fabs(const float2& a) +{ + return make_float2(fabsf(a.x), fabsf(a.y)); +} + +ccl_device_inline float2 as_float2(const float4& a) +{ + return make_float2(a.x, a.y); +} + +ccl_device_inline float2 interp(const float2& a, const float2& b, float t) +{ + return a + t*(b - a); +} +#endif /* !__KERNEL_OPENCL__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_MATH_FLOAT2_H__ */ diff --git a/intern/cycles/util/util_math_float3.h b/intern/cycles/util/util_math_float3.h new file mode 100644 index 00000000000..f5149fe13ed --- /dev/null +++ b/intern/cycles/util/util_math_float3.h @@ -0,0 +1,395 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_MATH_FLOAT3_H__ +#define __UTIL_MATH_FLOAT3_H__ + +#ifndef __UTIL_MATH_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +/******************************************************************************* + * Declaration. + */ + +#ifndef __KERNEL_OPENCL__ +ccl_device_inline float3 operator-(const float3& a); +ccl_device_inline float3 operator*(const float3& a, const float3& b); +ccl_device_inline float3 operator*(const float3& a, const float f); +ccl_device_inline float3 operator*(const float f, const float3& a); +ccl_device_inline float3 operator/(const float f, const float3& a); +ccl_device_inline float3 operator/(const float3& a, const float f); +ccl_device_inline float3 operator/(const float3& a, const float3& b); +ccl_device_inline float3 operator+(const float3& a, const float3& b); +ccl_device_inline float3 operator-(const float3& a, const float3& b); +ccl_device_inline float3 operator+=(float3& a, const float3& b); +ccl_device_inline float3 operator-=(float3& a, const float3& b); +ccl_device_inline float3 operator*=(float3& a, const float3& b); +ccl_device_inline float3 operator*=(float3& a, float f); +ccl_device_inline float3 operator/=(float3& a, const float3& b); +ccl_device_inline float3 operator/=(float3& a, float f); + +ccl_device_inline bool operator==(const float3& a, const float3& b); +ccl_device_inline bool operator!=(const float3& a, const float3& b); + +ccl_device_inline float dot(const float3& a, const float3& b); +ccl_device_inline float dot_xy(const float3& a, const float3& b); +ccl_device_inline float3 cross(const float3& a, const float3& b); +ccl_device_inline float3 normalize(const float3& a); +ccl_device_inline float3 min(const float3& a, const float3& b); +ccl_device_inline float3 max(const float3& a, const float3& b); +ccl_device_inline float3 clamp(const float3& a, const float3& mn, const float3& mx); +ccl_device_inline float3 fabs(const float3& a); +ccl_device_inline float3 mix(const float3& a, const float3& b, float t); +ccl_device_inline float3 rcp(const float3& a); +ccl_device_inline float3 sqrt(const float3& a); +#endif /* !__KERNEL_OPENCL__ */ + +ccl_device_inline float min3(float3 a); +ccl_device_inline float max3(float3 a); +ccl_device_inline float len(const float3 a); +ccl_device_inline float len_squared(const float3 a); + +ccl_device_inline float3 saturate3(float3 a); +ccl_device_inline float3 safe_normalize(const float3 a); +ccl_device_inline float3 normalize_len(const float3 a, float *t); +ccl_device_inline float3 safe_normalize_len(const float3 a, float *t); +ccl_device_inline float3 interp(float3 a, float3 b, float t); + +ccl_device_inline bool is_zero(const float3 a); +ccl_device_inline float reduce_add(const float3 a); +ccl_device_inline float average(const float3 a); +ccl_device_inline bool isequal_float3(const float3 a, const float3 b); + +/******************************************************************************* + * Definition. + */ + +#ifndef __KERNEL_OPENCL__ +ccl_device_inline float3 operator-(const float3& a) +{ +#ifdef __KERNEL_SSE__ + return float3(_mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)))); +#else + return make_float3(-a.x, -a.y, -a.z); +#endif +} + +ccl_device_inline float3 operator*(const float3& a, const float3& b) +{ +#ifdef __KERNEL_SSE__ + return float3(_mm_mul_ps(a.m128,b.m128)); +#else + return make_float3(a.x*b.x, a.y*b.y, a.z*b.z); +#endif +} + +ccl_device_inline float3 operator*(const float3& a, const float f) +{ +#ifdef __KERNEL_SSE__ + return float3(_mm_mul_ps(a.m128,_mm_set1_ps(f))); +#else + return make_float3(a.x*f, a.y*f, a.z*f); +#endif +} + +ccl_device_inline float3 operator*(const float f, const float3& a) +{ +#if defined(__KERNEL_SSE__) + return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128)); +#else + return make_float3(a.x*f, a.y*f, a.z*f); +#endif +} + +ccl_device_inline float3 operator/(const float f, const float3& a) +{ +#if defined(__KERNEL_SSE__) + return float3(_mm_div_ps(_mm_set1_ps(f), a.m128)); +#else + return make_float3(f / a.x, f / a.y, f / a.z); +#endif +} + +ccl_device_inline float3 operator/(const float3& a, const float f) +{ + float invf = 1.0f/f; + return a * invf; +} + +ccl_device_inline float3 operator/(const float3& a, const float3& b) +{ +#if defined(__KERNEL_SSE__) + return float3(_mm_div_ps(a.m128, b.m128)); +#else + return make_float3(a.x / b.x, a.y / b.y, a.z / b.z); +#endif +} + +ccl_device_inline float3 operator+(const float3& a, const float3& b) +{ +#ifdef __KERNEL_SSE__ + return float3(_mm_add_ps(a.m128, b.m128)); +#else + return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); +#endif +} + +ccl_device_inline float3 operator-(const float3& a, const float3& b) +{ +#ifdef __KERNEL_SSE__ + return float3(_mm_sub_ps(a.m128, b.m128)); +#else + return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); +#endif +} + +ccl_device_inline float3 operator+=(float3& a, const float3& b) +{ + return a = a + b; +} + +ccl_device_inline float3 operator-=(float3& a, const float3& b) +{ + return a = a - b; +} + +ccl_device_inline float3 operator*=(float3& a, const float3& b) +{ + return a = a * b; +} + +ccl_device_inline float3 operator*=(float3& a, float f) +{ + return a = a * f; +} + +ccl_device_inline float3 operator/=(float3& a, const float3& b) +{ + return a = a / b; +} + +ccl_device_inline float3 operator/=(float3& a, float f) +{ + float invf = 1.0f/f; + return a = a * invf; +} + +ccl_device_inline bool operator==(const float3& a, const float3& b) +{ +#ifdef __KERNEL_SSE__ + return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7; +#else + return (a.x == b.x && a.y == b.y && a.z == b.z); +#endif +} + +ccl_device_inline bool operator!=(const float3& a, const float3& b) +{ + return !(a == b); +} + +ccl_device_inline float dot(const float3& a, const float3& b) +{ +#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) + return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F)); +#else + return a.x*b.x + a.y*b.y + a.z*b.z; +#endif +} + +ccl_device_inline float dot_xy(const float3& a, const float3& b) +{ +#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) + return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a,b),b)); +#else + return a.x*b.x + a.y*b.y; +#endif +} + +ccl_device_inline float3 cross(const float3& a, const float3& b) +{ + float3 r = make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x); + return r; +} + +ccl_device_inline float3 normalize(const float3& a) +{ +#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) + __m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F)); + return float3(_mm_div_ps(a.m128, norm)); +#else + return a/len(a); +#endif +} + +ccl_device_inline float3 min(const float3& a, const float3& b) +{ +#ifdef __KERNEL_SSE__ + return float3(_mm_min_ps(a.m128, b.m128)); +#else + return make_float3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); +#endif +} + +ccl_device_inline float3 max(const float3& a, const float3& b) +{ +#ifdef __KERNEL_SSE__ + return float3(_mm_max_ps(a.m128, b.m128)); +#else + return make_float3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); +#endif +} + +ccl_device_inline float3 clamp(const float3& a, const float3& mn, const float3& mx) +{ + return min(max(a, mn), mx); +} + +ccl_device_inline float3 fabs(const float3& a) +{ +#ifdef __KERNEL_SSE__ + __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); + return float3(_mm_and_ps(a.m128, mask)); +#else + return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z)); +#endif +} + +ccl_device_inline float3 sqrt(const float3& a) +{ +#ifdef __KERNEL_SSE__ + return float3(_mm_sqrt_ps(a)); +#else + return make_float3(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z)); +#endif +} + +ccl_device_inline float3 mix(const float3& a, const float3& b, float t) +{ + return a + t*(b - a); +} + +ccl_device_inline float3 rcp(const float3& a) +{ +#ifdef __KERNEL_SSE__ + /* Don't use _mm_rcp_ps due to poor precision. */ + return float3(_mm_div_ps(_mm_set_ps1(1.0f), a.m128)); +#else + return make_float3(1.0f/a.x, 1.0f/a.y, 1.0f/a.z); +#endif +} +#endif /* !__KERNEL_OPENCL__ */ + +ccl_device_inline float min3(float3 a) +{ + return min(min(a.x, a.y), a.z); +} + +ccl_device_inline float max3(float3 a) +{ + return max(max(a.x, a.y), a.z); +} + +ccl_device_inline float len(const float3 a) +{ +#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) + return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(a.m128, a.m128, 0x7F))); +#else + return sqrtf(dot(a, a)); +#endif +} + +ccl_device_inline float len_squared(const float3 a) +{ + return dot(a, a); +} + +ccl_device_inline float3 saturate3(float3 a) +{ + return make_float3(saturate(a.x), saturate(a.y), saturate(a.z)); +} + +ccl_device_inline float3 normalize_len(const float3 a, float *t) +{ + *t = len(a); + float x = 1.0f / *t; + return a*x; +} + +ccl_device_inline float3 safe_normalize(const float3 a) +{ + float t = len(a); + return (t != 0.0f)? a * (1.0f/t) : a; +} + +ccl_device_inline float3 safe_normalize_len(const float3 a, float *t) +{ + *t = len(a); + return (*t != 0.0f)? a/(*t): a; +} + +ccl_device_inline float3 interp(float3 a, float3 b, float t) +{ + return a + t*(b - a); +} + +ccl_device_inline bool is_zero(const float3 a) +{ +#ifdef __KERNEL_SSE__ + return a == make_float3(0.0f); +#else + return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f); +#endif +} + +ccl_device_inline float reduce_add(const float3 a) +{ + return (a.x + a.y + a.z); +} + +ccl_device_inline float average(const float3 a) +{ + return reduce_add(a)*(1.0f/3.0f); +} + +ccl_device_inline bool isequal_float3(const float3 a, const float3 b) +{ +#ifdef __KERNEL_OPENCL__ + return all(a == b); +#else + return a == b; +#endif +} + +ccl_device_inline bool isfinite3_safe(float3 v) +{ + return isfinite_safe(v.x) && isfinite_safe(v.y) && isfinite_safe(v.z); +} + +ccl_device_inline float3 ensure_finite3(float3 v) +{ + if(!isfinite_safe(v.x)) v.x = 0.0f; + if(!isfinite_safe(v.y)) v.y = 0.0f; + if(!isfinite_safe(v.z)) v.z = 0.0f; + return v; +} + +CCL_NAMESPACE_END + +#endif /* __UTIL_MATH_FLOAT3_H__ */ diff --git a/intern/cycles/util/util_math_float4.h b/intern/cycles/util/util_math_float4.h new file mode 100644 index 00000000000..aa7e56fefe9 --- /dev/null +++ b/intern/cycles/util/util_math_float4.h @@ -0,0 +1,448 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_MATH_FLOAT4_H__ +#define __UTIL_MATH_FLOAT4_H__ + +#ifndef __UTIL_MATH_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +/******************************************************************************* + * Declaration. + */ + +#ifndef __KERNEL_OPENCL__ +ccl_device_inline float4 operator-(const float4& a); +ccl_device_inline float4 operator*(const float4& a, const float4& b); +ccl_device_inline float4 operator*(const float4& a, float f); +ccl_device_inline float4 operator*(float f, const float4& a); +ccl_device_inline float4 operator/(const float4& a, float f); +ccl_device_inline float4 operator/(const float4& a, const float4& b); +ccl_device_inline float4 operator+(const float4& a, const float4& b); +ccl_device_inline float4 operator-(const float4& a, const float4& b); +ccl_device_inline float4 operator+=(float4& a, const float4& b); +ccl_device_inline float4 operator*=(float4& a, const float4& b); +ccl_device_inline float4 operator/=(float4& a, float f); + +ccl_device_inline int4 operator<(const float4& a, const float4& b); +ccl_device_inline int4 operator>=(const float4& a, const float4& b); +ccl_device_inline int4 operator<=(const float4& a, const float4& b); +ccl_device_inline bool operator==(const float4& a, const float4& b); + +ccl_device_inline float dot(const float4& a, const float4& b); +ccl_device_inline float len_squared(const float4& a); +ccl_device_inline float4 rcp(const float4& a); +ccl_device_inline float4 sqrt(const float4& a); +ccl_device_inline float4 sqr(const float4& a); +ccl_device_inline float4 cross(const float4& a, const float4& b); +ccl_device_inline bool is_zero(const float4& a); +ccl_device_inline float average(const float4& a); +ccl_device_inline float len(const float4& a); +ccl_device_inline float4 normalize(const float4& a); +ccl_device_inline float4 safe_normalize(const float4& a); +ccl_device_inline float4 min(const float4& a, const float4& b); +ccl_device_inline float4 max(const float4& a, const float4& b); +ccl_device_inline float4 fabs(const float4& a); +#endif /* !__KERNEL_OPENCL__*/ + +#ifdef __KERNEL_SSE__ +template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> +__forceinline const float4 shuffle(const float4& b); +template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> +__forceinline const float4 shuffle(const float4& a, const float4& b); + +template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b); + +template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& a, const float4& b); +template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4& a, const float4& b); + +# ifdef __KERNEL_SSE3__ +template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b); +template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4& b); +# endif +#endif /* __KERNEL_SSE__ */ + +#ifndef __KERNEL_GPU__ +ccl_device_inline float4 select(const int4& mask, + const float4& a, + const float4& b); +ccl_device_inline float4 reduce_min(const float4& a); +ccl_device_inline float4 reduce_max(const float4& a); +ccl_device_inline float4 reduce_add(const float4& a); +#endif /* !__KERNEL_GPU__ */ + +/******************************************************************************* + * Definition. + */ + +#ifndef __KERNEL_OPENCL__ +ccl_device_inline float4 operator-(const float4& a) +{ +#ifdef __KERNEL_SSE__ + __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); + return float4(_mm_xor_ps(a.m128, mask)); +#else + return make_float4(-a.x, -a.y, -a.z, -a.w); +#endif +} + +ccl_device_inline float4 operator*(const float4& a, const float4& b) +{ +#ifdef __KERNEL_SSE__ + return float4(_mm_mul_ps(a.m128, b.m128)); +#else + return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w); +#endif +} + +ccl_device_inline float4 operator*(const float4& a, float f) +{ +#if defined(__KERNEL_SSE__) + return a * make_float4(f); +#else + return make_float4(a.x*f, a.y*f, a.z*f, a.w*f); +#endif +} + +ccl_device_inline float4 operator*(float f, const float4& a) +{ + return a * f; +} + +ccl_device_inline float4 operator/(const float4& a, float f) +{ + return a * (1.0f/f); +} + +ccl_device_inline float4 operator/(const float4& a, const float4& b) +{ +#ifdef __KERNEL_SSE__ + return float4(_mm_div_ps(a.m128, b.m128)); +#else + return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w); +#endif + +} + +ccl_device_inline float4 operator+(const float4& a, const float4& b) +{ +#ifdef __KERNEL_SSE__ + return float4(_mm_add_ps(a.m128, b.m128)); +#else + return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w); +#endif +} + +ccl_device_inline float4 operator-(const float4& a, const float4& b) +{ +#ifdef __KERNEL_SSE__ + return float4(_mm_sub_ps(a.m128, b.m128)); +#else + return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w); +#endif +} + +ccl_device_inline float4 operator+=(float4& a, const float4& b) +{ + return a = a + b; +} + +ccl_device_inline float4 operator*=(float4& a, const float4& b) +{ + return a = a * b; +} + +ccl_device_inline float4 operator/=(float4& a, float f) +{ + return a = a / f; +} + +ccl_device_inline int4 operator<(const float4& a, const float4& b) +{ +#ifdef __KERNEL_SSE__ + return int4(_mm_castps_si128(_mm_cmplt_ps(a.m128, b.m128))); +#else + return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w); +#endif +} + +ccl_device_inline int4 operator>=(const float4& a, const float4& b) +{ +#ifdef __KERNEL_SSE__ + return int4(_mm_castps_si128(_mm_cmpge_ps(a.m128, b.m128))); +#else + return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w); +#endif +} + +ccl_device_inline int4 operator<=(const float4& a, const float4& b) +{ +#ifdef __KERNEL_SSE__ + return int4(_mm_castps_si128(_mm_cmple_ps(a.m128, b.m128))); +#else + return make_int4(a.x <= b.x, a.y <= b.y, a.z <= b.z, a.w <= b.w); +#endif +} + +ccl_device_inline bool operator==(const float4& a, const float4& b) +{ +#ifdef __KERNEL_SSE__ + return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15; +#else + return (a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w); +#endif +} + +ccl_device_inline float dot(const float4& a, const float4& b) +{ +#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__) + return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF)); +#else + return (a.x*b.x + a.y*b.y) + (a.z*b.z + a.w*b.w); +#endif +} + +ccl_device_inline float len_squared(const float4& a) +{ + return dot(a, a); +} + +ccl_device_inline float4 rcp(const float4& a) +{ +#ifdef __KERNEL_SSE__ + /* Don't use _mm_rcp_ps due to poor precision. */ + return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128)); +#else + return make_float4(1.0f/a.x, 1.0f/a.y, 1.0f/a.z, 1.0f/a.w); +#endif +} + +ccl_device_inline float4 sqrt(const float4& a) +{ +#ifdef __KERNEL_SSE__ + return float4(_mm_sqrt_ps(a.m128)); +#else + return make_float4(sqrtf(a.x), + sqrtf(a.y), + sqrtf(a.z), + sqrtf(a.w)); +#endif +} + +ccl_device_inline float4 sqr(const float4& a) +{ + return a * a; +} + +ccl_device_inline float4 cross(const float4& a, const float4& b) +{ +#ifdef __KERNEL_SSE__ + return (shuffle<1,2,0,0>(a)*shuffle<2,0,1,0>(b)) - + (shuffle<2,0,1,0>(a)*shuffle<1,2,0,0>(b)); +#else + return make_float4(a.y*b.z - a.z*b.y, + a.z*b.x - a.x*b.z, + a.x*b.y - a.y*b.x, + 0.0f); +#endif +} + +ccl_device_inline bool is_zero(const float4& a) +{ +#ifdef __KERNEL_SSE__ + return a == make_float4(0.0f); +#else + return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f); +#endif +} + +ccl_device_inline float4 reduce_add(const float4& a) +{ +#ifdef __KERNEL_SSE__ +# ifdef __KERNEL_SSE3__ + float4 h(_mm_hadd_ps(a.m128, a.m128)); + return float4( _mm_hadd_ps(h.m128, h.m128)); +# else + float4 h(shuffle<1,0,3,2>(a) + a); + return shuffle<2,3,0,1>(h) + h; +# endif +#else + float sum = (a.x + a.y) + (a.z + a.w); + return make_float4(sum, sum, sum, sum); +#endif +} + +ccl_device_inline float average(const float4& a) +{ + return reduce_add(a).x * 0.25f; +} + +ccl_device_inline float len(const float4& a) +{ + return sqrtf(dot(a, a)); +} + +ccl_device_inline float4 normalize(const float4& a) +{ + return a/len(a); +} + +ccl_device_inline float4 safe_normalize(const float4& a) +{ + float t = len(a); + return (t != 0.0f)? a/t: a; +} + +ccl_device_inline float4 min(const float4& a, const float4& b) +{ +#ifdef __KERNEL_SSE__ + return float4(_mm_min_ps(a.m128, b.m128)); +#else + return make_float4(min(a.x, b.x), + min(a.y, b.y), + min(a.z, b.z), + min(a.w, b.w)); +#endif +} + +ccl_device_inline float4 max(const float4& a, const float4& b) +{ +#ifdef __KERNEL_SSE__ + return float4(_mm_max_ps(a.m128, b.m128)); +#else + return make_float4(max(a.x, b.x), + max(a.y, b.y), + max(a.z, b.z), + max(a.w, b.w)); +#endif +} + +ccl_device_inline float4 fabs(const float4& a) +{ +#ifdef __KERNEL_SSE__ + return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); +#else + return make_float4(fabsf(a.x), + fabsf(a.y), + fabsf(a.z), + fabsf(a.w)); +#endif +} +#endif /* !__KERNEL_OPENCL__*/ + +#ifdef __KERNEL_SSE__ +template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> +__forceinline const float4 shuffle(const float4& b) +{ + return float4(_mm_castsi128_ps( + _mm_shuffle_epi32(_mm_castps_si128(b), + _MM_SHUFFLE(index_3, index_2, index_1, index_0)))); +} + +template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> +__forceinline const float4 shuffle(const float4& a, const float4& b) +{ + return float4(_mm_shuffle_ps(a.m128, b.m128, + _MM_SHUFFLE(index_3, index_2, index_1, index_0))); +} + +template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b) +{ + return float4(_mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b)))); +} + +template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& a, const float4& b) +{ + return float4(_mm_movelh_ps(a.m128, b.m128)); +} + +template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4& a, const float4& b) +{ + return float4(_mm_movehl_ps(b.m128, a.m128)); +} + +# ifdef __KERNEL_SSE3__ +template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b) +{ + return float4(_mm_moveldup_ps(b)); +} + +template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4& b) +{ + return float4(_mm_movehdup_ps(b)); +} +# endif /* __KERNEL_SSE3__ */ +#endif /* __KERNEL_SSE__ */ + +#ifndef __KERNEL_GPU__ +ccl_device_inline float4 select(const int4& mask, + const float4& a, + const float4& b) +{ +#ifdef __KERNEL_SSE__ + return float4(_mm_blendv_ps(b.m128, a.m128, _mm_castsi128_ps(mask.m128))); +#else + return make_float4((mask.x)? a.x: b.x, + (mask.y)? a.y: b.y, + (mask.z)? a.z: b.z, + (mask.w)? a.w: b.w); +#endif +} + +ccl_device_inline float4 mask(const int4& mask, + const float4& a) +{ + /* Replace elements of x with zero where mask isn't set. */ + return select(mask, a, make_float4(0.0f)); +} + +ccl_device_inline float4 reduce_min(const float4& a) +{ +#ifdef __KERNEL_SSE__ + float4 h = min(shuffle<1,0,3,2>(a), a); + return min(shuffle<2,3,0,1>(h), h); +#else + return make_float4(min(min(a.x, a.y), min(a.z, a.w))); +#endif +} + +ccl_device_inline float4 reduce_max(const float4& a) +{ +#ifdef __KERNEL_SSE__ + float4 h = max(shuffle<1,0,3,2>(a), a); + return max(shuffle<2,3,0,1>(h), h); +#else + return make_float4(max(max(a.x, a.y), max(a.z, a.w))); +#endif +} + +ccl_device_inline float4 load_float4(const float *v) +{ +#ifdef __KERNEL_SSE__ + return float4(_mm_loadu_ps(v)); +#else + return make_float4(v[0], v[1], v[2], v[3]); +#endif +} + +#endif /* !__KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_MATH_FLOAT4_H__ */ diff --git a/intern/cycles/util/util_math_int2.h b/intern/cycles/util/util_math_int2.h new file mode 100644 index 00000000000..828c49a131c --- /dev/null +++ b/intern/cycles/util/util_math_int2.h @@ -0,0 +1,77 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_MATH_INT2_H__ +#define __UTIL_MATH_INT2_H__ + +#ifndef __UTIL_MATH_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +/******************************************************************************* + * Declaration. + */ + +#ifndef __KERNEL_OPENCL__ +ccl_device_inline bool operator==(const int2 a, const int2 b); +ccl_device_inline int2 operator+(const int2 &a, const int2 &b); +ccl_device_inline int2 operator+=(int2 &a, const int2 &b); +ccl_device_inline int2 operator-(const int2 &a, const int2 &b); +ccl_device_inline int2 operator*(const int2 &a, const int2 &b); +ccl_device_inline int2 operator/(const int2 &a, const int2 &b); +#endif /* !__KERNEL_OPENCL__ */ + +/******************************************************************************* + * Definition. + */ + +#ifndef __KERNEL_OPENCL__ +ccl_device_inline bool operator==(const int2 a, const int2 b) +{ + return (a.x == b.x && a.y == b.y); +} + +ccl_device_inline int2 operator+(const int2 &a, const int2 &b) +{ + return make_int2(a.x + b.x, a.y + b.y); +} + +ccl_device_inline int2 operator+=(int2 &a, const int2 &b) +{ + return a = a + b; +} + +ccl_device_inline int2 operator-(const int2 &a, const int2 &b) +{ + return make_int2(a.x - b.x, a.y - b.y); +} + +ccl_device_inline int2 operator*(const int2 &a, const int2 &b) +{ + return make_int2(a.x * b.x, a.y * b.y); +} + +ccl_device_inline int2 operator/(const int2 &a, const int2 &b) +{ + return make_int2(a.x / b.x, a.y / b.y); +} +#endif /* !__KERNEL_OPENCL__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_MATH_INT2_H__ */ diff --git a/intern/cycles/util/util_math_int3.h b/intern/cycles/util/util_math_int3.h new file mode 100644 index 00000000000..6eef8517665 --- /dev/null +++ b/intern/cycles/util/util_math_int3.h @@ -0,0 +1,98 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_MATH_INT3_H__ +#define __UTIL_MATH_INT3_H__ + +#ifndef __UTIL_MATH_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +/******************************************************************************* + * Declaration. + */ + +#ifndef __KERNEL_OPENCL__ +ccl_device_inline int3 min(int3 a, int3 b); +ccl_device_inline int3 max(int3 a, int3 b); +ccl_device_inline int3 clamp(const int3& a, int mn, int mx); +ccl_device_inline int3 clamp(const int3& a, int3& mn, int mx); +#endif /* !__KERNEL_OPENCL__ */ + +/******************************************************************************* + * Definition. + */ + +#ifndef __KERNEL_OPENCL__ +ccl_device_inline int3 min(int3 a, int3 b) +{ +#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) + return int3(_mm_min_epi32(a.m128, b.m128)); +#else + return make_int3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); +#endif +} + +ccl_device_inline int3 max(int3 a, int3 b) +{ +#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) + return int3(_mm_max_epi32(a.m128, b.m128)); +#else + return make_int3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); +#endif +} + +ccl_device_inline int3 clamp(const int3& a, int mn, int mx) +{ +#ifdef __KERNEL_SSE__ + return min(max(a, make_int3(mn)), make_int3(mx)); +#else + return make_int3(clamp(a.x, mn, mx), clamp(a.y, mn, mx), clamp(a.z, mn, mx)); +#endif +} + +ccl_device_inline int3 clamp(const int3& a, int3& mn, int mx) +{ +#ifdef __KERNEL_SSE__ + return min(max(a, mn), make_int3(mx)); +#else + return make_int3(clamp(a.x, mn.x, mx), + clamp(a.y, mn.y, mx), + clamp(a.z, mn.z, mx)); +#endif +} + +ccl_device_inline bool operator==(const int3 &a, const int3 &b) +{ + return a.x == b.x && a.y == b.y && a.z == b.z; +} + +ccl_device_inline bool operator!=(const int3 &a, const int3 &b) +{ + return !(a == b); +} + +ccl_device_inline bool operator<(const int3 &a, const int3 &b) +{ + return a.x < b.x && a.y < b.y && a.z < b.z; +} +#endif /* !__KERNEL_OPENCL__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_MATH_INT3_H__ */ diff --git a/intern/cycles/util/util_math_int4.h b/intern/cycles/util/util_math_int4.h new file mode 100644 index 00000000000..79a8c0841e7 --- /dev/null +++ b/intern/cycles/util/util_math_int4.h @@ -0,0 +1,119 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_MATH_INT4_H__ +#define __UTIL_MATH_INT4_H__ + +#ifndef __UTIL_MATH_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +/******************************************************************************* + * Declaration. + */ + +#ifndef __KERNEL_GPU__ +ccl_device_inline int4 operator+(const int4& a, const int4& b); +ccl_device_inline int4 operator+=(int4& a, const int4& b); +ccl_device_inline int4 operator>>(const int4& a, int i); +ccl_device_inline int4 min(int4 a, int4 b); +ccl_device_inline int4 max(int4 a, int4 b); +ccl_device_inline int4 clamp(const int4& a, const int4& mn, const int4& mx); +ccl_device_inline int4 select(const int4& mask, const int4& a, const int4& b); +#endif /* __KERNEL_GPU__ */ + +/******************************************************************************* + * Definition. + */ + +#ifndef __KERNEL_GPU__ +ccl_device_inline int4 operator+(const int4& a, const int4& b) +{ +#ifdef __KERNEL_SSE__ + return int4(_mm_add_epi32(a.m128, b.m128)); +#else + return make_int4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w); +#endif +} + +ccl_device_inline int4 operator+=(int4& a, const int4& b) +{ + return a = a + b; +} + +ccl_device_inline int4 operator>>(const int4& a, int i) +{ +#ifdef __KERNEL_SSE__ + return int4(_mm_srai_epi32(a.m128, i)); +#else + return make_int4(a.x >> i, a.y >> i, a.z >> i, a.w >> i); +#endif +} + +ccl_device_inline int4 min(int4 a, int4 b) +{ +#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) + return int4(_mm_min_epi32(a.m128, b.m128)); +#else + return make_int4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); +#endif +} + +ccl_device_inline int4 max(int4 a, int4 b) +{ +#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__) + return int4(_mm_max_epi32(a.m128, b.m128)); +#else + return make_int4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); +#endif +} + +ccl_device_inline int4 clamp(const int4& a, const int4& mn, const int4& mx) +{ + return min(max(a, mn), mx); +} + +ccl_device_inline int4 select(const int4& mask, const int4& a, const int4& b) +{ +#ifdef __KERNEL_SSE__ + const __m128 m = _mm_cvtepi32_ps(mask); + /* TODO(sergey): avoid cvt. */ + return int4(_mm_castps_si128( + _mm_or_ps(_mm_and_ps(m, _mm_castsi128_ps(a)), + _mm_andnot_ps(m, _mm_castsi128_ps(b))))); +#else + return make_int4((mask.x)? a.x: b.x, + (mask.y)? a.y: b.y, + (mask.z)? a.z: b.z, + (mask.w)? a.w: b.w); +#endif +} + +ccl_device_inline int4 load_int4(const int *v) +{ +#ifdef __KERNEL_SSE__ + return int4(_mm_loadu_si128((__m128i*)v)); +#else + return make_int4(v[0], v[1], v[2], v[3]); +#endif +} +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_MATH_INT4_H__ */ diff --git a/intern/cycles/util/util_math_intersect.h b/intern/cycles/util/util_math_intersect.h new file mode 100644 index 00000000000..61ddcc38f50 --- /dev/null +++ b/intern/cycles/util/util_math_intersect.h @@ -0,0 +1,222 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_MATH_INTERSECT_H__ +#define __UTIL_MATH_INTERSECT_H__ + +CCL_NAMESPACE_BEGIN + +/* Ray Intersection */ + +ccl_device bool ray_sphere_intersect( + float3 ray_P, float3 ray_D, float ray_t, + float3 sphere_P, float sphere_radius, + float3 *isect_P, float *isect_t) +{ + const float3 d = sphere_P - ray_P; + const float radiussq = sphere_radius*sphere_radius; + const float tsq = dot(d, d); + + if(tsq > radiussq) { + /* Ray origin outside sphere. */ + const float tp = dot(d, ray_D); + if(tp < 0.0f) { + /* Ray points away from sphere. */ + return false; + } + const float dsq = tsq - tp*tp; /* pythagoras */ + if(dsq > radiussq) { + /* Closest point on ray outside sphere. */ + return false; + } + const float t = tp - sqrtf(radiussq - dsq); /* pythagoras */ + if(t < ray_t) { + *isect_t = t; + *isect_P = ray_P + ray_D*t; + return true; + } + } + return false; +} + +ccl_device bool ray_aligned_disk_intersect( + float3 ray_P, float3 ray_D, float ray_t, + float3 disk_P, float disk_radius, + float3 *isect_P, float *isect_t) +{ + /* Aligned disk normal. */ + float disk_t; + const float3 disk_N = normalize_len(ray_P - disk_P, &disk_t); + const float div = dot(ray_D, disk_N); + if(UNLIKELY(div == 0.0f)) { + return false; + } + /* Compute t to intersection point. */ + const float t = -disk_t/div; + if(t < 0.0f || t > ray_t) { + return false; + } + /* Test if within radius. */ + float3 P = ray_P + ray_D*t; + if(len_squared(P - disk_P) > disk_radius*disk_radius) { + return false; + } + *isect_P = P; + *isect_t = t; + return true; +} + +ccl_device_forceinline bool ray_triangle_intersect( + float3 ray_P, float3 ray_dir, float ray_t, +#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) + const ssef *ssef_verts, +#else + const float3 tri_a, const float3 tri_b, const float3 tri_c, +#endif + float *isect_u, float *isect_v, float *isect_t) +{ +#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__) + typedef ssef float3; + const float3 tri_a(ssef_verts[0]); + const float3 tri_b(ssef_verts[1]); + const float3 tri_c(ssef_verts[2]); + const float3 P(ray_P); + const float3 dir(ray_dir); +#else +# define dot3(a, b) dot(a, b) + const float3 P = ray_P; + const float3 dir = ray_dir; +#endif + + /* Calculate vertices relative to ray origin. */ + const float3 v0 = tri_c - P; + const float3 v1 = tri_a - P; + const float3 v2 = tri_b - P; + + /* Calculate triangle edges. */ + const float3 e0 = v2 - v0; + const float3 e1 = v0 - v1; + const float3 e2 = v1 - v2; + + /* Perform edge tests. */ +#if defined(__KERNEL_SSE2__) && defined (__KERNEL_SSE__) + const float3 crossU = cross(v2 + v0, e0); + const float3 crossV = cross(v0 + v1, e1); + const float3 crossW = cross(v1 + v2, e2); + + ssef crossX(crossU); + ssef crossY(crossV); + ssef crossZ(crossW); + ssef zero = _mm_setzero_ps(); + _MM_TRANSPOSE4_PS(crossX, crossY, crossZ, zero); + + const ssef dirX(ray_dir.x); + const ssef dirY(ray_dir.y); + const ssef dirZ(ray_dir.z); + + ssef UVWW = madd(crossX, dirX, madd(crossY, dirY, crossZ * dirZ)); +#else /* __KERNEL_SSE2__ */ + const float U = dot(cross(v2 + v0, e0), ray_dir); + const float V = dot(cross(v0 + v1, e1), ray_dir); + const float W = dot(cross(v1 + v2, e2), ray_dir); +#endif /* __KERNEL_SSE2__ */ + +#if defined(__KERNEL_SSE2__) && defined (__KERNEL_SSE__) + int uvw_sign = movemask(UVWW) & 0x7; + if (uvw_sign != 0) + { + if (uvw_sign != 0x7) + return false; + } +#else + const float minUVW = min(U, min(V, W)); + const float maxUVW = max(U, max(V, W)); + + if(minUVW < 0.0f && maxUVW > 0.0f) { + return false; + } +#endif + + + /* Calculate geometry normal and denominator. */ + const float3 Ng1 = cross(e1, e0); + //const Vec3vfM Ng1 = stable_triangle_normal(e2,e1,e0); + const float3 Ng = Ng1 + Ng1; + const float den = dot3(Ng, dir); + /* Avoid division by 0. */ + if(UNLIKELY(den == 0.0f)) { + return false; + } + + /* Perform depth test. */ + const float T = dot3(v0, Ng); + const int sign_den = (__float_as_int(den) & 0x80000000); + const float sign_T = xor_signmask(T, sign_den); + if((sign_T < 0.0f) || + (sign_T > ray_t * xor_signmask(den, sign_den))) + { + return false; + } + + const float inv_den = 1.0f / den; +#if defined(__KERNEL_SSE2__) && defined (__KERNEL_SSE__) + UVWW *= inv_den; + _mm_store_ss(isect_u, UVWW); + _mm_store_ss(isect_v, shuffle<1,1,3,3>(UVWW)); +#else + *isect_u = U * inv_den; + *isect_v = V * inv_den; +#endif + *isect_t = T * inv_den; + return true; + +#undef dot3 +} + +ccl_device bool ray_quad_intersect(float3 ray_P, float3 ray_D, + float ray_mint, float ray_maxt, + float3 quad_P, + float3 quad_u, float3 quad_v, float3 quad_n, + float3 *isect_P, float *isect_t, + float *isect_u, float *isect_v) +{ + /* Perform intersection test. */ + float t = -(dot(ray_P, quad_n) - dot(quad_P, quad_n)) / dot(ray_D, quad_n); + if(t < ray_mint || t > ray_maxt) { + return false; + } + const float3 hit = ray_P + t*ray_D; + const float3 inplane = hit - quad_P; + const float u = dot(inplane, quad_u) / dot(quad_u, quad_u) + 0.5f; + if(u < 0.0f || u > 1.0f) { + return false; + } + const float v = dot(inplane, quad_v) / dot(quad_v, quad_v) + 0.5f; + if(v < 0.0f || v > 1.0f) { + return false; + } + /* Store the result. */ + /* TODO(sergey): Check whether we can avoid some checks here. */ + if(isect_P != NULL) *isect_P = hit; + if(isect_t != NULL) *isect_t = t; + if(isect_u != NULL) *isect_u = u; + if(isect_v != NULL) *isect_v = v; + return true; +} + +CCL_NAMESPACE_END + +#endif /* __UTIL_MATH_INTERSECT_H__ */ diff --git a/intern/cycles/util/util_math_matrix.h b/intern/cycles/util/util_math_matrix.h new file mode 100644 index 00000000000..382dad64ea5 --- /dev/null +++ b/intern/cycles/util/util_math_matrix.h @@ -0,0 +1,407 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_MATH_MATRIX_H__ +#define __UTIL_MATH_MATRIX_H__ + +CCL_NAMESPACE_BEGIN + +#define MAT(A, size, row, col) A[(row)*(size)+(col)] + +/* Variants that use a constant stride on GPUS. */ +#ifdef __KERNEL_GPU__ +# define MATS(A, n, r, c, s) A[((r)*(n)+(c))*(s)] +/* Element access when only the lower-triangular elements are stored. */ +# define MATHS(A, r, c, s) A[((r)*((r)+1)/2+(c))*(s)] +# define VECS(V, i, s) V[(i)*(s)] +#else +# define MATS(A, n, r, c, s) MAT(A, n, r, c) +# define MATHS(A, r, c, s) A[(r)*((r)+1)/2+(c)] +# define VECS(V, i, s) V[i] +#endif + +/* Zeroing helpers. */ + +ccl_device_inline void math_vector_zero(float *v, int n) +{ + for(int i = 0; i < n; i++) { + v[i] = 0.0f; + } +} + +ccl_device_inline void math_matrix_zero(float *A, int n) +{ + for(int row = 0; row < n; row++) { + for(int col = 0; col <= row; col++) { + MAT(A, n, row, col) = 0.0f; + } + } +} + +/* Elementary vector operations. */ + +ccl_device_inline void math_vector_add(float *a, const float *ccl_restrict b, int n) +{ + for(int i = 0; i < n; i++) { + a[i] += b[i]; + } +} + +ccl_device_inline void math_vector_mul(float *a, const float *ccl_restrict b, int n) +{ + for(int i = 0; i < n; i++) { + a[i] *= b[i]; + } +} + +ccl_device_inline void math_vector_mul_strided(ccl_global float *a, const float *ccl_restrict b, int astride, int n) +{ + for(int i = 0; i < n; i++) { + a[i*astride] *= b[i]; + } +} + +ccl_device_inline void math_vector_scale(float *a, float b, int n) +{ + for(int i = 0; i < n; i++) { + a[i] *= b; + } +} + +ccl_device_inline void math_vector_max(float *a, const float *ccl_restrict b, int n) +{ + for(int i = 0; i < n; i++) { + a[i] = max(a[i], b[i]); + } +} + +ccl_device_inline void math_vec3_add(float3 *v, int n, float *x, float3 w) +{ + for(int i = 0; i < n; i++) { + v[i] += w*x[i]; + } +} + +ccl_device_inline void math_vec3_add_strided(ccl_global float3 *v, int n, float *x, float3 w, int stride) +{ + for(int i = 0; i < n; i++) { + ccl_global float *elem = (ccl_global float*) (v + i*stride); + atomic_add_and_fetch_float(elem+0, w.x*x[i]); + atomic_add_and_fetch_float(elem+1, w.y*x[i]); + atomic_add_and_fetch_float(elem+2, w.z*x[i]); + } +} + +/* Elementary matrix operations. + * Note: TriMatrix refers to a square matrix that is symmetric, and therefore its upper-triangular part isn't stored. */ + +ccl_device_inline void math_trimatrix_add_diagonal(ccl_global float *A, int n, float val, int stride) +{ + for(int row = 0; row < n; row++) { + MATHS(A, row, row, stride) += val; + } +} + +/* Add Gramian matrix of v to A. + * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j]. */ +ccl_device_inline void math_matrix_add_gramian(float *A, + int n, + const float *ccl_restrict v, + float weight) +{ + for(int row = 0; row < n; row++) { + for(int col = 0; col <= row; col++) { + MAT(A, n, row, col) += v[row]*v[col]*weight; + } + } +} + +/* Add Gramian matrix of v to A. + * The Gramian matrix of v is vt*v, so element (i,j) is v[i]*v[j]. */ +ccl_device_inline void math_trimatrix_add_gramian_strided(ccl_global float *A, + int n, + const float *ccl_restrict v, + float weight, + int stride) +{ + for(int row = 0; row < n; row++) { + for(int col = 0; col <= row; col++) { + atomic_add_and_fetch_float(&MATHS(A, row, col, stride), v[row]*v[col]*weight); + } + } +} + +/* Transpose matrix A inplace. */ +ccl_device_inline void math_matrix_transpose(ccl_global float *A, int n, int stride) +{ + for(int i = 0; i < n; i++) { + for(int j = 0; j < i; j++) { + float temp = MATS(A, n, i, j, stride); + MATS(A, n, i, j, stride) = MATS(A, n, j, i, stride); + MATS(A, n, j, i, stride) = temp; + } + } +} + +/* Solvers for matrix problems */ + +/* In-place Cholesky-Banachiewicz decomposition of the square, positive-definite matrix A + * into a lower triangular matrix L so that A = L*L^T. A is being overwritten by L. + * Also, only the lower triangular part of A is ever accessed. */ +ccl_device void math_trimatrix_cholesky(ccl_global float *A, int n, int stride) +{ + for(int row = 0; row < n; row++) { + for(int col = 0; col <= row; col++) { + float sum_col = MATHS(A, row, col, stride); + for(int k = 0; k < col; k++) { + sum_col -= MATHS(A, row, k, stride) * MATHS(A, col, k, stride); + } + if(row == col) { + sum_col = sqrtf(max(sum_col, 0.0f)); + } + else { + sum_col /= MATHS(A, col, col, stride); + } + MATHS(A, row, col, stride) = sum_col; + } + } +} + +/* Solve A*S=y for S given A and y, where A is symmetrical positive-semidefinite and both inputs are destroyed in the process. + * + * We can apply Cholesky decomposition to find a lower triangular L so that L*Lt = A. + * With that we get (L*Lt)*S = L*(Lt*S) = L*b = y, defining b as Lt*S. + * Since L is lower triangular, finding b is relatively easy since y is known. + * Then, the remaining problem is Lt*S = b, which again can be solved easily. + * + * This is useful for solving the normal equation S=inv(Xt*W*X)*Xt*W*y, since Xt*W*X is + * symmetrical positive-semidefinite by construction, so we can just use this function with A=Xt*W*X and y=Xt*W*y. */ +ccl_device_inline void math_trimatrix_vec3_solve(ccl_global float *A, ccl_global float3 *y, int n, int stride) +{ + /* Since the first entry of the design row is always 1, the upper-left element of XtWX is a good + * heuristic for the amount of pixels considered (with weighting), therefore the amount of correction + * is scaled based on it. */ + math_trimatrix_add_diagonal(A, n, 3e-7f*A[0], stride); /* Improve the numerical stability. */ + math_trimatrix_cholesky(A, n, stride); /* Replace A with L so that L*Lt = A. */ + + /* Use forward substitution to solve L*b = y, replacing y by b. */ + for(int row = 0; row < n; row++) { + float3 sum = VECS(y, row, stride); + for(int col = 0; col < row; col++) + sum -= MATHS(A, row, col, stride) * VECS(y, col, stride); + VECS(y, row, stride) = sum / MATHS(A, row, row, stride); + } + + /* Use backward substitution to solve Lt*S = b, replacing b by S. */ + for(int row = n-1; row >= 0; row--) { + float3 sum = VECS(y, row, stride); + for(int col = row+1; col < n; col++) + sum -= MATHS(A, col, row, stride) * VECS(y, col, stride); + VECS(y, row, stride) = sum / MATHS(A, row, row, stride); + } +} + +/* Perform the Jacobi Eigenvalue Methon on matrix A. + * A is assumed to be a symmetrical matrix, therefore only the lower-triangular part is ever accessed. + * The algorithm overwrites the contents of A. + * + * After returning, A will be overwritten with D, which is (almost) diagonal, + * and V will contain the eigenvectors of the original A in its rows (!), + * so that A = V^T*D*V. Therefore, the diagonal elements of D are the (sorted) eigenvalues of A. + */ +ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float *V, int n, int v_stride) +{ + const float singular_epsilon = 1e-9f; + + for(int row = 0; row < n; row++) { + for(int col = 0; col < n; col++) { + MATS(V, n, row, col, v_stride) = (col == row) ? 1.0f : 0.0f; + } + } + + for(int sweep = 0; sweep < 8; sweep++) { + float off_diagonal = 0.0f; + for(int row = 1; row < n; row++) { + for(int col = 0; col < row; col++) { + off_diagonal += fabsf(MAT(A, n, row, col)); + } + } + if(off_diagonal < 1e-7f) { + /* The matrix has nearly reached diagonal form. + * Since the eigenvalues are only used to determine truncation, their exact values aren't required - a relative error of a few ULPs won't matter at all. */ + break; + } + + /* Set the threshold for the small element rotation skip in the first sweep: + * Skip all elements that are less than a tenth of the average off-diagonal element. */ + float threshold = 0.2f*off_diagonal / (n*n); + + for(int row = 1; row < n; row++) { + for(int col = 0; col < row; col++) { + /* Perform a Jacobi rotation on this element that reduces it to zero. */ + float element = MAT(A, n, row, col); + float abs_element = fabsf(element); + + /* If we're in a later sweep and the element already is very small, just set it to zero and skip the rotation. */ + if(sweep > 3 && abs_element <= singular_epsilon*fabsf(MAT(A, n, row, row)) && abs_element <= singular_epsilon*fabsf(MAT(A, n, col, col))) { + MAT(A, n, row, col) = 0.0f; + continue; + } + + if(element == 0.0f) { + continue; + } + + /* If we're in one of the first sweeps and the element is smaller than the threshold, skip it. */ + if(sweep < 3 && (abs_element < threshold)) { + continue; + } + + /* Determine rotation: The rotation is characterized by its angle phi - or, in the actual implementation, sin(phi) and cos(phi). + * To find those, we first compute their ratio - that might be unstable if the angle approaches 90°, so there's a fallback for that case. + * Then, we compute sin(phi) and cos(phi) themselves. */ + float singular_diff = MAT(A, n, row, row) - MAT(A, n, col, col); + float ratio; + if(abs_element > singular_epsilon*fabsf(singular_diff)) { + float cot_2phi = 0.5f*singular_diff / element; + ratio = 1.0f / (fabsf(cot_2phi) + sqrtf(1.0f + cot_2phi*cot_2phi)); + if(cot_2phi < 0.0f) ratio = -ratio; /* Copy sign. */ + } + else { + ratio = element / singular_diff; + } + + float c = 1.0f / sqrtf(1.0f + ratio*ratio); + float s = ratio*c; + /* To improve numerical stability by avoiding cancellation, the update equations are reformulized to use sin(phi) and tan(phi/2) instead. */ + float tan_phi_2 = s / (1.0f + c); + + /* Update the singular values in the diagonal. */ + float singular_delta = ratio*element; + MAT(A, n, row, row) += singular_delta; + MAT(A, n, col, col) -= singular_delta; + + /* Set the element itself to zero. */ + MAT(A, n, row, col) = 0.0f; + + /* Perform the actual rotations on the matrices. */ +#define ROT(M, r1, c1, r2, c2, stride) \ + { \ + float M1 = MATS(M, n, r1, c1, stride); \ + float M2 = MATS(M, n, r2, c2, stride); \ + MATS(M, n, r1, c1, stride) -= s*(M2 + tan_phi_2*M1); \ + MATS(M, n, r2, c2, stride) += s*(M1 - tan_phi_2*M2); \ + } + + /* Split into three parts to ensure correct accesses since we only store the lower-triangular part of A. */ + for(int i = 0 ; i < col; i++) ROT(A, col, i, row, i, 1); + for(int i = col+1; i < row; i++) ROT(A, i, col, row, i, 1); + for(int i = row+1; i < n ; i++) ROT(A, i, col, i, row, 1); + + for(int i = 0 ; i < n ; i++) ROT(V, col, i, row, i, v_stride); +#undef ROT + } + } + } + + /* Sort eigenvalues and the associated eigenvectors. */ + for(int i = 0; i < n - 1; i++) { + float v = MAT(A, n, i, i); + int k = i; + for(int j = i; j < n; j++) { + if(MAT(A, n, j, j) >= v) { + v = MAT(A, n, j, j); + k = j; + } + } + if(k != i) { + /* Swap eigenvalues. */ + MAT(A, n, k, k) = MAT(A, n, i, i); + MAT(A, n, i, i) = v; + /* Swap eigenvectors. */ + for(int j = 0; j < n; j++) { + float v = MATS(V, n, i, j, v_stride); + MATS(V, n, i, j, v_stride) = MATS(V, n, k, j, v_stride); + MATS(V, n, k, j, v_stride) = v; + } + } + } +} + +#ifdef __KERNEL_SSE3__ +ccl_device_inline void math_vector_zero_sse(float4 *A, int n) +{ + for(int i = 0; i < n; i++) { + A[i] = make_float4(0.0f); + } +} + +ccl_device_inline void math_matrix_zero_sse(float4 *A, int n) +{ + for(int row = 0; row < n; row++) { + for(int col = 0; col <= row; col++) { + MAT(A, n, row, col) = make_float4(0.0f); + } + } +} + +/* Add Gramian matrix of v to A. + * The Gramian matrix of v is v^T*v, so element (i,j) is v[i]*v[j]. */ +ccl_device_inline void math_matrix_add_gramian_sse(float4 *A, int n, const float4 *ccl_restrict v, float4 weight) +{ + for(int row = 0; row < n; row++) { + for(int col = 0; col <= row; col++) { + MAT(A, n, row, col) = MAT(A, n, row, col) + v[row] * v[col] * weight; + } + } +} + +ccl_device_inline void math_vector_add_sse(float4 *V, int n, const float4 *ccl_restrict a) +{ + for(int i = 0; i < n; i++) { + V[i] += a[i]; + } +} + +ccl_device_inline void math_vector_mul_sse(float4 *V, int n, const float4 *ccl_restrict a) +{ + for(int i = 0; i < n; i++) { + V[i] *= a[i]; + } +} + +ccl_device_inline void math_vector_max_sse(float4 *a, const float4 *ccl_restrict b, int n) +{ + for(int i = 0; i < n; i++) { + a[i] = max(a[i], b[i]); + } +} + +ccl_device_inline void math_matrix_hsum(float *A, int n, const float4 *ccl_restrict B) +{ + for(int row = 0; row < n; row++) { + for(int col = 0; col <= row; col++) { + MAT(A, n, row, col) = reduce_add(MAT(B, n, row, col))[0]; + } + } +} +#endif + +#undef MAT + +CCL_NAMESPACE_END + +#endif /* __UTIL_MATH_MATRIX_H__ */ diff --git a/intern/cycles/util/util_md5.cpp b/intern/cycles/util/util_md5.cpp index 19168135f01..749760d84f0 100644 --- a/intern/cycles/util/util_md5.cpp +++ b/intern/cycles/util/util_md5.cpp @@ -310,6 +310,13 @@ void MD5Hash::append(const uint8_t *data, int nbytes) memcpy(buf, p, left); } +void MD5Hash::append(const string& str) +{ + if(str.size()) { + append((const uint8_t*)str.c_str(), str.size()); + } +} + bool MD5Hash::append_file(const string& filepath) { FILE *f = path_fopen(filepath, "rb"); diff --git a/intern/cycles/util/util_md5.h b/intern/cycles/util/util_md5.h index d0af9fdb004..b043b591e67 100644 --- a/intern/cycles/util/util_md5.h +++ b/intern/cycles/util/util_md5.h @@ -30,8 +30,8 @@ #ifndef __UTIL_MD5_H__ #define __UTIL_MD5_H__ -#include "util_string.h" -#include "util_types.h" +#include "util/util_string.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN @@ -41,6 +41,7 @@ public: ~MD5Hash(); void append(const uint8_t *data, int size); + void append(const string& str); bool append_file(const string& filepath); string get_hex(); diff --git a/intern/cycles/util/util_optimization.h b/intern/cycles/util/util_optimization.h index adc141a7b28..3c5785c4807 100644 --- a/intern/cycles/util/util_optimization.h +++ b/intern/cycles/util/util_optimization.h @@ -19,22 +19,15 @@ #ifndef __KERNEL_GPU__ -/* quiet unused define warnings */ -#if defined(__KERNEL_SSE2__) || \ - defined(__KERNEL_SSE3__) || \ - defined(__KERNEL_SSSE3__) || \ - defined(__KERNEL_SSE41__) || \ - defined(__KERNEL_AVX__) || \ - defined(__KERNEL_AVX2__) - /* do nothing */ -#endif - /* x86 * * Compile a regular, SSE2 and SSE3 kernel. */ #if defined(i386) || defined(_M_IX86) +/* We require minimum SSE2 support on x86, so auto enable. */ +# define __KERNEL_SSE2__ + # ifdef WITH_KERNEL_SSE2 # define WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 # endif @@ -73,48 +66,6 @@ #endif /* defined(__x86_64__) || defined(_M_X64) */ -/* SSE Experiment - * - * This is disabled code for an experiment to use SSE types globally for types - * such as float3 and float4. Currently this gives an overall slowdown. */ - -#if 0 -# define __KERNEL_SSE__ -# ifndef __KERNEL_SSE2__ -# define __KERNEL_SSE2__ -# endif -# ifndef __KERNEL_SSE3__ -# define __KERNEL_SSE3__ -# endif -# ifndef __KERNEL_SSSE3__ -# define __KERNEL_SSSE3__ -# endif -# ifndef __KERNEL_SSE4__ -# define __KERNEL_SSE4__ -# endif -#endif - -/* SSE Intrinsics includes - * - * We assume __KERNEL_SSEX__ flags to have been defined at this point */ - -/* SSE intrinsics headers */ -#ifndef FREE_WINDOWS64 - -#ifdef _MSC_VER -# include <intrin.h> -#elif (defined(__x86_64__) || defined(__i386__)) -# include <x86intrin.h> -#endif - -#else - -/* MinGW64 has conflicting declarations for these SSE headers in <windows.h>. - * Since we can't avoid including <windows.h>, better only include that */ -#include "util_windows.h" - -#endif - #endif #endif /* __UTIL_OPTIMIZATION_H__ */ diff --git a/intern/cycles/util/util_path.cpp b/intern/cycles/util/util_path.cpp index 5df262fcbbb..0e0371928ab 100644 --- a/intern/cycles/util/util_path.cpp +++ b/intern/cycles/util/util_path.cpp @@ -14,10 +14,9 @@ * limitations under the License. */ -#include "util_debug.h" -#include "util_md5.h" -#include "util_path.h" -#include "util_string.h" +#include "util/util_md5.h" +#include "util/util_path.h" +#include "util/util_string.h" #include <OpenImageIO/filesystem.h> #include <OpenImageIO/strutil.h> @@ -45,7 +44,8 @@ OIIO_NAMESPACE_USING # include <shlwapi.h> #endif -#include "util_windows.h" +#include "util/util_map.h" +#include "util/util_windows.h" CCL_NAMESPACE_BEGIN @@ -320,17 +320,18 @@ static char *path_specials(const string& sub) { static bool env_init = false; static char *env_shader_path; - static char *env_kernel_path; + static char *env_source_path; if(!env_init) { env_shader_path = getenv("CYCLES_SHADER_PATH"); - env_kernel_path = getenv("CYCLES_KERNEL_PATH"); + /* NOTE: It is KERNEL in env variable for compatibility reasons. */ + env_source_path = getenv("CYCLES_KERNEL_PATH"); env_init = true; } if(env_shader_path != NULL && sub == "shader") { return env_shader_path; } - else if(env_shader_path != NULL && sub == "kernel") { - return env_kernel_path; + else if(env_shader_path != NULL && sub == "source") { + return env_source_path; } return NULL; } @@ -767,64 +768,195 @@ bool path_remove(const string& path) return remove(path.c_str()) == 0; } -static string line_directive(const string& path, int line) -{ - string escaped_path = path; - string_replace(escaped_path, "\"", "\\\""); - string_replace(escaped_path, "\'", "\\\'"); - string_replace(escaped_path, "\?", "\\\?"); - string_replace(escaped_path, "\\", "\\\\"); +struct SourceReplaceState { + typedef map<string, string> ProcessedMapping; + /* Base director for all relative include headers. */ + string base; + /* Result of processed files. */ + ProcessedMapping processed_files; + /* Set of files which are considered "precompiled" and which are replaced + * with and empty string on a subsequent occurrence in include statement. + */ + set<string> precompiled_headers; +}; + +static string path_source_replace_includes_recursive( + const string& source, + const string& source_filepath, + SourceReplaceState *state); + +static string line_directive(const SourceReplaceState& state, + const string& path, + const int line) +{ + string unescaped_path = path; + /* First we make path relative. */ + if(string_startswith(unescaped_path, state.base.c_str())) { + const string base_file = path_filename(state.base); + const size_t base_len = state.base.length(); + unescaped_path = base_file + + unescaped_path.substr(base_len, + unescaped_path.length() - base_len); + } + /* Second, we replace all unsafe characters. */ + const size_t length = unescaped_path.length(); + string escaped_path = ""; + for(size_t i = 0; i < length; ++i) { + const char ch = unescaped_path[i]; + if(strchr("\"\'\?\\", ch) != NULL) { + escaped_path += "\\"; + } + escaped_path += ch; + } + /* TODO(sergey): Check whether using std::to_string combined with several + * concatenation operations is any faster. + */ return string_printf("#line %d \"%s\"", line, escaped_path.c_str()); } +static string path_source_handle_preprocessor( + const string& preprocessor_line, + const string& source_filepath, + const size_t line_number, + SourceReplaceState *state) +{ + string result = preprocessor_line; + string token = string_strip( + preprocessor_line.substr(1, preprocessor_line.size() - 1)); + if(string_startswith(token, "include")) { + token = string_strip(token.substr(7, token.size() - 7)); + if(token[0] == '"') { + const size_t n_start = 1; + const size_t n_end = token.find("\"", n_start); + const string filename = token.substr(n_start, n_end - n_start); + const bool is_precompiled = string_endswith(token, "// PRECOMPILED"); + string filepath = path_join(state->base, filename); + if(!path_exists(filepath)) { + filepath = path_join(path_dirname(source_filepath), + filename); + } + if(is_precompiled) { + state->precompiled_headers.insert(filepath); + } + string text; + if(path_read_text(filepath, text)) { + text = path_source_replace_includes_recursive( + text, filepath, state); + /* Use line directives for better error messages. */ + result = line_directive(*state, filepath, 1) + "\n" + + text + "\n" + + line_directive(*state, source_filepath, line_number + 1); + } + } + } + return result; +} -string path_source_replace_includes(const string& source, - const string& path, - const string& source_filename) +/* Our own little c preprocessor that replaces #includes with the file + * contents, to work around issue of OpenCL drivers not supporting + * include paths with spaces in them. + */ +static string path_source_replace_includes_recursive( + const string& source, + const string& source_filepath, + SourceReplaceState *state) { - /* Our own little c preprocessor that replaces #includes with the file - * contents, to work around issue of opencl drivers not supporting - * include paths with spaces in them. + /* Try to re-use processed file without spending time on replacing all + * include directives again. */ - + SourceReplaceState::ProcessedMapping::iterator replaced_file = + state->processed_files.find(source_filepath); + if(replaced_file != state->processed_files.end()) { + if(state->precompiled_headers.find(source_filepath) != + state->precompiled_headers.end()) { + return ""; + } + return replaced_file->second; + } + /* Perform full file processing. */ string result = ""; - vector<string> lines; - string_split(lines, source, "\n", false); - - for(size_t i = 0; i < lines.size(); ++i) { - string line = lines[i]; - if(line[0] == '#') { - string token = string_strip(line.substr(1, line.size() - 1)); - if(string_startswith(token, "include")) { - token = string_strip(token.substr(7, token.size() - 7)); - if(token[0] == '"') { - size_t n_start = 1; - size_t n_end = token.find("\"", n_start); - string filename = token.substr(n_start, n_end - n_start); - string text, filepath = path_join(path, filename); - if(path_read_text(filepath, text)) { - /* Replace include directories with both current path - * and path extracted from the include file. - * Not totally robust, but works fine for Cycles kernel - * and avoids having list of include directories.x - */ - text = path_source_replace_includes( - text, path_dirname(filepath), filename); - text = path_source_replace_includes(text, path, filename); - /* Use line directives for better error messages. */ - line = line_directive(filepath, 1) - + token.replace(0, n_end + 1, "\n" + text + "\n") - + line_directive(path_join(path, source_filename), i); - } - } + const size_t source_length = source.length(); + size_t index = 0; + /* Information about where we are in the source. */ + size_t line_number = 0, column_number = 1; + /* Currently gathered non-preprocessor token. + * Store as start/length rather than token itself to avoid overhead of + * memory re-allocations on each character concatenation. + */ + size_t token_start = 0, token_length = 0; + /* Denotes whether we're inside of preprocessor line, together with + * preprocessor line itself. + * + * TODO(sergey): Investigate whether using token start/end position + * gives measurable speedup. + */ + bool inside_preprocessor = false; + string preprocessor_line = ""; + /* Actual loop over the whole source. */ + while(index < source_length) { + const char ch = source[index]; + if(ch == '\n') { + if(inside_preprocessor) { + result += path_source_handle_preprocessor(preprocessor_line, + source_filepath, + line_number, + state); + /* Start gathering net part of the token. */ + token_start = index; + token_length = 0; + } + inside_preprocessor = false; + preprocessor_line = ""; + column_number = 0; + ++line_number; + } + else if(ch == '#' && column_number == 1 && !inside_preprocessor) { + /* Append all possible non-preprocessor token to the result. */ + if(token_length != 0) { + result.append(source, token_start, token_length); + token_start = index; + token_length = 0; } + inside_preprocessor = true; + } + if(inside_preprocessor) { + preprocessor_line += ch; } - result += line + "\n"; + else { + ++token_length; + } + ++index; + ++column_number; } - + /* Append possible tokens which happened before special events handled + * above. + */ + if(token_length != 0) { + result.append(source, token_start, token_length); + } + if(inside_preprocessor) { + result += path_source_handle_preprocessor(preprocessor_line, + source_filepath, + line_number, + state); + } + /* Store result for further reuse. */ + state->processed_files[source_filepath] = result; return result; } +string path_source_replace_includes(const string& source, + const string& path, + const string& source_filename) +{ + SourceReplaceState state; + state.base = path; + return path_source_replace_includes_recursive( + source, + path_join(path, source_filename), + &state); +} + FILE *path_fopen(const string& path, const string& mode) { #ifdef _WIN32 diff --git a/intern/cycles/util/util_path.h b/intern/cycles/util/util_path.h index 70dbb5ae403..0e5e2d2c837 100644 --- a/intern/cycles/util/util_path.h +++ b/intern/cycles/util/util_path.h @@ -24,10 +24,10 @@ #include <stdio.h> -#include "util_set.h" -#include "util_string.h" -#include "util_types.h" -#include "util_vector.h" +#include "util/util_set.h" +#include "util/util_string.h" +#include "util/util_types.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/util/util_progress.h b/intern/cycles/util/util_progress.h index 14215056840..134383e88db 100644 --- a/intern/cycles/util/util_progress.h +++ b/intern/cycles/util/util_progress.h @@ -23,10 +23,10 @@ * update notifications from a job running in another thread. All methods * except for the constructor/destructor are thread safe. */ -#include "util_function.h" -#include "util_string.h" -#include "util_time.h" -#include "util_thread.h" +#include "util/util_function.h" +#include "util/util_string.h" +#include "util/util_time.h" +#include "util/util_thread.h" CCL_NAMESPACE_BEGIN @@ -37,9 +37,11 @@ public: pixel_samples = 0; total_pixel_samples = 0; current_tile_sample = 0; - finished_tiles = 0; + rendered_tiles = 0; + denoised_tiles = 0; start_time = time_dt(); render_start_time = time_dt(); + end_time = 0.0; status = "Initializing"; substatus = ""; sync_status = ""; @@ -75,9 +77,11 @@ public: pixel_samples = 0; total_pixel_samples = 0; current_tile_sample = 0; - finished_tiles = 0; + rendered_tiles = 0; + denoised_tiles = 0; start_time = time_dt(); render_start_time = time_dt(); + end_time = 0.0; status = "Initializing"; substatus = ""; sync_status = ""; @@ -144,6 +148,7 @@ public: thread_scoped_lock lock(progress_mutex); start_time = time_dt(); + end_time = 0.0; } void set_render_start_time() @@ -167,8 +172,15 @@ public: { thread_scoped_lock lock(progress_mutex); - total_time_ = time_dt() - start_time; - render_time_ = time_dt() - render_start_time; + double time = (end_time > 0) ? end_time : time_dt(); + + total_time_ = time - start_time; + render_time_ = time - render_start_time; + } + + void set_end_time() + { + end_time = time_dt(); } void reset_sample() @@ -177,7 +189,8 @@ public: pixel_samples = 0; current_tile_sample = 0; - finished_tiles = 0; + rendered_tiles = 0; + denoised_tiles = 0; } void set_total_pixel_samples(uint64_t total_pixel_samples_) @@ -209,23 +222,36 @@ public: set_update(); } - void add_finished_tile() + void add_finished_tile(bool denoised) { thread_scoped_lock lock(progress_mutex); - finished_tiles++; + if(denoised) { + denoised_tiles++; + } + else { + rendered_tiles++; + } } int get_current_sample() { + thread_scoped_lock lock(progress_mutex); /* Note that the value here always belongs to the last tile that updated, * so it's only useful if there is only one active tile. */ return current_tile_sample; } - int get_finished_tiles() + int get_rendered_tiles() + { + thread_scoped_lock lock(progress_mutex); + return rendered_tiles; + } + + int get_denoised_tiles() { - return finished_tiles; + thread_scoped_lock lock(progress_mutex); + return denoised_tiles; } /* status messages */ @@ -318,9 +344,11 @@ protected: int current_tile_sample; /* Stores the number of tiles that's already finished. * Used to determine whether all but the last tile are finished rendering, in which case the current_tile_sample is displayed. */ - int finished_tiles; + int rendered_tiles, denoised_tiles; double start_time, render_start_time; + /* End time written when render is done, so it doesn't keep increasing on redraws. */ + double end_time; string status; string substatus; diff --git a/intern/cycles/util/util_projection.h b/intern/cycles/util/util_projection.h new file mode 100644 index 00000000000..dbcb9877a48 --- /dev/null +++ b/intern/cycles/util/util_projection.h @@ -0,0 +1,177 @@ +/* + * Copyright 2011-2018 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_PROJECTION_H__ +#define __UTIL_PROJECTION_H__ + +#include "util/util_transform.h" + +CCL_NAMESPACE_BEGIN + +/* 4x4 projection matrix, perspective or orthographic. */ + +typedef struct ProjectionTransform { + float4 x, y, z, w; /* rows */ + +#ifndef __KERNEL_GPU__ + ProjectionTransform() + { + } + + explicit ProjectionTransform(const Transform& tfm) + : x(tfm.x), + y(tfm.y), + z(tfm.z), + w(make_float4(0.0f, 0.0f, 0.0f, 1.0f)) + { + } +#endif +} ProjectionTransform; + +typedef struct PerspectiveMotionTransform { + ProjectionTransform pre; + ProjectionTransform post; +} PerspectiveMotionTransform; + +/* Functions */ + +ccl_device_inline float3 transform_perspective(const ProjectionTransform *t, const float3 a) +{ + float4 b = make_float4(a.x, a.y, a.z, 1.0f); + float3 c = make_float3(dot(t->x, b), dot(t->y, b), dot(t->z, b)); + float w = dot(t->w, b); + + return (w != 0.0f)? c/w: make_float3(0.0f, 0.0f, 0.0f); +} + +ccl_device_inline float3 transform_perspective_direction(const ProjectionTransform *t, const float3 a) +{ + float3 c = make_float3( + a.x*t->x.x + a.y*t->x.y + a.z*t->x.z, + a.x*t->y.x + a.y*t->y.y + a.z*t->y.z, + a.x*t->z.x + a.y*t->z.y + a.z*t->z.z); + + return c; +} + +#ifndef __KERNEL_GPU__ + +ccl_device_inline Transform projection_to_transform(const ProjectionTransform& a) +{ + Transform tfm = {a.x, a.y, a.z}; + return tfm; +} + +ccl_device_inline ProjectionTransform projection_transpose(const ProjectionTransform& a) +{ + ProjectionTransform t; + + t.x.x = a.x.x; t.x.y = a.y.x; t.x.z = a.z.x; t.x.w = a.w.x; + t.y.x = a.x.y; t.y.y = a.y.y; t.y.z = a.z.y; t.y.w = a.w.y; + t.z.x = a.x.z; t.z.y = a.y.z; t.z.z = a.z.z; t.z.w = a.w.z; + t.w.x = a.x.w; t.w.y = a.y.w; t.w.z = a.z.w; t.w.w = a.w.w; + + return t; +} + +ProjectionTransform projection_inverse(const ProjectionTransform& a); + +ccl_device_inline ProjectionTransform make_projection( + float a, float b, float c, float d, + float e, float f, float g, float h, + float i, float j, float k, float l, + float m, float n, float o, float p) +{ + ProjectionTransform t; + + t.x.x = a; t.x.y = b; t.x.z = c; t.x.w = d; + t.y.x = e; t.y.y = f; t.y.z = g; t.y.w = h; + t.z.x = i; t.z.y = j; t.z.z = k; t.z.w = l; + t.w.x = m; t.w.y = n; t.w.z = o; t.w.w = p; + + return t; +} +ccl_device_inline ProjectionTransform projection_identity() +{ + return make_projection( + 1.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 1.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 1.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 1.0f); +} + +ccl_device_inline ProjectionTransform operator*(const ProjectionTransform& a, const ProjectionTransform& b) +{ + ProjectionTransform c = projection_transpose(b); + ProjectionTransform t; + + t.x = make_float4(dot(a.x, c.x), dot(a.x, c.y), dot(a.x, c.z), dot(a.x, c.w)); + t.y = make_float4(dot(a.y, c.x), dot(a.y, c.y), dot(a.y, c.z), dot(a.y, c.w)); + t.z = make_float4(dot(a.z, c.x), dot(a.z, c.y), dot(a.z, c.z), dot(a.z, c.w)); + t.w = make_float4(dot(a.w, c.x), dot(a.w, c.y), dot(a.w, c.z), dot(a.w, c.w)); + + return t; +} + +ccl_device_inline ProjectionTransform operator*(const ProjectionTransform& a, const Transform& b) +{ + return a * ProjectionTransform(b); +} + +ccl_device_inline ProjectionTransform operator*(const Transform& a, const ProjectionTransform& b) +{ + return ProjectionTransform(a) * b; +} + +ccl_device_inline void print_projection(const char *label, const ProjectionTransform& t) +{ + print_float4(label, t.x); + print_float4(label, t.y); + print_float4(label, t.z); + print_float4(label, t.w); + printf("\n"); +} + +ccl_device_inline ProjectionTransform projection_perspective(float fov, float n, float f) +{ + ProjectionTransform persp = make_projection( + 1, 0, 0, 0, + 0, 1, 0, 0, + 0, 0, f / (f - n), -f*n / (f - n), + 0, 0, 1, 0); + + float inv_angle = 1.0f/tanf(0.5f*fov); + + Transform scale = transform_scale(inv_angle, inv_angle, 1); + + return scale * persp; +} + +ccl_device_inline ProjectionTransform projection_orthographic(float znear, float zfar) +{ + Transform t = + transform_scale(1.0f, 1.0f, 1.0f / (zfar-znear)) * + transform_translate(0.0f, 0.0f, -znear); + + return ProjectionTransform(t); +} + +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_PROJECTION_H__ */ + diff --git a/intern/cycles/util/util_rect.h b/intern/cycles/util/util_rect.h new file mode 100644 index 00000000000..17a55a14d0b --- /dev/null +++ b/intern/cycles/util/util_rect.h @@ -0,0 +1,73 @@ +/* + * Copyright 2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_RECT_H__ +#define __UTIL_RECT_H__ + +#include "util/util_types.h" + +CCL_NAMESPACE_BEGIN + +/* Rectangles are represented as a int4 containing the coordinates of the lower-left and + * upper-right corners in the order (x0, y0, x1, y1). */ + +ccl_device_inline int4 rect_from_shape(int x0, int y0, int w, int h) +{ + return make_int4(x0, y0, x0 + w, y0 + h); +} + +ccl_device_inline int4 rect_expand(int4 rect, int d) +{ + return make_int4(rect.x - d, rect.y - d, rect.z + d, rect.w + d); +} + +/* Returns the intersection of two rects. */ +ccl_device_inline int4 rect_clip(int4 a, int4 b) +{ + return make_int4(max(a.x, b.x), max(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); +} + +ccl_device_inline bool rect_is_valid(int4 rect) +{ + return (rect.z > rect.x) && (rect.w > rect.y); +} + +/* Returns the local row-major index of the pixel inside the rect. */ +ccl_device_inline int coord_to_local_index(int4 rect, int x, int y) +{ + int w = rect.z - rect.x; + return (y - rect.y) * w + (x - rect.x); +} + +/* Finds the coordinates of a pixel given by its row-major index in the rect, + * and returns whether the pixel is inside it. */ +ccl_device_inline bool local_index_to_coord(int4 rect, int idx, int *x, int *y) +{ + int w = rect.z - rect.x; + *x = (idx % w) + rect.x; + *y = (idx / w) + rect.y; + return (*y < rect.w); +} + +ccl_device_inline int rect_size(int4 rect) +{ + return (rect.z - rect.x) * (rect.w - rect.y); +} + +CCL_NAMESPACE_END + +#endif /* __UTIL_RECT_H__ */ + diff --git a/intern/cycles/util/util_simd.cpp b/intern/cycles/util/util_simd.cpp index de2df612578..f90439c188b 100644 --- a/intern/cycles/util/util_simd.cpp +++ b/intern/cycles/util/util_simd.cpp @@ -19,7 +19,7 @@ (defined(WITH_KERNEL_NATIVE) && defined(__SSE2__)) #define __KERNEL_SSE2__ -#include "util_simd.h" +#include "util/util_simd.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h index 756bd15ed25..04341451afb 100644 --- a/intern/cycles/util/util_simd.h +++ b/intern/cycles/util/util_simd.h @@ -18,19 +18,37 @@ #ifndef __UTIL_SIMD_TYPES_H__ #define __UTIL_SIMD_TYPES_H__ +#ifndef __KERNEL_GPU__ + #include <limits> -#include "util_debug.h" -#include "util_types.h" +#include "util/util_defines.h" + +/* SSE Intrinsics includes + * + * We assume __KERNEL_SSEX__ flags to have been defined at this point */ + +/* SSE intrinsics headers */ +#ifndef FREE_WINDOWS64 + +#ifdef _MSC_VER +# include <intrin.h> +#elif (defined(__x86_64__) || defined(__i386__)) +# include <x86intrin.h> +#endif + +#else + +/* MinGW64 has conflicting declarations for these SSE headers in <windows.h>. + * Since we can't avoid including <windows.h>, better only include that */ +#include "util/util_windows.h" + +#endif CCL_NAMESPACE_BEGIN #ifdef __KERNEL_SSE2__ -struct sseb; -struct ssei; -struct ssef; - extern const __m128 _mm_lookupmask_ps[16]; /* Special Types */ @@ -328,12 +346,12 @@ __forceinline size_t __bscf(size_t& v) #endif /* _WIN32 */ -static const unsigned int BITSCAN_NO_BIT_SET_32 = 32; -static const size_t BITSCAN_NO_BIT_SET_64 = 64; - -/* Emulation of SSE4 functions with SSE3 */ +/* Test __KERNEL_SSE41__ for MSVC which does not define __SSE4_1__, and test + * __SSE4_1__ to avoid OpenImageIO conflicts with our emulation macros on other + * platforms when compiling code outside the kernel. */ +#if !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__)) -#if defined(__KERNEL_SSE3) && !defined(__KERNEL_SSE4__) +/* Emulation of SSE4 functions with SSE2 */ #define _MM_FROUND_TO_NEAREST_INT 0x00 #define _MM_FROUND_TO_NEG_INF 0x01 @@ -341,45 +359,51 @@ static const size_t BITSCAN_NO_BIT_SET_64 = 64; #define _MM_FROUND_TO_ZERO 0x03 #define _MM_FROUND_CUR_DIRECTION 0x04 -#define _mm_blendv_ps __emu_mm_blendv_ps -__forceinline __m128 _mm_blendv_ps( __m128 value, __m128 input, __m128 mask ) { - return _mm_or_ps(_mm_and_ps(mask, input), _mm_andnot_ps(mask, value)); +#undef _mm_blendv_ps +#define _mm_blendv_ps _mm_blendv_ps_emu +__forceinline __m128 _mm_blendv_ps_emu( __m128 value, __m128 input, __m128 mask) +{ + __m128i isignmask = _mm_set1_epi32(0x80000000); + __m128 signmask = _mm_castsi128_ps(isignmask); + __m128i iandsign = _mm_castps_si128(_mm_and_ps(mask, signmask)); + __m128i icmpmask = _mm_cmpeq_epi32(iandsign, isignmask); + __m128 cmpmask = _mm_castsi128_ps(icmpmask); + return _mm_or_ps(_mm_and_ps(cmpmask, input), _mm_andnot_ps(cmpmask, value)); } -#define _mm_blend_ps __emu_mm_blend_ps -__forceinline __m128 _mm_blend_ps( __m128 value, __m128 input, const int mask ) { +#undef _mm_blend_ps +#define _mm_blend_ps _mm_blend_ps_emu +__forceinline __m128 _mm_blend_ps_emu( __m128 value, __m128 input, const int mask) +{ assert(mask < 0x10); return _mm_blendv_ps(value, input, _mm_lookupmask_ps[mask]); } -#define _mm_blendv_epi8 __emu_mm_blendv_epi8 -__forceinline __m128i _mm_blendv_epi8( __m128i value, __m128i input, __m128i mask ) { +#undef _mm_blendv_epi8 +#define _mm_blendv_epi8 _mm_blendv_epi8_emu +__forceinline __m128i _mm_blendv_epi8_emu( __m128i value, __m128i input, __m128i mask) +{ return _mm_or_si128(_mm_and_si128(mask, input), _mm_andnot_si128(mask, value)); } -#define _mm_mullo_epi32 __emu_mm_mullo_epi32 -__forceinline __m128i _mm_mullo_epi32( __m128i value, __m128i input ) { - __m128i rvalue; - char* _r = (char*)(&rvalue + 1); - char* _v = (char*)(& value + 1); - char* _i = (char*)(& input + 1); - for( ssize_t i = -16 ; i != 0 ; i += 4 ) *((int32*)(_r + i)) = *((int32*)(_v + i))* *((int32*)(_i + i)); - return rvalue; -} - - -#define _mm_min_epi32 __emu_mm_min_epi32 -__forceinline __m128i _mm_min_epi32( __m128i value, __m128i input ) { +#undef _mm_min_epi32 +#define _mm_min_epi32 _mm_min_epi32_emu +__forceinline __m128i _mm_min_epi32_emu( __m128i value, __m128i input) +{ return _mm_blendv_epi8(input, value, _mm_cmplt_epi32(value, input)); } -#define _mm_max_epi32 __emu_mm_max_epi32 -__forceinline __m128i _mm_max_epi32( __m128i value, __m128i input ) { +#undef _mm_max_epi32 +#define _mm_max_epi32 _mm_max_epi32_emu +__forceinline __m128i _mm_max_epi32_emu( __m128i value, __m128i input) +{ return _mm_blendv_epi8(value, input, _mm_cmplt_epi32(value, input)); } -#define _mm_extract_epi32 __emu_mm_extract_epi32 -__forceinline int _mm_extract_epi32( __m128i input, const int index ) { - switch ( index ) { +#undef _mm_extract_epi32 +#define _mm_extract_epi32 _mm_extract_epi32_emu +__forceinline int _mm_extract_epi32_emu( __m128i input, const int index) +{ + switch(index) { case 0: return _mm_cvtsi128_si32(input); case 1: return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(1, 1, 1, 1))); case 2: return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(2, 2, 2, 2))); @@ -388,24 +412,27 @@ __forceinline int _mm_extract_epi32( __m128i input, const int index ) { } } -#define _mm_insert_epi32 __emu_mm_insert_epi32 -__forceinline __m128i _mm_insert_epi32( __m128i value, int input, const int index ) { +#undef _mm_insert_epi32 +#define _mm_insert_epi32 _mm_insert_epi32_emu +__forceinline __m128i _mm_insert_epi32_emu( __m128i value, int input, const int index) +{ assert(index >= 0 && index < 4); ((int*)&value)[index] = input; return value; } -#define _mm_extract_ps __emu_mm_extract_ps -__forceinline int _mm_extract_ps( __m128 input, const int index ) { - int32* ptr = (int32*)&input; return ptr[index]; +#undef _mm_insert_ps +#define _mm_insert_ps _mm_insert_ps_emu +__forceinline __m128 _mm_insert_ps_emu( __m128 value, __m128 input, const int index) +{ + assert(index < 0x100); + ((float*)&value)[(index >> 4)&0x3] = ((float*)&input)[index >> 6]; + return _mm_andnot_ps(_mm_lookupmask_ps[index&0xf], value); } -#define _mm_insert_ps __emu_mm_insert_ps -__forceinline __m128 _mm_insert_ps( __m128 value, __m128 input, const int index ) -{ assert(index < 0x100); ((float*)&value)[(index >> 4)&0x3] = ((float*)&input)[index >> 6]; return _mm_andnot_ps(_mm_lookupmask_ps[index&0xf], value); } - -#define _mm_round_ps __emu_mm_round_ps -__forceinline __m128 _mm_round_ps( __m128 value, const int flags ) +#undef _mm_round_ps +#define _mm_round_ps _mm_round_ps_emu +__forceinline __m128 _mm_round_ps_emu( __m128 value, const int flags) { - switch ( flags ) + switch(flags) { case _MM_FROUND_TO_NEAREST_INT: return _mm_cvtepi32_ps(_mm_cvtps_epi32(value)); case _MM_FROUND_TO_NEG_INF : return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(value, _mm_set1_ps(-0.5f)))); @@ -415,20 +442,7 @@ __forceinline __m128 _mm_round_ps( __m128 value, const int flags ) return value; } -#ifdef _M_X64 -#define _mm_insert_epi64 __emu_mm_insert_epi64 -__forceinline __m128i _mm_insert_epi64( __m128i value, __int64 input, const int index ) { - assert(size_t(index) < 4); ((__int64*)&value)[index] = input; return value; -} - -#define _mm_extract_epi64 __emu_mm_extract_epi64 -__forceinline __int64 _mm_extract_epi64( __m128i input, const int index ) { - assert(size_t(index) < 2); - return index == 0 ? _mm_cvtsi128_si64x(input) : _mm_cvtsi128_si64x(_mm_unpackhi_epi64(input, input)); -} -#endif - -#endif +#endif /* !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__)) */ #else /* __KERNEL_SSE2__ */ @@ -440,22 +454,37 @@ ccl_device_inline int bitscan(int value) { assert(value != 0); int bit = 0; - while(value >>= 1) { + while((value & (1 << bit)) == 0) { ++bit; } return bit; } +ccl_device_inline int __bsr(int value) +{ + assert(value != 0); + int bit = 0; + while(value >>= 1) { + ++bit; + } + return bit; +} #endif /* __KERNEL_SSE2__ */ +/* quiet unused define warnings */ +#if defined(__KERNEL_SSE2__) || \ + defined(__KERNEL_SSE3__) || \ + defined(__KERNEL_SSSE3__) || \ + defined(__KERNEL_SSE41__) || \ + defined(__KERNEL_AVX__) || \ + defined(__KERNEL_AVX2__) + /* do nothing */ +#endif + CCL_NAMESPACE_END -#include "util_math.h" -#include "util_sseb.h" -#include "util_ssei.h" -#include "util_ssef.h" -#include "util_avxf.h" +#endif /* __KERNEL_GPU__ */ #endif /* __UTIL_SIMD_TYPES_H__ */ diff --git a/intern/cycles/util/util_sky_model.cpp b/intern/cycles/util/util_sky_model.cpp index 5730986cc4f..6dda8469907 100644 --- a/intern/cycles/util/util_sky_model.cpp +++ b/intern/cycles/util/util_sky_model.cpp @@ -97,8 +97,8 @@ All instructions on how to use this code are in the accompanying header file. */ -#include "util_sky_model.h" -#include "util_sky_model_data.h" +#include "util/util_sky_model.h" +#include "util/util_sky_model_data.h" #include <assert.h> #include <stdio.h> diff --git a/intern/cycles/util/util_sseb.h b/intern/cycles/util/util_sseb.h index 6e669701f3b..977976c3fc0 100644 --- a/intern/cycles/util/util_sseb.h +++ b/intern/cycles/util/util_sseb.h @@ -22,6 +22,9 @@ CCL_NAMESPACE_BEGIN #ifdef __KERNEL_SSE2__ +struct ssei; +struct ssef; + /*! 4-wide SSE bool type. */ struct sseb { @@ -116,7 +119,7 @@ __forceinline const sseb unpacklo( const sseb& a, const sseb& b ) { return _mm_u __forceinline const sseb unpackhi( const sseb& a, const sseb& b ) { return _mm_unpackhi_ps(a, b); } template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const sseb shuffle( const sseb& a ) { - return _mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0)); + return _mm_castsi128_ps(_mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0))); } template<> __forceinline const sseb shuffle<0, 1, 0, 1>( const sseb& a ) { diff --git a/intern/cycles/util/util_ssef.h b/intern/cycles/util/util_ssef.h index 2f5295b5463..bb007ff84a9 100644 --- a/intern/cycles/util/util_ssef.h +++ b/intern/cycles/util/util_ssef.h @@ -22,6 +22,9 @@ CCL_NAMESPACE_BEGIN #ifdef __KERNEL_SSE2__ +struct sseb; +struct ssef; + /*! 4-wide SSE float type. */ struct ssef { @@ -514,12 +517,12 @@ ccl_device_inline float len3(const ssef& a) /* faster version for SSSE3 */ typedef ssei shuffle_swap_t; -ccl_device_inline const shuffle_swap_t shuffle_swap_identity(void) +ccl_device_inline shuffle_swap_t shuffle_swap_identity(void) { return _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); } -ccl_device_inline const shuffle_swap_t shuffle_swap_swap(void) +ccl_device_inline shuffle_swap_t shuffle_swap_swap(void) { return _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); } @@ -534,12 +537,12 @@ ccl_device_inline const ssef shuffle_swap(const ssef& a, const shuffle_swap_t& s /* somewhat slower version for SSE2 */ typedef int shuffle_swap_t; -ccl_device_inline const shuffle_swap_t shuffle_swap_identity(void) +ccl_device_inline shuffle_swap_t shuffle_swap_identity(void) { return 0; } -ccl_device_inline const shuffle_swap_t shuffle_swap_swap(void) +ccl_device_inline shuffle_swap_t shuffle_swap_swap(void) { return 1; } diff --git a/intern/cycles/util/util_ssei.h b/intern/cycles/util/util_ssei.h index 5f62569268c..ef2a9e68b7d 100644 --- a/intern/cycles/util/util_ssei.h +++ b/intern/cycles/util/util_ssei.h @@ -22,6 +22,9 @@ CCL_NAMESPACE_BEGIN #ifdef __KERNEL_SSE2__ +struct sseb; +struct ssef; + /*! 4-wide SSE integer type. */ struct ssei { @@ -234,8 +237,10 @@ __forceinline size_t select_max(const sseb& valid, const ssei& v) { const ssei a #else -__forceinline int reduce_min(const ssei& v) { return min(min(v[0],v[1]),min(v[2],v[3])); } -__forceinline int reduce_max(const ssei& v) { return max(max(v[0],v[1]),max(v[2],v[3])); } +__forceinline int ssei_min(int a, int b) { return (a < b)? a: b; } +__forceinline int ssei_max(int a, int b) { return (a > b)? a: b; } +__forceinline int reduce_min(const ssei& v) { return ssei_min(ssei_min(v[0],v[1]),ssei_min(v[2],v[3])); } +__forceinline int reduce_max(const ssei& v) { return ssei_max(ssei_max(v[0],v[1]),ssei_max(v[2],v[3])); } __forceinline int reduce_add(const ssei& v) { return v[0]+v[1]+v[2]+v[3]; } #endif diff --git a/intern/cycles/util/util_stack_allocator.h b/intern/cycles/util/util_stack_allocator.h index d7aab5b250c..79a535bd170 100644 --- a/intern/cycles/util/util_stack_allocator.h +++ b/intern/cycles/util/util_stack_allocator.h @@ -20,9 +20,6 @@ #include <cstddef> #include <memory> -#include "util_debug.h" -#include "util_types.h" - CCL_NAMESPACE_BEGIN /* Stack allocator for the use with STL. */ diff --git a/intern/cycles/util/util_static_assert.h b/intern/cycles/util/util_static_assert.h index 033d85e8ec6..e90049254de 100644 --- a/intern/cycles/util/util_static_assert.h +++ b/intern/cycles/util/util_static_assert.h @@ -43,7 +43,9 @@ template <> class StaticAssertFailure<true> {}; # endif /* __COUNTER__ */ # endif /* C++11 or MSVC2015 */ #else /* __KERNEL_GPU__ */ -# define static_assert(statement, message) +# ifndef static_assert +# define static_assert(statement, message) +# endif #endif /* __KERNEL_GPU__ */ /* TODO(sergey): For until C++11 is a bare minimum for us, diff --git a/intern/cycles/util/util_stats.h b/intern/cycles/util/util_stats.h index c21a8488c81..7667f58eb7d 100644 --- a/intern/cycles/util/util_stats.h +++ b/intern/cycles/util/util_stats.h @@ -17,7 +17,7 @@ #ifndef __UTIL_STATS_H__ #define __UTIL_STATS_H__ -#include "util_atomic.h" +#include "util/util_atomic.h" CCL_NAMESPACE_BEGIN @@ -30,7 +30,7 @@ public: void mem_alloc(size_t size) { atomic_add_and_fetch_z(&mem_used, size); - atomic_update_max_z(&mem_peak, mem_used); + atomic_fetch_and_update_max_z(&mem_peak, mem_used); } void mem_free(size_t size) { diff --git a/intern/cycles/util/util_string.cpp b/intern/cycles/util/util_string.cpp index 5594aa8edb6..94ad512982c 100644 --- a/intern/cycles/util/util_string.cpp +++ b/intern/cycles/util/util_string.cpp @@ -17,9 +17,9 @@ #include <stdarg.h> #include <stdio.h> -#include "util_foreach.h" -#include "util_string.h" -#include "util_windows.h" +#include "util/util_foreach.h" +#include "util/util_string.h" +#include "util/util_windows.h" #ifdef _WIN32 # ifndef vsnprintf @@ -148,6 +148,12 @@ void string_replace(string& haystack, const string& needle, const string& other) string string_remove_trademark(const string &s) { string result = s; + + /* Special case, so we don;t leave sequential spaces behind. */ + /* TODO(sergey): Consider using regex perhaps? */ + string_replace(result, " (TM)", ""); + string_replace(result, " (R)", ""); + string_replace(result, "(TM)", ""); string_replace(result, "(R)", ""); diff --git a/intern/cycles/util/util_string.h b/intern/cycles/util/util_string.h index 7aeed96f00b..e2c105db9c1 100644 --- a/intern/cycles/util/util_string.h +++ b/intern/cycles/util/util_string.h @@ -21,7 +21,7 @@ #include <string> #include <sstream> -#include "util_vector.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/util/util_system.cpp b/intern/cycles/util/util_system.cpp index 87d885c44cf..5f5211228c5 100644 --- a/intern/cycles/util/util_system.cpp +++ b/intern/cycles/util/util_system.cpp @@ -14,12 +14,11 @@ * limitations under the License. */ -#include "util_system.h" +#include "util/util_system.h" -#include "util_debug.h" -#include "util_logging.h" -#include "util_types.h" -#include "util_string.h" +#include "util/util_logging.h" +#include "util/util_types.h" +#include "util/util_string.h" #ifdef _WIN32 # if(!defined(FREE_WINDOWS)) @@ -234,35 +233,34 @@ static CPUCapabilities& system_cpu_capabilities() bool system_cpu_support_sse2() { CPUCapabilities& caps = system_cpu_capabilities(); - return DebugFlags().cpu.sse2 && caps.sse && caps.sse2; + return caps.sse && caps.sse2; } bool system_cpu_support_sse3() { CPUCapabilities& caps = system_cpu_capabilities(); - return DebugFlags().cpu.sse3 && - caps.sse && caps.sse2 && caps.sse3 && caps.ssse3; + return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3; } bool system_cpu_support_sse41() { CPUCapabilities& caps = system_cpu_capabilities(); - return DebugFlags().cpu.sse41 && - caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41; + return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41; } bool system_cpu_support_avx() { CPUCapabilities& caps = system_cpu_capabilities(); - return DebugFlags().cpu.avx && - caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx; + return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && + caps.sse41 && caps.avx; } bool system_cpu_support_avx2() { CPUCapabilities& caps = system_cpu_capabilities(); - return DebugFlags().cpu.avx2 && - caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && caps.avx && caps.f16c && caps.avx2 && caps.fma3 && caps.bmi1 && caps.bmi2; + return caps.sse && caps.sse2 && caps.sse3 && caps.ssse3 && caps.sse41 && + caps.avx && caps.f16c && caps.avx2 && caps.fma3 && caps.bmi1 && + caps.bmi2; } #else @@ -292,5 +290,26 @@ bool system_cpu_support_avx2() #endif +size_t system_physical_ram() +{ +#ifdef _WIN32 + MEMORYSTATUSEX ram; + ram.dwLength = sizeof (ram); + GlobalMemoryStatusEx(&ram); + return ram.ullTotalPhys * 1024; +#elif defined(__APPLE__) + uint64_t ram = 0; + size_t len = sizeof(ram); + if (sysctlbyname("hw.memsize", &ram, &len, NULL, 0) == 0) { + return ram; + } + return 0; +#else + size_t ps = sysconf(_SC_PAGESIZE); + size_t pn = sysconf(_SC_PHYS_PAGES); + return ps * pn; +#endif +} + CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_system.h b/intern/cycles/util/util_system.h index ff61b260bed..e55dd6dd136 100644 --- a/intern/cycles/util/util_system.h +++ b/intern/cycles/util/util_system.h @@ -17,7 +17,7 @@ #ifndef __UTIL_SYSTEM_H__ #define __UTIL_SYSTEM_H__ -#include "util_string.h" +#include "util/util_string.h" CCL_NAMESPACE_BEGIN @@ -42,6 +42,8 @@ bool system_cpu_support_sse41(); bool system_cpu_support_avx(); bool system_cpu_support_avx2(); +size_t system_physical_ram(); + CCL_NAMESPACE_END #endif /* __UTIL_SYSTEM_H__ */ diff --git a/intern/cycles/util/util_task.cpp b/intern/cycles/util/util_task.cpp index 0d1fed3ebbf..9df1096de8a 100644 --- a/intern/cycles/util/util_task.cpp +++ b/intern/cycles/util/util_task.cpp @@ -14,12 +14,11 @@ * limitations under the License. */ -#include "util_debug.h" -#include "util_foreach.h" -#include "util_logging.h" -#include "util_system.h" -#include "util_task.h" -#include "util_time.h" +#include "util/util_foreach.h" +#include "util/util_logging.h" +#include "util/util_system.h" +#include "util/util_task.h" +#include "util/util_time.h" //#define THREADING_DEBUG_ENABLED @@ -206,9 +205,9 @@ void TaskScheduler::init(int num_threads) threads.resize(num_threads); const int num_groups = system_cpu_group_count(); - unsigned short num_process_groups; + unsigned short num_process_groups = 0; vector<unsigned short> process_groups; - int current_group_threads; + int current_group_threads = 0; if(num_groups > 1) { process_groups.resize(num_groups); num_process_groups = system_cpu_process_groups(num_groups, diff --git a/intern/cycles/util/util_task.h b/intern/cycles/util/util_task.h index 0b82f14f66f..3ebfb007e40 100644 --- a/intern/cycles/util/util_task.h +++ b/intern/cycles/util/util_task.h @@ -17,10 +17,10 @@ #ifndef __UTIL_TASK_H__ #define __UTIL_TASK_H__ -#include "util_list.h" -#include "util_string.h" -#include "util_thread.h" -#include "util_vector.h" +#include "util/util_list.h" +#include "util/util_string.h" +#include "util/util_thread.h" +#include "util/util_vector.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/util/util_texture.h b/intern/cycles/util/util_texture.h index aff928ea2ee..4b5f630427d 100644 --- a/intern/cycles/util/util_texture.h +++ b/intern/cycles/util/util_texture.h @@ -20,63 +20,7 @@ CCL_NAMESPACE_BEGIN /* Texture limits on devices. */ - -/* CPU */ -#define TEX_NUM_FLOAT4_CPU 1024 -#define TEX_NUM_BYTE4_CPU 1024 -#define TEX_NUM_HALF4_CPU 1024 -#define TEX_NUM_FLOAT_CPU 1024 -#define TEX_NUM_BYTE_CPU 1024 -#define TEX_NUM_HALF_CPU 1024 -#define TEX_START_FLOAT4_CPU 0 -#define TEX_START_BYTE4_CPU TEX_NUM_FLOAT4_CPU -#define TEX_START_HALF4_CPU (TEX_NUM_FLOAT4_CPU + TEX_NUM_BYTE4_CPU) -#define TEX_START_FLOAT_CPU (TEX_NUM_FLOAT4_CPU + TEX_NUM_BYTE4_CPU + TEX_NUM_HALF4_CPU) -#define TEX_START_BYTE_CPU (TEX_NUM_FLOAT4_CPU + TEX_NUM_BYTE4_CPU + TEX_NUM_HALF4_CPU + TEX_NUM_FLOAT_CPU) -#define TEX_START_HALF_CPU (TEX_NUM_FLOAT4_CPU + TEX_NUM_BYTE4_CPU + TEX_NUM_HALF4_CPU + TEX_NUM_FLOAT_CPU + TEX_NUM_BYTE_CPU) - -/* CUDA (Geforce 4xx and 5xx) */ -#define TEX_NUM_FLOAT4_CUDA 5 -#define TEX_NUM_BYTE4_CUDA 85 -#define TEX_NUM_HALF4_CUDA 0 -#define TEX_NUM_FLOAT_CUDA 0 -#define TEX_NUM_BYTE_CUDA 0 -#define TEX_NUM_HALF_CUDA 0 -#define TEX_START_FLOAT4_CUDA 0 -#define TEX_START_BYTE4_CUDA TEX_NUM_FLOAT4_CUDA -#define TEX_START_HALF4_CUDA (TEX_NUM_FLOAT4_CUDA + TEX_NUM_BYTE4_CUDA) -#define TEX_START_FLOAT_CUDA (TEX_NUM_FLOAT4_CUDA + TEX_NUM_BYTE4_CUDA + TEX_NUM_HALF4_CUDA) -#define TEX_START_BYTE_CUDA (TEX_NUM_FLOAT4_CUDA + TEX_NUM_BYTE4_CUDA + TEX_NUM_HALF4_CUDA + TEX_NUM_FLOAT_CUDA) -#define TEX_START_HALF_CUDA (TEX_NUM_FLOAT4_CUDA + TEX_NUM_BYTE4_CUDA + TEX_NUM_HALF4_CUDA + TEX_NUM_FLOAT_CUDA + TEX_NUM_BYTE_CUDA) - -/* CUDA (Kepler, Geforce 6xx and above) */ -#define TEX_NUM_FLOAT4_CUDA_KEPLER 1024 -#define TEX_NUM_BYTE4_CUDA_KEPLER 1024 -#define TEX_NUM_HALF4_CUDA_KEPLER 1024 -#define TEX_NUM_FLOAT_CUDA_KEPLER 1024 -#define TEX_NUM_BYTE_CUDA_KEPLER 1024 -#define TEX_NUM_HALF_CUDA_KEPLER 1024 -#define TEX_START_FLOAT4_CUDA_KEPLER 0 -#define TEX_START_BYTE4_CUDA_KEPLER TEX_NUM_FLOAT4_CUDA_KEPLER -#define TEX_START_HALF4_CUDA_KEPLER (TEX_NUM_FLOAT4_CUDA_KEPLER + TEX_NUM_BYTE4_CUDA_KEPLER) -#define TEX_START_FLOAT_CUDA_KEPLER (TEX_NUM_FLOAT4_CUDA_KEPLER + TEX_NUM_BYTE4_CUDA_KEPLER + TEX_NUM_HALF4_CUDA_KEPLER) -#define TEX_START_BYTE_CUDA_KEPLER (TEX_NUM_FLOAT4_CUDA_KEPLER + TEX_NUM_BYTE4_CUDA_KEPLER + TEX_NUM_HALF4_CUDA_KEPLER + TEX_NUM_FLOAT_CUDA_KEPLER) -#define TEX_START_HALF_CUDA_KEPLER (TEX_NUM_FLOAT4_CUDA_KEPLER + TEX_NUM_BYTE4_CUDA_KEPLER + TEX_NUM_HALF4_CUDA_KEPLER + TEX_NUM_FLOAT_CUDA_KEPLER + TEX_NUM_BYTE_CUDA_KEPLER) - -/* OpenCL */ -#define TEX_NUM_FLOAT4_OPENCL 1024 -#define TEX_NUM_BYTE4_OPENCL 1024 -#define TEX_NUM_HALF4_OPENCL 0 -#define TEX_NUM_FLOAT_OPENCL 1024 -#define TEX_NUM_BYTE_OPENCL 1024 -#define TEX_NUM_HALF_OPENCL 0 -#define TEX_START_FLOAT4_OPENCL 0 -#define TEX_START_BYTE4_OPENCL TEX_NUM_FLOAT4_OPENCL -#define TEX_START_HALF4_OPENCL (TEX_NUM_FLOAT4_OPENCL + TEX_NUM_BYTE4_OPENCL) -#define TEX_START_FLOAT_OPENCL (TEX_NUM_FLOAT4_OPENCL + TEX_NUM_BYTE4_OPENCL + TEX_NUM_HALF4_OPENCL) -#define TEX_START_BYTE_OPENCL (TEX_NUM_FLOAT4_OPENCL + TEX_NUM_BYTE4_OPENCL + TEX_NUM_HALF4_OPENCL + TEX_NUM_FLOAT_OPENCL) -#define TEX_START_HALF_OPENCL (TEX_NUM_FLOAT4_OPENCL + TEX_NUM_BYTE4_OPENCL + TEX_NUM_HALF4_OPENCL + TEX_NUM_FLOAT_OPENCL + TEX_NUM_BYTE_OPENCL) - +#define TEX_NUM_MAX (INT_MAX >> 4) /* Color to use when textures are not found. */ #define TEX_IMAGE_MISSING_R 1 @@ -84,6 +28,63 @@ CCL_NAMESPACE_BEGIN #define TEX_IMAGE_MISSING_B 1 #define TEX_IMAGE_MISSING_A 1 +/* Texture type. */ +#define kernel_tex_type(tex) (tex & IMAGE_DATA_TYPE_MASK) + +/* Interpolation types for textures + * cuda also use texture space to store other objects */ +typedef enum InterpolationType { + INTERPOLATION_NONE = -1, + INTERPOLATION_LINEAR = 0, + INTERPOLATION_CLOSEST = 1, + INTERPOLATION_CUBIC = 2, + INTERPOLATION_SMART = 3, + + INTERPOLATION_NUM_TYPES, +} InterpolationType; + +/* Texture types + * Since we store the type in the lower bits of a flat index, + * the shift and bit mask constant below need to be kept in sync. */ +typedef enum ImageDataType { + IMAGE_DATA_TYPE_FLOAT4 = 0, + IMAGE_DATA_TYPE_BYTE4 = 1, + IMAGE_DATA_TYPE_HALF4 = 2, + IMAGE_DATA_TYPE_FLOAT = 3, + IMAGE_DATA_TYPE_BYTE = 4, + IMAGE_DATA_TYPE_HALF = 5, + + IMAGE_DATA_NUM_TYPES +} ImageDataType; + +#define IMAGE_DATA_TYPE_SHIFT 3 +#define IMAGE_DATA_TYPE_MASK 0x7 + +/* Extension types for textures. + * + * Defines how the image is extrapolated past its original bounds. */ +typedef enum ExtensionType { + /* Cause the image to repeat horizontally and vertically. */ + EXTENSION_REPEAT = 0, + /* Extend by repeating edge pixels of the image. */ + EXTENSION_EXTEND = 1, + /* Clip to image size and set exterior pixels as transparent. */ + EXTENSION_CLIP = 2, + + EXTENSION_NUM_TYPES, +} ExtensionType; + +typedef struct TextureInfo { + /* Pointer, offset or texture depending on device. */ + uint64_t data; + /* Buffer number for OpenCL. */ + uint cl_buffer; + /* Interpolation and extension type. */ + uint interpolation, extension; + /* Dimensions. */ + uint width, height, depth; +} TextureInfo; + CCL_NAMESPACE_END #endif /* __UTIL_TEXTURE_H__ */ diff --git a/intern/cycles/util/util_thread.cpp b/intern/cycles/util/util_thread.cpp index 3db8b4bd197..c66aa484264 100644 --- a/intern/cycles/util/util_thread.cpp +++ b/intern/cycles/util/util_thread.cpp @@ -14,10 +14,10 @@ * limitations under the License. */ -#include "util_thread.h" +#include "util/util_thread.h" -#include "util_system.h" -#include "util_windows.h" +#include "util/util_system.h" +#include "util/util_windows.h" CCL_NAMESPACE_BEGIN @@ -26,7 +26,11 @@ thread::thread(function<void(void)> run_cb, int group) joined_(false), group_(group) { +#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800) + thread_ = std::thread(&thread::run, this); +#else pthread_create(&pthread_id_, NULL, run, (void*)this); +#endif } thread::~thread() @@ -60,7 +64,17 @@ void *thread::run(void *arg) bool thread::join() { joined_ = true; +#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800) + try { + thread_.join(); + return true; + } + catch (const std::system_error&) { + return false; + } +#else return pthread_join(pthread_id_, NULL) == 0; +#endif } CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_thread.h b/intern/cycles/util/util_thread.h index 427c633d2ce..4d8f464359c 100644 --- a/intern/cycles/util/util_thread.h +++ b/intern/cycles/util/util_thread.h @@ -24,15 +24,21 @@ # include <functional> #else # include <boost/thread.hpp> +# include <pthread.h> #endif -#include <pthread.h> #include <queue> +#ifdef _WIN32 +# include "util_windows.h" +#else +# include <pthread.h> +#endif + #ifdef __APPLE__ # include <libkern/OSAtomic.h> #endif -#include "util_function.h" +#include "util/util_function.h" CCL_NAMESPACE_BEGIN @@ -60,7 +66,11 @@ public: protected: function<void(void)> run_cb_; +#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800) + std::thread thread_; +#else pthread_t pthread_id_; +#endif bool joined_; int group_; }; @@ -81,7 +91,24 @@ public: inline void unlock() { OSSpinLockUnlock(&spin_); } -#else /* __APPLE__ */ +#elif defined(_WIN32) + inline thread_spin_lock() { + const DWORD SPIN_COUNT = 50000; + InitializeCriticalSectionAndSpinCount(&cs_, SPIN_COUNT); + } + + inline ~thread_spin_lock() { + DeleteCriticalSection(&cs_); + } + + inline void lock() { + EnterCriticalSection(&cs_); + } + + inline void unlock() { + LeaveCriticalSection(&cs_); + } +#else inline thread_spin_lock() { pthread_spin_init(&spin_, 0); } @@ -97,15 +124,34 @@ public: inline void unlock() { pthread_spin_unlock(&spin_); } -#endif /* __APPLE__ */ +#endif protected: #ifdef __APPLE__ OSSpinLock spin_; +#elif defined(_WIN32) + CRITICAL_SECTION cs_; #else pthread_spinlock_t spin_; #endif }; +class thread_scoped_spin_lock { +public: + explicit thread_scoped_spin_lock(thread_spin_lock& lock) + : lock_(lock) { + lock_.lock(); + } + + ~thread_scoped_spin_lock() { + lock_.unlock(); + } + + /* TODO(sergey): Implement manual control over lock/unlock. */ + +protected: + thread_spin_lock& lock_; +}; + CCL_NAMESPACE_END #endif /* __UTIL_THREAD_H__ */ diff --git a/intern/cycles/util/util_time.cpp b/intern/cycles/util/util_time.cpp index 59c963cfafb..7c39aa294bf 100644 --- a/intern/cycles/util/util_time.cpp +++ b/intern/cycles/util/util_time.cpp @@ -16,8 +16,8 @@ #include <stdlib.h> -#include "util_time.h" -#include "util_windows.h" +#include "util/util_time.h" +#include "util/util_windows.h" #ifdef _WIN32 diff --git a/intern/cycles/util/util_time.h b/intern/cycles/util/util_time.h index 65798244111..f03aa590e9b 100644 --- a/intern/cycles/util/util_time.h +++ b/intern/cycles/util/util_time.h @@ -37,7 +37,7 @@ public: ~scoped_timer() { if(value_ != NULL) { - *value_ = time_dt() - time_start_; + *value_ = get_time(); } } @@ -46,6 +46,11 @@ public: return time_start_; } + double get_time() const + { + return time_dt() - time_start_; + } + protected: double *value_; double time_start_; diff --git a/intern/cycles/util/util_transform.cpp b/intern/cycles/util/util_transform.cpp index 2f10540c94e..206c3da23eb 100644 --- a/intern/cycles/util/util_transform.cpp +++ b/intern/cycles/util/util_transform.cpp @@ -46,10 +46,11 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "util_transform.h" +#include "util/util_projection.h" +#include "util/util_transform.h" -#include "util_boundbox.h" -#include "util_math.h" +#include "util/util_boundbox.h" +#include "util/util_math.h" CCL_NAMESPACE_BEGIN @@ -129,9 +130,9 @@ static bool transform_matrix4_gj_inverse(float R[][4], float M[][4]) return true; } -Transform transform_inverse(const Transform& tfm) +ProjectionTransform projection_inverse(const ProjectionTransform& tfm) { - Transform tfmR = transform_identity(); + ProjectionTransform tfmR = projection_identity(); float M[4][4], R[4][4]; memcpy(R, &tfmR, sizeof(R)); @@ -145,7 +146,7 @@ Transform transform_inverse(const Transform& tfm) M[2][2] += 1e-8f; if(UNLIKELY(!transform_matrix4_gj_inverse(R, M))) { - return transform_identity(); + return projection_identity(); } } @@ -154,6 +155,19 @@ Transform transform_inverse(const Transform& tfm) return tfmR; } +Transform transform_inverse(const Transform& tfm) +{ + ProjectionTransform projection(tfm); + return projection_to_transform(projection_inverse(projection)); +} + +Transform transform_transposed_inverse(const Transform& tfm) +{ + ProjectionTransform projection(tfm); + ProjectionTransform iprojection = projection_inverse(projection); + return projection_to_transform(projection_transpose(iprojection)); +} + /* Motion Transform */ float4 transform_to_quat(const Transform& tfm) @@ -202,14 +216,14 @@ float4 transform_to_quat(const Transform& tfm) return qt; } -static void transform_decompose(Transform *decomp, const Transform *tfm) +static void transform_decompose(DecomposedTransform *decomp, const Transform *tfm) { /* extract translation */ decomp->y = make_float4(tfm->x.w, tfm->y.w, tfm->z.w, 0.0f); /* extract rotation */ Transform M = *tfm; - M.x.w = 0.0f; M.y.w = 0.0f; M.z.w = 0.0f; M.w.w = 1.0f; + M.x.w = 0.0f; M.y.w = 0.0f; M.z.w = 0.0f; Transform R = M; float norm; @@ -217,9 +231,9 @@ static void transform_decompose(Transform *decomp, const Transform *tfm) do { Transform Rnext; - Transform Rit = transform_inverse(transform_transpose(R)); + Transform Rit = transform_transposed_inverse(R); - for(int i = 0; i < 4; i++) + for(int i = 0; i < 3; i++) for(int j = 0; j < 4; j++) Rnext[i][j] = 0.5f * (R[i][j] + Rit[i][j]); @@ -247,30 +261,18 @@ static void transform_decompose(Transform *decomp, const Transform *tfm) decomp->w = make_float4(scale.y.z, scale.z.x, scale.z.y, scale.z.z); } -void transform_motion_decompose(DecompMotionTransform *decomp, const MotionTransform *motion, const Transform *mid) +void transform_motion_decompose(DecomposedTransform *decomp, const Transform *motion, size_t size) { - Transform pre, post; - - transform_decompose(&pre, &motion->pre); - transform_decompose(&decomp->mid, mid); - transform_decompose(&post, &motion->post); - - /* ensure rotation around shortest angle, negated quaternions are the same - * but this means we don't have to do the check in quat_interpolate */ - if(dot(decomp->mid.x, post.x) < 0.0f) - decomp->mid.x = -decomp->mid.x; - if(dot(pre.x, decomp->mid.x) < 0.0f) - pre.x = -pre.x; - - /* drop scale of pre/post */ - pre.y.w = decomp->mid.y.w; - post.y.w = decomp->mid.y.w; - - /* store translation/rotation part of pre/post */ - decomp->pre_x = pre.x; - decomp->pre_y = pre.y; - decomp->post_x = post.x; - decomp->post_y = post.y; + for(size_t i = 0; i < size; i++) { + transform_decompose(decomp + i, motion + i); + + if(i > 0) { + /* Ensure rotation around shortest angle, negated quaternions are the same + * but this means we don't have to do the check in quat_interpolate */ + if(dot(decomp[i-1].x, decomp[i].x) < 0.0f) + decomp[i-1].x = -decomp[i-1].x; + } + } } Transform transform_from_viewplane(BoundBox2D& viewplane) diff --git a/intern/cycles/util/util_transform.h b/intern/cycles/util/util_transform.h index a0695f20488..987f4dac777 100644 --- a/intern/cycles/util/util_transform.h +++ b/intern/cycles/util/util_transform.h @@ -21,15 +21,15 @@ #include <string.h> #endif -#include "util_math.h" -#include "util_types.h" +#include "util/util_math.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN -/* Data Types */ +/* Affine transformation, stored as 4x3 matrix. */ typedef struct Transform { - float4 x, y, z, w; /* rows */ + float4 x, y, z; #ifndef __KERNEL_GPU__ float4 operator[](int i) const { return *(&x + i); } @@ -37,40 +37,16 @@ typedef struct Transform { #endif } Transform; -/* transform decomposed in rotation/translation/scale. we use the same data +/* Transform decomposed in rotation/translation/scale. we use the same data * structure as Transform, and tightly pack decomposition into it. first the - * rotation (4), then translation (3), then 3x3 scale matrix (9). - * - * For the DecompMotionTransform we drop scale from pre/post. */ - -typedef struct ccl_may_alias MotionTransform { - Transform pre; - Transform mid; - Transform post; -} MotionTransform; - -typedef struct DecompMotionTransform { - Transform mid; - float4 pre_x, pre_y; - float4 post_x, post_y; -} DecompMotionTransform; + * rotation (4), then translation (3), then 3x3 scale matrix (9). */ -typedef struct PerspectiveMotionTransform { - Transform pre; - Transform post; -} PerspectiveMotionTransform; +typedef struct DecomposedTransform { + float4 x, y, z, w; +} DecomposedTransform; /* Functions */ -ccl_device_inline float3 transform_perspective(const Transform *t, const float3 a) -{ - float4 b = make_float4(a.x, a.y, a.z, 1.0f); - float3 c = make_float3(dot(t->x, b), dot(t->y, b), dot(t->z, b)); - float w = dot(t->w, b); - - return (w != 0.0f)? c/w: make_float3(0.0f, 0.0f, 0.0f); -} - ccl_device_inline float3 transform_point(const Transform *t, const float3 a) { /* TODO(sergey): Disabled for now, causes crashes in certain cases. */ @@ -81,7 +57,7 @@ ccl_device_inline float3 transform_point(const Transform *t, const float3 a) x = _mm_loadu_ps(&t->x.x); y = _mm_loadu_ps(&t->y.x); z = _mm_loadu_ps(&t->z.x); - w = _mm_loadu_ps(&t->w.x); + w = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f); _MM_TRANSPOSE4_PS(x, y, z, w); @@ -137,29 +113,15 @@ ccl_device_inline float3 transform_direction_transposed(const Transform *t, cons return make_float3(dot(x, a), dot(y, a), dot(z, a)); } -ccl_device_inline Transform transform_transpose(const Transform a) -{ - Transform t; - - t.x.x = a.x.x; t.x.y = a.y.x; t.x.z = a.z.x; t.x.w = a.w.x; - t.y.x = a.x.y; t.y.y = a.y.y; t.y.z = a.z.y; t.y.w = a.w.y; - t.z.x = a.x.z; t.z.y = a.y.z; t.z.z = a.z.z; t.z.w = a.w.z; - t.w.x = a.x.w; t.w.y = a.y.w; t.w.z = a.z.w; t.w.w = a.w.w; - - return t; -} - ccl_device_inline Transform make_transform(float a, float b, float c, float d, float e, float f, float g, float h, - float i, float j, float k, float l, - float m, float n, float o, float p) + float i, float j, float k, float l) { Transform t; t.x.x = a; t.x.y = b; t.x.z = c; t.x.w = d; t.y.x = e; t.y.y = f; t.y.z = g; t.y.w = h; t.z.x = i; t.z.y = j; t.z.z = k; t.z.w = l; - t.w.x = m; t.w.y = n; t.w.z = o; t.w.w = p; return t; } @@ -173,21 +135,22 @@ ccl_device_inline Transform make_transform_frame(float3 N) const float3 dy = normalize(cross(N, dx)); return make_transform(dx.x, dx.y, dx.z, 0.0f, dy.x, dy.y, dy.z, 0.0f, - N.x , N.y, N.z, 0.0f, - 0.0f, 0.0f, 0.0f, 1.0f); + N.x , N.y, N.z, 0.0f); } #ifndef __KERNEL_GPU__ ccl_device_inline Transform operator*(const Transform a, const Transform b) { - Transform c = transform_transpose(b); - Transform t; + float4 c_x = make_float4(b.x.x, b.y.x, b.z.x, 0.0f); + float4 c_y = make_float4(b.x.y, b.y.y, b.z.y, 0.0f); + float4 c_z = make_float4(b.x.z, b.y.z, b.z.z, 0.0f); + float4 c_w = make_float4(b.x.w, b.y.w, b.z.w, 1.0f); - t.x = make_float4(dot(a.x, c.x), dot(a.x, c.y), dot(a.x, c.z), dot(a.x, c.w)); - t.y = make_float4(dot(a.y, c.x), dot(a.y, c.y), dot(a.y, c.z), dot(a.y, c.w)); - t.z = make_float4(dot(a.z, c.x), dot(a.z, c.y), dot(a.z, c.z), dot(a.z, c.w)); - t.w = make_float4(dot(a.w, c.x), dot(a.w, c.y), dot(a.w, c.z), dot(a.w, c.w)); + Transform t; + t.x = make_float4(dot(a.x, c_x), dot(a.x, c_y), dot(a.x, c_z), dot(a.x, c_w)); + t.y = make_float4(dot(a.y, c_x), dot(a.y, c_y), dot(a.y, c_z), dot(a.y, c_w)); + t.z = make_float4(dot(a.z, c_x), dot(a.z, c_y), dot(a.z, c_z), dot(a.z, c_w)); return t; } @@ -197,7 +160,6 @@ ccl_device_inline void print_transform(const char *label, const Transform& t) print_float4(label, t.x); print_float4(label, t.y); print_float4(label, t.z); - print_float4(label, t.w); printf("\n"); } @@ -206,8 +168,7 @@ ccl_device_inline Transform transform_translate(float3 t) return make_transform( 1, 0, 0, t.x, 0, 1, 0, t.y, - 0, 0, 1, t.z, - 0, 0, 0, 1); + 0, 0, 1, t.z); } ccl_device_inline Transform transform_translate(float x, float y, float z) @@ -220,8 +181,7 @@ ccl_device_inline Transform transform_scale(float3 s) return make_transform( s.x, 0, 0, 0, 0, s.y, 0, 0, - 0, 0, s.z, 0, - 0, 0, 0, 1); + 0, 0, s.z, 0); } ccl_device_inline Transform transform_scale(float x, float y, float z) @@ -229,21 +189,6 @@ ccl_device_inline Transform transform_scale(float x, float y, float z) return transform_scale(make_float3(x, y, z)); } -ccl_device_inline Transform transform_perspective(float fov, float n, float f) -{ - Transform persp = make_transform( - 1, 0, 0, 0, - 0, 1, 0, 0, - 0, 0, f / (f - n), -f*n / (f - n), - 0, 0, 1, 0); - - float inv_angle = 1.0f/tanf(0.5f*fov); - - Transform scale = transform_scale(inv_angle, inv_angle, 1); - - return scale * persp; -} - ccl_device_inline Transform transform_rotate(float angle, float3 axis) { float s = sinf(angle); @@ -266,9 +211,7 @@ ccl_device_inline Transform transform_rotate(float angle, float3 axis) axis.z*axis.x*t - s*axis.y, axis.z*axis.y*t + s*axis.x, axis.z*axis.z*t + c, - 0.0f, - - 0.0f, 0.0f, 0.0f, 1.0f); + 0.0f); } /* Euler is assumed to be in XYZ order. */ @@ -280,12 +223,6 @@ ccl_device_inline Transform transform_euler(float3 euler) transform_rotate(euler.x, make_float3(1.0f, 0.0f, 0.0f)); } -ccl_device_inline Transform transform_orthographic(float znear, float zfar) -{ - return transform_scale(1.0f, 1.0f, 1.0f / (zfar-znear)) * - transform_translate(0.0f, 0.0f, -znear); -} - ccl_device_inline Transform transform_identity() { return transform_scale(1.0f, 1.0f, 1.0f); @@ -314,20 +251,20 @@ ccl_device_inline void transform_set_column(Transform *t, int column, float3 val } Transform transform_inverse(const Transform& a); +Transform transform_transposed_inverse(const Transform& a); ccl_device_inline bool transform_uniform_scale(const Transform& tfm, float& scale) { /* the epsilon here is quite arbitrary, but this function is only used for - * surface area and bump, where we except it to not be so sensitive */ - Transform ttfm = transform_transpose(tfm); + * surface area and bump, where we expect it to not be so sensitive */ float eps = 1e-6f; float sx = len_squared(float4_to_float3(tfm.x)); float sy = len_squared(float4_to_float3(tfm.y)); float sz = len_squared(float4_to_float3(tfm.z)); - float stx = len_squared(float4_to_float3(ttfm.x)); - float sty = len_squared(float4_to_float3(ttfm.y)); - float stz = len_squared(float4_to_float3(ttfm.z)); + float stx = len_squared(transform_get_column(&tfm, 0)); + float sty = len_squared(transform_get_column(&tfm, 1)); + float stz = len_squared(transform_get_column(&tfm, 2)); if(fabsf(sx - sy) < eps && fabsf(sx - sz) < eps && fabsf(sx - stx) < eps && fabsf(sx - sty) < eps && @@ -365,7 +302,6 @@ ccl_device_inline Transform transform_empty() return make_transform( 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0); } @@ -422,12 +358,11 @@ ccl_device_inline Transform transform_quick_inverse(Transform M) R.x = make_float4(Rx.x, Rx.y, Rx.z, dot(Rx, T)); R.y = make_float4(Ry.x, Ry.y, Ry.z, dot(Ry, T)); R.z = make_float4(Rz.x, Rz.y, Rz.z, dot(Rz, T)); - R.w = make_float4(0.0f, 0.0f, 0.0f, 1.0f); return R; } -ccl_device_inline void transform_compose(Transform *tfm, const Transform *decomp) +ccl_device_inline void transform_compose(Transform *tfm, const DecomposedTransform *decomp) { /* rotation */ float q0, q1, q2, q3, qda, qdb, qdc, qaa, qab, qac, qbb, qbc, qcc; @@ -460,59 +395,30 @@ ccl_device_inline void transform_compose(Transform *tfm, const Transform *decomp tfm->x = make_float4(dot(rotation_x, scale_x), dot(rotation_x, scale_y), dot(rotation_x, scale_z), decomp->y.x); tfm->y = make_float4(dot(rotation_y, scale_x), dot(rotation_y, scale_y), dot(rotation_y, scale_z), decomp->y.y); tfm->z = make_float4(dot(rotation_z, scale_x), dot(rotation_z, scale_y), dot(rotation_z, scale_z), decomp->y.z); - tfm->w = make_float4(0.0f, 0.0f, 0.0f, 1.0f); } -/* Disabled for now, need arc-length parametrization for constant speed motion. - * #define CURVED_MOTION_INTERPOLATE */ - -ccl_device void transform_motion_interpolate(Transform *tfm, const DecompMotionTransform *motion, float t) +/* Interpolate from array of decomposed transforms. */ +ccl_device void transform_motion_array_interpolate(Transform *tfm, + const ccl_global DecomposedTransform *motion, + uint numsteps, + float time) { - /* possible optimization: is it worth it adding a check to skip scaling? - * it's probably quite uncommon to have scaling objects. or can we skip - * just shearing perhaps? */ - Transform decomp; - -#ifdef CURVED_MOTION_INTERPOLATE - /* 3 point bezier curve interpolation for position */ - float3 Ppre = float4_to_float3(motion->pre_y); - float3 Pmid = float4_to_float3(motion->mid.y); - float3 Ppost = float4_to_float3(motion->post_y); - - float3 Pcontrol = 2.0f*Pmid - 0.5f*(Ppre + Ppost); - float3 P = Ppre*t*t + Pcontrol*2.0f*t*(1.0f - t) + Ppost*(1.0f - t)*(1.0f - t); - - decomp.y.x = P.x; - decomp.y.y = P.y; - decomp.y.z = P.z; -#endif - - /* linear interpolation for rotation and scale */ - if(t < 0.5f) { - t *= 2.0f; - - decomp.x = quat_interpolate(motion->pre_x, motion->mid.x, t); -#ifdef CURVED_MOTION_INTERPOLATE - decomp.y.w = (1.0f - t)*motion->pre_y.w + t*motion->mid.y.w; -#else - decomp.y = (1.0f - t)*motion->pre_y + t*motion->mid.y; -#endif - } - else { - t = (t - 0.5f)*2.0f; - - decomp.x = quat_interpolate(motion->mid.x, motion->post_x, t); -#ifdef CURVED_MOTION_INTERPOLATE - decomp.y.w = (1.0f - t)*motion->mid.y.w + t*motion->post_y.w; -#else - decomp.y = (1.0f - t)*motion->mid.y + t*motion->post_y; -#endif - } - - decomp.z = motion->mid.z; - decomp.w = motion->mid.w; - - /* compose rotation, translation, scale into matrix */ + /* Figure out which steps we need to interpolate. */ + int maxstep = numsteps-1; + int step = min((int)(time*maxstep), maxstep-1); + float t = time*maxstep - step; + + const ccl_global DecomposedTransform *a = motion + step; + const ccl_global DecomposedTransform *b = motion + step + 1; + + /* Interpolate rotation, translation and scale. */ + DecomposedTransform decomp; + decomp.x = quat_interpolate(a->x, b->x, t); + decomp.y = (1.0f - t)*a->y + t*b->y; + decomp.z = (1.0f - t)*a->z + t*b->z; + decomp.w = (1.0f - t)*a->w + t*b->w; + + /* Compose rotation, translation, scale into matrix. */ transform_compose(tfm, &decomp); } @@ -520,13 +426,13 @@ ccl_device void transform_motion_interpolate(Transform *tfm, const DecompMotionT class BoundBox2D; -ccl_device_inline bool operator==(const MotionTransform& A, const MotionTransform& B) +ccl_device_inline bool operator==(const DecomposedTransform& A, const DecomposedTransform& B) { - return (A.pre == B.pre && A.post == B.post); + return memcmp(&A, &B, sizeof(DecomposedTransform)) == 0; } float4 transform_to_quat(const Transform& tfm); -void transform_motion_decompose(DecompMotionTransform *decomp, const MotionTransform *motion, const Transform *mid); +void transform_motion_decompose(DecomposedTransform *decomp, const Transform *motion, size_t size); Transform transform_from_viewplane(BoundBox2D& viewplane); #endif diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h index a000fae4bd6..84206a7ba5a 100644 --- a/intern/cycles/util/util_types.h +++ b/intern/cycles/util/util_types.h @@ -18,72 +18,20 @@ #define __UTIL_TYPES_H__ #ifndef __KERNEL_OPENCL__ - -#include <stdlib.h> - -#endif - -/* Bitness */ - -#if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) -#define __KERNEL_64_BIT__ -#endif - -/* Qualifiers for kernel code shared by CPU and GPU */ - -#ifndef __KERNEL_GPU__ - -#define ccl_device static inline -#define ccl_device_noinline static -#define ccl_global -#define ccl_constant -#define ccl_restrict __restrict -#define __KERNEL_WITH_SSE_ALIGN__ - -#if defined(_WIN32) && !defined(FREE_WINDOWS) -#define ccl_device_inline static __forceinline -#define ccl_device_forceinline static __forceinline -#define ccl_align(...) __declspec(align(__VA_ARGS__)) -#ifdef __KERNEL_64_BIT__ -#define ccl_try_align(...) __declspec(align(__VA_ARGS__)) -#else -#undef __KERNEL_WITH_SSE_ALIGN__ -#define ccl_try_align(...) /* not support for function arguments (error C2719) */ -#endif -#define ccl_may_alias -#define ccl_always_inline __forceinline -#define ccl_maybe_unused - -#else - -#define ccl_device_inline static inline __attribute__((always_inline)) -#define ccl_device_forceinline static inline __attribute__((always_inline)) -#define ccl_align(...) __attribute__((aligned(__VA_ARGS__))) -#ifndef FREE_WINDOWS64 -#define __forceinline inline __attribute__((always_inline)) -#endif -#define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__))) -#define ccl_may_alias __attribute__((__may_alias__)) -#define ccl_always_inline __attribute__((always_inline)) -#define ccl_maybe_unused __attribute__((used)) - -#endif - +# include <stdlib.h> #endif /* Standard Integer Types */ -#ifndef __KERNEL_GPU__ - -/* int8_t, uint16_t, and friends */ -#ifndef _WIN32 -#include <stdint.h> +#if !defined(__KERNEL_GPU__) && !defined(_WIN32) +# include <stdint.h> #endif -/* SIMD Types */ - -#include "util_optimization.h" +#include "util/util_defines.h" +#ifndef __KERNEL_GPU__ +# include "util/util_optimization.h" +# include "util/util_simd.h" #endif CCL_NAMESPACE_BEGIN @@ -97,18 +45,19 @@ CCL_NAMESPACE_BEGIN /* Shorter Unsigned Names */ #ifndef __KERNEL_OPENCL__ - typedef unsigned char uchar; typedef unsigned int uint; - +typedef unsigned short ushort; #endif -#ifndef __KERNEL_GPU__ - /* Fixed Bits Types */ -#ifdef _WIN32 +#ifdef __KERNEL_OPENCL__ +typedef ulong uint64_t; +#endif +#ifndef __KERNEL_GPU__ +# ifdef _WIN32 typedef signed char int8_t; typedef unsigned char uint8_t; @@ -120,440 +69,85 @@ typedef unsigned int uint32_t; typedef long long int64_t; typedef unsigned long long uint64_t; - -#ifdef __KERNEL_64_BIT__ +# ifdef __KERNEL_64_BIT__ typedef int64_t ssize_t; -#else +# else typedef int32_t ssize_t; -#endif - -#endif +# endif +# endif /* _WIN32 */ /* Generic Memory Pointer */ typedef uint64_t device_ptr; +#endif /* __KERNEL_GPU__ */ -/* Vector Types */ - -struct uchar2 { - uchar x, y; - - __forceinline uchar operator[](int i) const { return *(&x + i); } - __forceinline uchar& operator[](int i) { return *(&x + i); } -}; - -struct uchar3 { - uchar x, y, z; - - __forceinline uchar operator[](int i) const { return *(&x + i); } - __forceinline uchar& operator[](int i) { return *(&x + i); } -}; - -struct uchar4 { - uchar x, y, z, w; - - __forceinline uchar operator[](int i) const { return *(&x + i); } - __forceinline uchar& operator[](int i) { return *(&x + i); } -}; - -struct int2 { - int x, y; - - __forceinline int operator[](int i) const { return *(&x + i); } - __forceinline int& operator[](int i) { return *(&x + i); } -}; - -struct ccl_try_align(16) int3 { -#ifdef __KERNEL_SSE__ - union { - __m128i m128; - struct { int x, y, z, w; }; - }; - - __forceinline int3() {} - __forceinline int3(const __m128i a) : m128(a) {} - __forceinline operator const __m128i&(void) const { return m128; } - __forceinline operator __m128i&(void) { return m128; } - - int3(const int3& a) { m128 = a.m128; } - int3& operator =(const int3& a) { m128 = a.m128; return *this; } -#else - int x, y, z, w; -#endif - - __forceinline int operator[](int i) const { return *(&x + i); } - __forceinline int& operator[](int i) { return *(&x + i); } -}; - -struct ccl_try_align(16) int4 { -#ifdef __KERNEL_SSE__ - union { - __m128i m128; - struct { int x, y, z, w; }; - }; - - __forceinline int4() {} - __forceinline int4(const __m128i a) : m128(a) {} - __forceinline operator const __m128i&(void) const { return m128; } - __forceinline operator __m128i&(void) { return m128; } - - int4(const int4& a) : m128(a.m128) {} - int4& operator=(const int4& a) { m128 = a.m128; return *this; } -#else - int x, y, z, w; -#endif - - __forceinline int operator[](int i) const { return *(&x + i); } - __forceinline int& operator[](int i) { return *(&x + i); } -}; - -struct uint2 { - uint x, y; - - __forceinline uint operator[](uint i) const { return *(&x + i); } - __forceinline uint& operator[](uint i) { return *(&x + i); } -}; - -struct uint3 { - uint x, y, z; - - __forceinline uint operator[](uint i) const { return *(&x + i); } - __forceinline uint& operator[](uint i) { return *(&x + i); } -}; - -struct uint4 { - uint x, y, z, w; - - __forceinline uint operator[](uint i) const { return *(&x + i); } - __forceinline uint& operator[](uint i) { return *(&x + i); } -}; - -struct float2 { - float x, y; - - __forceinline float operator[](int i) const { return *(&x + i); } - __forceinline float& operator[](int i) { return *(&x + i); } -}; - -struct ccl_try_align(16) float3 { -#ifdef __KERNEL_SSE__ - union { - __m128 m128; - struct { float x, y, z, w; }; - }; - - __forceinline float3() {} - __forceinline float3(const __m128& a) : m128(a) {} - __forceinline operator const __m128&(void) const { return m128; } - __forceinline operator __m128&(void) { return m128; } - - __forceinline float3(const float3& a) : m128(a.m128) {} - __forceinline float3& operator =(const float3& a) { m128 = a.m128; return *this; } -#else - float x, y, z, w; -#endif - - __forceinline float operator[](int i) const { return *(&x + i); } - __forceinline float& operator[](int i) { return *(&x + i); } -}; - -struct ccl_try_align(16) float4 { -#ifdef __KERNEL_SSE__ - union { - __m128 m128; - struct { float x, y, z, w; }; - }; - - __forceinline float4() {} - __forceinline float4(const __m128 a) : m128(a) {} - __forceinline operator const __m128&(void) const { return m128; } - __forceinline operator __m128&(void) { return m128; } - - __forceinline float4(const float4& a) : m128(a.m128) {} - __forceinline float4& operator =(const float4& a) { m128 = a.m128; return *this; } - -#else - float x, y, z, w; -#endif - - __forceinline float operator[](int i) const { return *(&x + i); } - __forceinline float& operator[](int i) { return *(&x + i); } -}; - -template<typename T> -class vector3 -{ -public: - T x, y, z; - - ccl_always_inline vector3() {} - ccl_always_inline vector3(const T& a) - : x(a), y(a), z(a) {} - ccl_always_inline vector3(const T& x, const T& y, const T& z) - : x(x), y(y), z(z) {} -}; - -#endif - -#ifndef __KERNEL_GPU__ - -/* Vector Type Constructors - * - * OpenCL does not support C++ class, so we use these instead. */ - -ccl_device_inline uchar2 make_uchar2(uchar x, uchar y) -{ - uchar2 a = {x, y}; - return a; -} - -ccl_device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z) -{ - uchar3 a = {x, y, z}; - return a; -} - -ccl_device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w) -{ - uchar4 a = {x, y, z, w}; - return a; -} - -ccl_device_inline int2 make_int2(int x, int y) -{ - int2 a = {x, y}; - return a; -} - -ccl_device_inline int3 make_int3(int x, int y, int z) -{ -#ifdef __KERNEL_SSE__ - int3 a; - a.m128 = _mm_set_epi32(0, z, y, x); -#else - int3 a = {x, y, z, 0}; -#endif - - return a; -} - -ccl_device_inline int4 make_int4(int x, int y, int z, int w) -{ -#ifdef __KERNEL_SSE__ - int4 a; - a.m128 = _mm_set_epi32(w, z, y, x); -#else - int4 a = {x, y, z, w}; -#endif - - return a; -} - -ccl_device_inline uint2 make_uint2(uint x, uint y) -{ - uint2 a = {x, y}; - return a; -} - -ccl_device_inline uint3 make_uint3(uint x, uint y, uint z) -{ - uint3 a = {x, y, z}; - return a; -} - -ccl_device_inline uint4 make_uint4(uint x, uint y, uint z, uint w) -{ - uint4 a = {x, y, z, w}; - return a; -} - -ccl_device_inline float2 make_float2(float x, float y) -{ - float2 a = {x, y}; - return a; -} - -ccl_device_inline float3 make_float3(float x, float y, float z) -{ -#ifdef __KERNEL_SSE__ - float3 a; - a.m128 = _mm_set_ps(0.0f, z, y, x); -#else - float3 a = {x, y, z, 0.0f}; -#endif - - return a; -} - -ccl_device_inline float4 make_float4(float x, float y, float z, float w) -{ -#ifdef __KERNEL_SSE__ - float4 a; - a.m128 = _mm_set_ps(w, z, y, x); -#else - float4 a = {x, y, z, w}; -#endif - - return a; -} - -ccl_device_inline int align_up(int offset, int alignment) +ccl_device_inline size_t align_up(size_t offset, size_t alignment) { return (offset + alignment - 1) & ~(alignment - 1); } -ccl_device_inline int3 make_int3(int i) +ccl_device_inline size_t divide_up(size_t x, size_t y) { -#ifdef __KERNEL_SSE__ - int3 a; - a.m128 = _mm_set1_epi32(i); -#else - int3 a = {i, i, i, i}; -#endif - - return a; + return (x + y - 1) / y; } -ccl_device_inline int4 make_int4(int i) +ccl_device_inline size_t round_up(size_t x, size_t multiple) { -#ifdef __KERNEL_SSE__ - int4 a; - a.m128 = _mm_set1_epi32(i); -#else - int4 a = {i, i, i, i}; -#endif - - return a; + return ((x + multiple - 1) / multiple) * multiple; } -ccl_device_inline float3 make_float3(float f) +ccl_device_inline size_t round_down(size_t x, size_t multiple) { -#ifdef __KERNEL_SSE__ - float3 a; - a.m128 = _mm_set1_ps(f); -#else - float3 a = {f, f, f, f}; -#endif - - return a; + return (x / multiple) * multiple; } -ccl_device_inline float4 make_float4(float f) -{ -#ifdef __KERNEL_SSE__ - float4 a; - a.m128 = _mm_set1_ps(f); -#else - float4 a = {f, f, f, f}; -#endif - - return a; -} +CCL_NAMESPACE_END -ccl_device_inline float4 make_float4(const int4& i) -{ -#ifdef __KERNEL_SSE__ - float4 a; - a.m128 = _mm_cvtepi32_ps(i.m128); -#else - float4 a = {(float)i.x, (float)i.y, (float)i.z, (float)i.w}; -#endif +/* Vectorized types declaration. */ +#include "util/util_types_uchar2.h" +#include "util/util_types_uchar3.h" +#include "util/util_types_uchar4.h" - return a; -} +#include "util/util_types_int2.h" +#include "util/util_types_int3.h" +#include "util/util_types_int4.h" -ccl_device_inline int4 make_int4(const float3& f) -{ -#ifdef __KERNEL_SSE__ - int4 a; - a.m128 = _mm_cvtps_epi32(f.m128); -#else - int4 a = {(int)f.x, (int)f.y, (int)f.z, (int)f.w}; -#endif +#include "util/util_types_uint2.h" +#include "util/util_types_uint3.h" +#include "util/util_types_uint4.h" - return a; -} +#include "util/util_types_float2.h" +#include "util/util_types_float3.h" +#include "util/util_types_float4.h" -#endif +#include "util/util_types_vector3.h" -/* Interpolation types for textures - * cuda also use texture space to store other objects */ -enum InterpolationType { - INTERPOLATION_NONE = -1, - INTERPOLATION_LINEAR = 0, - INTERPOLATION_CLOSEST = 1, - INTERPOLATION_CUBIC = 2, - INTERPOLATION_SMART = 3, +/* Vectorized types implementation. */ +#include "util/util_types_uchar2_impl.h" +#include "util/util_types_uchar3_impl.h" +#include "util/util_types_uchar4_impl.h" - INTERPOLATION_NUM_TYPES, -}; +#include "util/util_types_int2_impl.h" +#include "util/util_types_int3_impl.h" +#include "util/util_types_int4_impl.h" -/* Extension types for textures. - * - * Defines how the image is extrapolated past its original bounds. - */ -enum ExtensionType { - /* Cause the image to repeat horizontally and vertically. */ - EXTENSION_REPEAT = 0, - /* Extend by repeating edge pixels of the image. */ - EXTENSION_EXTEND = 1, - /* Clip to image size and set exterior pixels as transparent. */ - EXTENSION_CLIP = 2, - - EXTENSION_NUM_TYPES, -}; - -/* macros */ - -/* hints for branch prediction, only use in code that runs a _lot_ */ -#if defined(__GNUC__) && defined(__KERNEL_CPU__) -# define LIKELY(x) __builtin_expect(!!(x), 1) -# define UNLIKELY(x) __builtin_expect(!!(x), 0) -#else -# define LIKELY(x) (x) -# define UNLIKELY(x) (x) -#endif +#include "util/util_types_uint2_impl.h" +#include "util/util_types_uint3_impl.h" +#include "util/util_types_uint4_impl.h" -#if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && _MSC_VER >= 1800)) -# define HAS_CPP11_FEATURES -#endif +#include "util/util_types_float2_impl.h" +#include "util/util_types_float3_impl.h" +#include "util/util_types_float4_impl.h" -#if defined(__GNUC__) || defined(__clang__) -# if defined(HAS_CPP11_FEATURES) -/* Some magic to be sure we don't have reference in the type. */ -template<typename T> static inline T decltype_helper(T x) { return x; } -# define TYPEOF(x) decltype(decltype_helper(x)) -# else -# define TYPEOF(x) typeof(x) -# endif -#endif +#include "util/util_types_vector3_impl.h" -/* Causes warning: - * incompatible types when assigning to type 'Foo' from type 'Bar' - * ... the compiler optimizes away the temp var */ -#ifdef __GNUC__ -#define CHECK_TYPE(var, type) { \ - TYPEOF(var) *__tmp; \ - __tmp = (type *)NULL; \ - (void)__tmp; \ -} (void)0 - -#define CHECK_TYPE_PAIR(var_a, var_b) { \ - TYPEOF(var_a) *__tmp; \ - __tmp = (typeof(var_b) *)NULL; \ - (void)__tmp; \ -} (void)0 -#else -# define CHECK_TYPE(var, type) -# define CHECK_TYPE_PAIR(var_a, var_b) +/* SSE types. */ +#ifndef __KERNEL_GPU__ +# include "util/util_sseb.h" +# include "util/util_ssei.h" +# include "util/util_ssef.h" +# include "util/util_avxf.h" #endif -/* can be used in simple macros */ -#define CHECK_TYPE_INLINE(val, type) \ - ((void)(((type)0) != (val))) - - -CCL_NAMESPACE_END - #endif /* __UTIL_TYPES_H__ */ diff --git a/intern/cycles/util/util_types_float2.h b/intern/cycles/util/util_types_float2.h new file mode 100644 index 00000000000..ec7a1f717a1 --- /dev/null +++ b/intern/cycles/util/util_types_float2.h @@ -0,0 +1,40 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_FLOAT2_H__ +#define __UTIL_TYPES_FLOAT2_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +struct float2 { + float x, y; + + __forceinline float operator[](int i) const; + __forceinline float& operator[](int i); +}; + +ccl_device_inline float2 make_float2(float x, float y); +ccl_device_inline void print_float2(const char *label, const float2& a); +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_FLOAT2_H__ */ diff --git a/intern/cycles/util/util_types_float2_impl.h b/intern/cycles/util/util_types_float2_impl.h new file mode 100644 index 00000000000..782dda195eb --- /dev/null +++ b/intern/cycles/util/util_types_float2_impl.h @@ -0,0 +1,59 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_FLOAT2_IMPL_H__ +#define __UTIL_TYPES_FLOAT2_IMPL_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +#ifndef __KERNEL_GPU__ +# include <cstdio> +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +__forceinline float float2::operator[](int i) const +{ + util_assert(i >= 0); + util_assert(i < 2); + return *(&x + i); +} + +__forceinline float& float2::operator[](int i) +{ + util_assert(i >= 0); + util_assert(i < 2); + return *(&x + i); +} + +ccl_device_inline float2 make_float2(float x, float y) +{ + float2 a = {x, y}; + return a; +} + +ccl_device_inline void print_float2(const char *label, const float2& a) +{ + printf("%s: %.8f %.8f\n", label, (double)a.x, (double)a.y); +} +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_FLOAT2_IMPL_H__ */ diff --git a/intern/cycles/util/util_types_float3.h b/intern/cycles/util/util_types_float3.h new file mode 100644 index 00000000000..28146ad04f7 --- /dev/null +++ b/intern/cycles/util/util_types_float3.h @@ -0,0 +1,57 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_FLOAT3_H__ +#define __UTIL_TYPES_FLOAT3_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +struct ccl_try_align(16) float3 { +#ifdef __KERNEL_SSE__ + union { + __m128 m128; + struct { float x, y, z, w; }; + }; + + __forceinline float3(); + __forceinline float3(const float3& a); + __forceinline explicit float3(const __m128& a); + + __forceinline operator const __m128&(void) const; + __forceinline operator __m128&(void); + + __forceinline float3& operator =(const float3& a); +#else /* __KERNEL_SSE__ */ + float x, y, z, w; +#endif /* __KERNEL_SSE__ */ + + __forceinline float operator[](int i) const; + __forceinline float& operator[](int i); +}; + +ccl_device_inline float3 make_float3(float f); +ccl_device_inline float3 make_float3(float x, float y, float z); +ccl_device_inline void print_float3(const char *label, const float3& a); +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_FLOAT3_H__ */ diff --git a/intern/cycles/util/util_types_float3_impl.h b/intern/cycles/util/util_types_float3_impl.h new file mode 100644 index 00000000000..45f61767d3f --- /dev/null +++ b/intern/cycles/util/util_types_float3_impl.h @@ -0,0 +1,105 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_FLOAT3_IMPL_H__ +#define __UTIL_TYPES_FLOAT3_IMPL_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +#ifndef __KERNEL_GPU__ +# include <cstdio> +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +#ifdef __KERNEL_SSE__ +__forceinline float3::float3() +{ +} + +__forceinline float3::float3(const float3& a) + : m128(a.m128) +{ +} + +__forceinline float3::float3(const __m128& a) + : m128(a) +{ +} + +__forceinline float3::operator const __m128&(void) const +{ + return m128; +} + +__forceinline float3::operator __m128&(void) +{ + return m128; +} + +__forceinline float3& float3::operator =(const float3& a) +{ + m128 = a.m128; + return *this; +} +#endif /* __KERNEL_SSE__ */ + +__forceinline float float3::operator[](int i) const +{ + util_assert(i >= 0); + util_assert(i < 3); + return *(&x + i); +} + +__forceinline float& float3::operator[](int i) +{ + util_assert(i >= 0); + util_assert(i < 3); + return *(&x + i); +} + +ccl_device_inline float3 make_float3(float f) +{ +#ifdef __KERNEL_SSE__ + float3 a(_mm_set1_ps(f)); +#else + float3 a = {f, f, f, f}; +#endif + return a; +} + +ccl_device_inline float3 make_float3(float x, float y, float z) +{ +#ifdef __KERNEL_SSE__ + float3 a(_mm_set_ps(0.0f, z, y, x)); +#else + float3 a = {x, y, z, 0.0f}; +#endif + return a; +} + +ccl_device_inline void print_float3(const char *label, const float3& a) +{ + printf("%s: %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z); +} +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_FLOAT3_IMPL_H__ */ diff --git a/intern/cycles/util/util_types_float4.h b/intern/cycles/util/util_types_float4.h new file mode 100644 index 00000000000..154391f6881 --- /dev/null +++ b/intern/cycles/util/util_types_float4.h @@ -0,0 +1,60 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_FLOAT4_H__ +#define __UTIL_TYPES_FLOAT4_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +struct int4; + +struct ccl_try_align(16) float4 { +#ifdef __KERNEL_SSE__ + union { + __m128 m128; + struct { float x, y, z, w; }; + }; + + __forceinline float4(); + __forceinline explicit float4(const __m128& a); + + __forceinline operator const __m128&(void) const; + __forceinline operator __m128&(void); + + __forceinline float4& operator =(const float4& a); + +#else /* __KERNEL_SSE__ */ + float x, y, z, w; +#endif /* __KERNEL_SSE__ */ + + __forceinline float operator[](int i) const; + __forceinline float& operator[](int i); +}; + +ccl_device_inline float4 make_float4(float f); +ccl_device_inline float4 make_float4(float x, float y, float z, float w); +ccl_device_inline float4 make_float4(const int4& i); +ccl_device_inline void print_float4(const char *label, const float4& a); +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_FLOAT4_H__ */ diff --git a/intern/cycles/util/util_types_float4_impl.h b/intern/cycles/util/util_types_float4_impl.h new file mode 100644 index 00000000000..09f45f47d38 --- /dev/null +++ b/intern/cycles/util/util_types_float4_impl.h @@ -0,0 +1,112 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_FLOAT4_IMPL_H__ +#define __UTIL_TYPES_FLOAT4_IMPL_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +#ifndef __KERNEL_GPU__ +# include <cstdio> +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +#ifdef __KERNEL_SSE__ +__forceinline float4::float4() +{ +} + +__forceinline float4::float4(const __m128& a) + : m128(a) +{ +} + +__forceinline float4::operator const __m128&(void) const +{ + return m128; +} + +__forceinline float4::operator __m128&(void) +{ + return m128; +} + +__forceinline float4& float4::operator =(const float4& a) +{ + m128 = a.m128; + return *this; +} +#endif /* __KERNEL_SSE__ */ + +__forceinline float float4::operator[](int i) const +{ + util_assert(i >= 0); + util_assert(i < 4); + return *(&x + i); +} + +__forceinline float& float4::operator[](int i) +{ + util_assert(i >= 0); + util_assert(i < 4); + return *(&x + i); +} + +ccl_device_inline float4 make_float4(float f) +{ +#ifdef __KERNEL_SSE__ + float4 a(_mm_set1_ps(f)); +#else + float4 a = {f, f, f, f}; +#endif + return a; +} + +ccl_device_inline float4 make_float4(float x, float y, float z, float w) +{ +#ifdef __KERNEL_SSE__ + float4 a(_mm_set_ps(w, z, y, x)); +#else + float4 a = {x, y, z, w}; +#endif + return a; +} + +ccl_device_inline float4 make_float4(const int4& i) +{ +#ifdef __KERNEL_SSE__ + float4 a(_mm_cvtepi32_ps(i.m128)); +#else + float4 a = {(float)i.x, (float)i.y, (float)i.z, (float)i.w}; +#endif + return a; +} + +ccl_device_inline void print_float4(const char *label, const float4& a) +{ + printf("%s: %.8f %.8f %.8f %.8f\n", + label, + (double)a.x, (double)a.y, (double)a.z, (double)a.w); +} +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_FLOAT4_IMPL_H__ */ diff --git a/intern/cycles/util/util_types_int2.h b/intern/cycles/util/util_types_int2.h new file mode 100644 index 00000000000..82e860f89eb --- /dev/null +++ b/intern/cycles/util/util_types_int2.h @@ -0,0 +1,39 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_INT2_H__ +#define __UTIL_TYPES_INT2_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +struct int2 { + int x, y; + + __forceinline int operator[](int i) const; + __forceinline int& operator[](int i); +}; + +ccl_device_inline int2 make_int2(int x, int y); +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_INT2_H__ */ diff --git a/intern/cycles/util/util_types_int2_impl.h b/intern/cycles/util/util_types_int2_impl.h new file mode 100644 index 00000000000..c7d3942e723 --- /dev/null +++ b/intern/cycles/util/util_types_int2_impl.h @@ -0,0 +1,50 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_INT2_IMPL_H__ +#define __UTIL_TYPES_INT2_IMPL_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +int int2::operator[](int i) const +{ + util_assert(i >= 0); + util_assert(i < 2); + return *(&x + i); +} + +int& int2::operator[](int i) +{ + util_assert(i >= 0); + util_assert(i < 2); + return *(&x + i); +} + +ccl_device_inline int2 make_int2(int x, int y) +{ + int2 a = {x, y}; + return a; +} +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_INT2_IMPL_H__ */ diff --git a/intern/cycles/util/util_types_int3.h b/intern/cycles/util/util_types_int3.h new file mode 100644 index 00000000000..9d43b201c02 --- /dev/null +++ b/intern/cycles/util/util_types_int3.h @@ -0,0 +1,57 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_INT3_H__ +#define __UTIL_TYPES_INT3_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +struct ccl_try_align(16) int3 { +#ifdef __KERNEL_SSE__ + union { + __m128i m128; + struct { int x, y, z, w; }; + }; + + __forceinline int3(); + __forceinline int3(const int3& a); + __forceinline explicit int3(const __m128i& a); + + __forceinline operator const __m128i&(void) const; + __forceinline operator __m128i&(void); + + __forceinline int3& operator =(const int3& a); +#else /* __KERNEL_SSE__ */ + int x, y, z, w; +#endif /* __KERNEL_SSE__ */ + + __forceinline int operator[](int i) const; + __forceinline int& operator[](int i); +}; + +ccl_device_inline int3 make_int3(int i); +ccl_device_inline int3 make_int3(int x, int y, int z); +ccl_device_inline void print_int3(const char *label, const int3& a); +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_INT3_H__ */ diff --git a/intern/cycles/util/util_types_int3_impl.h b/intern/cycles/util/util_types_int3_impl.h new file mode 100644 index 00000000000..ada50c4812c --- /dev/null +++ b/intern/cycles/util/util_types_int3_impl.h @@ -0,0 +1,106 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_INT3_IMPL_H__ +#define __UTIL_TYPES_INT3_IMPL_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +#ifndef __KERNEL_GPU__ +# include <cstdio> +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +#ifdef __KERNEL_SSE__ +__forceinline int3::int3() +{ +} + +__forceinline int3::int3(const __m128i& a) + : m128(a) +{ +} + +__forceinline int3::int3(const int3& a) + : m128(a.m128) +{ +} + +__forceinline int3::operator const __m128i&(void) const +{ + return m128; +} + +__forceinline int3::operator __m128i&(void) +{ + return m128; +} + +__forceinline int3& int3::operator =(const int3& a) +{ + m128 = a.m128; + return *this; +} +#endif /* __KERNEL_SSE__ */ + +__forceinline int int3::operator[](int i) const +{ + util_assert(i >= 0); + util_assert(i < 3); + return *(&x + i); +} + +__forceinline int& int3::operator[](int i) +{ + util_assert(i >= 0); + util_assert(i < 3); + return *(&x + i); +} + +ccl_device_inline int3 make_int3(int i) +{ +#ifdef __KERNEL_SSE__ + int3 a(_mm_set1_epi32(i)); +#else + int3 a = {i, i, i, i}; +#endif + return a; +} + +ccl_device_inline int3 make_int3(int x, int y, int z) +{ +#ifdef __KERNEL_SSE__ + int3 a(_mm_set_epi32(0, z, y, x)); +#else + int3 a = {x, y, z, 0}; +#endif + + return a; +} + +ccl_device_inline void print_int3(const char *label, const int3& a) +{ + printf("%s: %d %d %d\n", label, a.x, a.y, a.z); +} +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_INT3_IMPL_H__ */ diff --git a/intern/cycles/util/util_types_int4.h b/intern/cycles/util/util_types_int4.h new file mode 100644 index 00000000000..cdd0ecbdae5 --- /dev/null +++ b/intern/cycles/util/util_types_int4.h @@ -0,0 +1,61 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_INT4_H__ +#define __UTIL_TYPES_INT4_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ + +struct float3; + +struct ccl_try_align(16) int4 { +#ifdef __KERNEL_SSE__ + union { + __m128i m128; + struct { int x, y, z, w; }; + }; + + __forceinline int4(); + __forceinline int4(const int4& a); + __forceinline explicit int4(const __m128i& a); + + __forceinline operator const __m128i&(void) const; + __forceinline operator __m128i&(void); + + __forceinline int4& operator=(const int4& a); +#else /* __KERNEL_SSE__ */ + int x, y, z, w; +#endif /* __KERNEL_SSE__ */ + + __forceinline int operator[](int i) const; + __forceinline int& operator[](int i); +}; + +ccl_device_inline int4 make_int4(int i); +ccl_device_inline int4 make_int4(int x, int y, int z, int w); +ccl_device_inline int4 make_int4(const float3& f); +ccl_device_inline void print_int4(const char *label, const int4& a); +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_INT4_H__ */ diff --git a/intern/cycles/util/util_types_int4_impl.h b/intern/cycles/util/util_types_int4_impl.h new file mode 100644 index 00000000000..07cdc88f2dc --- /dev/null +++ b/intern/cycles/util/util_types_int4_impl.h @@ -0,0 +1,115 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_INT4_IMPL_H__ +#define __UTIL_TYPES_INT4_IMPL_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +#ifndef __KERNEL_GPU__ +# include <cstdio> +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +#ifdef __KERNEL_SSE__ +__forceinline int4::int4() +{ +} + +__forceinline int4::int4(const int4& a) + : m128(a.m128) +{ +} + +__forceinline int4::int4(const __m128i& a) + : m128(a) +{ +} + +__forceinline int4::operator const __m128i&(void) const +{ + return m128; +} + +__forceinline int4::operator __m128i&(void) +{ + return m128; +} + +__forceinline int4& int4::operator=(const int4& a) +{ + m128 = a.m128; + return *this; +} +#endif /* __KERNEL_SSE__ */ + +__forceinline int int4::operator[](int i) const +{ + util_assert(i >= 0); + util_assert(i < 4); + return *(&x + i); +} + +__forceinline int& int4::operator[](int i) +{ + util_assert(i >= 0); + util_assert(i < 4); + return *(&x + i); +} + +ccl_device_inline int4 make_int4(int i) +{ +#ifdef __KERNEL_SSE__ + int4 a(_mm_set1_epi32(i)); +#else + int4 a = {i, i, i, i}; +#endif + return a; +} + +ccl_device_inline int4 make_int4(int x, int y, int z, int w) +{ +#ifdef __KERNEL_SSE__ + int4 a(_mm_set_epi32(w, z, y, x)); +#else + int4 a = {x, y, z, w}; +#endif + return a; +} + +ccl_device_inline int4 make_int4(const float3& f) +{ +#ifdef __KERNEL_SSE__ + int4 a(_mm_cvtps_epi32(f.m128)); +#else + int4 a = {(int)f.x, (int)f.y, (int)f.z, (int)f.w}; +#endif + return a; +} + +ccl_device_inline void print_int4(const char *label, const int4& a) +{ + printf("%s: %d %d %d %d\n", label, a.x, a.y, a.z, a.w); +} +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_INT4_IMPL_H__ */ diff --git a/intern/cycles/util/util_types_uchar2.h b/intern/cycles/util/util_types_uchar2.h new file mode 100644 index 00000000000..f618a2234ca --- /dev/null +++ b/intern/cycles/util/util_types_uchar2.h @@ -0,0 +1,39 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_UCHAR2_H__ +#define __UTIL_TYPES_UCHAR2_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +struct uchar2 { + uchar x, y; + + __forceinline uchar operator[](int i) const; + __forceinline uchar& operator[](int i); +}; + +ccl_device_inline uchar2 make_uchar2(uchar x, uchar y); +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_UCHAR2_H__ */ diff --git a/intern/cycles/util/util_types_uchar2_impl.h b/intern/cycles/util/util_types_uchar2_impl.h new file mode 100644 index 00000000000..d5f196d0ce0 --- /dev/null +++ b/intern/cycles/util/util_types_uchar2_impl.h @@ -0,0 +1,50 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_UCHAR2_IMPL_H__ +#define __UTIL_TYPES_UCHAR2_IMPL_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +uchar uchar2::operator[](int i) const +{ + util_assert(i >= 0); + util_assert(i < 2); + return *(&x + i); +} + +uchar& uchar2::operator[](int i) +{ + util_assert(i >= 0); + util_assert(i < 2); + return *(&x + i); +} + +ccl_device_inline uchar2 make_uchar2(uchar x, uchar y) +{ + uchar2 a = {x, y}; + return a; +} +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_UCHAR2_IMPL_H__ */ diff --git a/intern/cycles/util/util_types_uchar3.h b/intern/cycles/util/util_types_uchar3.h new file mode 100644 index 00000000000..1e3644e6fd6 --- /dev/null +++ b/intern/cycles/util/util_types_uchar3.h @@ -0,0 +1,39 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_UCHAR3_H__ +#define __UTIL_TYPES_UCHAR3_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +struct uchar3 { + uchar x, y, z; + + __forceinline uchar operator[](int i) const; + __forceinline uchar& operator[](int i); +}; + +ccl_device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z); +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_UCHAR3_H__ */ diff --git a/intern/cycles/util/util_types_uchar3_impl.h b/intern/cycles/util/util_types_uchar3_impl.h new file mode 100644 index 00000000000..611021efb7f --- /dev/null +++ b/intern/cycles/util/util_types_uchar3_impl.h @@ -0,0 +1,50 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_UCHAR3_IMPL_H__ +#define __UTIL_TYPES_UCHAR3_IMPL_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +uchar uchar3::operator[](int i) const +{ + util_assert(i >= 0); + util_assert(i < 3); + return *(&x + i); +} + +uchar& uchar3::operator[](int i) +{ + util_assert(i >= 0); + util_assert(i < 3); + return *(&x + i); +} + +ccl_device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z) +{ + uchar3 a = {x, y, z}; + return a; +} +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_UCHAR3_IMPL_H__ */ diff --git a/intern/cycles/util/util_types_uchar4.h b/intern/cycles/util/util_types_uchar4.h new file mode 100644 index 00000000000..3802cebbfb9 --- /dev/null +++ b/intern/cycles/util/util_types_uchar4.h @@ -0,0 +1,39 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_UCHAR4_H__ +#define __UTIL_TYPES_UCHAR4_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +struct uchar4 { + uchar x, y, z, w; + + __forceinline uchar operator[](int i) const; + __forceinline uchar& operator[](int i); +}; + +ccl_device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w); +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_UCHAR4_H__ */ diff --git a/intern/cycles/util/util_types_uchar4_impl.h b/intern/cycles/util/util_types_uchar4_impl.h new file mode 100644 index 00000000000..03039f60c54 --- /dev/null +++ b/intern/cycles/util/util_types_uchar4_impl.h @@ -0,0 +1,50 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_UCHAR4_IMPL_H__ +#define __UTIL_TYPES_UCHAR4_IMPL_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +uchar uchar4::operator[](int i) const +{ + util_assert(i >= 0); + util_assert(i < 4); + return *(&x + i); +} + +uchar& uchar4::operator[](int i) +{ + util_assert(i >= 0); + util_assert(i < 4); + return *(&x + i); +} + +ccl_device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w) +{ + uchar4 a = {x, y, z, w}; + return a; +} +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_UCHAR4_IMPL_H__ */ diff --git a/intern/cycles/util/util_types_uint2.h b/intern/cycles/util/util_types_uint2.h new file mode 100644 index 00000000000..c4a31899614 --- /dev/null +++ b/intern/cycles/util/util_types_uint2.h @@ -0,0 +1,39 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_UINT2_H__ +#define __UTIL_TYPES_UINT2_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +struct uint2 { + uint x, y; + + __forceinline uint operator[](uint i) const; + __forceinline uint& operator[](uint i); +}; + +ccl_device_inline uint2 make_uint2(uint x, uint y); +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_UINT2_H__ */ diff --git a/intern/cycles/util/util_types_uint2_impl.h b/intern/cycles/util/util_types_uint2_impl.h new file mode 100644 index 00000000000..b50ffa2667f --- /dev/null +++ b/intern/cycles/util/util_types_uint2_impl.h @@ -0,0 +1,48 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_UINT2_IMPL_H__ +#define __UTIL_TYPES_UINT2_IMPL_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +__forceinline uint uint2::operator[](uint i) const +{ + util_assert(i < 2); + return *(&x + i); +} + +__forceinline uint& uint2::operator[](uint i) +{ + util_assert(i < 2); + return *(&x + i); +} + +ccl_device_inline uint2 make_uint2(uint x, uint y) +{ + uint2 a = {x, y}; + return a; +} +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_UINT2_IMPL_H__ */ diff --git a/intern/cycles/util/util_types_uint3.h b/intern/cycles/util/util_types_uint3.h new file mode 100644 index 00000000000..aeeecd2df06 --- /dev/null +++ b/intern/cycles/util/util_types_uint3.h @@ -0,0 +1,39 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_UINT3_H__ +#define __UTIL_TYPES_UINT3_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +struct uint3 { + uint x, y, z; + + __forceinline uint operator[](uint i) const; + __forceinline uint& operator[](uint i); +}; + +ccl_device_inline uint3 make_uint3(uint x, uint y, uint z); +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_UINT3_H__ */ diff --git a/intern/cycles/util/util_types_uint3_impl.h b/intern/cycles/util/util_types_uint3_impl.h new file mode 100644 index 00000000000..26005d5baff --- /dev/null +++ b/intern/cycles/util/util_types_uint3_impl.h @@ -0,0 +1,48 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_UINT3_IMPL_H__ +#define __UTIL_TYPES_UINT3_IMPL_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +__forceinline uint uint3::operator[](uint i) const +{ + util_assert(i < 3); + return *(&x + i); +} + +__forceinline uint& uint3::operator[](uint i) +{ + util_assert(i < 3); + return *(&x + i); +} + +ccl_device_inline uint3 make_uint3(uint x, uint y, uint z) +{ + uint3 a = {x, y, z}; + return a; +} +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_UINT3_IMPL_H__ */ diff --git a/intern/cycles/util/util_types_uint4.h b/intern/cycles/util/util_types_uint4.h new file mode 100644 index 00000000000..2d3a7bb85e4 --- /dev/null +++ b/intern/cycles/util/util_types_uint4.h @@ -0,0 +1,39 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_UINT4_H__ +#define __UTIL_TYPES_UINT4_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +struct uint4 { + uint x, y, z, w; + + __forceinline uint operator[](uint i) const; + __forceinline uint& operator[](uint i); +}; + +ccl_device_inline uint4 make_uint4(uint x, uint y, uint z, uint w); +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_UINT4_H__ */ diff --git a/intern/cycles/util/util_types_uint4_impl.h b/intern/cycles/util/util_types_uint4_impl.h new file mode 100644 index 00000000000..6d48131a446 --- /dev/null +++ b/intern/cycles/util/util_types_uint4_impl.h @@ -0,0 +1,48 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_UINT4_IMPL_H__ +#define __UTIL_TYPES_UINT4_IMPL_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +__forceinline uint uint4::operator[](uint i) const +{ + util_assert(i < 3); + return *(&x + i); +} + +__forceinline uint& uint4::operator[](uint i) +{ + util_assert(i < 3); + return *(&x + i); +} + +ccl_device_inline uint4 make_uint4(uint x, uint y, uint z, uint w) +{ + uint4 a = {x, y, z, w}; + return a; +} +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_UINT4_IMPL_H__ */ diff --git a/intern/cycles/util/util_types_vector3.h b/intern/cycles/util/util_types_vector3.h new file mode 100644 index 00000000000..12acf9dc959 --- /dev/null +++ b/intern/cycles/util/util_types_vector3.h @@ -0,0 +1,41 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_VECTOR3_H__ +#define __UTIL_TYPES_VECTOR3_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +template<typename T> +class vector3 +{ +public: + T x, y, z; + + __forceinline vector3(); + __forceinline vector3(const T& a); + __forceinline vector3(const T& x, const T& y, const T& z); +}; +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_VECTOR3_H__ */ diff --git a/intern/cycles/util/util_types_vector3_impl.h b/intern/cycles/util/util_types_vector3_impl.h new file mode 100644 index 00000000000..2f6b8368540 --- /dev/null +++ b/intern/cycles/util/util_types_vector3_impl.h @@ -0,0 +1,47 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_TYPES_VECTOR3_IMPL_H__ +#define __UTIL_TYPES_VECTOR3_IMPL_H__ + +#ifndef __UTIL_TYPES_H__ +# error "Do not include this file directly, include util_types.h instead." +#endif + +CCL_NAMESPACE_BEGIN + +#ifndef __KERNEL_GPU__ +template<typename T> +ccl_always_inline vector3<T>::vector3() +{ +} + +template<typename T> +ccl_always_inline vector3<T>::vector3(const T& a) + : x(a), y(a), z(a) +{ +} + +template<typename T> +ccl_always_inline vector3<T>::vector3(const T& x, const T& y, const T& z) + : x(x), y(y), z(z) +{ +} +#endif /* __KERNEL_GPU__ */ + +CCL_NAMESPACE_END + +#endif /* __UTIL_TYPES_VECTOR3_IMPL_H__ */ diff --git a/intern/cycles/util/util_vector.h b/intern/cycles/util/util_vector.h index 546b17570bb..e98e4e34181 100644 --- a/intern/cycles/util/util_vector.h +++ b/intern/cycles/util/util_vector.h @@ -23,9 +23,9 @@ #include <cstring> #include <vector> -#include "util_aligned_malloc.h" -#include "util_guarded_allocator.h" -#include "util_types.h" +#include "util/util_aligned_malloc.h" +#include "util/util_guarded_allocator.h" +#include "util/util_types.h" CCL_NAMESPACE_BEGIN @@ -86,9 +86,9 @@ public: * this was actually showing up in profiles quite significantly. it * also does not run any constructors/destructors * - if this is used, we are not tempted to use inefficient operations - * - aligned allocation for SSE data types */ + * - aligned allocation for CPU native data types */ -template<typename T, size_t alignment = 16> +template<typename T, size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES> class array { public: @@ -162,6 +162,11 @@ public: return memcmp(data_, other.data_, datasize_*sizeof(T)) == 0; } + bool operator!=(const array<T>& other) const + { + return !(*this == other); + } + void steal_data(array& from) { if(this != &from) { @@ -177,6 +182,14 @@ public: } } + T *steal_pointer() + { + T *ptr = data_; + data_ = NULL; + clear(); + return ptr; + } + T* resize(size_t newsize) { if(newsize == 0) { @@ -202,6 +215,18 @@ public: return data_; } + T* resize(size_t newsize, const T& value) + { + size_t oldsize = size(); + resize(newsize); + + for(size_t i = oldsize; i < size(); i++) { + data_[i] = value; + } + + return data_; + } + void clear() { if(data_ != NULL) { @@ -273,6 +298,15 @@ public: push_back_slow(t); } + void append(const array<T>& from) + { + if(from.size()) { + size_t old_size = size(); + resize(old_size + from.size()); + memcpy(data_ + old_size, from.data(), sizeof(T) * from.size()); + } + } + protected: inline T* mem_allocate(size_t N) { diff --git a/intern/cycles/util/util_version.h b/intern/cycles/util/util_version.h index d609c739ac7..112255f447b 100644 --- a/intern/cycles/util/util_version.h +++ b/intern/cycles/util/util_version.h @@ -22,8 +22,8 @@ CCL_NAMESPACE_BEGIN #define CYCLES_VERSION_MAJOR 1 -#define CYCLES_VERSION_MINOR 8 -#define CYCLES_VERSION_PATCH 1 +#define CYCLES_VERSION_MINOR 9 +#define CYCLES_VERSION_PATCH 0 #define CYCLES_MAKE_VERSION_STRING2(a,b,c) #a "." #b "." #c #define CYCLES_MAKE_VERSION_STRING(a,b,c) CYCLES_MAKE_VERSION_STRING2(a,b,c) diff --git a/intern/cycles/util/util_view.cpp b/intern/cycles/util/util_view.cpp index 9796a5f896d..10d86167921 100644 --- a/intern/cycles/util/util_view.cpp +++ b/intern/cycles/util/util_view.cpp @@ -17,11 +17,11 @@ #include <stdio.h> #include <stdlib.h> -#include "util_opengl.h" -#include "util_string.h" -#include "util_time.h" -#include "util_version.h" -#include "util_view.h" +#include "util/util_opengl.h" +#include "util/util_string.h" +#include "util/util_time.h" +#include "util/util_version.h" +#include "util/util_view.h" #ifdef __APPLE__ #include <GLUT/glut.h> diff --git a/intern/cycles/util/util_windows.cpp b/intern/cycles/util/util_windows.cpp index 4de8483564b..073db2a27db 100644 --- a/intern/cycles/util/util_windows.cpp +++ b/intern/cycles/util/util_windows.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "util_windows.h" +#include "util/util_windows.h" #ifdef _WIN32 diff --git a/intern/cycles/util/util_xml.h b/intern/cycles/util/util_xml.h index cfd0afc95f7..6f06f17937b 100644 --- a/intern/cycles/util/util_xml.h +++ b/intern/cycles/util/util_xml.h @@ -25,7 +25,17 @@ CCL_NAMESPACE_BEGIN OIIO_NAMESPACE_USING +#ifdef WITH_SYSTEM_PUGIXML +# define PUGIXML_NAMESPACE pugi +#else +# define PUGIXML_NAMESPACE OIIO_NAMESPACE::pugi +#endif + +using PUGIXML_NAMESPACE::xml_attribute; +using PUGIXML_NAMESPACE::xml_document; +using PUGIXML_NAMESPACE::xml_node; +using PUGIXML_NAMESPACE::xml_parse_result; + CCL_NAMESPACE_END #endif /* __UTIL_XML_H__ */ - |