diff options
Diffstat (limited to 'intern/cycles/kernel')
72 files changed, 3216 insertions, 418 deletions
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 9bb0455b9d5..bef869f34b4 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -10,7 +10,23 @@ set(INC_SYS set(SRC kernels/cpu/kernel.cpp + kernels/cpu/kernel_sse2.cpp + kernels/cpu/kernel_sse3.cpp + kernels/cpu/kernel_sse41.cpp + kernels/cpu/kernel_avx.cpp + kernels/cpu/kernel_avx2.cpp kernels/cpu/kernel_split.cpp + kernels/cpu/kernel_split_sse2.cpp + kernels/cpu/kernel_split_sse3.cpp + kernels/cpu/kernel_split_sse41.cpp + kernels/cpu/kernel_split_avx.cpp + kernels/cpu/kernel_split_avx2.cpp + kernels/cpu/filter.cpp + kernels/cpu/filter_sse2.cpp + kernels/cpu/filter_sse3.cpp + kernels/cpu/filter_sse41.cpp + kernels/cpu/filter_avx.cpp + kernels/cpu/filter_avx2.cpp kernels/opencl/kernel.cl kernels/opencl/kernel_state_buffer_size.cl kernels/opencl/kernel_split.cl @@ -32,8 +48,10 @@ set(SRC kernels/opencl/kernel_next_iteration_setup.cl kernels/opencl/kernel_indirect_subsurface.cl kernels/opencl/kernel_buffer_update.cl + kernels/opencl/filter.cl kernels/cuda/kernel.cu kernels/cuda/kernel_split.cu + kernels/cuda/filter.cu ) set(SRC_BVH_HEADERS @@ -95,6 +113,8 @@ set(SRC_KERNELS_CPU_HEADERS kernels/cpu/kernel_cpu.h kernels/cpu/kernel_cpu_impl.h kernels/cpu/kernel_cpu_image.h + kernels/cpu/filter_cpu.h + kernels/cpu/filter_cpu_impl.h ) set(SRC_KERNELS_CUDA_HEADERS @@ -190,6 +210,21 @@ set(SRC_GEOM_HEADERS geom/geom_volume.h ) +set(SRC_FILTER_HEADERS + filter/filter.h + filter/filter_defines.h + filter/filter_features.h + filter/filter_features_sse.h + filter/filter_kernel.h + filter/filter_nlm_cpu.h + filter/filter_nlm_gpu.h + filter/filter_prefilter.h + filter/filter_reconstruction.h + filter/filter_transform.h + filter/filter_transform_gpu.h + filter/filter_transform_sse.h +) + set(SRC_UTIL_HEADERS ../util/util_atomic.h ../util/util_color.h @@ -204,6 +239,7 @@ set(SRC_UTIL_HEADERS ../util/util_math_int2.h ../util/util_math_int3.h ../util/util_math_int4.h + ../util/util_math_matrix.h ../util/util_static_assert.h ../util/util_transform.h ../util/util_texture.h @@ -295,23 +331,21 @@ if(WITH_CYCLES_CUDA_BINARIES) ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS} ) + set(cuda_filter_sources kernels/cuda/filter.cu + ${SRC_HEADERS} + ${SRC_KERNELS_CUDA_HEADERS} + ${SRC_FILTER_HEADERS} + ${SRC_UTIL_HEADERS} + ) set(cuda_cubins) - macro(CYCLES_CUDA_KERNEL_ADD arch split experimental) - if(${split}) - set(cuda_extra_flags "-D__SPLIT__") - set(cuda_cubin kernel_split) - else() - set(cuda_extra_flags "") - set(cuda_cubin kernel) - endif() - + macro(CYCLES_CUDA_KERNEL_ADD arch name flags sources experimental) if(${experimental}) - set(cuda_extra_flags ${cuda_extra_flags} -D__KERNEL_EXPERIMENTAL__) - set(cuda_cubin ${cuda_cubin}_experimental) + set(flags ${flags} -D__KERNEL_EXPERIMENTAL__) + set(name ${name}_experimental) endif() - set(cuda_cubin ${cuda_cubin}_${arch}.cubin) + set(cuda_cubin ${name}_${arch}.cubin) if(WITH_CYCLES_DEBUG) set(cuda_debug_flags "-D__KERNEL_DEBUG__") @@ -325,11 +359,7 @@ if(WITH_CYCLES_CUDA_BINARIES) set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}") set(cuda_math_flags "--use_fast_math") - if(split) - set(cuda_kernel_src "/kernels/cuda/kernel_split.cu") - else() - set(cuda_kernel_src "/kernels/cuda/kernel.cu") - endif() + set(cuda_kernel_src "/kernels/cuda/${name}.cu") add_custom_command( OUTPUT ${cuda_cubin} @@ -343,13 +373,13 @@ if(WITH_CYCLES_CUDA_BINARIES) ${cuda_arch_flags} ${cuda_version_flags} ${cuda_math_flags} - ${cuda_extra_flags} + ${flags} ${cuda_debug_flags} -I${CMAKE_CURRENT_SOURCE_DIR}/.. -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC - DEPENDS ${cuda_sources}) + DEPENDS ${sources}) delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib) list(APPEND cuda_cubins ${cuda_cubin}) @@ -363,11 +393,12 @@ if(WITH_CYCLES_CUDA_BINARIES) foreach(arch ${CYCLES_CUDA_BINARIES_ARCH}) # Compile regular kernel - CYCLES_CUDA_KERNEL_ADD(${arch} FALSE FALSE) + CYCLES_CUDA_KERNEL_ADD(${arch} kernel "" "${cuda_sources}" FALSE) + CYCLES_CUDA_KERNEL_ADD(${arch} filter "" "${cuda_filter_sources}" FALSE) if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES) # Compile split kernel - CYCLES_CUDA_KERNEL_ADD(${arch} TRUE FALSE) + CYCLES_CUDA_KERNEL_ADD(${arch} kernel_split "-D__SPLIT__" ${cuda_sources} FALSE) endif() endforeach() @@ -388,41 +419,30 @@ include_directories(SYSTEM ${INC_SYS}) set_source_files_properties(kernels/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_split.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}") +set_source_files_properties(kernels/cpu/filter.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}") if(CXX_HAS_SSE) - list(APPEND SRC - kernels/cpu/kernel_sse2.cpp - kernels/cpu/kernel_sse3.cpp - kernels/cpu/kernel_sse41.cpp - kernels/cpu/kernel_split_sse2.cpp - kernels/cpu/kernel_split_sse3.cpp - kernels/cpu/kernel_split_sse41.cpp - ) - set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/filter_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/filter_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/filter_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") endif() if(CXX_HAS_AVX) - list(APPEND SRC - kernels/cpu/kernel_avx.cpp - kernels/cpu/kernel_split_avx.cpp - ) set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/filter_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") endif() if(CXX_HAS_AVX2) - list(APPEND SRC - kernels/cpu/kernel_avx2.cpp - kernels/cpu/kernel_split_avx2.cpp - ) set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/filter_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") endif() add_library(cycles_kernel @@ -432,6 +452,7 @@ add_library(cycles_kernel ${SRC_KERNELS_CUDA_HEADERS} ${SRC_BVH_HEADERS} ${SRC_CLOSURE_HEADERS} + ${SRC_FILTER_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} ${SRC_SPLIT_HEADERS} @@ -472,12 +493,15 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocke delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_subsurface.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_buffer_update.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/filter.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/filter.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_FILTER_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/filter) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/util) diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h index a6bba8bf74d..a04c157dc40 100644 --- a/intern/cycles/kernel/closure/bsdf.h +++ b/intern/cycles/kernel/closure/bsdf.h @@ -435,5 +435,23 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b) #endif } +/* Classifies a closure as diffuse-like or specular-like. + * This is needed for the denoising feature pass generation, + * which are written on the first bounce where more than 25% + * of the sampling weight belongs to diffuse-line closures. */ +ccl_device_inline bool bsdf_is_specular_like(ShaderClosure *sc) +{ + if(CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) { + return true; + } + + if(CLOSURE_IS_BSDF_MICROFACET(sc->type)) { + MicrofacetBsdf *bsdf = (MicrofacetBsdf*) sc; + return (bsdf->alpha_x*bsdf->alpha_y <= 0.075f*0.075f); + } + + return false; +} + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h index 7e0f5a7ec75..a5ba2cb2972 100644 --- a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h +++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h @@ -40,7 +40,6 @@ typedef ccl_addr_space struct VelvetBsdf { float sigma; float invsigma2; - float3 N; } VelvetBsdf; ccl_device int bsdf_ashikhmin_velvet_setup(VelvetBsdf *bsdf) diff --git a/intern/cycles/kernel/closure/bsdf_diffuse.h b/intern/cycles/kernel/closure/bsdf_diffuse.h index dcd187f9305..ec6f1f20996 100644 --- a/intern/cycles/kernel/closure/bsdf_diffuse.h +++ b/intern/cycles/kernel/closure/bsdf_diffuse.h @@ -37,7 +37,6 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct DiffuseBsdf { SHADER_CLOSURE_BASE; - float3 N; } DiffuseBsdf; /* DIFFUSE */ diff --git a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h index 2d982a95fe4..24f40af46a3 100644 --- a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h +++ b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h @@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct DiffuseRampBsdf { SHADER_CLOSURE_BASE; - float3 N; float3 *colors; } DiffuseRampBsdf; diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h index 3fe7572e4ce..30cc8b90330 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet.h @@ -46,7 +46,6 @@ typedef ccl_addr_space struct MicrofacetBsdf { float alpha_x, alpha_y, ior; MicrofacetExtra *extra; float3 T; - float3 N; } MicrofacetBsdf; /* Beckmann and GGX microfacet importance sampling. */ diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h index 57f1e733ee7..30644946840 100644 --- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h +++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h @@ -42,7 +42,7 @@ ccl_device_forceinline float D_ggx_aniso(const float3 wm, const float2 alpha) /* Sample slope distribution (based on page 14 of the supplemental implementation). */ ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 randU) { - if(cosI > 0.9999f || cosI < 1e-6f) { + if(cosI > 0.9999f || fabsf(cosI) < 1e-6f) { const float r = sqrtf(randU.x / max(1.0f - randU.x, 1e-7f)); const float phi = M_2PI_F * randU.y; return make_float2(r*cosf(phi), r*sinf(phi)); diff --git a/intern/cycles/kernel/closure/bsdf_oren_nayar.h b/intern/cycles/kernel/closure/bsdf_oren_nayar.h index cb342a026ef..6b770fc0c16 100644 --- a/intern/cycles/kernel/closure/bsdf_oren_nayar.h +++ b/intern/cycles/kernel/closure/bsdf_oren_nayar.h @@ -22,7 +22,6 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct OrenNayarBsdf { SHADER_CLOSURE_BASE; - float3 N; float roughness; float a; float b; diff --git a/intern/cycles/kernel/closure/bsdf_phong_ramp.h b/intern/cycles/kernel/closure/bsdf_phong_ramp.h index e152a8780db..420f94755ee 100644 --- a/intern/cycles/kernel/closure/bsdf_phong_ramp.h +++ b/intern/cycles/kernel/closure/bsdf_phong_ramp.h @@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct PhongRampBsdf { SHADER_CLOSURE_BASE; - float3 N; float exponent; float3 *colors; } PhongRampBsdf; diff --git a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h index 8a116693bdb..215c32e1ffb 100644 --- a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h +++ b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h @@ -28,7 +28,6 @@ typedef ccl_addr_space struct PrincipledDiffuseBsdf { SHADER_CLOSURE_BASE; float roughness; - float3 N; } PrincipledDiffuseBsdf; ccl_device float3 calculate_principled_diffuse_brdf(const PrincipledDiffuseBsdf *bsdf, diff --git a/intern/cycles/kernel/closure/bsdf_principled_sheen.h b/intern/cycles/kernel/closure/bsdf_principled_sheen.h index 58df4f7ddbb..f4476bfecd0 100644 --- a/intern/cycles/kernel/closure/bsdf_principled_sheen.h +++ b/intern/cycles/kernel/closure/bsdf_principled_sheen.h @@ -26,7 +26,6 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct PrincipledSheenBsdf { SHADER_CLOSURE_BASE; - float3 N; } PrincipledSheenBsdf; ccl_device float3 calculate_principled_sheen_brdf(const PrincipledSheenBsdf *bsdf, diff --git a/intern/cycles/kernel/closure/bsdf_toon.h b/intern/cycles/kernel/closure/bsdf_toon.h index 28e775bcbc8..d8b6d8ddead 100644 --- a/intern/cycles/kernel/closure/bsdf_toon.h +++ b/intern/cycles/kernel/closure/bsdf_toon.h @@ -38,7 +38,6 @@ CCL_NAMESPACE_BEGIN typedef ccl_addr_space struct ToonBsdf { SHADER_CLOSURE_BASE; - float3 N; float size; float smooth; } ToonBsdf; diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h index f9236a6e52c..f733ea4c517 100644 --- a/intern/cycles/kernel/closure/bssrdf.h +++ b/intern/cycles/kernel/closure/bssrdf.h @@ -28,7 +28,6 @@ typedef ccl_addr_space struct Bssrdf { float texture_blur; float albedo; float roughness; - float3 N; } Bssrdf; /* Planar Truncated Gaussian diff --git a/intern/cycles/kernel/filter/filter.h b/intern/cycles/kernel/filter/filter.h new file mode 100644 index 00000000000..f6e474d6702 --- /dev/null +++ b/intern/cycles/kernel/filter/filter.h @@ -0,0 +1,52 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __FILTER_H__ +#define __FILTER_H__ + +/* CPU Filter Kernel Interface */ + +#include "util/util_types.h" + +#include "kernel/filter/filter_defines.h" + +CCL_NAMESPACE_BEGIN + +#define KERNEL_NAME_JOIN(x, y, z) x ## _ ## y ## _ ## z +#define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name) +#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name) + +#define KERNEL_ARCH cpu +#include "kernel/kernels/cpu/filter_cpu.h" + +#define KERNEL_ARCH cpu_sse2 +#include "kernel/kernels/cpu/filter_cpu.h" + +#define KERNEL_ARCH cpu_sse3 +#include "kernel/kernels/cpu/filter_cpu.h" + +#define KERNEL_ARCH cpu_sse41 +#include "kernel/kernels/cpu/filter_cpu.h" + +#define KERNEL_ARCH cpu_avx +#include "kernel/kernels/cpu/filter_cpu.h" + +#define KERNEL_ARCH cpu_avx2 +#include "kernel/kernels/cpu/filter_cpu.h" + +CCL_NAMESPACE_END + +#endif /* __FILTER_H__ */ diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h new file mode 100644 index 00000000000..ce96f733aff --- /dev/null +++ b/intern/cycles/kernel/filter/filter_defines.h @@ -0,0 +1,38 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __FILTER_DEFINES_H__ +#define __FILTER_DEFINES_H__ + +#define DENOISE_FEATURES 10 +#define TRANSFORM_SIZE (DENOISE_FEATURES*DENOISE_FEATURES) +#define XTWX_SIZE (((DENOISE_FEATURES+1)*(DENOISE_FEATURES+2))/2) +#define XTWY_SIZE (DENOISE_FEATURES+1) + +typedef struct TilesInfo { + int offsets[9]; + int strides[9]; + int x[4]; + int y[4]; + /* TODO(lukas): CUDA doesn't have uint64_t... */ +#ifdef __KERNEL_OPENCL__ + ccl_global float *buffers[9]; +#else + long long int buffers[9]; +#endif +} TilesInfo; + +#endif /* __FILTER_DEFINES_H__*/ diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h new file mode 100644 index 00000000000..f5a40d49997 --- /dev/null +++ b/intern/cycles/kernel/filter/filter_features.h @@ -0,0 +1,120 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + CCL_NAMESPACE_BEGIN + +#define ccl_get_feature(buffer, pass) buffer[(pass)*pass_stride] + +/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y). + * pixel_buffer always points to the current pixel in the first pass. */ +#define FOR_PIXEL_WINDOW pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \ + for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ + for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) { + +#define END_FOR_PIXEL_WINDOW } \ + pixel_buffer += buffer_w - (high.x - low.x); \ + } + +ccl_device_inline void filter_get_features(int2 pixel, ccl_global float ccl_restrict_ptr buffer, float *features, float ccl_restrict_ptr mean, int pass_stride) +{ + features[0] = pixel.x; + features[1] = pixel.y; + features[2] = ccl_get_feature(buffer, 0); + features[3] = ccl_get_feature(buffer, 1); + features[4] = ccl_get_feature(buffer, 2); + features[5] = ccl_get_feature(buffer, 3); + features[6] = ccl_get_feature(buffer, 4); + features[7] = ccl_get_feature(buffer, 5); + features[8] = ccl_get_feature(buffer, 6); + features[9] = ccl_get_feature(buffer, 7); + if(mean) { + for(int i = 0; i < DENOISE_FEATURES; i++) + features[i] -= mean[i]; + } +} + +ccl_device_inline void filter_get_feature_scales(int2 pixel, ccl_global float ccl_restrict_ptr buffer, float *scales, float ccl_restrict_ptr mean, int pass_stride) +{ + scales[0] = fabsf(pixel.x - mean[0]); + scales[1] = fabsf(pixel.y - mean[1]); + scales[2] = fabsf(ccl_get_feature(buffer, 0) - mean[2]); + scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3], + ccl_get_feature(buffer, 2) - mean[4], + ccl_get_feature(buffer, 3) - mean[5])); + scales[4] = fabsf(ccl_get_feature(buffer, 4) - mean[6]); + scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7], + ccl_get_feature(buffer, 6) - mean[8], + ccl_get_feature(buffer, 7) - mean[9])); +} + +ccl_device_inline void filter_calculate_scale(float *scale) +{ + scale[0] = 1.0f/max(scale[0], 0.01f); + scale[1] = 1.0f/max(scale[1], 0.01f); + scale[2] = 1.0f/max(scale[2], 0.01f); + scale[6] = 1.0f/max(scale[4], 0.01f); + scale[7] = scale[8] = scale[9] = 1.0f/max(sqrtf(scale[5]), 0.01f); + scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f); +} + +ccl_device_inline float3 filter_get_pixel_color(ccl_global float ccl_restrict_ptr buffer, int pass_stride) +{ + return make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2)); +} + +ccl_device_inline float filter_get_pixel_variance(ccl_global float ccl_restrict_ptr buffer, int pass_stride) +{ + return average(make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2))); +} + +ccl_device_inline void design_row_add(float *design_row, + int rank, + ccl_global float ccl_restrict_ptr transform, + int stride, + int row, + float feature) +{ + for(int i = 0; i < rank; i++) { + design_row[1+i] += transform[(row*DENOISE_FEATURES + i)*stride]*feature; + } +} + +/* Fill the design row. */ +ccl_device_inline void filter_get_design_row_transform(int2 p_pixel, + ccl_global float ccl_restrict_ptr p_buffer, + int2 q_pixel, + ccl_global float ccl_restrict_ptr q_buffer, + int pass_stride, + int rank, + float *design_row, + ccl_global float ccl_restrict_ptr transform, + int stride) +{ + design_row[0] = 1.0f; + math_vector_zero(design_row+1, rank); + design_row_add(design_row, rank, transform, stride, 0, q_pixel.x - p_pixel.x); + design_row_add(design_row, rank, transform, stride, 1, q_pixel.y - p_pixel.y); + design_row_add(design_row, rank, transform, stride, 2, ccl_get_feature(q_buffer, 0) - ccl_get_feature(p_buffer, 0)); + design_row_add(design_row, rank, transform, stride, 3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1)); + design_row_add(design_row, rank, transform, stride, 4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2)); + design_row_add(design_row, rank, transform, stride, 5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3)); + design_row_add(design_row, rank, transform, stride, 6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4)); + design_row_add(design_row, rank, transform, stride, 7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5)); + design_row_add(design_row, rank, transform, stride, 8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6)); + design_row_add(design_row, rank, transform, stride, 9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7)); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h new file mode 100644 index 00000000000..303c8f482e3 --- /dev/null +++ b/intern/cycles/kernel/filter/filter_features_sse.h @@ -0,0 +1,95 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +#define ccl_get_feature_sse(pass) _mm_loadu_ps(buffer + (pass)*pass_stride) + +/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time. + * pixel_buffer always points to the first of the 4 current pixel in the first pass. + * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. */ + +#define FOR_PIXEL_WINDOW_SSE pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \ + for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ + __m128 y4 = _mm_set1_ps(pixel.y); \ + for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \ + __m128 x4 = _mm_add_ps(_mm_set1_ps(pixel.x), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); \ + __m128 active_pixels = _mm_cmplt_ps(x4, _mm_set1_ps(high.x)); + +#define END_FOR_PIXEL_WINDOW_SSE } \ + pixel_buffer += buffer_w - (pixel.x - low.x); \ + } + +ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, __m128 active_pixels, float ccl_restrict_ptr buffer, __m128 *features, __m128 ccl_restrict_ptr mean, int pass_stride) +{ + features[0] = x; + features[1] = y; + features[2] = ccl_get_feature_sse(0); + features[3] = ccl_get_feature_sse(1); + features[4] = ccl_get_feature_sse(2); + features[5] = ccl_get_feature_sse(3); + features[6] = ccl_get_feature_sse(4); + features[7] = ccl_get_feature_sse(5); + features[8] = ccl_get_feature_sse(6); + features[9] = ccl_get_feature_sse(7); + if(mean) { + for(int i = 0; i < DENOISE_FEATURES; i++) + features[i] = _mm_sub_ps(features[i], mean[i]); + } + for(int i = 0; i < DENOISE_FEATURES; i++) + features[i] = _mm_mask_ps(features[i], active_pixels); +} + +ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y, __m128 active_pixels, float ccl_restrict_ptr buffer, __m128 *scales, __m128 ccl_restrict_ptr mean, int pass_stride) +{ + scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels); + scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels); + + scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(0), mean[2])), active_pixels); + + __m128 diff, scale; + diff = _mm_sub_ps(ccl_get_feature_sse(1), mean[3]); + scale = _mm_mul_ps(diff, diff); + diff = _mm_sub_ps(ccl_get_feature_sse(2), mean[4]); + scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); + diff = _mm_sub_ps(ccl_get_feature_sse(3), mean[5]); + scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); + scales[3] = _mm_mask_ps(scale, active_pixels); + + scales[4] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(4), mean[6])), active_pixels); + + diff = _mm_sub_ps(ccl_get_feature_sse(5), mean[7]); + scale = _mm_mul_ps(diff, diff); + diff = _mm_sub_ps(ccl_get_feature_sse(6), mean[8]); + scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); + diff = _mm_sub_ps(ccl_get_feature_sse(7), mean[9]); + scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); + scales[5] = _mm_mask_ps(scale, active_pixels); +} + +ccl_device_inline void filter_calculate_scale_sse(__m128 *scale) +{ + scale[0] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[0]), _mm_set1_ps(0.01f))); + scale[1] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[1]), _mm_set1_ps(0.01f))); + scale[2] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[2]), _mm_set1_ps(0.01f))); + scale[6] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[4]), _mm_set1_ps(0.01f))); + + scale[7] = scale[8] = scale[9] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[5])), _mm_set1_ps(0.01f))); + scale[3] = scale[4] = scale[5] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[3])), _mm_set1_ps(0.01f))); +} + + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_kernel.h b/intern/cycles/kernel/filter/filter_kernel.h new file mode 100644 index 00000000000..2ef03dc0a02 --- /dev/null +++ b/intern/cycles/kernel/filter/filter_kernel.h @@ -0,0 +1,50 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "util/util_color.h" +#include "util/util_math.h" +#include "util/util_math_fast.h" +#include "util/util_texture.h" + +#include "util/util_atomic.h" +#include "util/util_math_matrix.h" + +#include "kernel/filter/filter_defines.h" + +#include "kernel/filter/filter_features.h" +#ifdef __KERNEL_SSE3__ +# include "kernel/filter/filter_features_sse.h" +#endif + +#include "kernel/filter/filter_prefilter.h" + +#ifdef __KERNEL_GPU__ +# include "kernel/filter/filter_transform_gpu.h" +#else +# ifdef __KERNEL_SSE3__ +# include "kernel/filter/filter_transform_sse.h" +# else +# include "kernel/filter/filter_transform.h" +# endif +#endif + +#include "kernel/filter/filter_reconstruction.h" + +#ifdef __KERNEL_CPU__ +# include "kernel/filter/filter_nlm_cpu.h" +#else +# include "kernel/filter/filter_nlm_gpu.h" +#endif diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h new file mode 100644 index 00000000000..1a314b100be --- /dev/null +++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h @@ -0,0 +1,163 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, float ccl_restrict_ptr weightImage, float ccl_restrict_ptr varianceImage, float *differenceImage, int4 rect, int w, int channel_offset, float a, float k_2) +{ + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x; x < rect.z; x++) { + float diff = 0.0f; + int numChannels = channel_offset? 3 : 1; + for(int c = 0; c < numChannels; c++) { + float cdiff = weightImage[c*channel_offset + y*w+x] - weightImage[c*channel_offset + (y+dy)*w+(x+dx)]; + float pvar = varianceImage[c*channel_offset + y*w+x]; + float qvar = varianceImage[c*channel_offset + (y+dy)*w+(x+dx)]; + diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar)); + } + if(numChannels > 1) { + diff *= 1.0f/numChannels; + } + differenceImage[y*w+x] = diff; + } + } +} + +ccl_device_inline void kernel_filter_nlm_blur(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) +{ +#ifdef __KERNEL_SSE3__ + int aligned_lowx = (rect.x & ~(3)); + int aligned_highx = ((rect.z + 3) & ~(3)); +#endif + for(int y = rect.y; y < rect.w; y++) { + const int low = max(rect.y, y-f); + const int high = min(rect.w, y+f+1); + for(int x = rect.x; x < rect.z; x++) { + outImage[y*w+x] = 0.0f; + } + for(int y1 = low; y1 < high; y1++) { +#ifdef __KERNEL_SSE3__ + for(int x = aligned_lowx; x < aligned_highx; x+=4) { + _mm_store_ps(outImage + y*w+x, _mm_add_ps(_mm_load_ps(outImage + y*w+x), _mm_load_ps(differenceImage + y1*w+x))); + } +#else + for(int x = rect.x; x < rect.z; x++) { + outImage[y*w+x] += differenceImage[y1*w+x]; + } +#endif + } + for(int x = rect.x; x < rect.z; x++) { + outImage[y*w+x] *= 1.0f/(high - low); + } + } +} + +ccl_device_inline void kernel_filter_nlm_calc_weight(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) +{ + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x; x < rect.z; x++) { + outImage[y*w+x] = 0.0f; + } + } + for(int dx = -f; dx <= f; dx++) { + int pos_dx = max(0, dx); + int neg_dx = min(0, dx); + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) { + outImage[y*w+x] += differenceImage[y*w+dx+x]; + } + } + } + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x; x < rect.z; x++) { + const int low = max(rect.x, x-f); + const int high = min(rect.z, x+f+1); + outImage[y*w+x] = expf(-max(outImage[y*w+x] * (1.0f/(high - low)), 0.0f)); + } + } +} + +ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, float ccl_restrict_ptr differenceImage, float ccl_restrict_ptr image, float *outImage, float *accumImage, int4 rect, int w, int f) +{ + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x; x < rect.z; x++) { + const int low = max(rect.x, x-f); + const int high = min(rect.z, x+f+1); + float sum = 0.0f; + for(int x1 = low; x1 < high; x1++) { + sum += differenceImage[y*w+x1]; + } + float weight = sum * (1.0f/(high - low)); + accumImage[y*w+x] += weight; + outImage[y*w+x] += weight*image[(y+dy)*w+(x+dx)]; + } + } +} + +ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, + float ccl_restrict_ptr differenceImage, + float ccl_restrict_ptr buffer, + float *color_pass, + float *variance_pass, + float *transform, + int *rank, + float *XtWX, + float3 *XtWY, + int4 rect, + int4 filter_rect, + int w, int h, int f, + int pass_stride) +{ + /* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */ + for(int fy = max(0, rect.y-filter_rect.y); fy < min(filter_rect.w, rect.w-filter_rect.y); fy++) { + int y = fy + filter_rect.y; + for(int fx = max(0, rect.x-filter_rect.x); fx < min(filter_rect.z, rect.z-filter_rect.x); fx++) { + int x = fx + filter_rect.x; + const int low = max(rect.x, x-f); + const int high = min(rect.z, x+f+1); + float sum = 0.0f; + for(int x1 = low; x1 < high; x1++) { + sum += differenceImage[y*w+x1]; + } + float weight = sum * (1.0f/(high - low)); + + int storage_ofs = fy*filter_rect.z + fx; + float *l_transform = transform + storage_ofs*TRANSFORM_SIZE; + float *l_XtWX = XtWX + storage_ofs*XTWX_SIZE; + float3 *l_XtWY = XtWY + storage_ofs*XTWY_SIZE; + int *l_rank = rank + storage_ofs; + + kernel_filter_construct_gramian(x, y, 1, + dx, dy, w, h, + pass_stride, + buffer, + color_pass, variance_pass, + l_transform, l_rank, + weight, l_XtWX, l_XtWY, 0); + } + } +} + +ccl_device_inline void kernel_filter_nlm_normalize(float *outImage, float ccl_restrict_ptr accumImage, int4 rect, int w) +{ + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x; x < rect.z; x++) { + outImage[y*w+x] /= accumImage[y*w+x]; + } + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h new file mode 100644 index 00000000000..b5ba7cf51a5 --- /dev/null +++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h @@ -0,0 +1,147 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y, + int dx, int dy, + ccl_global float ccl_restrict_ptr weightImage, + ccl_global float ccl_restrict_ptr varianceImage, + ccl_global float *differenceImage, + int4 rect, int w, + int channel_offset, + float a, float k_2) +{ + float diff = 0.0f; + int numChannels = channel_offset? 3 : 1; + for(int c = 0; c < numChannels; c++) { + float cdiff = weightImage[c*channel_offset + y*w+x] - weightImage[c*channel_offset + (y+dy)*w+(x+dx)]; + float pvar = varianceImage[c*channel_offset + y*w+x]; + float qvar = varianceImage[c*channel_offset + (y+dy)*w+(x+dx)]; + diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar)); + } + if(numChannels > 1) { + diff *= 1.0f/numChannels; + } + differenceImage[y*w+x] = diff; +} + +ccl_device_inline void kernel_filter_nlm_blur(int x, int y, + ccl_global float ccl_restrict_ptr differenceImage, + ccl_global float *outImage, + int4 rect, int w, int f) +{ + float sum = 0.0f; + const int low = max(rect.y, y-f); + const int high = min(rect.w, y+f+1); + for(int y1 = low; y1 < high; y1++) { + sum += differenceImage[y1*w+x]; + } + sum *= 1.0f/(high-low); + outImage[y*w+x] = sum; +} + +ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y, + ccl_global float ccl_restrict_ptr differenceImage, + ccl_global float *outImage, + int4 rect, int w, int f) +{ + float sum = 0.0f; + const int low = max(rect.x, x-f); + const int high = min(rect.z, x+f+1); + for(int x1 = low; x1 < high; x1++) { + sum += differenceImage[y*w+x1]; + } + sum *= 1.0f/(high-low); + outImage[y*w+x] = expf(-max(sum, 0.0f)); +} + +ccl_device_inline void kernel_filter_nlm_update_output(int x, int y, + int dx, int dy, + ccl_global float ccl_restrict_ptr differenceImage, + ccl_global float ccl_restrict_ptr image, + ccl_global float *outImage, + ccl_global float *accumImage, + int4 rect, int w, int f) +{ + float sum = 0.0f; + const int low = max(rect.x, x-f); + const int high = min(rect.z, x+f+1); + for(int x1 = low; x1 < high; x1++) { + sum += differenceImage[y*w+x1]; + } + sum *= 1.0f/(high-low); + if(outImage) { + accumImage[y*w+x] += sum; + outImage[y*w+x] += sum*image[(y+dy)*w+(x+dx)]; + } + else { + accumImage[y*w+x] = sum; + } +} + +ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy, + int dx, int dy, + ccl_global float ccl_restrict_ptr differenceImage, + ccl_global float ccl_restrict_ptr buffer, + ccl_global float *color_pass, + ccl_global float *variance_pass, + ccl_global float ccl_restrict_ptr transform, + ccl_global int *rank, + ccl_global float *XtWX, + ccl_global float3 *XtWY, + int4 rect, + int4 filter_rect, + int w, int h, int f, + int pass_stride, + int localIdx) +{ + int y = fy + filter_rect.y; + int x = fx + filter_rect.x; + const int low = max(rect.x, x-f); + const int high = min(rect.z, x+f+1); + float sum = 0.0f; + for(int x1 = low; x1 < high; x1++) { + sum += differenceImage[y*w+x1]; + } + float weight = sum * (1.0f/(high - low)); + + int storage_ofs = fy*filter_rect.z + fx; + transform += storage_ofs; + rank += storage_ofs; + XtWX += storage_ofs; + XtWY += storage_ofs; + + kernel_filter_construct_gramian(x, y, + filter_rect.z*filter_rect.w, + dx, dy, w, h, + pass_stride, + buffer, + color_pass, variance_pass, + transform, rank, + weight, XtWX, XtWY, + localIdx); +} + +ccl_device_inline void kernel_filter_nlm_normalize(int x, int y, + ccl_global float *outImage, + ccl_global float ccl_restrict_ptr accumImage, + int4 rect, int w) +{ + outImage[y*w+x] /= accumImage[y*w+x]; +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h new file mode 100644 index 00000000000..54bcf888052 --- /dev/null +++ b/intern/cycles/kernel/filter/filter_prefilter.h @@ -0,0 +1,145 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +/* First step of the shadow prefiltering, performs the shadow division and stores all data + * in a nice and easy rectangular array that can be passed to the NLM filter. + * + * Calculates: + * unfiltered: Contains the two half images of the shadow feature pass + * sampleVariance: The sample-based variance calculated in the kernel. Note: This calculation is biased in general, and especially here since the variance of the ratio can only be approximated. + * sampleVarianceV: Variance of the sample variance estimation, quite noisy (since it's essentially the buffer variance of the two variance halves) + * bufferVariance: The buffer-based variance of the shadow feature. Unbiased, but quite noisy. + */ +ccl_device void kernel_filter_divide_shadow(int sample, + ccl_global TilesInfo *tiles, + int x, int y, + ccl_global float *unfilteredA, + ccl_global float *unfilteredB, + ccl_global float *sampleVariance, + ccl_global float *sampleVarianceV, + ccl_global float *bufferVariance, + int4 rect, + int buffer_pass_stride, + int buffer_denoising_offset, + bool use_split_variance) +{ + int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2); + int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2); + int tile = ytile*3+xtile; + + int offset = tiles->offsets[tile]; + int stride = tiles->strides[tile]; + ccl_global float ccl_restrict_ptr center_buffer = (ccl_global float*) tiles->buffers[tile]; + center_buffer += (y*stride + x + offset)*buffer_pass_stride; + center_buffer += buffer_denoising_offset + 14; + + int buffer_w = align_up(rect.z - rect.x, 4); + int idx = (y-rect.y)*buffer_w + (x - rect.x); + unfilteredA[idx] = center_buffer[1] / max(center_buffer[0], 1e-7f); + unfilteredB[idx] = center_buffer[4] / max(center_buffer[3], 1e-7f); + + float varA = center_buffer[2]; + float varB = center_buffer[5]; + int odd_sample = (sample+1)/2; + int even_sample = sample/2; + if(use_split_variance) { + varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample); + varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample); + } + varA /= (odd_sample - 1); + varB /= (even_sample - 1); + + sampleVariance[idx] = 0.5f*(varA + varB) / sample; + sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample*sample); + bufferVariance[idx] = 0.5f * (unfilteredA[idx] - unfilteredB[idx]) * (unfilteredA[idx] - unfilteredB[idx]); +} + +/* Load a regular feature from the render buffers into the denoise buffer. + * Parameters: + * - sample: The sample amount in the buffer, used to normalize the buffer. + * - m_offset, v_offset: Render Buffer Pass offsets of mean and variance of the feature. + * - x, y: Current pixel + * - mean, variance: Target denoise buffers. + * - rect: The prefilter area (lower pixels inclusive, upper pixels exclusive). + */ +ccl_device void kernel_filter_get_feature(int sample, + ccl_global TilesInfo *tiles, + int m_offset, int v_offset, + int x, int y, + ccl_global float *mean, + ccl_global float *variance, + int4 rect, int buffer_pass_stride, + int buffer_denoising_offset, + bool use_split_variance) +{ + int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2); + int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2); + int tile = ytile*3+xtile; + ccl_global float *center_buffer = ((ccl_global float*) tiles->buffers[tile]) + (tiles->offsets[tile] + y*tiles->strides[tile] + x)*buffer_pass_stride + buffer_denoising_offset; + + int buffer_w = align_up(rect.z - rect.x, 4); + int idx = (y-rect.y)*buffer_w + (x - rect.x); + + mean[idx] = center_buffer[m_offset] / sample; + if(use_split_variance) { + variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1))); + } + else { + variance[idx] = center_buffer[v_offset] / (sample * (sample-1)); + } +} + +/* Combine A/B buffers. + * Calculates the combined mean and the buffer variance. */ +ccl_device void kernel_filter_combine_halves(int x, int y, + ccl_global float *mean, + ccl_global float *variance, + ccl_global float *a, + ccl_global float *b, + int4 rect, int r) +{ + int buffer_w = align_up(rect.z - rect.x, 4); + int idx = (y-rect.y)*buffer_w + (x - rect.x); + + if(mean) mean[idx] = 0.5f * (a[idx]+b[idx]); + if(variance) { + if(r == 0) variance[idx] = 0.25f * (a[idx]-b[idx])*(a[idx]-b[idx]); + else { + variance[idx] = 0.0f; + float values[25]; + int numValues = 0; + for(int py = max(y-r, rect.y); py < min(y+r+1, rect.w); py++) { + for(int px = max(x-r, rect.x); px < min(x+r+1, rect.z); px++) { + int pidx = (py-rect.y)*buffer_w + (px-rect.x); + values[numValues++] = 0.25f * (a[pidx]-b[pidx])*(a[pidx]-b[pidx]); + } + } + /* Insertion-sort the variances (fast enough for 25 elements). */ + for(int i = 1; i < numValues; i++) { + float v = values[i]; + int j; + for(j = i-1; j >= 0 && values[j] > v; j--) + values[j+1] = values[j]; + values[j+1] = v; + } + variance[idx] = values[(7*numValues)/8]; + } + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h new file mode 100644 index 00000000000..02f3802fa0c --- /dev/null +++ b/intern/cycles/kernel/filter/filter_reconstruction.h @@ -0,0 +1,103 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device_inline void kernel_filter_construct_gramian(int x, int y, + int storage_stride, + int dx, int dy, + int w, int h, + int pass_stride, + ccl_global float ccl_restrict_ptr buffer, + ccl_global float *color_pass, + ccl_global float *variance_pass, + ccl_global float ccl_restrict_ptr transform, + ccl_global int *rank, + float weight, + ccl_global float *XtWX, + ccl_global float3 *XtWY, + int localIdx) +{ + int p_offset = y *w + x; + int q_offset = (y+dy)*w + (x+dx); + +#ifdef __KERNEL_CPU__ + const int stride = 1; + (void)storage_stride; + (void)localIdx; + float design_row[DENOISE_FEATURES+1]; +#elif defined(__KERNEL_CUDA__) + const int stride = storage_stride; + ccl_local float shared_design_row[(DENOISE_FEATURES+1)*CCL_MAX_LOCAL_SIZE]; + ccl_local_param float *design_row = shared_design_row + localIdx*(DENOISE_FEATURES+1); +#else + const int stride = storage_stride; + float design_row[DENOISE_FEATURES+1]; +#endif + + float3 p_color = filter_get_pixel_color(color_pass + p_offset, pass_stride); + float3 q_color = filter_get_pixel_color(color_pass + q_offset, pass_stride); + + float p_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + p_offset, pass_stride)); + float q_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + q_offset, pass_stride)); + + if(average(fabs(p_color - q_color)) > 3.0f*(p_std_dev + q_std_dev + 1e-3f)) { + return; + } + + filter_get_design_row_transform(make_int2(x, y), buffer + p_offset, + make_int2(x+dx, y+dy), buffer + q_offset, + pass_stride, *rank, design_row, transform, stride); + + math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride); + math_vec3_add_strided(XtWY, (*rank)+1, design_row, weight * q_color, stride); +} + +ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h, + ccl_global float *buffer, + ccl_global int *rank, + int storage_stride, + ccl_global float *XtWX, + ccl_global float3 *XtWY, + int4 buffer_params, + int sample) +{ +#ifdef __KERNEL_CPU__ + const int stride = 1; + (void)storage_stride; +#else + const int stride = storage_stride; +#endif + + math_trimatrix_vec3_solve(XtWX, XtWY, (*rank)+1, stride); + + float3 final_color = XtWY[0]; + + ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z; + final_color *= sample; + if(buffer_params.w) { + final_color.x += combined_buffer[buffer_params.w+0]; + final_color.y += combined_buffer[buffer_params.w+1]; + final_color.z += combined_buffer[buffer_params.w+2]; + } + combined_buffer[0] = final_color.x; + combined_buffer[1] = final_color.y; + combined_buffer[2] = final_color.z; +} + +#undef STORAGE_TYPE + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_transform.h b/intern/cycles/kernel/filter/filter_transform.h new file mode 100644 index 00000000000..139dc402d21 --- /dev/null +++ b/intern/cycles/kernel/filter/filter_transform.h @@ -0,0 +1,113 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer, + int x, int y, int4 rect, + int pass_stride, + float *transform, int *rank, + int radius, float pca_threshold) +{ + int buffer_w = align_up(rect.z - rect.x, 4); + + float features[DENOISE_FEATURES]; + + /* Temporary storage, used in different steps of the algorithm. */ + float tempmatrix[DENOISE_FEATURES*DENOISE_FEATURES]; + float tempvector[2*DENOISE_FEATURES]; + float ccl_restrict_ptr pixel_buffer; + int2 pixel; + + + + + /* === Calculate denoising window. === */ + int2 low = make_int2(max(rect.x, x - radius), + max(rect.y, y - radius)); + int2 high = make_int2(min(rect.z, x + radius + 1), + min(rect.w, y + radius + 1)); + + + + + /* === Shift feature passes to have mean 0. === */ + float feature_means[DENOISE_FEATURES]; + math_vector_zero(feature_means, DENOISE_FEATURES); + FOR_PIXEL_WINDOW { + filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride); + math_vector_add(feature_means, features, DENOISE_FEATURES); + } END_FOR_PIXEL_WINDOW + + float pixel_scale = 1.0f / ((high.y - low.y) * (high.x - low.x)); + math_vector_scale(feature_means, pixel_scale, DENOISE_FEATURES); + + /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */ + float *feature_scale = tempvector; + math_vector_zero(feature_scale, DENOISE_FEATURES); + + FOR_PIXEL_WINDOW { + filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride); + math_vector_max(feature_scale, features, DENOISE_FEATURES); + } END_FOR_PIXEL_WINDOW + + filter_calculate_scale(feature_scale); + + + /* === Generate the feature transformation. === + * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space + * which generally has fewer dimensions. This mainly helps to prevent overfitting. */ + float* feature_matrix = tempmatrix; + math_matrix_zero(feature_matrix, DENOISE_FEATURES); + FOR_PIXEL_WINDOW { + filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride); + math_vector_mul(features, feature_scale, DENOISE_FEATURES); + math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f); + } END_FOR_PIXEL_WINDOW + + math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1); + *rank = 0; + if(pca_threshold < 0.0f) { + float threshold_energy = 0.0f; + for(int i = 0; i < DENOISE_FEATURES; i++) { + threshold_energy += feature_matrix[i*DENOISE_FEATURES+i]; + } + threshold_energy *= 1.0f - (-pca_threshold); + + float reduced_energy = 0.0f; + for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) { + if(i >= 2 && reduced_energy >= threshold_energy) + break; + float s = feature_matrix[i*DENOISE_FEATURES+i]; + reduced_energy += s; + } + } + else { + for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) { + float s = feature_matrix[i*DENOISE_FEATURES+i]; + if(i >= 2 && sqrtf(s) < pca_threshold) + break; + } + } + + /* Bake the feature scaling into the transformation matrix. */ + for(int i = 0; i < (*rank); i++) { + math_vector_mul(transform + i*DENOISE_FEATURES, feature_scale, DENOISE_FEATURES); + } + math_matrix_transpose(transform, DENOISE_FEATURES, 1); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h new file mode 100644 index 00000000000..68304e14143 --- /dev/null +++ b/intern/cycles/kernel/filter/filter_transform_gpu.h @@ -0,0 +1,117 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_filter_construct_transform(ccl_global float ccl_restrict_ptr buffer, + int x, int y, int4 rect, + int pass_stride, + ccl_global float *transform, + ccl_global int *rank, + int radius, float pca_threshold, + int transform_stride, int localIdx) +{ + int buffer_w = align_up(rect.z - rect.x, 4); + +#ifdef __KERNEL_CUDA__ + ccl_local float shared_features[DENOISE_FEATURES*CCL_MAX_LOCAL_SIZE]; + ccl_local_param float *features = shared_features + localIdx*DENOISE_FEATURES; +#else + float features[DENOISE_FEATURES]; +#endif + + /* === Calculate denoising window. === */ + int2 low = make_int2(max(rect.x, x - radius), + max(rect.y, y - radius)); + int2 high = make_int2(min(rect.z, x + radius + 1), + min(rect.w, y + radius + 1)); + ccl_global float ccl_restrict_ptr pixel_buffer; + int2 pixel; + + + + + /* === Shift feature passes to have mean 0. === */ + float feature_means[DENOISE_FEATURES]; + math_vector_zero(feature_means, DENOISE_FEATURES); + FOR_PIXEL_WINDOW { + filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride); + math_vector_add(feature_means, features, DENOISE_FEATURES); + } END_FOR_PIXEL_WINDOW + + float pixel_scale = 1.0f / ((high.y - low.y) * (high.x - low.x)); + math_vector_scale(feature_means, pixel_scale, DENOISE_FEATURES); + + /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */ + float feature_scale[DENOISE_FEATURES]; + math_vector_zero(feature_scale, DENOISE_FEATURES); + + FOR_PIXEL_WINDOW { + filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride); + math_vector_max(feature_scale, features, DENOISE_FEATURES); + } END_FOR_PIXEL_WINDOW + + filter_calculate_scale(feature_scale); + + + + /* === Generate the feature transformation. === + * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space + * which generally has fewer dimensions. This mainly helps to prevent overfitting. */ + float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES]; + math_matrix_zero(feature_matrix, DENOISE_FEATURES); + FOR_PIXEL_WINDOW { + filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride); + math_vector_mul(features, feature_scale, DENOISE_FEATURES); + math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f); + } END_FOR_PIXEL_WINDOW + + math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, transform_stride); + *rank = 0; + if(pca_threshold < 0.0f) { + float threshold_energy = 0.0f; + for(int i = 0; i < DENOISE_FEATURES; i++) { + threshold_energy += feature_matrix[i*DENOISE_FEATURES+i]; + } + threshold_energy *= 1.0f - (-pca_threshold); + + float reduced_energy = 0.0f; + for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) { + if(i >= 2 && reduced_energy >= threshold_energy) + break; + float s = feature_matrix[i*DENOISE_FEATURES+i]; + reduced_energy += s; + } + } + else { + for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) { + float s = feature_matrix[i*DENOISE_FEATURES+i]; + if(i >= 2 && sqrtf(s) < pca_threshold) + break; + } + } + + math_matrix_transpose(transform, DENOISE_FEATURES, transform_stride); + + /* Bake the feature scaling into the transformation matrix. */ + for(int i = 0; i < DENOISE_FEATURES; i++) { + for(int j = 0; j < (*rank); j++) { + transform[(i*DENOISE_FEATURES + j)*transform_stride] *= feature_scale[i]; + } + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h new file mode 100644 index 00000000000..ed3a92f6241 --- /dev/null +++ b/intern/cycles/kernel/filter/filter_transform_sse.h @@ -0,0 +1,102 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer, + int x, int y, int4 rect, + int pass_stride, + float *transform, int *rank, + int radius, float pca_threshold) +{ + int buffer_w = align_up(rect.z - rect.x, 4); + + __m128 features[DENOISE_FEATURES]; + float ccl_restrict_ptr pixel_buffer; + int2 pixel; + + int2 low = make_int2(max(rect.x, x - radius), + max(rect.y, y - radius)); + int2 high = make_int2(min(rect.z, x + radius + 1), + min(rect.w, y + radius + 1)); + + __m128 feature_means[DENOISE_FEATURES]; + math_vector_zero_sse(feature_means, DENOISE_FEATURES); + FOR_PIXEL_WINDOW_SSE { + filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride); + math_vector_add_sse(feature_means, DENOISE_FEATURES, features); + } END_FOR_PIXEL_WINDOW_SSE + + __m128 pixel_scale = _mm_set1_ps(1.0f / ((high.y - low.y) * (high.x - low.x))); + for(int i = 0; i < DENOISE_FEATURES; i++) { + feature_means[i] = _mm_mul_ps(_mm_hsum_ps(feature_means[i]), pixel_scale); + } + + __m128 feature_scale[DENOISE_FEATURES]; + math_vector_zero_sse(feature_scale, DENOISE_FEATURES); + FOR_PIXEL_WINDOW_SSE { + filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride); + math_vector_max_sse(feature_scale, features, DENOISE_FEATURES); + } END_FOR_PIXEL_WINDOW_SSE + + filter_calculate_scale_sse(feature_scale); + + __m128 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES]; + math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES); + FOR_PIXEL_WINDOW_SSE { + filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride); + math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale); + math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, _mm_set1_ps(1.0f)); + } END_FOR_PIXEL_WINDOW_SSE + + float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES]; + math_matrix_hsum(feature_matrix, DENOISE_FEATURES, feature_matrix_sse); + + math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1); + + *rank = 0; + if(pca_threshold < 0.0f) { + float threshold_energy = 0.0f; + for(int i = 0; i < DENOISE_FEATURES; i++) { + threshold_energy += feature_matrix[i*DENOISE_FEATURES+i]; + } + threshold_energy *= 1.0f - (-pca_threshold); + + float reduced_energy = 0.0f; + for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) { + if(i >= 2 && reduced_energy >= threshold_energy) + break; + float s = feature_matrix[i*DENOISE_FEATURES+i]; + reduced_energy += s; + } + } + else { + for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) { + float s = feature_matrix[i*DENOISE_FEATURES+i]; + if(i >= 2 && sqrtf(s) < pca_threshold) + break; + } + } + + math_matrix_transpose(transform, DENOISE_FEATURES, 1); + + /* Bake the feature scaling into the transformation matrix. */ + for(int i = 0; i < DENOISE_FEATURES; i++) { + math_vector_scale(transform + i*DENOISE_FEATURES, _mm_cvtss_f32(feature_scale[i]), *rank); + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h index 47778553b94..105aee8da15 100644 --- a/intern/cycles/kernel/geom/geom_triangle.h +++ b/intern/cycles/kernel/geom/geom_triangle.h @@ -76,7 +76,7 @@ ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3 /* Interpolate smooth vertex normal from vertices */ -ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, float u, float v) +ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, float3 Ng, int prim, float u, float v) { /* load triangle vertices */ const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); @@ -84,7 +84,9 @@ ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, flo float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y)); float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z)); - return normalize((1.0f - u - v)*n2 + u*n0 + v*n1); + float3 N = safe_normalize((1.0f - u - v)*n2 + u*n0 + v*n1); + + return is_zero(N)? Ng: N; } /* Ray differentials on triangle */ diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h index 06c0fb2fbca..84a988f1dbc 100644 --- a/intern/cycles/kernel/kernel.h +++ b/intern/cycles/kernel/kernel.h @@ -50,30 +50,20 @@ void kernel_tex_copy(KernelGlobals *kg, #define KERNEL_ARCH cpu #include "kernel/kernels/cpu/kernel_cpu.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 -# define KERNEL_ARCH cpu_sse2 -# include "kernel/kernels/cpu/kernel_cpu.h" -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 -# define KERNEL_ARCH cpu_sse3 -# include "kernel/kernels/cpu/kernel_cpu.h" -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 -# define KERNEL_ARCH cpu_sse41 -# include "kernel/kernels/cpu/kernel_cpu.h" -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX -# define KERNEL_ARCH cpu_avx -# include "kernel/kernels/cpu/kernel_cpu.h" -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ - -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 -# define KERNEL_ARCH cpu_avx2 -# include "kernel/kernels/cpu/kernel_cpu.h" -#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ +#define KERNEL_ARCH cpu_sse2 +#include "kernel/kernels/cpu/kernel_cpu.h" + +#define KERNEL_ARCH cpu_sse3 +#include "kernel/kernels/cpu/kernel_cpu.h" + +#define KERNEL_ARCH cpu_sse41 +#include "kernel/kernels/cpu/kernel_cpu.h" + +#define KERNEL_ARCH cpu_avx +#include "kernel/kernels/cpu/kernel_cpu.h" + +#define KERNEL_ARCH cpu_avx2 +#include "kernel/kernels/cpu/kernel_cpu.h" CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h index 823d30dde78..06728415c15 100644 --- a/intern/cycles/kernel/kernel_accumulate.h +++ b/intern/cycles/kernel/kernel_accumulate.h @@ -222,6 +222,12 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass) L->path_total_shaded = make_float3(0.0f, 0.0f, 0.0f); L->shadow_color = make_float3(0.0f, 0.0f, 0.0f); #endif + +#ifdef __DENOISING_FEATURES__ + L->denoising_normal = make_float3(0.0f, 0.0f, 0.0f); + L->denoising_albedo = make_float3(0.0f, 0.0f, 0.0f); + L->denoising_depth = 0.0f; +#endif /* __DENOISING_FEATURES__ */ } ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space float3 *throughput, @@ -277,15 +283,15 @@ ccl_device_inline void path_radiance_accum_emission(PathRadiance *L, float3 thro } ccl_device_inline void path_radiance_accum_ao(PathRadiance *L, + ccl_addr_space PathState *state, float3 throughput, float3 alpha, float3 bsdf, - float3 ao, - int bounce) + float3 ao) { #ifdef __PASSES__ if(L->use_light_pass) { - if(bounce == 0) { + if(state->bounce == 0) { /* directly visible lighting */ L->direct_diffuse += throughput*bsdf*ao; L->ao += alpha*throughput*ao; @@ -302,31 +308,43 @@ ccl_device_inline void path_radiance_accum_ao(PathRadiance *L, } #ifdef __SHADOW_TRICKS__ - float3 light = throughput * bsdf; - L->path_total += light; - L->path_total_shaded += ao * light; + if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { + float3 light = throughput * bsdf; + L->path_total += light; + L->path_total_shaded += ao * light; + } #endif } ccl_device_inline void path_radiance_accum_total_ao( PathRadiance *L, + ccl_addr_space PathState *state, float3 throughput, float3 bsdf) { #ifdef __SHADOW_TRICKS__ - L->path_total += throughput * bsdf; + if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { + L->path_total += throughput * bsdf; + } #else (void) L; + (void) state; (void) throughput; (void) bsdf; #endif } -ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 throughput, BsdfEval *bsdf_eval, float3 shadow, float shadow_fac, int bounce, bool is_lamp) +ccl_device_inline void path_radiance_accum_light(PathRadiance *L, + ccl_addr_space PathState *state, + float3 throughput, + BsdfEval *bsdf_eval, + float3 shadow, + float shadow_fac, + bool is_lamp) { #ifdef __PASSES__ if(L->use_light_pass) { - if(bounce == 0) { + if(state->bounce == 0) { /* directly visible lighting */ L->direct_diffuse += throughput*bsdf_eval->diffuse*shadow; L->direct_glossy += throughput*bsdf_eval->glossy*shadow; @@ -352,21 +370,27 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 through } #ifdef __SHADOW_TRICKS__ - float3 light = throughput * bsdf_eval->sum_no_mis; - L->path_total += light; - L->path_total_shaded += shadow * light; + if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { + float3 light = throughput * bsdf_eval->sum_no_mis; + L->path_total += light; + L->path_total_shaded += shadow * light; + } #endif } ccl_device_inline void path_radiance_accum_total_light( PathRadiance *L, + ccl_addr_space PathState *state, float3 throughput, const BsdfEval *bsdf_eval) { #ifdef __SHADOW_TRICKS__ - L->path_total += throughput * bsdf_eval->sum_no_mis; + if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { + L->path_total += throughput * bsdf_eval->sum_no_mis; + } #else (void) L; + (void) state; (void) throughput; (void) bsdf_eval; #endif @@ -393,11 +417,17 @@ ccl_device_inline void path_radiance_accum_background(PathRadiance *L, } #ifdef __SHADOW_TRICKS__ - L->path_total += throughput * value; - if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) { - L->path_total_shaded += throughput * value; + if(state->flag & PATH_RAY_STORE_SHADOW_INFO) { + L->path_total += throughput * value; + if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) { + L->path_total_shaded += throughput * value; + } } #endif + +#ifdef __DENOISING_FEATURES__ + L->denoising_albedo += state->denoising_feature_weight * value; +#endif /* __DENOISING_FEATURES__ */ } ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L) @@ -555,6 +585,38 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi return L_sum; } +ccl_device_inline void path_radiance_split_denoising(KernelGlobals *kg, PathRadiance *L, float3 *noisy, float3 *clean) +{ +#ifdef __PASSES__ + kernel_assert(L->use_light_pass); + + *clean = L->emission + L->background; + *noisy = L->direct_scatter + L->indirect_scatter; + +# define ADD_COMPONENT(flag, component) \ + if(kernel_data.film.denoising_flags & flag) \ + *clean += component; \ + else \ + *noisy += component; + + ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_DIR, L->direct_diffuse); + ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_IND, L->indirect_diffuse); + ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_DIR, L->direct_glossy); + ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_IND, L->indirect_glossy); + ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_DIR, L->direct_transmission); + ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_IND, L->indirect_transmission); + ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_DIR, L->direct_subsurface); + ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_IND, L->indirect_subsurface); +# undef ADD_COMPONENT +#else + *noisy = L->emission; + *clean = make_float3(0.0f, 0.0f, 0.0f); +#endif + + *noisy = ensure_finite3(*noisy); + *clean = ensure_finite3(*clean); +} + ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample, int num_samples) { float fac = 1.0f/num_samples; @@ -595,12 +657,12 @@ ccl_device_inline float path_radiance_sum_shadow(const PathRadiance *L) /* Calculate final light sum and transparency for shadow catcher object. */ ccl_device_inline float3 path_radiance_sum_shadowcatcher(KernelGlobals *kg, const PathRadiance *L, - ccl_addr_space float* L_transparent) + float* alpha) { const float shadow = path_radiance_sum_shadow(L); float3 L_sum; if(kernel_data.background.transparent) { - *L_transparent = shadow; + *alpha = 1.0f-shadow; L_sum = make_float3(0.0f, 0.0f, 0.0f); } else { diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h index 21da180bb8e..7595e74e2d5 100644 --- a/intern/cycles/kernel/kernel_compat_cpu.h +++ b/intern/cycles/kernel/kernel_compat_cpu.h @@ -42,6 +42,8 @@ #include "util/util_types.h" #include "util/util_texture.h" +#define ccl_restrict_ptr const * __restrict + #define ccl_addr_space #define ccl_local_id(d) 0 diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h index c375d17a95f..80d7401fbcf 100644 --- a/intern/cycles/kernel/kernel_compat_cuda.h +++ b/intern/cycles/kernel/kernel_compat_cuda.h @@ -55,6 +55,10 @@ #define ccl_restrict __restrict__ #define ccl_align(n) __align__(n) +#define ccl_restrict_ptr const * __restrict__ +#define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH) + + /* No assert supported for CUDA */ #define kernel_assert(cond) diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h index c2263ac0d49..15cf4b81b21 100644 --- a/intern/cycles/kernel/kernel_compat_opencl.h +++ b/intern/cycles/kernel/kernel_compat_opencl.h @@ -50,6 +50,8 @@ # define ccl_addr_space #endif +#define ccl_restrict_ptr const * __restrict__ + #define ccl_local_id(d) get_local_id(d) #define ccl_global_id(d) get_global_id(d) diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h index a2909cec1a1..9baa9d54957 100644 --- a/intern/cycles/kernel/kernel_light.h +++ b/intern/cycles/kernel/kernel_light.h @@ -102,7 +102,7 @@ ccl_device_inline float area_light_sample(float3 P, float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f); cu = clamp(cu, -1.0f, 1.0f); /* Compute xu. */ - float xu = -(cu * z0) / sqrtf(1.0f - cu * cu); + float xu = -(cu * z0) / max(sqrtf(1.0f - cu * cu), 1e-7f); xu = clamp(xu, x0, x1); /* Compute yv. */ float z0sq = z0 * z0; diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h index ed523696571..8ab4c724829 100644 --- a/intern/cycles/kernel/kernel_passes.h +++ b/intern/cycles/kernel/kernel_passes.h @@ -60,6 +60,135 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sa #endif /* __SPLIT_KERNEL__ */ } +#ifdef __DENOISING_FEATURES__ +ccl_device_inline void kernel_write_pass_float_variance(ccl_global float *buffer, int sample, float value) +{ + kernel_write_pass_float(buffer, sample, value); + + /* The online one-pass variance update that's used for the megakernel can't easily be implemented + * with atomics, so for the split kernel the E[x^2] - 1/N * (E[x])^2 fallback is used. */ +# ifdef __SPLIT_KERNEL__ + kernel_write_pass_float(buffer+1, sample, value*value); +# else + if(sample == 0) { + kernel_write_pass_float(buffer+1, sample, 0.0f); + } + else { + float new_mean = buffer[0] * (1.0f / (sample + 1)); + float old_mean = (buffer[0] - value) * (1.0f / sample); + kernel_write_pass_float(buffer+1, sample, (value - new_mean) * (value - old_mean)); + } +# endif +} + +# if defined(__SPLIT_KERNEL__) +# define kernel_write_pass_float3_unaligned kernel_write_pass_float3 +# else +ccl_device_inline void kernel_write_pass_float3_unaligned(ccl_global float *buffer, int sample, float3 value) +{ + buffer[0] = (sample == 0)? value.x: buffer[0] + value.x; + buffer[1] = (sample == 0)? value.y: buffer[1] + value.y; + buffer[2] = (sample == 0)? value.z: buffer[2] + value.z; +} +# endif + +ccl_device_inline void kernel_write_pass_float3_variance(ccl_global float *buffer, int sample, float3 value) +{ + kernel_write_pass_float3_unaligned(buffer, sample, value); +# ifdef __SPLIT_KERNEL__ + kernel_write_pass_float3_unaligned(buffer+3, sample, value*value); +# else + if(sample == 0) { + kernel_write_pass_float3_unaligned(buffer+3, sample, make_float3(0.0f, 0.0f, 0.0f)); + } + else { + float3 sum = make_float3(buffer[0], buffer[1], buffer[2]); + float3 new_mean = sum * (1.0f / (sample + 1)); + float3 old_mean = (sum - value) * (1.0f / sample); + kernel_write_pass_float3_unaligned(buffer+3, sample, (value - new_mean) * (value - old_mean)); + } +# endif +} + +ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg, ccl_global float *buffer, + int sample, float path_total, float path_total_shaded) +{ + if(kernel_data.film.pass_denoising_data == 0) + return; + + buffer += (sample & 1)? DENOISING_PASS_SHADOW_B : DENOISING_PASS_SHADOW_A; + + path_total = ensure_finite(path_total); + path_total_shaded = ensure_finite(path_total_shaded); + + kernel_write_pass_float(buffer, sample/2, path_total); + kernel_write_pass_float(buffer+1, sample/2, path_total_shaded); + + float value = path_total_shaded / max(path_total, 1e-7f); +# ifdef __SPLIT_KERNEL__ + kernel_write_pass_float(buffer+2, sample/2, value*value); +# else + if(sample < 2) { + kernel_write_pass_float(buffer+2, sample/2, 0.0f); + } + else { + float old_value = (buffer[1] - path_total_shaded) / max(buffer[0] - path_total, 1e-7f); + float new_value = buffer[1] / max(buffer[0], 1e-7f); + kernel_write_pass_float(buffer+2, sample, (value - new_value) * (value - old_value)); + } +# endif +} +#endif /* __DENOISING_FEATURES__ */ + +ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg, + ShaderData *sd, + ccl_global PathState *state, + PathRadiance *L) +{ +#ifdef __DENOISING_FEATURES__ + if(state->denoising_feature_weight == 0.0f) { + return; + } + + L->denoising_depth += ensure_finite(state->denoising_feature_weight * sd->ray_length); + + float3 normal = make_float3(0.0f, 0.0f, 0.0f); + float3 albedo = make_float3(0.0f, 0.0f, 0.0f); + float sum_weight = 0.0f, sum_nonspecular_weight = 0.0f; + + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; + + if(!CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) + continue; + + /* All closures contribute to the normal feature, but only diffuse-like ones to the albedo. */ + normal += sc->N * sc->sample_weight; + sum_weight += sc->sample_weight; + if(!bsdf_is_specular_like(sc)) { + albedo += sc->weight; + sum_nonspecular_weight += sc->sample_weight; + } + } + + /* Wait for next bounce if 75% or more sample weight belongs to specular-like closures. */ + if((sum_weight == 0.0f) || (sum_nonspecular_weight*4.0f > sum_weight)) { + if(sum_weight != 0.0f) { + normal /= sum_weight; + } + L->denoising_normal += ensure_finite3(state->denoising_feature_weight * normal); + L->denoising_albedo += ensure_finite3(state->denoising_feature_weight * albedo); + + state->denoising_feature_weight = 0.0f; + } +#else + (void) kg; + (void) sd; + (void) state; + (void) L; +#endif /* __DENOISING_FEATURES__ */ +} + ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L, ShaderData *sd, int sample, ccl_addr_space PathState *state, float3 throughput) { @@ -199,5 +328,79 @@ ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg, ccl_global f #endif } +ccl_device_inline void kernel_write_result(KernelGlobals *kg, ccl_global float *buffer, + int sample, PathRadiance *L, float alpha, bool is_shadow_catcher) +{ + if(L) { + float3 L_sum; +#ifdef __SHADOW_TRICKS__ + if(is_shadow_catcher) { + L_sum = path_radiance_sum_shadowcatcher(kg, L, &alpha); + } + else +#endif /* __SHADOW_TRICKS__ */ + { + L_sum = path_radiance_clamp_and_sum(kg, L); + } + + kernel_write_pass_float4(buffer, sample, make_float4(L_sum.x, L_sum.y, L_sum.z, alpha)); + + kernel_write_light_passes(kg, buffer, L, sample); + +#ifdef __DENOISING_FEATURES__ + if(kernel_data.film.pass_denoising_data) { +# ifdef __SHADOW_TRICKS__ + kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, average(L->path_total), average(L->path_total_shaded)); +# else + kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, 0.0f, 0.0f); +# endif + if(kernel_data.film.pass_denoising_clean) { + float3 noisy, clean; + path_radiance_split_denoising(kg, L, &noisy, &clean); + kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, + sample, noisy); + kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean, + sample, clean); + } + else { + kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, + sample, L_sum); + } + + kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL, + sample, L->denoising_normal); + kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO, + sample, L->denoising_albedo); + kernel_write_pass_float_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH, + sample, L->denoising_depth); + } +#endif /* __DENOISING_FEATURES__ */ + } + else { + kernel_write_pass_float4(buffer, sample, make_float4(0.0f, 0.0f, 0.0f, 0.0f)); + +#ifdef __DENOISING_FEATURES__ + if(kernel_data.film.pass_denoising_data) { + kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, 0.0f, 0.0f); + + kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, + sample, make_float3(0.0f, 0.0f, 0.0f)); + + kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL, + sample, make_float3(0.0f, 0.0f, 0.0f)); + kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO, + sample, make_float3(0.0f, 0.0f, 0.0f)); + kernel_write_pass_float_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH, + sample, 0.0f); + + if(kernel_data.film.pass_denoising_clean) { + kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean, + sample, make_float3(0.0f, 0.0f, 0.0f)); + } + } +#endif /* __DENOISING_FEATURES__ */ + } +} + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h index 58da141aed3..0d31ae32aa6 100644 --- a/intern/cycles/kernel/kernel_path.h +++ b/intern/cycles/kernel/kernel_path.h @@ -90,10 +90,10 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg, light_ray.dD = differential3_zero(); if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) { - path_radiance_accum_ao(L, throughput, ao_alpha, ao_bsdf, ao_shadow, state->bounce); + path_radiance_accum_ao(L, state, throughput, ao_alpha, ao_bsdf, ao_shadow); } else { - path_radiance_accum_total_ao(L, throughput, ao_bsdf); + path_radiance_accum_total_ao(L, state, throughput, ao_bsdf); } } } @@ -366,6 +366,8 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, throughput /= probability; } + kernel_update_denoising_features(kg, sd, state, L); + #ifdef __AO__ /* ambient occlusion */ if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) { @@ -427,18 +429,19 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, } -ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, - RNG *rng, - int sample, - Ray ray, - ccl_global float *buffer) +ccl_device_inline float kernel_path_integrate(KernelGlobals *kg, + RNG *rng, + int sample, + Ray ray, + ccl_global float *buffer, + PathRadiance *L, + bool *is_shadow_catcher) { /* initialize */ - PathRadiance L; float3 throughput = make_float3(1.0f, 1.0f, 1.0f); float L_transparent = 0.0f; - path_radiance_init(&L, kernel_data.film.use_light_pass); + path_radiance_init(L, kernel_data.film.use_light_pass); /* shader data memory used for both volumes and surfaces, saves stack space */ ShaderData sd; @@ -517,7 +520,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, float3 emission; if(indirect_lamp_emission(kg, &emission_sd, &state, &light_ray, &emission)) - path_radiance_accum_emission(&L, throughput, emission, state.bounce); + path_radiance_accum_emission(L, throughput, emission, state.bounce); } #endif /* __LAMP_MIS__ */ @@ -549,7 +552,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, /* emission */ if(volume_segment.closure_flag & SD_EMISSION) - path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce); + path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce); /* scattering */ VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED; @@ -559,7 +562,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, /* direct light sampling */ kernel_branched_path_volume_connect_light(kg, rng, &sd, - &emission_sd, throughput, &state, &L, all, + &emission_sd, throughput, &state, L, all, &volume_ray, &volume_segment); /* indirect sample. if we use distance sampling and take just @@ -577,7 +580,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, kernel_volume_decoupled_free(kg, &volume_segment); if(result == VOLUME_PATH_SCATTERED) { - if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray)) + if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray)) continue; else break; @@ -591,15 +594,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, { /* integrate along volume segment with distance sampling */ VolumeIntegrateResult result = kernel_volume_integrate( - kg, &state, &sd, &volume_ray, &L, &throughput, rng, heterogeneous); + kg, &state, &sd, &volume_ray, L, &throughput, rng, heterogeneous); # ifdef __VOLUME_SCATTER__ if(result == VOLUME_PATH_SCATTERED) { /* direct lighting */ - kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L); + kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L); /* indirect light bounce */ - if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray)) + if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray)) continue; else break; @@ -623,7 +626,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, #ifdef __BACKGROUND__ /* sample background shader */ float3 L_background = indirect_background(kg, &emission_sd, &state, &ray); - path_radiance_accum_background(&L, &state, throughput, L_background); + path_radiance_accum_background(L, &state, throughput, L_background); #endif /* __BACKGROUND__ */ break; @@ -640,10 +643,10 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, #ifdef __SHADOW_TRICKS__ if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) { if(state.flag & PATH_RAY_CAMERA) { - state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY); + state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO); state.catcher_object = sd.object; if(!kernel_data.background.transparent) { - L.shadow_color = indirect_background(kg, &emission_sd, &state, &ray); + L->shadow_color = indirect_background(kg, &emission_sd, &state, &ray); } } } @@ -677,7 +680,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, #endif /* __HOLDOUT__ */ /* holdout mask objects do not write data passes */ - kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput); + kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput); /* blurring of bsdf after bounces, for rays that have a small likelihood * of following this particular path (diffuse, rough glossy) */ @@ -695,7 +698,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, if(sd.flag & SD_EMISSION) { /* todo: is isect.t wrong here for transparent surfaces? */ float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf); - path_radiance_accum_emission(&L, throughput, emission, state.bounce); + path_radiance_accum_emission(L, throughput, emission, state.bounce); } #endif /* __EMISSION__ */ @@ -715,10 +718,12 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, throughput /= probability; } + kernel_update_denoising_features(kg, &sd, &state, L); + #ifdef __AO__ /* ambient occlusion */ if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) { - kernel_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd)); + kernel_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd)); } #endif /* __AO__ */ @@ -729,7 +734,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, if(kernel_path_subsurface_scatter(kg, &sd, &emission_sd, - &L, + L, &state, rng, &ray, @@ -742,15 +747,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, #endif /* __SUBSURFACE__ */ /* direct lighting */ - kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L); + kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L); /* compute direct lighting and next bounce */ - if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, &L, &ray)) + if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, L, &ray)) break; } #ifdef __SUBSURFACE__ - kernel_path_subsurface_accum_indirect(&ss_indirect, &L); + kernel_path_subsurface_accum_indirect(&ss_indirect, L); /* Trace indirect subsurface rays by restarting the loop. this uses less * stack memory than invoking kernel_path_indirect. @@ -760,7 +765,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, &ss_indirect, &state, &ray, - &L, + L, &throughput); } else { @@ -769,24 +774,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg, } #endif /* __SUBSURFACE__ */ - float3 L_sum; #ifdef __SHADOW_TRICKS__ - if(state.flag & PATH_RAY_SHADOW_CATCHER) { - L_sum = path_radiance_sum_shadowcatcher(kg, &L, &L_transparent); - } - else + *is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER); #endif /* __SHADOW_TRICKS__ */ - { - L_sum = path_radiance_clamp_and_sum(kg, &L); - } - - kernel_write_light_passes(kg, buffer, &L, sample); #ifdef __KERNEL_DEBUG__ kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample); #endif /* __KERNEL_DEBUG__ */ - return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent); + return 1.0f - L_transparent; } ccl_device void kernel_path_trace(KernelGlobals *kg, @@ -807,15 +803,16 @@ ccl_device void kernel_path_trace(KernelGlobals *kg, kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray); /* integrate */ - float4 L; - - if(ray.t != 0.0f) - L = kernel_path_integrate(kg, &rng, sample, ray, buffer); - else - L = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + PathRadiance L; + bool is_shadow_catcher; - /* accumulate result in output buffer */ - kernel_write_pass_float4(buffer, sample, L); + if(ray.t != 0.0f) { + float alpha = kernel_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher); + kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher); + } + else { + kernel_write_result(kg, buffer, sample, NULL, 0.0f, false); + } path_rng_end(kg, rng_state, rng); } diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h index ddcb57161ea..10816d3e5d1 100644 --- a/intern/cycles/kernel/kernel_path_branched.h +++ b/intern/cycles/kernel/kernel_path_branched.h @@ -56,10 +56,10 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg, light_ray.dD = differential3_zero(); if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) { - path_radiance_accum_ao(L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state->bounce); + path_radiance_accum_ao(L, state, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow); } else { - path_radiance_accum_total_ao(L, throughput*num_samples_inv, ao_bsdf); + path_radiance_accum_total_ao(L, state, throughput*num_samples_inv, ao_bsdf); } } } @@ -72,14 +72,32 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba RNG *rng, ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd, float3 throughput, float num_samples_adjust, PathState *state, PathRadiance *L) { + float sum_sample_weight = 0.0f; +#ifdef __DENOISING_FEATURES__ + if(state->denoising_feature_weight > 0.0f) { + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; + + /* transparency is not handled here, but in outer loop */ + if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) { + continue; + } + + sum_sample_weight += sc->sample_weight; + } + } + else { + sum_sample_weight = 1.0f; + } +#endif /* __DENOISING_FEATURES__ */ + for(int i = 0; i < sd->num_closure; i++) { const ShaderClosure *sc = &sd->closure[i]; - if(!CLOSURE_IS_BSDF(sc->type)) - continue; /* transparency is not handled here, but in outer loop */ - if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) + if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) { continue; + } int num_samples; @@ -111,7 +129,8 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba &tp, &ps, L, - &bsdf_ray)) + &bsdf_ray, + sum_sample_weight)) { continue; } @@ -243,14 +262,19 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, } #endif /* __SUBSURFACE__ */ -ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer) +ccl_device float kernel_branched_path_integrate(KernelGlobals *kg, + RNG *rng, + int sample, + Ray ray, + ccl_global float *buffer, + PathRadiance *L, + bool *is_shadow_catcher) { /* initialize */ - PathRadiance L; float3 throughput = make_float3(1.0f, 1.0f, 1.0f); float L_transparent = 0.0f; - path_radiance_init(&L, kernel_data.film.use_light_pass); + path_radiance_init(L, kernel_data.film.use_light_pass); /* shader data memory used for both volumes and surfaces, saves stack space */ ShaderData sd; @@ -330,7 +354,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in int all = kernel_data.integrator.sample_all_lights_direct; kernel_branched_path_volume_connect_light(kg, rng, &sd, - &emission_sd, throughput, &state, &L, all, + &emission_sd, throughput, &state, L, all, &volume_ray, &volume_segment); /* indirect light sampling */ @@ -362,7 +386,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in &sd, &tp, &ps, - &L, + L, &pray)) { kernel_path_indirect(kg, @@ -373,19 +397,19 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in tp*num_samples_inv, num_samples, &ps, - &L); + L); /* for render passes, sum and reset indirect light pass variables * for the next samples */ - path_radiance_sum_indirect(&L); - path_radiance_reset_indirect(&L); + path_radiance_sum_indirect(L); + path_radiance_reset_indirect(L); } } } /* emission and transmittance */ if(volume_segment.closure_flag & SD_EMISSION) - path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce); + path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce); throughput *= volume_segment.accum_transmittance; /* free cached steps */ @@ -407,20 +431,20 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in path_state_branch(&ps, j, num_samples); VolumeIntegrateResult result = kernel_volume_integrate( - kg, &ps, &sd, &volume_ray, &L, &tp, rng, heterogeneous); + kg, &ps, &sd, &volume_ray, L, &tp, rng, heterogeneous); #ifdef __VOLUME_SCATTER__ if(result == VOLUME_PATH_SCATTERED) { /* todo: support equiangular, MIS and all light sampling. * alternatively get decoupled ray marching working on the GPU */ - kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, &L); + kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, L); if(kernel_path_volume_bounce(kg, rng, &sd, &tp, &ps, - &L, + L, &pray)) { kernel_path_indirect(kg, @@ -431,12 +455,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in tp, num_samples, &ps, - &L); + L); /* for render passes, sum and reset indirect light pass variables * for the next samples */ - path_radiance_sum_indirect(&L); - path_radiance_reset_indirect(&L); + path_radiance_sum_indirect(L); + path_radiance_reset_indirect(L); } } #endif /* __VOLUME_SCATTER__ */ @@ -462,7 +486,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in #ifdef __BACKGROUND__ /* sample background shader */ float3 L_background = indirect_background(kg, &emission_sd, &state, &ray); - path_radiance_accum_background(&L, &state, throughput, L_background); + path_radiance_accum_background(L, &state, throughput, L_background); #endif /* __BACKGROUND__ */ break; @@ -476,10 +500,10 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in #ifdef __SHADOW_TRICKS__ if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) { if(state.flag & PATH_RAY_CAMERA) { - state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY); + state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO); state.catcher_object = sd.object; if(!kernel_data.background.transparent) { - L.shadow_color = indirect_background(kg, &emission_sd, &state, &ray); + L->shadow_color = indirect_background(kg, &emission_sd, &state, &ray); } } } @@ -509,13 +533,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in #endif /* __HOLDOUT__ */ /* holdout mask objects do not write data passes */ - kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput); + kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput); #ifdef __EMISSION__ /* emission */ if(sd.flag & SD_EMISSION) { float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf); - path_radiance_accum_emission(&L, throughput, emission, state.bounce); + path_radiance_accum_emission(L, throughput, emission, state.bounce); } #endif /* __EMISSION__ */ @@ -539,10 +563,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in } } + kernel_update_denoising_features(kg, &sd, &state, L); + #ifdef __AO__ /* ambient occlusion */ if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) { - kernel_branched_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput); + kernel_branched_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput); } #endif /* __AO__ */ @@ -550,7 +576,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in /* bssrdf scatter to a different location on the same object */ if(sd.flag & SD_BSSRDF) { kernel_branched_path_subsurface_scatter(kg, &sd, &indirect_sd, &emission_sd, - &L, &state, rng, &ray, throughput); + L, &state, rng, &ray, throughput); } #endif /* __SUBSURFACE__ */ @@ -563,13 +589,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in int all = (kernel_data.integrator.sample_all_lights_direct) || (state.flag & PATH_RAY_SHADOW_CATCHER); kernel_branched_path_surface_connect_light(kg, rng, - &sd, &emission_sd, &hit_state, throughput, 1.0f, &L, all); + &sd, &emission_sd, &hit_state, throughput, 1.0f, L, all); } #endif /* __EMISSION__ */ /* indirect light */ kernel_branched_path_surface_indirect_light(kg, rng, - &sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, &L); + &sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, L); /* continue in case of transparency */ throughput *= shader_bsdf_transparency(kg, &sd); @@ -598,24 +624,15 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in #endif /* __VOLUME__ */ } - float3 L_sum; #ifdef __SHADOW_TRICKS__ - if(state.flag & PATH_RAY_SHADOW_CATCHER) { - L_sum = path_radiance_sum_shadowcatcher(kg, &L, &L_transparent); - } - else + *is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER); #endif /* __SHADOW_TRICKS__ */ - { - L_sum = path_radiance_clamp_and_sum(kg, &L); - } - - kernel_write_light_passes(kg, buffer, &L, sample); #ifdef __KERNEL_DEBUG__ kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample); #endif /* __KERNEL_DEBUG__ */ - return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent); + return 1.0f - L_transparent; } ccl_device void kernel_branched_path_trace(KernelGlobals *kg, @@ -636,15 +653,16 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg, kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray); /* integrate */ - float4 L; - - if(ray.t != 0.0f) - L = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer); - else - L = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + PathRadiance L; + bool is_shadow_catcher; - /* accumulate result in output buffer */ - kernel_write_pass_float4(buffer, sample, L); + if(ray.t != 0.0f) { + float alpha = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher); + kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher); + } + else { + kernel_write_result(kg, buffer, sample, NULL, 0.0f, false); + } path_rng_end(kg, rng_state, rng); } @@ -654,4 +672,3 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg, #endif /* __BRANCHED_PATH__ */ CCL_NAMESPACE_END - diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h index c0cd2a63120..0fa77d9e8bd 100644 --- a/intern/cycles/kernel/kernel_path_state.h +++ b/intern/cycles/kernel/kernel_path_state.h @@ -35,6 +35,16 @@ ccl_device_inline void path_state_init(KernelGlobals *kg, state->transmission_bounce = 0; state->transparent_bounce = 0; +#ifdef __DENOISING_FEATURES__ + if(kernel_data.film.pass_denoising_data) { + state->flag |= PATH_RAY_STORE_SHADOW_INFO; + state->denoising_feature_weight = 1.0f; + } + else { + state->denoising_feature_weight = 0.0f; + } +#endif /* __DENOISING_FEATURES__ */ + state->min_ray_pdf = FLT_MAX; state->ray_pdf = 0.0f; #ifdef __LAMP_MIS__ @@ -128,6 +138,10 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathSta /* random number generator next bounce */ state->rng_offset += PRNG_BOUNCE_NUM; + + if((state->denoising_feature_weight == 0.0f) && !(state->flag & PATH_RAY_SHADOW_CATCHER)) { + state->flag &= ~PATH_RAY_STORE_SHADOW_INFO; + } } ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *state) diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h index bd4ba775b4d..e676ea0f3ae 100644 --- a/intern/cycles/kernel/kernel_path_surface.h +++ b/intern/cycles/kernel/kernel_path_surface.h @@ -70,10 +70,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light( if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); + path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp); } else { - path_radiance_accum_total_light(L, throughput*num_samples_inv, &L_light); + path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light); } } } @@ -107,10 +107,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light( if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); + path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp); } else { - path_radiance_accum_total_light(L, throughput*num_samples_inv, &L_light); + path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light); } } } @@ -133,10 +133,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light( if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, state->bounce, is_lamp); + path_radiance_accum_light(L, state, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, is_lamp); } else { - path_radiance_accum_total_light(L, throughput*num_samples_adjust, &L_light); + path_radiance_accum_total_light(L, state, throughput*num_samples_adjust, &L_light); } } } @@ -155,7 +155,8 @@ ccl_device bool kernel_branched_path_surface_bounce( ccl_addr_space float3 *throughput, ccl_addr_space PathState *state, PathRadiance *L, - ccl_addr_space Ray *ray) + ccl_addr_space Ray *ray, + float sum_sample_weight) { /* sample BSDF */ float bsdf_pdf; @@ -175,6 +176,10 @@ ccl_device bool kernel_branched_path_surface_bounce( /* modify throughput */ path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label); +#ifdef __DENOISING_FEATURES__ + state->denoising_feature_weight *= sc->sample_weight / (sum_sample_weight * num_samples); +#endif + /* modify path state */ path_state_next(kg, state, label); @@ -257,10 +262,10 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp); + path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp); } else { - path_radiance_accum_total_light(L, throughput, &L_light); + path_radiance_accum_total_light(L, state, throughput, &L_light); } } } diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h index 371f2c1c7cb..dcedf51e479 100644 --- a/intern/cycles/kernel/kernel_path_volume.h +++ b/intern/cycles/kernel/kernel_path_volume.h @@ -55,7 +55,7 @@ ccl_device_inline void kernel_path_volume_connect_light( if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp); + path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp); } } } @@ -184,7 +184,7 @@ ccl_device void kernel_branched_path_volume_connect_light( if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); + path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp); } } } @@ -233,7 +233,7 @@ ccl_device void kernel_branched_path_volume_connect_light( if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp); + path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp); } } } @@ -271,7 +271,7 @@ ccl_device void kernel_branched_path_volume_connect_light( if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) { /* accumulate */ - path_radiance_accum_light(L, tp, &L_light, shadow, 1.0f, state->bounce, is_lamp); + path_radiance_accum_light(L, state, tp, &L_light, shadow, 1.0f, is_lamp); } } } diff --git a/intern/cycles/kernel/kernel_projection.h b/intern/cycles/kernel/kernel_projection.h index 9a2b0884a7e..cbb2442d1dc 100644 --- a/intern/cycles/kernel/kernel_projection.h +++ b/intern/cycles/kernel/kernel_projection.h @@ -57,6 +57,9 @@ ccl_device float3 spherical_to_direction(float theta, float phi) ccl_device float2 direction_to_equirectangular_range(float3 dir, float4 range) { + if(is_zero(dir)) + return make_float2(0.0f, 0.0f); + float u = (atan2f(dir.y, dir.x) - range.y) / range.x; float v = (acosf(dir.z / len(dir)) - range.w) / range.z; diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h index 8c0c5e90a3e..c66f52255f0 100644 --- a/intern/cycles/kernel/kernel_shader.h +++ b/intern/cycles/kernel/kernel_shader.h @@ -99,7 +99,7 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg, /* smooth normal */ if(sd->shader & SHADER_SMOOTH_NORMAL) - sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v); + sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v); #ifdef __DPDU__ /* dPdu/dPdv */ @@ -186,7 +186,7 @@ void shader_setup_from_subsurface( sd->N = Ng; if(sd->shader & SHADER_SMOOTH_NORMAL) - sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v); + sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v); # ifdef __DPDU__ /* dPdu/dPdv */ @@ -300,7 +300,7 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg, if(sd->type & PRIMITIVE_TRIANGLE) { /* smooth normal */ if(sd->shader & SHADER_SMOOTH_NORMAL) { - sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v); + sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v); #ifdef __INSTANCING__ if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index 9b354457b91..dd1fa1b82f7 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -173,6 +173,8 @@ CCL_NAMESPACE_BEGIN #define __PATCH_EVAL__ #define __SHADOW_TRICKS__ +#define __DENOISING_FEATURES__ + #ifdef __KERNEL_SHADING__ # define __SVM__ # define __EMISSION__ @@ -314,31 +316,32 @@ enum SamplingPattern { /* these flags values correspond to raytypes in osl.cpp, so keep them in sync! */ enum PathRayFlag { - PATH_RAY_CAMERA = 1, - PATH_RAY_REFLECT = 2, - PATH_RAY_TRANSMIT = 4, - PATH_RAY_DIFFUSE = 8, - PATH_RAY_GLOSSY = 16, - PATH_RAY_SINGULAR = 32, - PATH_RAY_TRANSPARENT = 64, - - PATH_RAY_SHADOW_OPAQUE = 128, - PATH_RAY_SHADOW_TRANSPARENT = 256, + PATH_RAY_CAMERA = (1 << 0), + PATH_RAY_REFLECT = (1 << 1), + PATH_RAY_TRANSMIT = (1 << 2), + PATH_RAY_DIFFUSE = (1 << 3), + PATH_RAY_GLOSSY = (1 << 4), + PATH_RAY_SINGULAR = (1 << 5), + PATH_RAY_TRANSPARENT = (1 << 6), + + PATH_RAY_SHADOW_OPAQUE = (1 << 7), + PATH_RAY_SHADOW_TRANSPARENT = (1 << 8), PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT), - PATH_RAY_CURVE = 512, /* visibility flag to define curve segments */ - PATH_RAY_VOLUME_SCATTER = 1024, /* volume scattering */ + PATH_RAY_CURVE = (1 << 9), /* visibility flag to define curve segments */ + PATH_RAY_VOLUME_SCATTER = (1 << 10), /* volume scattering */ /* Special flag to tag unaligned BVH nodes. */ - PATH_RAY_NODE_UNALIGNED = 2048, + PATH_RAY_NODE_UNALIGNED = (1 << 11), - PATH_RAY_ALL_VISIBILITY = (1|2|4|8|16|32|64|128|256|512|1024|2048), + PATH_RAY_ALL_VISIBILITY = ((1 << 12)-1), - PATH_RAY_MIS_SKIP = 4096, - PATH_RAY_DIFFUSE_ANCESTOR = 8192, - PATH_RAY_SINGLE_PASS_DONE = 16384, - PATH_RAY_SHADOW_CATCHER = 32768, - PATH_RAY_SHADOW_CATCHER_ONLY = 65536, + PATH_RAY_MIS_SKIP = (1 << 12), + PATH_RAY_DIFFUSE_ANCESTOR = (1 << 13), + PATH_RAY_SINGLE_PASS_DONE = (1 << 14), + PATH_RAY_SHADOW_CATCHER = (1 << 15), + PATH_RAY_SHADOW_CATCHER_ONLY = (1 << 16), + PATH_RAY_STORE_SHADOW_INFO = (1 << 17), }; /* Closure Label */ @@ -394,6 +397,22 @@ typedef enum PassType { #define PASS_ALL (~0) +typedef enum DenoisingPassOffsets { + DENOISING_PASS_NORMAL = 0, + DENOISING_PASS_NORMAL_VAR = 3, + DENOISING_PASS_ALBEDO = 6, + DENOISING_PASS_ALBEDO_VAR = 9, + DENOISING_PASS_DEPTH = 12, + DENOISING_PASS_DEPTH_VAR = 13, + DENOISING_PASS_SHADOW_A = 14, + DENOISING_PASS_SHADOW_B = 17, + DENOISING_PASS_COLOR = 20, + DENOISING_PASS_COLOR_VAR = 23, + + DENOISING_PASS_SIZE_BASE = 26, + DENOISING_PASS_SIZE_CLEAN = 3, +} DenoisingPassOffsets; + typedef enum BakePassFilter { BAKE_FILTER_NONE = 0, BAKE_FILTER_DIRECT = (1 << 0), @@ -427,6 +446,18 @@ typedef enum BakePassFilterCombos { BAKE_FILTER_SUBSURFACE_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_SUBSURFACE), } BakePassFilterCombos; +typedef enum DenoiseFlag { + DENOISING_CLEAN_DIFFUSE_DIR = (1 << 0), + DENOISING_CLEAN_DIFFUSE_IND = (1 << 1), + DENOISING_CLEAN_GLOSSY_DIR = (1 << 2), + DENOISING_CLEAN_GLOSSY_IND = (1 << 3), + DENOISING_CLEAN_TRANSMISSION_DIR = (1 << 4), + DENOISING_CLEAN_TRANSMISSION_IND = (1 << 5), + DENOISING_CLEAN_SUBSURFACE_DIR = (1 << 6), + DENOISING_CLEAN_SUBSURFACE_IND = (1 << 7), + DENOISING_CLEAN_ALL_PASSES = (1 << 8)-1, +} DenoiseFlag; + typedef ccl_addr_space struct PathRadiance { #ifdef __PASSES__ int use_light_pass; @@ -482,6 +513,12 @@ typedef ccl_addr_space struct PathRadiance { /* Color of the background on which shadow is alpha-overed. */ float3 shadow_color; #endif + +#ifdef __DENOISING_FEATURES__ + float3 denoising_normal; + float3 denoising_albedo; + float denoising_depth; +#endif /* __DENOISING_FEATURES__ */ } PathRadiance; typedef struct BsdfEval { @@ -724,12 +761,13 @@ typedef struct AttributeDescriptor { #define SHADER_CLOSURE_BASE \ float3 weight; \ ClosureType type; \ - float sample_weight \ + float sample_weight; \ + float3 N typedef ccl_addr_space struct ccl_align(16) ShaderClosure { SHADER_CLOSURE_BASE; - float data[14]; /* pad to 80 bytes */ + float data[10]; /* pad to 80 bytes */ } ShaderClosure; /* Shader Context @@ -960,6 +998,10 @@ typedef struct PathState { int transmission_bounce; int transparent_bounce; +#ifdef __DENOISING_FEATURES__ + float denoising_feature_weight; +#endif /* __DENOISING_FEATURES__ */ + /* multiple importance sampling */ float min_ray_pdf; /* smallest bounce pdf over entire path up to now */ float ray_pdf; /* last bounce pdf */ @@ -1137,6 +1179,11 @@ typedef struct KernelFilm { float mist_inv_depth; float mist_falloff; + int pass_denoising_data; + int pass_denoising_clean; + int denoising_flags; + int pad; + #ifdef __KERNEL_DEBUG__ int pass_bvh_traversed_nodes; int pass_bvh_traversed_instances; diff --git a/intern/cycles/kernel/kernels/cpu/filter.cpp b/intern/cycles/kernel/kernels/cpu/filter.cpp new file mode 100644 index 00000000000..2ff1a392dc3 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter.cpp @@ -0,0 +1,61 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CPU kernel entry points */ + +/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this + * one with SSE2 intrinsics. + */ +#if defined(__x86_64__) || defined(_M_X64) +# define __KERNEL_SSE2__ +#endif + +/* When building kernel for native machine detect kernel features from the flags + * set by compiler. + */ +#ifdef WITH_KERNEL_NATIVE +# ifdef __SSE2__ +# ifndef __KERNEL_SSE2__ +# define __KERNEL_SSE2__ +# endif +# endif +# ifdef __SSE3__ +# define __KERNEL_SSE3__ +# endif +# ifdef __SSSE3__ +# define __KERNEL_SSSE3__ +# endif +# ifdef __SSE4_1__ +# define __KERNEL_SSE41__ +# endif +# ifdef __AVX__ +# define __KERNEL_SSE__ +# define __KERNEL_AVX__ +# endif +# ifdef __AVX2__ +# define __KERNEL_SSE__ +# define __KERNEL_AVX2__ +# endif +#endif + +/* quiet unused define warnings */ +#if defined(__KERNEL_SSE2__) + /* do nothing */ +#endif + +#include "kernel/filter/filter.h" +#define KERNEL_ARCH cpu +#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp new file mode 100644 index 00000000000..4a9e6047ecf --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp @@ -0,0 +1,39 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with AVX + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ + +#include "kernel/filter/filter.h" +#define KERNEL_ARCH cpu_avx +#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp new file mode 100644 index 00000000000..c22ec576254 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp @@ -0,0 +1,40 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with AVX2 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# define __KERNEL_AVX2__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ + +#include "kernel/filter/filter.h" +#define KERNEL_ARCH cpu_avx2 +#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h new file mode 100644 index 00000000000..10007ee2635 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h @@ -0,0 +1,132 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Templated common declaration part of all CPU kernels. */ + +void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample, + TilesInfo *tiles, + int x, + int y, + float *unfilteredA, + float *unfilteredB, + float *sampleV, + float *sampleVV, + float *bufferV, + int* prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset, + bool use_split_variance); + +void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, + TilesInfo *tiles, + int m_offset, + int v_offset, + int x, + int y, + float *mean, + float *variance, + int* prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset, + bool use_split_variance); + +void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y, + float *mean, + float *variance, + float *a, + float *b, + int* prefilter_rect, + int r); + +void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, + int x, + int y, + int storage_ofs, + float *transform, + int *rank, + int* rect, + int pass_stride, + int radius, + float pca_threshold); + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, + int dy, + float *weightImage, + float *variance, + float *differenceImage, + int* rect, + int w, + int channel_offset, + float a, + float k_2); + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *differenceImage, + float *outImage, + int* rect, + int w, + int f); + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *differenceImage, + float *outImage, + int* rect, + int w, + int f); + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, + int dy, + float *differenceImage, + float *image, + float *outImage, + float *accumImage, + int* rect, + int w, + int f); + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, + int dy, + float *differenceImage, + float *buffer, + float *color_pass, + float *variance_pass, + float *transform, + int *rank, + float *XtWX, + float3 *XtWY, + int *rect, + int *filter_rect, + int w, + int h, + int f, + int pass_stride); + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *outImage, + float *accumImage, + int* rect, + int w); + +void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x, + int y, + int storage_ofs, + int w, + int h, + float *buffer, + int *rank, + float *XtWX, + float3 *XtWY, + int *buffer_params, + int sample); + +#undef KERNEL_ARCH diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h new file mode 100644 index 00000000000..3b71e50ca3b --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h @@ -0,0 +1,259 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Templated common implementation part of all CPU kernels. + * + * The idea is that particular .cpp files sets needed optimization flags and + * simply includes this file without worry of copying actual implementation over. + */ + +#include "kernel/kernel_compat_cpu.h" + +#include "kernel/filter/filter_kernel.h" + +#ifdef KERNEL_STUB +# include "util/util_debug.h" +# define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!")) +#endif + +CCL_NAMESPACE_BEGIN + + +/* Denoise filter */ + +void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample, + TilesInfo *tiles, + int x, + int y, + float *unfilteredA, + float *unfilteredB, + float *sampleVariance, + float *sampleVarianceV, + float *bufferVariance, + int* prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset, + bool use_split_variance) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow); +#else + kernel_filter_divide_shadow(sample, tiles, + x, y, + unfilteredA, + unfilteredB, + sampleVariance, + sampleVarianceV, + bufferVariance, + load_int4(prefilter_rect), + buffer_pass_stride, + buffer_denoising_offset, + use_split_variance); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, + TilesInfo *tiles, + int m_offset, + int v_offset, + int x, + int y, + float *mean, float *variance, + int* prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset, + bool use_split_variance) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_get_feature); +#else + kernel_filter_get_feature(sample, tiles, + m_offset, v_offset, + x, y, + mean, variance, + load_int4(prefilter_rect), + buffer_pass_stride, + buffer_denoising_offset, + use_split_variance); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y, + float *mean, + float *variance, + float *a, + float *b, + int* prefilter_rect, + int r) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_combine_halves); +#else + kernel_filter_combine_halves(x, y, mean, variance, a, b, load_int4(prefilter_rect), r); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, + int x, + int y, + int storage_ofs, + float *transform, + int *rank, + int* prefilter_rect, + int pass_stride, + int radius, + float pca_threshold) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_construct_transform); +#else + rank += storage_ofs; + transform += storage_ofs*TRANSFORM_SIZE; + kernel_filter_construct_transform(buffer, + x, y, + load_int4(prefilter_rect), + pass_stride, + transform, + rank, + radius, + pca_threshold); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, + int dy, + float *weightImage, + float *variance, + float *differenceImage, + int *rect, + int w, + int channel_offset, + float a, + float k_2) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference); +#else + kernel_filter_nlm_calc_difference(dx, dy, weightImage, variance, differenceImage, load_int4(rect), w, channel_offset, a, k_2); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *differenceImage, + float *outImage, + int *rect, + int w, + int f) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_nlm_blur); +#else + kernel_filter_nlm_blur(differenceImage, outImage, load_int4(rect), w, f); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *differenceImage, + float *outImage, + int *rect, + int w, + int f) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_weight); +#else + kernel_filter_nlm_calc_weight(differenceImage, outImage, load_int4(rect), w, f); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, + int dy, + float *differenceImage, + float *image, + float *outImage, + float *accumImage, + int *rect, + int w, + int f) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output); +#else + kernel_filter_nlm_update_output(dx, dy, differenceImage, image, outImage, accumImage, load_int4(rect), w, f); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, + int dy, + float *differenceImage, + float *buffer, + float *color_pass, + float *variance_pass, + float *transform, + int *rank, + float *XtWX, + float3 *XtWY, + int *rect, + int *filter_rect, + int w, + int h, + int f, + int pass_stride) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian); +#else + kernel_filter_nlm_construct_gramian(dx, dy, differenceImage, buffer, color_pass, variance_pass, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_rect), w, h, f, pass_stride); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *outImage, + float *accumImage, + int *rect, + int w) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_nlm_normalize); +#else + kernel_filter_nlm_normalize(outImage, accumImage, load_int4(rect), w); +#endif +} + +void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x, + int y, + int storage_ofs, + int w, + int h, + float *buffer, + int *rank, + float *XtWX, + float3 *XtWY, + int *buffer_params, + int sample) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_finalize); +#else + XtWX += storage_ofs*XTWX_SIZE; + XtWY += storage_ofs*XTWY_SIZE; + rank += storage_ofs; + kernel_filter_finalize(x, y, w, h, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample); +#endif +} + +#undef KERNEL_STUB +#undef STUB_ASSERT +#undef KERNEL_ARCH + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp new file mode 100644 index 00000000000..f7c9935f1d0 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp @@ -0,0 +1,34 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE2 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ + +#include "kernel/filter/filter.h" +#define KERNEL_ARCH cpu_sse2 +#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp new file mode 100644 index 00000000000..070b95a3505 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp @@ -0,0 +1,36 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */ + +#include "kernel/filter/filter.h" +#define KERNEL_ARCH cpu_sse3 +#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp new file mode 100644 index 00000000000..1a7b2040da1 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp @@ -0,0 +1,37 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +#include "util/util_optimization.h" + +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# endif +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ + +#include "kernel/filter/filter.h" +#define KERNEL_ARCH cpu_sse41 +#include "kernel/kernels/cpu/filter_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp index 2600d977972..a645fb4d8dd 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp @@ -17,21 +17,23 @@ /* Optimized CPU kernel entry points. This file is compiled with AVX * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ - -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE__ -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -# define __KERNEL_AVX__ -#endif #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_avx -# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_avx +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp index dba15d037ac..6bbb87727b9 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp @@ -18,21 +18,23 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE__ -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -# define __KERNEL_AVX__ -# define __KERNEL_AVX2__ -#endif - #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_avx2 -# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# define __KERNEL_AVX2__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_avx2 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h index 39c9a9cf33c..9895080d328 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h @@ -89,6 +89,4 @@ DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup) DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface) DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update) -void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func)); - #undef KERNEL_ARCH diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h index 8c05dd1d9ef..b9d82781840 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h @@ -57,6 +57,11 @@ # include "kernel/split/kernel_buffer_update.h" #endif +#ifdef KERNEL_STUB +# include "util/util_debug.h" +# define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!")) +#endif + CCL_NAMESPACE_BEGIN #ifndef __SPLIT_KERNEL__ @@ -71,7 +76,10 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg, int offset, int stride) { -#ifdef __BRANCHED_PATH__ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, path_trace); +#else +# ifdef __BRANCHED_PATH__ if(kernel_data.integrator.branched) { kernel_branched_path_trace(kg, buffer, @@ -82,10 +90,11 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg, stride); } else -#endif +# endif { kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride); } +#endif /* KERNEL_STUB */ } /* Film */ @@ -98,6 +107,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg, int offset, int stride) { +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, convert_to_byte); +#else kernel_film_convert_to_byte(kg, rgba, buffer, @@ -105,6 +117,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg, x, y, offset, stride); +#endif /* KERNEL_STUB */ } void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg, @@ -115,6 +128,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg, int offset, int stride) { +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, convert_to_half_float); +#else kernel_film_convert_to_half_float(kg, rgba, buffer, @@ -122,6 +138,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg, x, y, offset, stride); +#endif /* KERNEL_STUB */ } /* Shader Evaluate */ @@ -136,9 +153,12 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, int offset, int sample) { +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, shader); +#else if(type >= SHADER_EVAL_BAKE) { kernel_assert(output_luma == NULL); -#ifdef __BAKING__ +# ifdef __BAKING__ kernel_bake_evaluate(kg, input, output, @@ -147,7 +167,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, i, offset, sample); -#endif +# endif } else { kernel_shader_evaluate(kg, @@ -158,17 +178,26 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, i, sample); } +#endif /* KERNEL_STUB */ } #else /* __SPLIT_KERNEL__ */ /* Split Kernel Path Tracing */ -#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ +#ifdef KERNEL_STUB +# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ + void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \ + { \ + STUB_ASSERT(KERNEL_ARCH, name); \ + } +#else +# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \ { \ kernel_##name(kg); \ } +#endif /* KERNEL_STUB */ #define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \ void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \ @@ -194,42 +223,10 @@ DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint) DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint) - -void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func)) -{ -#define REGISTER_NAME_STRING(name) #name -#define REGISTER_EVAL_NAME(name) REGISTER_NAME_STRING(name) -#define REGISTER(name) reg(REGISTER_EVAL_NAME(KERNEL_FUNCTION_FULL_NAME(name)), (void*)KERNEL_FUNCTION_FULL_NAME(name)); - - REGISTER(path_trace); - REGISTER(convert_to_byte); - REGISTER(convert_to_half_float); - REGISTER(shader); - - REGISTER(data_init); - REGISTER(path_init); - REGISTER(scene_intersect); - REGISTER(lamp_emission); - REGISTER(do_volume); - REGISTER(queue_enqueue); - REGISTER(indirect_background); - REGISTER(shader_setup); - REGISTER(shader_sort); - REGISTER(shader_eval); - REGISTER(holdout_emission_blurring_pathtermination_ao); - REGISTER(subsurface_scatter); - REGISTER(direct_lighting); - REGISTER(shadow_blocked_ao); - REGISTER(shadow_blocked_dl); - REGISTER(next_iteration_setup); - REGISTER(indirect_subsurface); - REGISTER(buffer_update); - -#undef REGISTER -#undef REGISTER_EVAL_NAME -#undef REGISTER_NAME_STRING -} - #endif /* __SPLIT_KERNEL__ */ +#undef KERNEL_STUB +#undef STUB_ASSERT +#undef KERNEL_ARCH + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp index 27a746a0799..6ba3425a343 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp @@ -17,22 +17,25 @@ /* Optimized CPU kernel entry points. This file is compiled with AVX * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ - -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -# define __KERNEL_AVX__ -#endif #define __SPLIT_KERNEL__ #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_avx -# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_avx +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp index 364d279a189..76b2d77ebb8 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp @@ -18,23 +18,25 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE__ -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -# define __KERNEL_AVX__ -# define __KERNEL_AVX2__ -#endif - #define __SPLIT_KERNEL__ #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_avx2 -# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# define __KERNEL_AVX2__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_avx2 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp index 0afb481296f..b468b6f44c8 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp @@ -18,17 +18,19 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -#endif - #define __SPLIT_KERNEL__ #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_sse2 -# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_sse2 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp index 13d00813591..3e5792d0b17 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp @@ -18,19 +18,21 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -#endif - #define __SPLIT_KERNEL__ #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_sse3 -# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_sse3 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp index a4312071edc..3629f21cd29 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp @@ -18,20 +18,22 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -#endif - #define __SPLIT_KERNEL__ #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_sse41 -# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_sse41 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp index 1acfaa91ac9..57530c88710 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp @@ -18,15 +18,17 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -#endif - #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_sse2 -# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_sse2 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp index f7b6a2e21fe..c607753bc4b 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp @@ -18,17 +18,19 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -#endif - #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_sse3 -# include "kernel/kernels/cpu/kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_sse3 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp index 1900c6e3012..a278554731c 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp +++ b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp @@ -18,18 +18,20 @@ * optimization flags and nearly all functions inlined, while kernel.cpp * is compiled without for other CPU's. */ -/* SSE optimization disabled for now on 32 bit, see bug #36316 */ -#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) -# define __KERNEL_SSE2__ -# define __KERNEL_SSE3__ -# define __KERNEL_SSSE3__ -# define __KERNEL_SSE41__ -#endif - #include "util/util_optimization.h" -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 -# include "kernel/kernel.h" -# define KERNEL_ARCH cpu_sse41 -# include "kernel/kernels/cpu//kernel_cpu_impl.h" +#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 +# define KERNEL_STUB +#else +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# endif #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ + +#include "kernel/kernel.h" +#define KERNEL_ARCH cpu_sse41 +#include "kernel/kernels/cpu/kernel_cpu_impl.h" diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu new file mode 100644 index 00000000000..50f73f9728d --- /dev/null +++ b/intern/cycles/kernel/kernels/cuda/filter.cu @@ -0,0 +1,235 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CUDA kernel entry points */ + +#ifdef __CUDA_ARCH__ + +#include "kernel_config.h" + +#include "kernel/kernel_compat_cuda.h" + +#include "kernel/filter/filter_kernel.h" + +/* kernels */ + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_divide_shadow(int sample, + TilesInfo *tiles, + float *unfilteredA, + float *unfilteredB, + float *sampleVariance, + float *sampleVarianceV, + float *bufferVariance, + int4 prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset, + bool use_split_variance) +{ + int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; + int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_divide_shadow(sample, + tiles, + x, y, + unfilteredA, + unfilteredB, + sampleVariance, + sampleVarianceV, + bufferVariance, + prefilter_rect, + buffer_pass_stride, + buffer_denoising_offset, + use_split_variance); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_get_feature(int sample, + TilesInfo *tiles, + int m_offset, + int v_offset, + float *mean, + float *variance, + int4 prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset, + bool use_split_variance) +{ + int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; + int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_get_feature(sample, + tiles, + m_offset, v_offset, + x, y, + mean, variance, + prefilter_rect, + buffer_pass_stride, + buffer_denoising_offset, + use_split_variance); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r) +{ + int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; + int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_construct_transform(float const* __restrict__ buffer, + float *transform, int *rank, + int4 filter_area, int4 rect, + int radius, float pca_threshold, + int pass_stride) +{ + int x = blockDim.x*blockIdx.x + threadIdx.x; + int y = blockDim.y*blockIdx.y + threadIdx.y; + if(x < filter_area.z && y < filter_area.w) { + int *l_rank = rank + y*filter_area.z + x; + float *l_transform = transform + y*filter_area.z + x; + kernel_filter_construct_transform(buffer, + x + filter_area.x, y + filter_area.y, + rect, pass_stride, + l_transform, l_rank, + radius, pca_threshold, + filter_area.z*filter_area.w, + threadIdx.y*blockDim.x + threadIdx.x); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_nlm_calc_difference(int dx, int dy, + float ccl_restrict_ptr weightImage, + float ccl_restrict_ptr varianceImage, + float *differenceImage, + int4 rect, int w, + int channel_offset, + float a, float k_2) { + int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; + int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_calc_difference(x, y, dx, dy, weightImage, varianceImage, differenceImage, rect, w, channel_offset, a, k_2); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_nlm_blur(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) { + int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; + int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_blur(x, y, differenceImage, outImage, rect, w, f); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_nlm_calc_weight(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) { + int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; + int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_calc_weight(x, y, differenceImage, outImage, rect, w, f); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_nlm_update_output(int dx, int dy, + float ccl_restrict_ptr differenceImage, + float ccl_restrict_ptr image, + float *outImage, float *accumImage, + int4 rect, int w, + int f) { + int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; + int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_update_output(x, y, dx, dy, differenceImage, image, outImage, accumImage, rect, w, f); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_nlm_normalize(float *outImage, float ccl_restrict_ptr accumImage, int4 rect, int w) { + int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; + int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_normalize(x, y, outImage, accumImage, rect, w); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_nlm_construct_gramian(int dx, int dy, + float ccl_restrict_ptr differenceImage, + float ccl_restrict_ptr buffer, + float *color_pass, + float *variance_pass, + float const* __restrict__ transform, + int *rank, + float *XtWX, + float3 *XtWY, + int4 rect, + int4 filter_rect, + int w, int h, int f, + int pass_stride) { + int x = blockDim.x*blockIdx.x + threadIdx.x + max(0, rect.x-filter_rect.x); + int y = blockDim.y*blockIdx.y + threadIdx.y + max(0, rect.y-filter_rect.y); + if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) { + kernel_filter_nlm_construct_gramian(x, y, + dx, dy, + differenceImage, + buffer, + color_pass, variance_pass, + transform, rank, + XtWX, XtWY, + rect, filter_rect, + w, h, f, + pass_stride, + threadIdx.y*blockDim.x + threadIdx.x); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_finalize(int w, int h, + float *buffer, int *rank, + float *XtWX, float3 *XtWY, + int4 filter_area, int4 buffer_params, + int sample) { + int x = blockDim.x*blockIdx.x + threadIdx.x; + int y = blockDim.y*blockIdx.y + threadIdx.y; + if(x < filter_area.z && y < filter_area.w) { + int storage_ofs = y*filter_area.z+x; + rank += storage_ofs; + XtWX += storage_ofs; + XtWY += storage_ofs; + kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample); + } +} + +#endif + diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl new file mode 100644 index 00000000000..3d82bff9892 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/filter.cl @@ -0,0 +1,262 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* OpenCL kernel entry points */ + +#include "kernel/kernel_compat_opencl.h" + +#include "kernel/filter/filter_kernel.h" + +/* kernels */ + +__kernel void kernel_ocl_filter_divide_shadow(int sample, + ccl_global TilesInfo *tiles, + ccl_global float *unfilteredA, + ccl_global float *unfilteredB, + ccl_global float *sampleVariance, + ccl_global float *sampleVarianceV, + ccl_global float *bufferVariance, + int4 prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset, + char use_split_variance) +{ + int x = prefilter_rect.x + get_global_id(0); + int y = prefilter_rect.y + get_global_id(1); + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_divide_shadow(sample, + tiles, + x, y, + unfilteredA, + unfilteredB, + sampleVariance, + sampleVarianceV, + bufferVariance, + prefilter_rect, + buffer_pass_stride, + buffer_denoising_offset, + use_split_variance); + } +} + +__kernel void kernel_ocl_filter_get_feature(int sample, + ccl_global TilesInfo *tiles, + int m_offset, + int v_offset, + ccl_global float *mean, + ccl_global float *variance, + int4 prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset, + char use_split_variance) +{ + int x = prefilter_rect.x + get_global_id(0); + int y = prefilter_rect.y + get_global_id(1); + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_get_feature(sample, + tiles, + m_offset, v_offset, + x, y, + mean, variance, + prefilter_rect, + buffer_pass_stride, + buffer_denoising_offset, + use_split_variance); + } +} + +__kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean, + ccl_global float *variance, + ccl_global float *a, + ccl_global float *b, + int4 prefilter_rect, + int r) +{ + int x = prefilter_rect.x + get_global_id(0); + int y = prefilter_rect.y + get_global_id(1); + if(x < prefilter_rect.z && y < prefilter_rect.w) { + kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r); + } +} + +__kernel void kernel_ocl_filter_construct_transform(ccl_global float ccl_restrict_ptr buffer, + ccl_global float *transform, + ccl_global int *rank, + int4 filter_area, + int4 rect, + int pass_stride, + int radius, + float pca_threshold) +{ + int x = get_global_id(0); + int y = get_global_id(1); + if(x < filter_area.z && y < filter_area.w) { + ccl_global int *l_rank = rank + y*filter_area.z + x; + ccl_global float *l_transform = transform + y*filter_area.z + x; + kernel_filter_construct_transform(buffer, + x + filter_area.x, y + filter_area.y, + rect, pass_stride, + l_transform, l_rank, + radius, pca_threshold, + filter_area.z*filter_area.w, + get_local_id(1)*get_local_size(0) + get_local_id(0)); + } +} + +__kernel void kernel_ocl_filter_nlm_calc_difference(int dx, + int dy, + ccl_global float ccl_restrict_ptr weightImage, + ccl_global float ccl_restrict_ptr varianceImage, + ccl_global float *differenceImage, + int4 rect, + int w, + int channel_offset, + float a, + float k_2) { + int x = get_global_id(0) + rect.x; + int y = get_global_id(1) + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_calc_difference(x, y, dx, dy, weightImage, varianceImage, differenceImage, rect, w, channel_offset, a, k_2); + } +} + +__kernel void kernel_ocl_filter_nlm_blur(ccl_global float ccl_restrict_ptr differenceImage, + ccl_global float *outImage, + int4 rect, + int w, + int f) { + int x = get_global_id(0) + rect.x; + int y = get_global_id(1) + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_blur(x, y, differenceImage, outImage, rect, w, f); + } +} + +__kernel void kernel_ocl_filter_nlm_calc_weight(ccl_global float ccl_restrict_ptr differenceImage, + ccl_global float *outImage, + int4 rect, + int w, + int f) { + int x = get_global_id(0) + rect.x; + int y = get_global_id(1) + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_calc_weight(x, y, differenceImage, outImage, rect, w, f); + } +} + +__kernel void kernel_ocl_filter_nlm_update_output(int dx, + int dy, + ccl_global float ccl_restrict_ptr differenceImage, + ccl_global float ccl_restrict_ptr image, + ccl_global float *outImage, + ccl_global float *accumImage, + int4 rect, + int w, + int f) { + int x = get_global_id(0) + rect.x; + int y = get_global_id(1) + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_update_output(x, y, dx, dy, differenceImage, image, outImage, accumImage, rect, w, f); + } +} + +__kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *outImage, + ccl_global float ccl_restrict_ptr accumImage, + int4 rect, + int w) { + int x = get_global_id(0) + rect.x; + int y = get_global_id(1) + rect.y; + if(x < rect.z && y < rect.w) { + kernel_filter_nlm_normalize(x, y, outImage, accumImage, rect, w); + } +} + +__kernel void kernel_ocl_filter_nlm_construct_gramian(int dx, + int dy, + ccl_global float ccl_restrict_ptr differenceImage, + ccl_global float ccl_restrict_ptr buffer, + ccl_global float *color_pass, + ccl_global float *variance_pass, + ccl_global float ccl_restrict_ptr transform, + ccl_global int *rank, + ccl_global float *XtWX, + ccl_global float3 *XtWY, + int4 rect, + int4 filter_rect, + int w, + int h, + int f, + int pass_stride) { + int x = get_global_id(0) + max(0, rect.x-filter_rect.x); + int y = get_global_id(1) + max(0, rect.y-filter_rect.y); + if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) { + kernel_filter_nlm_construct_gramian(x, y, + dx, dy, + differenceImage, + buffer, + color_pass, variance_pass, + transform, rank, + XtWX, XtWY, + rect, filter_rect, + w, h, f, + pass_stride, + get_local_id(1)*get_local_size(0) + get_local_id(0)); + } +} + +__kernel void kernel_ocl_filter_finalize(int w, + int h, + ccl_global float *buffer, + ccl_global int *rank, + ccl_global float *XtWX, + ccl_global float3 *XtWY, + int4 filter_area, + int4 buffer_params, + int sample) { + int x = get_global_id(0); + int y = get_global_id(1); + if(x < filter_area.z && y < filter_area.w) { + int storage_ofs = y*filter_area.z+x; + rank += storage_ofs; + XtWX += storage_ofs; + XtWY += storage_ofs; + kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample); + } +} + +__kernel void kernel_ocl_filter_set_tiles(ccl_global TilesInfo* tiles, + ccl_global float *buffer_1, + ccl_global float *buffer_2, + ccl_global float *buffer_3, + ccl_global float *buffer_4, + ccl_global float *buffer_5, + ccl_global float *buffer_6, + ccl_global float *buffer_7, + ccl_global float *buffer_8, + ccl_global float *buffer_9) +{ + if((get_global_id(0) == 0) && (get_global_id(1) == 0)) { + tiles->buffers[0] = buffer_1; + tiles->buffers[1] = buffer_2; + tiles->buffers[2] = buffer_3; + tiles->buffers[3] = buffer_4; + tiles->buffers[4] = buffer_5; + tiles->buffers[5] = buffer_6; + tiles->buffers[6] = buffer_7; + tiles->buffers[7] = buffer_8; + tiles->buffers[8] = buffer_9; + } +} diff --git a/intern/cycles/kernel/split/kernel_branched.h b/intern/cycles/kernel/split/kernel_branched.h index c7bc1b4df0a..dc74a2ada53 100644 --- a/intern/cycles/kernel/split/kernel_branched.h +++ b/intern/cycles/kernel/split/kernel_branched.h @@ -76,6 +76,26 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter( RNG rng = kernel_split_state.rng[ray_index]; PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; float3 throughput = branched_state->throughput; + ccl_global PathState *ps = &kernel_split_state.path_state[ray_index]; + + float sum_sample_weight = 0.0f; +#ifdef __DENOISING_FEATURES__ + if(ps->denoising_feature_weight > 0.0f) { + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; + + /* transparency is not handled here, but in outer loop */ + if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) { + continue; + } + + sum_sample_weight += sc->sample_weight; + } + } + else { + sum_sample_weight = 1.0f; + } +#endif /* __DENOISING_FEATURES__ */ for(int i = branched_state->next_closure; i < sd->num_closure; i++) { const ShaderClosure *sc = &sd->closure[i]; @@ -103,7 +123,6 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter( RNG bsdf_rng = cmj_hash(rng, i); for(int j = branched_state->next_sample; j < num_samples; j++) { - ccl_global PathState *ps = &kernel_split_state.path_state[ray_index]; if(reset_path_state) { *ps = branched_state->path_state; } @@ -122,7 +141,8 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter( tp, ps, L, - bsdf_ray)) + bsdf_ray, + sum_sample_weight)) { continue; } diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h index 859c221d976..1f6dce0253c 100644 --- a/intern/cycles/kernel/split/kernel_buffer_update.h +++ b/intern/cycles/kernel/split/kernel_buffer_update.h @@ -111,24 +111,15 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg, buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride; if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) { - float3 L_sum; -#ifdef __SHADOW_TRICKS__ - if(state->flag & PATH_RAY_SHADOW_CATCHER) { - L_sum = path_radiance_sum_shadowcatcher(kg, L, L_transparent); - } - else -#endif /* __SHADOW_TRICKS__ */ - { - L_sum = path_radiance_clamp_and_sum(kg, L); - } kernel_write_light_passes(kg, buffer, L, sample); #ifdef __KERNEL_DEBUG__ kernel_write_debug_passes(kg, buffer, state, debug_data, sample); #endif - float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent)); /* accumulate result in output buffer */ - kernel_write_pass_float4(buffer, sample, L_rad); + bool is_shadow_catcher = (state->flag & PATH_RAY_SHADOW_CATCHER); + kernel_write_result(kg, buffer, sample, L, 1.0f - (*L_transparent), is_shadow_catcher); + path_rng_end(kg, rng_state, rng); ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h index 87498910d38..670a557f084 100644 --- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h +++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h @@ -125,7 +125,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( #ifdef __SHADOW_TRICKS__ if((sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) { if(state->flag & PATH_RAY_CAMERA) { - state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY); + state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO); state->catcher_object = sd->object; if(!kernel_data.background.transparent) { PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; @@ -246,6 +246,8 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( kernel_split_state.throughput[ray_index] = throughput/probability; } } + + kernel_update_denoising_features(kg, sd, state, L); } } diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h index 452b6e45a36..386fbbc4d09 100644 --- a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h +++ b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h @@ -89,10 +89,10 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg) &shadow)) { /* accumulate */ - path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp); + path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp); } else { - path_radiance_accum_total_light(L, throughput, &L_light); + path_radiance_accum_total_light(L, state, throughput, &L_light); } } diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h index 407f8e784c0..7918c640175 100644 --- a/intern/cycles/kernel/svm/svm_closure.h +++ b/intern/cycles/kernel/svm/svm_closure.h @@ -444,6 +444,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight); if(bsdf) { + bsdf->N = N; sd->flag |= bsdf_transparent_setup(bsdf); } break; @@ -704,6 +705,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight); if(bsdf) { + bsdf->N = N; /* todo: giving a fixed weight here will cause issues when * mixing multiple BSDFS. energy will not be conserved and * the throughput can blow up after multiple bounces. we diff --git a/intern/cycles/kernel/svm/svm_displace.h b/intern/cycles/kernel/svm/svm_displace.h index c94fa130af7..656357be52d 100644 --- a/intern/cycles/kernel/svm/svm_displace.h +++ b/intern/cycles/kernel/svm/svm_displace.h @@ -63,8 +63,13 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac strength = max(strength, 0.0f); /* compute and output perturbed normal */ - float3 normal_out = normalize(absdet*normal_in - distance*signf(det)*surfgrad); - normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in); + float3 normal_out = safe_normalize(absdet*normal_in - distance*signf(det)*surfgrad); + if(is_zero(normal_out)) { + normal_out = normal_in; + } + else { + normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in); + } if(use_object_space) { object_normal_transform(kg, sd, &normal_out); diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h index 4a09d9f6653..cce4e89e715 100644 --- a/intern/cycles/kernel/svm/svm_geometry.h +++ b/intern/cycles/kernel/svm/svm_geometry.h @@ -37,6 +37,7 @@ ccl_device_inline void svm_node_geometry(KernelGlobals *kg, #ifdef __UV__ case NODE_GEOM_uv: data = make_float3(sd->u, sd->v, 0.0f); break; #endif + default: data = make_float3(0.0f, 0.0f, 0.0f); } stack_store_float3(stack, out_offset, data); diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h index 328ff79223b..8e45dbfa5ff 100644 --- a/intern/cycles/kernel/svm/svm_image.h +++ b/intern/cycles/kernel/svm/svm_image.h @@ -317,8 +317,8 @@ ccl_device void svm_node_tex_environment(KernelGlobals *kg, ShaderData *sd, floa float3 co = stack_load_float3(stack, co_offset); float2 uv; - co = normalize(co); - + co = safe_normalize(co); + if(projection == 0) uv = direction_to_equirectangular(co); else diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h index cc9b840b13f..d859cae1708 100644 --- a/intern/cycles/kernel/svm/svm_types.h +++ b/intern/cycles/kernel/svm/svm_types.h @@ -402,7 +402,6 @@ typedef enum ClosureType { CLOSURE_BSDF_DIFFUSE_TOON_ID, /* Glossy */ - CLOSURE_BSDF_GLOSSY_ID, CLOSURE_BSDF_REFLECTION_ID, CLOSURE_BSDF_MICROFACET_GGX_ID, CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID, @@ -423,14 +422,13 @@ typedef enum ClosureType { CLOSURE_BSDF_HAIR_REFLECTION_ID, /* Transmission */ - CLOSURE_BSDF_TRANSMISSION_ID, CLOSURE_BSDF_TRANSLUCENT_ID, CLOSURE_BSDF_REFRACTION_ID, CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID, CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID, + CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID, CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID, CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID, - CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID, CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID, CLOSURE_BSDF_SHARP_GLASS_ID, CLOSURE_BSDF_HAIR_TRANSMISSION_ID, @@ -465,13 +463,16 @@ typedef enum ClosureType { /* watch this, being lazy with memory usage */ #define CLOSURE_IS_BSDF(type) (type <= CLOSURE_BSDF_TRANSPARENT_ID) #define CLOSURE_IS_BSDF_DIFFUSE(type) (type >= CLOSURE_BSDF_DIFFUSE_ID && type <= CLOSURE_BSDF_DIFFUSE_TOON_ID) -#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_GLOSSY_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID) -#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSMISSION_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID) +#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID) +#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSLUCENT_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID) #define CLOSURE_IS_BSDF_BSSRDF(type) (type == CLOSURE_BSDF_BSSRDF_ID || type == CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID) +#define CLOSURE_IS_BSDF_TRANSPARENT(type) (type == CLOSURE_BSDF_TRANSPARENT_ID) #define CLOSURE_IS_BSDF_ANISOTROPIC(type) (type >= CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID) #define CLOSURE_IS_BSDF_MULTISCATTER(type) (type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID ||\ type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID || \ type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID) +#define CLOSURE_IS_BSDF_MICROFACET(type) ((type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID) ||\ + (type >= CLOSURE_BSDF_REFRACTION_ID && type <= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID)) #define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_BURLEY_ID) #define CLOSURE_IS_BSSRDF(type) (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_BURLEY_ID) #define CLOSURE_IS_VOLUME(type) (type >= CLOSURE_VOLUME_ID && type <= CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) @@ -480,7 +481,7 @@ typedef enum ClosureType { #define CLOSURE_IS_BACKGROUND(type) (type == CLOSURE_BACKGROUND_ID) #define CLOSURE_IS_AMBIENT_OCCLUSION(type) (type == CLOSURE_AMBIENT_OCCLUSION_ID) #define CLOSURE_IS_PHASE(type) (type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) -#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID) +#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID) #define CLOSURE_IS_PRINCIPLED(type) (type == CLOSURE_BSDF_PRINCIPLED_ID) #define CLOSURE_WEIGHT_CUTOFF 1e-5f |