Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/kernel')
-rw-r--r--intern/cycles/kernel/CMakeLists.txt100
-rw-r--r--intern/cycles/kernel/closure/bsdf.h18
-rw-r--r--intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h1
-rw-r--r--intern/cycles/kernel/closure/bsdf_diffuse.h1
-rw-r--r--intern/cycles/kernel/closure/bsdf_diffuse_ramp.h1
-rw-r--r--intern/cycles/kernel/closure/bsdf_microfacet.h1
-rw-r--r--intern/cycles/kernel/closure/bsdf_microfacet_multi.h2
-rw-r--r--intern/cycles/kernel/closure/bsdf_oren_nayar.h1
-rw-r--r--intern/cycles/kernel/closure/bsdf_phong_ramp.h1
-rw-r--r--intern/cycles/kernel/closure/bsdf_principled_diffuse.h1
-rw-r--r--intern/cycles/kernel/closure/bsdf_principled_sheen.h1
-rw-r--r--intern/cycles/kernel/closure/bsdf_toon.h1
-rw-r--r--intern/cycles/kernel/closure/bssrdf.h1
-rw-r--r--intern/cycles/kernel/filter/filter.h52
-rw-r--r--intern/cycles/kernel/filter/filter_defines.h38
-rw-r--r--intern/cycles/kernel/filter/filter_features.h120
-rw-r--r--intern/cycles/kernel/filter/filter_features_sse.h95
-rw-r--r--intern/cycles/kernel/filter/filter_kernel.h50
-rw-r--r--intern/cycles/kernel/filter/filter_nlm_cpu.h163
-rw-r--r--intern/cycles/kernel/filter/filter_nlm_gpu.h147
-rw-r--r--intern/cycles/kernel/filter/filter_prefilter.h145
-rw-r--r--intern/cycles/kernel/filter/filter_reconstruction.h103
-rw-r--r--intern/cycles/kernel/filter/filter_transform.h113
-rw-r--r--intern/cycles/kernel/filter/filter_transform_gpu.h117
-rw-r--r--intern/cycles/kernel/filter/filter_transform_sse.h102
-rw-r--r--intern/cycles/kernel/geom/geom_triangle.h6
-rw-r--r--intern/cycles/kernel/kernel.h38
-rw-r--r--intern/cycles/kernel/kernel_accumulate.h98
-rw-r--r--intern/cycles/kernel/kernel_compat_cpu.h2
-rw-r--r--intern/cycles/kernel/kernel_compat_cuda.h4
-rw-r--r--intern/cycles/kernel/kernel_compat_opencl.h2
-rw-r--r--intern/cycles/kernel/kernel_light.h2
-rw-r--r--intern/cycles/kernel/kernel_passes.h203
-rw-r--r--intern/cycles/kernel/kernel_path.h89
-rw-r--r--intern/cycles/kernel/kernel_path_branched.h117
-rw-r--r--intern/cycles/kernel/kernel_path_state.h14
-rw-r--r--intern/cycles/kernel/kernel_path_surface.h23
-rw-r--r--intern/cycles/kernel/kernel_path_volume.h8
-rw-r--r--intern/cycles/kernel/kernel_projection.h3
-rw-r--r--intern/cycles/kernel/kernel_shader.h6
-rw-r--r--intern/cycles/kernel/kernel_types.h89
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter.cpp61
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_avx.cpp39
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_avx2.cpp40
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_cpu.h132
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h259
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_sse2.cpp34
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_sse3.cpp36
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_sse41.cpp37
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_avx.cpp30
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp32
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_cpu.h2
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h79
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp29
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp32
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp20
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp24
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp26
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp20
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp24
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp26
-rw-r--r--intern/cycles/kernel/kernels/cuda/filter.cu235
-rw-r--r--intern/cycles/kernel/kernels/opencl/filter.cl262
-rw-r--r--intern/cycles/kernel/split/kernel_branched.h24
-rw-r--r--intern/cycles/kernel/split/kernel_buffer_update.h15
-rw-r--r--intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h4
-rw-r--r--intern/cycles/kernel/split/kernel_shadow_blocked_dl.h4
-rw-r--r--intern/cycles/kernel/svm/svm_closure.h2
-rw-r--r--intern/cycles/kernel/svm/svm_displace.h9
-rw-r--r--intern/cycles/kernel/svm/svm_geometry.h1
-rw-r--r--intern/cycles/kernel/svm/svm_image.h4
-rw-r--r--intern/cycles/kernel/svm/svm_types.h13
72 files changed, 3216 insertions, 418 deletions
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 9bb0455b9d5..bef869f34b4 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -10,7 +10,23 @@ set(INC_SYS
set(SRC
kernels/cpu/kernel.cpp
+ kernels/cpu/kernel_sse2.cpp
+ kernels/cpu/kernel_sse3.cpp
+ kernels/cpu/kernel_sse41.cpp
+ kernels/cpu/kernel_avx.cpp
+ kernels/cpu/kernel_avx2.cpp
kernels/cpu/kernel_split.cpp
+ kernels/cpu/kernel_split_sse2.cpp
+ kernels/cpu/kernel_split_sse3.cpp
+ kernels/cpu/kernel_split_sse41.cpp
+ kernels/cpu/kernel_split_avx.cpp
+ kernels/cpu/kernel_split_avx2.cpp
+ kernels/cpu/filter.cpp
+ kernels/cpu/filter_sse2.cpp
+ kernels/cpu/filter_sse3.cpp
+ kernels/cpu/filter_sse41.cpp
+ kernels/cpu/filter_avx.cpp
+ kernels/cpu/filter_avx2.cpp
kernels/opencl/kernel.cl
kernels/opencl/kernel_state_buffer_size.cl
kernels/opencl/kernel_split.cl
@@ -32,8 +48,10 @@ set(SRC
kernels/opencl/kernel_next_iteration_setup.cl
kernels/opencl/kernel_indirect_subsurface.cl
kernels/opencl/kernel_buffer_update.cl
+ kernels/opencl/filter.cl
kernels/cuda/kernel.cu
kernels/cuda/kernel_split.cu
+ kernels/cuda/filter.cu
)
set(SRC_BVH_HEADERS
@@ -95,6 +113,8 @@ set(SRC_KERNELS_CPU_HEADERS
kernels/cpu/kernel_cpu.h
kernels/cpu/kernel_cpu_impl.h
kernels/cpu/kernel_cpu_image.h
+ kernels/cpu/filter_cpu.h
+ kernels/cpu/filter_cpu_impl.h
)
set(SRC_KERNELS_CUDA_HEADERS
@@ -190,6 +210,21 @@ set(SRC_GEOM_HEADERS
geom/geom_volume.h
)
+set(SRC_FILTER_HEADERS
+ filter/filter.h
+ filter/filter_defines.h
+ filter/filter_features.h
+ filter/filter_features_sse.h
+ filter/filter_kernel.h
+ filter/filter_nlm_cpu.h
+ filter/filter_nlm_gpu.h
+ filter/filter_prefilter.h
+ filter/filter_reconstruction.h
+ filter/filter_transform.h
+ filter/filter_transform_gpu.h
+ filter/filter_transform_sse.h
+)
+
set(SRC_UTIL_HEADERS
../util/util_atomic.h
../util/util_color.h
@@ -204,6 +239,7 @@ set(SRC_UTIL_HEADERS
../util/util_math_int2.h
../util/util_math_int3.h
../util/util_math_int4.h
+ ../util/util_math_matrix.h
../util/util_static_assert.h
../util/util_transform.h
../util/util_texture.h
@@ -295,23 +331,21 @@ if(WITH_CYCLES_CUDA_BINARIES)
${SRC_CLOSURE_HEADERS}
${SRC_UTIL_HEADERS}
)
+ set(cuda_filter_sources kernels/cuda/filter.cu
+ ${SRC_HEADERS}
+ ${SRC_KERNELS_CUDA_HEADERS}
+ ${SRC_FILTER_HEADERS}
+ ${SRC_UTIL_HEADERS}
+ )
set(cuda_cubins)
- macro(CYCLES_CUDA_KERNEL_ADD arch split experimental)
- if(${split})
- set(cuda_extra_flags "-D__SPLIT__")
- set(cuda_cubin kernel_split)
- else()
- set(cuda_extra_flags "")
- set(cuda_cubin kernel)
- endif()
-
+ macro(CYCLES_CUDA_KERNEL_ADD arch name flags sources experimental)
if(${experimental})
- set(cuda_extra_flags ${cuda_extra_flags} -D__KERNEL_EXPERIMENTAL__)
- set(cuda_cubin ${cuda_cubin}_experimental)
+ set(flags ${flags} -D__KERNEL_EXPERIMENTAL__)
+ set(name ${name}_experimental)
endif()
- set(cuda_cubin ${cuda_cubin}_${arch}.cubin)
+ set(cuda_cubin ${name}_${arch}.cubin)
if(WITH_CYCLES_DEBUG)
set(cuda_debug_flags "-D__KERNEL_DEBUG__")
@@ -325,11 +359,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}")
set(cuda_math_flags "--use_fast_math")
- if(split)
- set(cuda_kernel_src "/kernels/cuda/kernel_split.cu")
- else()
- set(cuda_kernel_src "/kernels/cuda/kernel.cu")
- endif()
+ set(cuda_kernel_src "/kernels/cuda/${name}.cu")
add_custom_command(
OUTPUT ${cuda_cubin}
@@ -343,13 +373,13 @@ if(WITH_CYCLES_CUDA_BINARIES)
${cuda_arch_flags}
${cuda_version_flags}
${cuda_math_flags}
- ${cuda_extra_flags}
+ ${flags}
${cuda_debug_flags}
-I${CMAKE_CURRENT_SOURCE_DIR}/..
-DCCL_NAMESPACE_BEGIN=
-DCCL_NAMESPACE_END=
-DNVCC
- DEPENDS ${cuda_sources})
+ DEPENDS ${sources})
delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
list(APPEND cuda_cubins ${cuda_cubin})
@@ -363,11 +393,12 @@ if(WITH_CYCLES_CUDA_BINARIES)
foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
# Compile regular kernel
- CYCLES_CUDA_KERNEL_ADD(${arch} FALSE FALSE)
+ CYCLES_CUDA_KERNEL_ADD(${arch} kernel "" "${cuda_sources}" FALSE)
+ CYCLES_CUDA_KERNEL_ADD(${arch} filter "" "${cuda_filter_sources}" FALSE)
if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES)
# Compile split kernel
- CYCLES_CUDA_KERNEL_ADD(${arch} TRUE FALSE)
+ CYCLES_CUDA_KERNEL_ADD(${arch} kernel_split "-D__SPLIT__" ${cuda_sources} FALSE)
endif()
endforeach()
@@ -388,41 +419,30 @@ include_directories(SYSTEM ${INC_SYS})
set_source_files_properties(kernels/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
+set_source_files_properties(kernels/cpu/filter.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
if(CXX_HAS_SSE)
- list(APPEND SRC
- kernels/cpu/kernel_sse2.cpp
- kernels/cpu/kernel_sse3.cpp
- kernels/cpu/kernel_sse41.cpp
- kernels/cpu/kernel_split_sse2.cpp
- kernels/cpu/kernel_split_sse3.cpp
- kernels/cpu/kernel_split_sse41.cpp
- )
-
set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
+ set_source_files_properties(kernels/cpu/filter_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+ set_source_files_properties(kernels/cpu/filter_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
+ set_source_files_properties(kernels/cpu/filter_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
endif()
if(CXX_HAS_AVX)
- list(APPEND SRC
- kernels/cpu/kernel_avx.cpp
- kernels/cpu/kernel_split_avx.cpp
- )
set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+ set_source_files_properties(kernels/cpu/filter_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
endif()
if(CXX_HAS_AVX2)
- list(APPEND SRC
- kernels/cpu/kernel_avx2.cpp
- kernels/cpu/kernel_split_avx2.cpp
- )
set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+ set_source_files_properties(kernels/cpu/filter_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
endif()
add_library(cycles_kernel
@@ -432,6 +452,7 @@ add_library(cycles_kernel
${SRC_KERNELS_CUDA_HEADERS}
${SRC_BVH_HEADERS}
${SRC_CLOSURE_HEADERS}
+ ${SRC_FILTER_HEADERS}
${SRC_SVM_HEADERS}
${SRC_GEOM_HEADERS}
${SRC_SPLIT_HEADERS}
@@ -472,12 +493,15 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocke
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_subsurface.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_buffer_update.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/filter.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/filter.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_FILTER_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/filter)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/util)
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index a6bba8bf74d..a04c157dc40 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -435,5 +435,23 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b)
#endif
}
+/* Classifies a closure as diffuse-like or specular-like.
+ * This is needed for the denoising feature pass generation,
+ * which are written on the first bounce where more than 25%
+ * of the sampling weight belongs to diffuse-line closures. */
+ccl_device_inline bool bsdf_is_specular_like(ShaderClosure *sc)
+{
+ if(CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
+ return true;
+ }
+
+ if(CLOSURE_IS_BSDF_MICROFACET(sc->type)) {
+ MicrofacetBsdf *bsdf = (MicrofacetBsdf*) sc;
+ return (bsdf->alpha_x*bsdf->alpha_y <= 0.075f*0.075f);
+ }
+
+ return false;
+}
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
index 7e0f5a7ec75..a5ba2cb2972 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
@@ -40,7 +40,6 @@ typedef ccl_addr_space struct VelvetBsdf {
float sigma;
float invsigma2;
- float3 N;
} VelvetBsdf;
ccl_device int bsdf_ashikhmin_velvet_setup(VelvetBsdf *bsdf)
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse.h b/intern/cycles/kernel/closure/bsdf_diffuse.h
index dcd187f9305..ec6f1f20996 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse.h
@@ -37,7 +37,6 @@ CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct DiffuseBsdf {
SHADER_CLOSURE_BASE;
- float3 N;
} DiffuseBsdf;
/* DIFFUSE */
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
index 2d982a95fe4..24f40af46a3 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
@@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct DiffuseRampBsdf {
SHADER_CLOSURE_BASE;
- float3 N;
float3 *colors;
} DiffuseRampBsdf;
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h
index 3fe7572e4ce..30cc8b90330 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -46,7 +46,6 @@ typedef ccl_addr_space struct MicrofacetBsdf {
float alpha_x, alpha_y, ior;
MicrofacetExtra *extra;
float3 T;
- float3 N;
} MicrofacetBsdf;
/* Beckmann and GGX microfacet importance sampling. */
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
index 57f1e733ee7..30644946840 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
@@ -42,7 +42,7 @@ ccl_device_forceinline float D_ggx_aniso(const float3 wm, const float2 alpha)
/* Sample slope distribution (based on page 14 of the supplemental implementation). */
ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 randU)
{
- if(cosI > 0.9999f || cosI < 1e-6f) {
+ if(cosI > 0.9999f || fabsf(cosI) < 1e-6f) {
const float r = sqrtf(randU.x / max(1.0f - randU.x, 1e-7f));
const float phi = M_2PI_F * randU.y;
return make_float2(r*cosf(phi), r*sinf(phi));
diff --git a/intern/cycles/kernel/closure/bsdf_oren_nayar.h b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
index cb342a026ef..6b770fc0c16 100644
--- a/intern/cycles/kernel/closure/bsdf_oren_nayar.h
+++ b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
@@ -22,7 +22,6 @@ CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct OrenNayarBsdf {
SHADER_CLOSURE_BASE;
- float3 N;
float roughness;
float a;
float b;
diff --git a/intern/cycles/kernel/closure/bsdf_phong_ramp.h b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
index e152a8780db..420f94755ee 100644
--- a/intern/cycles/kernel/closure/bsdf_phong_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
@@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct PhongRampBsdf {
SHADER_CLOSURE_BASE;
- float3 N;
float exponent;
float3 *colors;
} PhongRampBsdf;
diff --git a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
index 8a116693bdb..215c32e1ffb 100644
--- a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
@@ -28,7 +28,6 @@ typedef ccl_addr_space struct PrincipledDiffuseBsdf {
SHADER_CLOSURE_BASE;
float roughness;
- float3 N;
} PrincipledDiffuseBsdf;
ccl_device float3 calculate_principled_diffuse_brdf(const PrincipledDiffuseBsdf *bsdf,
diff --git a/intern/cycles/kernel/closure/bsdf_principled_sheen.h b/intern/cycles/kernel/closure/bsdf_principled_sheen.h
index 58df4f7ddbb..f4476bfecd0 100644
--- a/intern/cycles/kernel/closure/bsdf_principled_sheen.h
+++ b/intern/cycles/kernel/closure/bsdf_principled_sheen.h
@@ -26,7 +26,6 @@ CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct PrincipledSheenBsdf {
SHADER_CLOSURE_BASE;
- float3 N;
} PrincipledSheenBsdf;
ccl_device float3 calculate_principled_sheen_brdf(const PrincipledSheenBsdf *bsdf,
diff --git a/intern/cycles/kernel/closure/bsdf_toon.h b/intern/cycles/kernel/closure/bsdf_toon.h
index 28e775bcbc8..d8b6d8ddead 100644
--- a/intern/cycles/kernel/closure/bsdf_toon.h
+++ b/intern/cycles/kernel/closure/bsdf_toon.h
@@ -38,7 +38,6 @@ CCL_NAMESPACE_BEGIN
typedef ccl_addr_space struct ToonBsdf {
SHADER_CLOSURE_BASE;
- float3 N;
float size;
float smooth;
} ToonBsdf;
diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h
index f9236a6e52c..f733ea4c517 100644
--- a/intern/cycles/kernel/closure/bssrdf.h
+++ b/intern/cycles/kernel/closure/bssrdf.h
@@ -28,7 +28,6 @@ typedef ccl_addr_space struct Bssrdf {
float texture_blur;
float albedo;
float roughness;
- float3 N;
} Bssrdf;
/* Planar Truncated Gaussian
diff --git a/intern/cycles/kernel/filter/filter.h b/intern/cycles/kernel/filter/filter.h
new file mode 100644
index 00000000000..f6e474d6702
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FILTER_H__
+#define __FILTER_H__
+
+/* CPU Filter Kernel Interface */
+
+#include "util/util_types.h"
+
+#include "kernel/filter/filter_defines.h"
+
+CCL_NAMESPACE_BEGIN
+
+#define KERNEL_NAME_JOIN(x, y, z) x ## _ ## y ## _ ## z
+#define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name)
+#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
+
+#define KERNEL_ARCH cpu
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+CCL_NAMESPACE_END
+
+#endif /* __FILTER_H__ */
diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h
new file mode 100644
index 00000000000..ce96f733aff
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_defines.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FILTER_DEFINES_H__
+#define __FILTER_DEFINES_H__
+
+#define DENOISE_FEATURES 10
+#define TRANSFORM_SIZE (DENOISE_FEATURES*DENOISE_FEATURES)
+#define XTWX_SIZE (((DENOISE_FEATURES+1)*(DENOISE_FEATURES+2))/2)
+#define XTWY_SIZE (DENOISE_FEATURES+1)
+
+typedef struct TilesInfo {
+ int offsets[9];
+ int strides[9];
+ int x[4];
+ int y[4];
+ /* TODO(lukas): CUDA doesn't have uint64_t... */
+#ifdef __KERNEL_OPENCL__
+ ccl_global float *buffers[9];
+#else
+ long long int buffers[9];
+#endif
+} TilesInfo;
+
+#endif /* __FILTER_DEFINES_H__*/
diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h
new file mode 100644
index 00000000000..f5a40d49997
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_features.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ CCL_NAMESPACE_BEGIN
+
+#define ccl_get_feature(buffer, pass) buffer[(pass)*pass_stride]
+
+/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).
+ * pixel_buffer always points to the current pixel in the first pass. */
+#define FOR_PIXEL_WINDOW pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
+ for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
+ for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) {
+
+#define END_FOR_PIXEL_WINDOW } \
+ pixel_buffer += buffer_w - (high.x - low.x); \
+ }
+
+ccl_device_inline void filter_get_features(int2 pixel, ccl_global float ccl_restrict_ptr buffer, float *features, float ccl_restrict_ptr mean, int pass_stride)
+{
+ features[0] = pixel.x;
+ features[1] = pixel.y;
+ features[2] = ccl_get_feature(buffer, 0);
+ features[3] = ccl_get_feature(buffer, 1);
+ features[4] = ccl_get_feature(buffer, 2);
+ features[5] = ccl_get_feature(buffer, 3);
+ features[6] = ccl_get_feature(buffer, 4);
+ features[7] = ccl_get_feature(buffer, 5);
+ features[8] = ccl_get_feature(buffer, 6);
+ features[9] = ccl_get_feature(buffer, 7);
+ if(mean) {
+ for(int i = 0; i < DENOISE_FEATURES; i++)
+ features[i] -= mean[i];
+ }
+}
+
+ccl_device_inline void filter_get_feature_scales(int2 pixel, ccl_global float ccl_restrict_ptr buffer, float *scales, float ccl_restrict_ptr mean, int pass_stride)
+{
+ scales[0] = fabsf(pixel.x - mean[0]);
+ scales[1] = fabsf(pixel.y - mean[1]);
+ scales[2] = fabsf(ccl_get_feature(buffer, 0) - mean[2]);
+ scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3],
+ ccl_get_feature(buffer, 2) - mean[4],
+ ccl_get_feature(buffer, 3) - mean[5]));
+ scales[4] = fabsf(ccl_get_feature(buffer, 4) - mean[6]);
+ scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7],
+ ccl_get_feature(buffer, 6) - mean[8],
+ ccl_get_feature(buffer, 7) - mean[9]));
+}
+
+ccl_device_inline void filter_calculate_scale(float *scale)
+{
+ scale[0] = 1.0f/max(scale[0], 0.01f);
+ scale[1] = 1.0f/max(scale[1], 0.01f);
+ scale[2] = 1.0f/max(scale[2], 0.01f);
+ scale[6] = 1.0f/max(scale[4], 0.01f);
+ scale[7] = scale[8] = scale[9] = 1.0f/max(sqrtf(scale[5]), 0.01f);
+ scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f);
+}
+
+ccl_device_inline float3 filter_get_pixel_color(ccl_global float ccl_restrict_ptr buffer, int pass_stride)
+{
+ return make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2));
+}
+
+ccl_device_inline float filter_get_pixel_variance(ccl_global float ccl_restrict_ptr buffer, int pass_stride)
+{
+ return average(make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2)));
+}
+
+ccl_device_inline void design_row_add(float *design_row,
+ int rank,
+ ccl_global float ccl_restrict_ptr transform,
+ int stride,
+ int row,
+ float feature)
+{
+ for(int i = 0; i < rank; i++) {
+ design_row[1+i] += transform[(row*DENOISE_FEATURES + i)*stride]*feature;
+ }
+}
+
+/* Fill the design row. */
+ccl_device_inline void filter_get_design_row_transform(int2 p_pixel,
+ ccl_global float ccl_restrict_ptr p_buffer,
+ int2 q_pixel,
+ ccl_global float ccl_restrict_ptr q_buffer,
+ int pass_stride,
+ int rank,
+ float *design_row,
+ ccl_global float ccl_restrict_ptr transform,
+ int stride)
+{
+ design_row[0] = 1.0f;
+ math_vector_zero(design_row+1, rank);
+ design_row_add(design_row, rank, transform, stride, 0, q_pixel.x - p_pixel.x);
+ design_row_add(design_row, rank, transform, stride, 1, q_pixel.y - p_pixel.y);
+ design_row_add(design_row, rank, transform, stride, 2, ccl_get_feature(q_buffer, 0) - ccl_get_feature(p_buffer, 0));
+ design_row_add(design_row, rank, transform, stride, 3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1));
+ design_row_add(design_row, rank, transform, stride, 4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2));
+ design_row_add(design_row, rank, transform, stride, 5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3));
+ design_row_add(design_row, rank, transform, stride, 6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4));
+ design_row_add(design_row, rank, transform, stride, 7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5));
+ design_row_add(design_row, rank, transform, stride, 8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6));
+ design_row_add(design_row, rank, transform, stride, 9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7));
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h
new file mode 100644
index 00000000000..303c8f482e3
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_features_sse.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#define ccl_get_feature_sse(pass) _mm_loadu_ps(buffer + (pass)*pass_stride)
+
+/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time.
+ * pixel_buffer always points to the first of the 4 current pixel in the first pass.
+ * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. */
+
+#define FOR_PIXEL_WINDOW_SSE pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
+ for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
+ __m128 y4 = _mm_set1_ps(pixel.y); \
+ for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
+ __m128 x4 = _mm_add_ps(_mm_set1_ps(pixel.x), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); \
+ __m128 active_pixels = _mm_cmplt_ps(x4, _mm_set1_ps(high.x));
+
+#define END_FOR_PIXEL_WINDOW_SSE } \
+ pixel_buffer += buffer_w - (pixel.x - low.x); \
+ }
+
+ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, __m128 active_pixels, float ccl_restrict_ptr buffer, __m128 *features, __m128 ccl_restrict_ptr mean, int pass_stride)
+{
+ features[0] = x;
+ features[1] = y;
+ features[2] = ccl_get_feature_sse(0);
+ features[3] = ccl_get_feature_sse(1);
+ features[4] = ccl_get_feature_sse(2);
+ features[5] = ccl_get_feature_sse(3);
+ features[6] = ccl_get_feature_sse(4);
+ features[7] = ccl_get_feature_sse(5);
+ features[8] = ccl_get_feature_sse(6);
+ features[9] = ccl_get_feature_sse(7);
+ if(mean) {
+ for(int i = 0; i < DENOISE_FEATURES; i++)
+ features[i] = _mm_sub_ps(features[i], mean[i]);
+ }
+ for(int i = 0; i < DENOISE_FEATURES; i++)
+ features[i] = _mm_mask_ps(features[i], active_pixels);
+}
+
+ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y, __m128 active_pixels, float ccl_restrict_ptr buffer, __m128 *scales, __m128 ccl_restrict_ptr mean, int pass_stride)
+{
+ scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels);
+ scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels);
+
+ scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(0), mean[2])), active_pixels);
+
+ __m128 diff, scale;
+ diff = _mm_sub_ps(ccl_get_feature_sse(1), mean[3]);
+ scale = _mm_mul_ps(diff, diff);
+ diff = _mm_sub_ps(ccl_get_feature_sse(2), mean[4]);
+ scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
+ diff = _mm_sub_ps(ccl_get_feature_sse(3), mean[5]);
+ scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
+ scales[3] = _mm_mask_ps(scale, active_pixels);
+
+ scales[4] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(4), mean[6])), active_pixels);
+
+ diff = _mm_sub_ps(ccl_get_feature_sse(5), mean[7]);
+ scale = _mm_mul_ps(diff, diff);
+ diff = _mm_sub_ps(ccl_get_feature_sse(6), mean[8]);
+ scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
+ diff = _mm_sub_ps(ccl_get_feature_sse(7), mean[9]);
+ scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
+ scales[5] = _mm_mask_ps(scale, active_pixels);
+}
+
+ccl_device_inline void filter_calculate_scale_sse(__m128 *scale)
+{
+ scale[0] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[0]), _mm_set1_ps(0.01f)));
+ scale[1] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[1]), _mm_set1_ps(0.01f)));
+ scale[2] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[2]), _mm_set1_ps(0.01f)));
+ scale[6] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[4]), _mm_set1_ps(0.01f)));
+
+ scale[7] = scale[8] = scale[9] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[5])), _mm_set1_ps(0.01f)));
+ scale[3] = scale[4] = scale[5] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[3])), _mm_set1_ps(0.01f)));
+}
+
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_kernel.h b/intern/cycles/kernel/filter/filter_kernel.h
new file mode 100644
index 00000000000..2ef03dc0a02
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_kernel.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "util/util_color.h"
+#include "util/util_math.h"
+#include "util/util_math_fast.h"
+#include "util/util_texture.h"
+
+#include "util/util_atomic.h"
+#include "util/util_math_matrix.h"
+
+#include "kernel/filter/filter_defines.h"
+
+#include "kernel/filter/filter_features.h"
+#ifdef __KERNEL_SSE3__
+# include "kernel/filter/filter_features_sse.h"
+#endif
+
+#include "kernel/filter/filter_prefilter.h"
+
+#ifdef __KERNEL_GPU__
+# include "kernel/filter/filter_transform_gpu.h"
+#else
+# ifdef __KERNEL_SSE3__
+# include "kernel/filter/filter_transform_sse.h"
+# else
+# include "kernel/filter/filter_transform.h"
+# endif
+#endif
+
+#include "kernel/filter/filter_reconstruction.h"
+
+#ifdef __KERNEL_CPU__
+# include "kernel/filter/filter_nlm_cpu.h"
+#else
+# include "kernel/filter/filter_nlm_gpu.h"
+#endif
diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h
new file mode 100644
index 00000000000..1a314b100be
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, float ccl_restrict_ptr weightImage, float ccl_restrict_ptr varianceImage, float *differenceImage, int4 rect, int w, int channel_offset, float a, float k_2)
+{
+ for(int y = rect.y; y < rect.w; y++) {
+ for(int x = rect.x; x < rect.z; x++) {
+ float diff = 0.0f;
+ int numChannels = channel_offset? 3 : 1;
+ for(int c = 0; c < numChannels; c++) {
+ float cdiff = weightImage[c*channel_offset + y*w+x] - weightImage[c*channel_offset + (y+dy)*w+(x+dx)];
+ float pvar = varianceImage[c*channel_offset + y*w+x];
+ float qvar = varianceImage[c*channel_offset + (y+dy)*w+(x+dx)];
+ diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
+ }
+ if(numChannels > 1) {
+ diff *= 1.0f/numChannels;
+ }
+ differenceImage[y*w+x] = diff;
+ }
+ }
+}
+
+ccl_device_inline void kernel_filter_nlm_blur(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f)
+{
+#ifdef __KERNEL_SSE3__
+ int aligned_lowx = (rect.x & ~(3));
+ int aligned_highx = ((rect.z + 3) & ~(3));
+#endif
+ for(int y = rect.y; y < rect.w; y++) {
+ const int low = max(rect.y, y-f);
+ const int high = min(rect.w, y+f+1);
+ for(int x = rect.x; x < rect.z; x++) {
+ outImage[y*w+x] = 0.0f;
+ }
+ for(int y1 = low; y1 < high; y1++) {
+#ifdef __KERNEL_SSE3__
+ for(int x = aligned_lowx; x < aligned_highx; x+=4) {
+ _mm_store_ps(outImage + y*w+x, _mm_add_ps(_mm_load_ps(outImage + y*w+x), _mm_load_ps(differenceImage + y1*w+x)));
+ }
+#else
+ for(int x = rect.x; x < rect.z; x++) {
+ outImage[y*w+x] += differenceImage[y1*w+x];
+ }
+#endif
+ }
+ for(int x = rect.x; x < rect.z; x++) {
+ outImage[y*w+x] *= 1.0f/(high - low);
+ }
+ }
+}
+
+ccl_device_inline void kernel_filter_nlm_calc_weight(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f)
+{
+ for(int y = rect.y; y < rect.w; y++) {
+ for(int x = rect.x; x < rect.z; x++) {
+ outImage[y*w+x] = 0.0f;
+ }
+ }
+ for(int dx = -f; dx <= f; dx++) {
+ int pos_dx = max(0, dx);
+ int neg_dx = min(0, dx);
+ for(int y = rect.y; y < rect.w; y++) {
+ for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) {
+ outImage[y*w+x] += differenceImage[y*w+dx+x];
+ }
+ }
+ }
+ for(int y = rect.y; y < rect.w; y++) {
+ for(int x = rect.x; x < rect.z; x++) {
+ const int low = max(rect.x, x-f);
+ const int high = min(rect.z, x+f+1);
+ outImage[y*w+x] = expf(-max(outImage[y*w+x] * (1.0f/(high - low)), 0.0f));
+ }
+ }
+}
+
+ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, float ccl_restrict_ptr differenceImage, float ccl_restrict_ptr image, float *outImage, float *accumImage, int4 rect, int w, int f)
+{
+ for(int y = rect.y; y < rect.w; y++) {
+ for(int x = rect.x; x < rect.z; x++) {
+ const int low = max(rect.x, x-f);
+ const int high = min(rect.z, x+f+1);
+ float sum = 0.0f;
+ for(int x1 = low; x1 < high; x1++) {
+ sum += differenceImage[y*w+x1];
+ }
+ float weight = sum * (1.0f/(high - low));
+ accumImage[y*w+x] += weight;
+ outImage[y*w+x] += weight*image[(y+dy)*w+(x+dx)];
+ }
+ }
+}
+
+ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
+ float ccl_restrict_ptr differenceImage,
+ float ccl_restrict_ptr buffer,
+ float *color_pass,
+ float *variance_pass,
+ float *transform,
+ int *rank,
+ float *XtWX,
+ float3 *XtWY,
+ int4 rect,
+ int4 filter_rect,
+ int w, int h, int f,
+ int pass_stride)
+{
+ /* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */
+ for(int fy = max(0, rect.y-filter_rect.y); fy < min(filter_rect.w, rect.w-filter_rect.y); fy++) {
+ int y = fy + filter_rect.y;
+ for(int fx = max(0, rect.x-filter_rect.x); fx < min(filter_rect.z, rect.z-filter_rect.x); fx++) {
+ int x = fx + filter_rect.x;
+ const int low = max(rect.x, x-f);
+ const int high = min(rect.z, x+f+1);
+ float sum = 0.0f;
+ for(int x1 = low; x1 < high; x1++) {
+ sum += differenceImage[y*w+x1];
+ }
+ float weight = sum * (1.0f/(high - low));
+
+ int storage_ofs = fy*filter_rect.z + fx;
+ float *l_transform = transform + storage_ofs*TRANSFORM_SIZE;
+ float *l_XtWX = XtWX + storage_ofs*XTWX_SIZE;
+ float3 *l_XtWY = XtWY + storage_ofs*XTWY_SIZE;
+ int *l_rank = rank + storage_ofs;
+
+ kernel_filter_construct_gramian(x, y, 1,
+ dx, dy, w, h,
+ pass_stride,
+ buffer,
+ color_pass, variance_pass,
+ l_transform, l_rank,
+ weight, l_XtWX, l_XtWY, 0);
+ }
+ }
+}
+
+ccl_device_inline void kernel_filter_nlm_normalize(float *outImage, float ccl_restrict_ptr accumImage, int4 rect, int w)
+{
+ for(int y = rect.y; y < rect.w; y++) {
+ for(int x = rect.x; x < rect.z; x++) {
+ outImage[y*w+x] /= accumImage[y*w+x];
+ }
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h
new file mode 100644
index 00000000000..b5ba7cf51a5
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y,
+ int dx, int dy,
+ ccl_global float ccl_restrict_ptr weightImage,
+ ccl_global float ccl_restrict_ptr varianceImage,
+ ccl_global float *differenceImage,
+ int4 rect, int w,
+ int channel_offset,
+ float a, float k_2)
+{
+ float diff = 0.0f;
+ int numChannels = channel_offset? 3 : 1;
+ for(int c = 0; c < numChannels; c++) {
+ float cdiff = weightImage[c*channel_offset + y*w+x] - weightImage[c*channel_offset + (y+dy)*w+(x+dx)];
+ float pvar = varianceImage[c*channel_offset + y*w+x];
+ float qvar = varianceImage[c*channel_offset + (y+dy)*w+(x+dx)];
+ diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
+ }
+ if(numChannels > 1) {
+ diff *= 1.0f/numChannels;
+ }
+ differenceImage[y*w+x] = diff;
+}
+
+ccl_device_inline void kernel_filter_nlm_blur(int x, int y,
+ ccl_global float ccl_restrict_ptr differenceImage,
+ ccl_global float *outImage,
+ int4 rect, int w, int f)
+{
+ float sum = 0.0f;
+ const int low = max(rect.y, y-f);
+ const int high = min(rect.w, y+f+1);
+ for(int y1 = low; y1 < high; y1++) {
+ sum += differenceImage[y1*w+x];
+ }
+ sum *= 1.0f/(high-low);
+ outImage[y*w+x] = sum;
+}
+
+ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y,
+ ccl_global float ccl_restrict_ptr differenceImage,
+ ccl_global float *outImage,
+ int4 rect, int w, int f)
+{
+ float sum = 0.0f;
+ const int low = max(rect.x, x-f);
+ const int high = min(rect.z, x+f+1);
+ for(int x1 = low; x1 < high; x1++) {
+ sum += differenceImage[y*w+x1];
+ }
+ sum *= 1.0f/(high-low);
+ outImage[y*w+x] = expf(-max(sum, 0.0f));
+}
+
+ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
+ int dx, int dy,
+ ccl_global float ccl_restrict_ptr differenceImage,
+ ccl_global float ccl_restrict_ptr image,
+ ccl_global float *outImage,
+ ccl_global float *accumImage,
+ int4 rect, int w, int f)
+{
+ float sum = 0.0f;
+ const int low = max(rect.x, x-f);
+ const int high = min(rect.z, x+f+1);
+ for(int x1 = low; x1 < high; x1++) {
+ sum += differenceImage[y*w+x1];
+ }
+ sum *= 1.0f/(high-low);
+ if(outImage) {
+ accumImage[y*w+x] += sum;
+ outImage[y*w+x] += sum*image[(y+dy)*w+(x+dx)];
+ }
+ else {
+ accumImage[y*w+x] = sum;
+ }
+}
+
+ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy,
+ int dx, int dy,
+ ccl_global float ccl_restrict_ptr differenceImage,
+ ccl_global float ccl_restrict_ptr buffer,
+ ccl_global float *color_pass,
+ ccl_global float *variance_pass,
+ ccl_global float ccl_restrict_ptr transform,
+ ccl_global int *rank,
+ ccl_global float *XtWX,
+ ccl_global float3 *XtWY,
+ int4 rect,
+ int4 filter_rect,
+ int w, int h, int f,
+ int pass_stride,
+ int localIdx)
+{
+ int y = fy + filter_rect.y;
+ int x = fx + filter_rect.x;
+ const int low = max(rect.x, x-f);
+ const int high = min(rect.z, x+f+1);
+ float sum = 0.0f;
+ for(int x1 = low; x1 < high; x1++) {
+ sum += differenceImage[y*w+x1];
+ }
+ float weight = sum * (1.0f/(high - low));
+
+ int storage_ofs = fy*filter_rect.z + fx;
+ transform += storage_ofs;
+ rank += storage_ofs;
+ XtWX += storage_ofs;
+ XtWY += storage_ofs;
+
+ kernel_filter_construct_gramian(x, y,
+ filter_rect.z*filter_rect.w,
+ dx, dy, w, h,
+ pass_stride,
+ buffer,
+ color_pass, variance_pass,
+ transform, rank,
+ weight, XtWX, XtWY,
+ localIdx);
+}
+
+ccl_device_inline void kernel_filter_nlm_normalize(int x, int y,
+ ccl_global float *outImage,
+ ccl_global float ccl_restrict_ptr accumImage,
+ int4 rect, int w)
+{
+ outImage[y*w+x] /= accumImage[y*w+x];
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h
new file mode 100644
index 00000000000..54bcf888052
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_prefilter.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* First step of the shadow prefiltering, performs the shadow division and stores all data
+ * in a nice and easy rectangular array that can be passed to the NLM filter.
+ *
+ * Calculates:
+ * unfiltered: Contains the two half images of the shadow feature pass
+ * sampleVariance: The sample-based variance calculated in the kernel. Note: This calculation is biased in general, and especially here since the variance of the ratio can only be approximated.
+ * sampleVarianceV: Variance of the sample variance estimation, quite noisy (since it's essentially the buffer variance of the two variance halves)
+ * bufferVariance: The buffer-based variance of the shadow feature. Unbiased, but quite noisy.
+ */
+ccl_device void kernel_filter_divide_shadow(int sample,
+ ccl_global TilesInfo *tiles,
+ int x, int y,
+ ccl_global float *unfilteredA,
+ ccl_global float *unfilteredB,
+ ccl_global float *sampleVariance,
+ ccl_global float *sampleVarianceV,
+ ccl_global float *bufferVariance,
+ int4 rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance)
+{
+ int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2);
+ int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2);
+ int tile = ytile*3+xtile;
+
+ int offset = tiles->offsets[tile];
+ int stride = tiles->strides[tile];
+ ccl_global float ccl_restrict_ptr center_buffer = (ccl_global float*) tiles->buffers[tile];
+ center_buffer += (y*stride + x + offset)*buffer_pass_stride;
+ center_buffer += buffer_denoising_offset + 14;
+
+ int buffer_w = align_up(rect.z - rect.x, 4);
+ int idx = (y-rect.y)*buffer_w + (x - rect.x);
+ unfilteredA[idx] = center_buffer[1] / max(center_buffer[0], 1e-7f);
+ unfilteredB[idx] = center_buffer[4] / max(center_buffer[3], 1e-7f);
+
+ float varA = center_buffer[2];
+ float varB = center_buffer[5];
+ int odd_sample = (sample+1)/2;
+ int even_sample = sample/2;
+ if(use_split_variance) {
+ varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample);
+ varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample);
+ }
+ varA /= (odd_sample - 1);
+ varB /= (even_sample - 1);
+
+ sampleVariance[idx] = 0.5f*(varA + varB) / sample;
+ sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample*sample);
+ bufferVariance[idx] = 0.5f * (unfilteredA[idx] - unfilteredB[idx]) * (unfilteredA[idx] - unfilteredB[idx]);
+}
+
+/* Load a regular feature from the render buffers into the denoise buffer.
+ * Parameters:
+ * - sample: The sample amount in the buffer, used to normalize the buffer.
+ * - m_offset, v_offset: Render Buffer Pass offsets of mean and variance of the feature.
+ * - x, y: Current pixel
+ * - mean, variance: Target denoise buffers.
+ * - rect: The prefilter area (lower pixels inclusive, upper pixels exclusive).
+ */
+ccl_device void kernel_filter_get_feature(int sample,
+ ccl_global TilesInfo *tiles,
+ int m_offset, int v_offset,
+ int x, int y,
+ ccl_global float *mean,
+ ccl_global float *variance,
+ int4 rect, int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance)
+{
+ int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2);
+ int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2);
+ int tile = ytile*3+xtile;
+ ccl_global float *center_buffer = ((ccl_global float*) tiles->buffers[tile]) + (tiles->offsets[tile] + y*tiles->strides[tile] + x)*buffer_pass_stride + buffer_denoising_offset;
+
+ int buffer_w = align_up(rect.z - rect.x, 4);
+ int idx = (y-rect.y)*buffer_w + (x - rect.x);
+
+ mean[idx] = center_buffer[m_offset] / sample;
+ if(use_split_variance) {
+ variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1)));
+ }
+ else {
+ variance[idx] = center_buffer[v_offset] / (sample * (sample-1));
+ }
+}
+
+/* Combine A/B buffers.
+ * Calculates the combined mean and the buffer variance. */
+ccl_device void kernel_filter_combine_halves(int x, int y,
+ ccl_global float *mean,
+ ccl_global float *variance,
+ ccl_global float *a,
+ ccl_global float *b,
+ int4 rect, int r)
+{
+ int buffer_w = align_up(rect.z - rect.x, 4);
+ int idx = (y-rect.y)*buffer_w + (x - rect.x);
+
+ if(mean) mean[idx] = 0.5f * (a[idx]+b[idx]);
+ if(variance) {
+ if(r == 0) variance[idx] = 0.25f * (a[idx]-b[idx])*(a[idx]-b[idx]);
+ else {
+ variance[idx] = 0.0f;
+ float values[25];
+ int numValues = 0;
+ for(int py = max(y-r, rect.y); py < min(y+r+1, rect.w); py++) {
+ for(int px = max(x-r, rect.x); px < min(x+r+1, rect.z); px++) {
+ int pidx = (py-rect.y)*buffer_w + (px-rect.x);
+ values[numValues++] = 0.25f * (a[pidx]-b[pidx])*(a[pidx]-b[pidx]);
+ }
+ }
+ /* Insertion-sort the variances (fast enough for 25 elements). */
+ for(int i = 1; i < numValues; i++) {
+ float v = values[i];
+ int j;
+ for(j = i-1; j >= 0 && values[j] > v; j--)
+ values[j+1] = values[j];
+ values[j+1] = v;
+ }
+ variance[idx] = values[(7*numValues)/8];
+ }
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h
new file mode 100644
index 00000000000..02f3802fa0c
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_reconstruction.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
+ int storage_stride,
+ int dx, int dy,
+ int w, int h,
+ int pass_stride,
+ ccl_global float ccl_restrict_ptr buffer,
+ ccl_global float *color_pass,
+ ccl_global float *variance_pass,
+ ccl_global float ccl_restrict_ptr transform,
+ ccl_global int *rank,
+ float weight,
+ ccl_global float *XtWX,
+ ccl_global float3 *XtWY,
+ int localIdx)
+{
+ int p_offset = y *w + x;
+ int q_offset = (y+dy)*w + (x+dx);
+
+#ifdef __KERNEL_CPU__
+ const int stride = 1;
+ (void)storage_stride;
+ (void)localIdx;
+ float design_row[DENOISE_FEATURES+1];
+#elif defined(__KERNEL_CUDA__)
+ const int stride = storage_stride;
+ ccl_local float shared_design_row[(DENOISE_FEATURES+1)*CCL_MAX_LOCAL_SIZE];
+ ccl_local_param float *design_row = shared_design_row + localIdx*(DENOISE_FEATURES+1);
+#else
+ const int stride = storage_stride;
+ float design_row[DENOISE_FEATURES+1];
+#endif
+
+ float3 p_color = filter_get_pixel_color(color_pass + p_offset, pass_stride);
+ float3 q_color = filter_get_pixel_color(color_pass + q_offset, pass_stride);
+
+ float p_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + p_offset, pass_stride));
+ float q_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + q_offset, pass_stride));
+
+ if(average(fabs(p_color - q_color)) > 3.0f*(p_std_dev + q_std_dev + 1e-3f)) {
+ return;
+ }
+
+ filter_get_design_row_transform(make_int2(x, y), buffer + p_offset,
+ make_int2(x+dx, y+dy), buffer + q_offset,
+ pass_stride, *rank, design_row, transform, stride);
+
+ math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride);
+ math_vec3_add_strided(XtWY, (*rank)+1, design_row, weight * q_color, stride);
+}
+
+ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h,
+ ccl_global float *buffer,
+ ccl_global int *rank,
+ int storage_stride,
+ ccl_global float *XtWX,
+ ccl_global float3 *XtWY,
+ int4 buffer_params,
+ int sample)
+{
+#ifdef __KERNEL_CPU__
+ const int stride = 1;
+ (void)storage_stride;
+#else
+ const int stride = storage_stride;
+#endif
+
+ math_trimatrix_vec3_solve(XtWX, XtWY, (*rank)+1, stride);
+
+ float3 final_color = XtWY[0];
+
+ ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z;
+ final_color *= sample;
+ if(buffer_params.w) {
+ final_color.x += combined_buffer[buffer_params.w+0];
+ final_color.y += combined_buffer[buffer_params.w+1];
+ final_color.z += combined_buffer[buffer_params.w+2];
+ }
+ combined_buffer[0] = final_color.x;
+ combined_buffer[1] = final_color.y;
+ combined_buffer[2] = final_color.z;
+}
+
+#undef STORAGE_TYPE
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform.h b/intern/cycles/kernel/filter/filter_transform.h
new file mode 100644
index 00000000000..139dc402d21
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_transform.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
+ int x, int y, int4 rect,
+ int pass_stride,
+ float *transform, int *rank,
+ int radius, float pca_threshold)
+{
+ int buffer_w = align_up(rect.z - rect.x, 4);
+
+ float features[DENOISE_FEATURES];
+
+ /* Temporary storage, used in different steps of the algorithm. */
+ float tempmatrix[DENOISE_FEATURES*DENOISE_FEATURES];
+ float tempvector[2*DENOISE_FEATURES];
+ float ccl_restrict_ptr pixel_buffer;
+ int2 pixel;
+
+
+
+
+ /* === Calculate denoising window. === */
+ int2 low = make_int2(max(rect.x, x - radius),
+ max(rect.y, y - radius));
+ int2 high = make_int2(min(rect.z, x + radius + 1),
+ min(rect.w, y + radius + 1));
+
+
+
+
+ /* === Shift feature passes to have mean 0. === */
+ float feature_means[DENOISE_FEATURES];
+ math_vector_zero(feature_means, DENOISE_FEATURES);
+ FOR_PIXEL_WINDOW {
+ filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride);
+ math_vector_add(feature_means, features, DENOISE_FEATURES);
+ } END_FOR_PIXEL_WINDOW
+
+ float pixel_scale = 1.0f / ((high.y - low.y) * (high.x - low.x));
+ math_vector_scale(feature_means, pixel_scale, DENOISE_FEATURES);
+
+ /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
+ float *feature_scale = tempvector;
+ math_vector_zero(feature_scale, DENOISE_FEATURES);
+
+ FOR_PIXEL_WINDOW {
+ filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride);
+ math_vector_max(feature_scale, features, DENOISE_FEATURES);
+ } END_FOR_PIXEL_WINDOW
+
+ filter_calculate_scale(feature_scale);
+
+
+ /* === Generate the feature transformation. ===
+ * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
+ * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
+ float* feature_matrix = tempmatrix;
+ math_matrix_zero(feature_matrix, DENOISE_FEATURES);
+ FOR_PIXEL_WINDOW {
+ filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
+ math_vector_mul(features, feature_scale, DENOISE_FEATURES);
+ math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
+ } END_FOR_PIXEL_WINDOW
+
+ math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
+ *rank = 0;
+ if(pca_threshold < 0.0f) {
+ float threshold_energy = 0.0f;
+ for(int i = 0; i < DENOISE_FEATURES; i++) {
+ threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+ }
+ threshold_energy *= 1.0f - (-pca_threshold);
+
+ float reduced_energy = 0.0f;
+ for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
+ if(i >= 2 && reduced_energy >= threshold_energy)
+ break;
+ float s = feature_matrix[i*DENOISE_FEATURES+i];
+ reduced_energy += s;
+ }
+ }
+ else {
+ for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
+ float s = feature_matrix[i*DENOISE_FEATURES+i];
+ if(i >= 2 && sqrtf(s) < pca_threshold)
+ break;
+ }
+ }
+
+ /* Bake the feature scaling into the transformation matrix. */
+ for(int i = 0; i < (*rank); i++) {
+ math_vector_mul(transform + i*DENOISE_FEATURES, feature_scale, DENOISE_FEATURES);
+ }
+ math_matrix_transpose(transform, DENOISE_FEATURES, 1);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h
new file mode 100644
index 00000000000..68304e14143
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_transform_gpu.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_filter_construct_transform(ccl_global float ccl_restrict_ptr buffer,
+ int x, int y, int4 rect,
+ int pass_stride,
+ ccl_global float *transform,
+ ccl_global int *rank,
+ int radius, float pca_threshold,
+ int transform_stride, int localIdx)
+{
+ int buffer_w = align_up(rect.z - rect.x, 4);
+
+#ifdef __KERNEL_CUDA__
+ ccl_local float shared_features[DENOISE_FEATURES*CCL_MAX_LOCAL_SIZE];
+ ccl_local_param float *features = shared_features + localIdx*DENOISE_FEATURES;
+#else
+ float features[DENOISE_FEATURES];
+#endif
+
+ /* === Calculate denoising window. === */
+ int2 low = make_int2(max(rect.x, x - radius),
+ max(rect.y, y - radius));
+ int2 high = make_int2(min(rect.z, x + radius + 1),
+ min(rect.w, y + radius + 1));
+ ccl_global float ccl_restrict_ptr pixel_buffer;
+ int2 pixel;
+
+
+
+
+ /* === Shift feature passes to have mean 0. === */
+ float feature_means[DENOISE_FEATURES];
+ math_vector_zero(feature_means, DENOISE_FEATURES);
+ FOR_PIXEL_WINDOW {
+ filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride);
+ math_vector_add(feature_means, features, DENOISE_FEATURES);
+ } END_FOR_PIXEL_WINDOW
+
+ float pixel_scale = 1.0f / ((high.y - low.y) * (high.x - low.x));
+ math_vector_scale(feature_means, pixel_scale, DENOISE_FEATURES);
+
+ /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
+ float feature_scale[DENOISE_FEATURES];
+ math_vector_zero(feature_scale, DENOISE_FEATURES);
+
+ FOR_PIXEL_WINDOW {
+ filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride);
+ math_vector_max(feature_scale, features, DENOISE_FEATURES);
+ } END_FOR_PIXEL_WINDOW
+
+ filter_calculate_scale(feature_scale);
+
+
+
+ /* === Generate the feature transformation. ===
+ * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
+ * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
+ float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
+ math_matrix_zero(feature_matrix, DENOISE_FEATURES);
+ FOR_PIXEL_WINDOW {
+ filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
+ math_vector_mul(features, feature_scale, DENOISE_FEATURES);
+ math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
+ } END_FOR_PIXEL_WINDOW
+
+ math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, transform_stride);
+ *rank = 0;
+ if(pca_threshold < 0.0f) {
+ float threshold_energy = 0.0f;
+ for(int i = 0; i < DENOISE_FEATURES; i++) {
+ threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+ }
+ threshold_energy *= 1.0f - (-pca_threshold);
+
+ float reduced_energy = 0.0f;
+ for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
+ if(i >= 2 && reduced_energy >= threshold_energy)
+ break;
+ float s = feature_matrix[i*DENOISE_FEATURES+i];
+ reduced_energy += s;
+ }
+ }
+ else {
+ for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
+ float s = feature_matrix[i*DENOISE_FEATURES+i];
+ if(i >= 2 && sqrtf(s) < pca_threshold)
+ break;
+ }
+ }
+
+ math_matrix_transpose(transform, DENOISE_FEATURES, transform_stride);
+
+ /* Bake the feature scaling into the transformation matrix. */
+ for(int i = 0; i < DENOISE_FEATURES; i++) {
+ for(int j = 0; j < (*rank); j++) {
+ transform[(i*DENOISE_FEATURES + j)*transform_stride] *= feature_scale[i];
+ }
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h
new file mode 100644
index 00000000000..ed3a92f6241
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_transform_sse.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
+ int x, int y, int4 rect,
+ int pass_stride,
+ float *transform, int *rank,
+ int radius, float pca_threshold)
+{
+ int buffer_w = align_up(rect.z - rect.x, 4);
+
+ __m128 features[DENOISE_FEATURES];
+ float ccl_restrict_ptr pixel_buffer;
+ int2 pixel;
+
+ int2 low = make_int2(max(rect.x, x - radius),
+ max(rect.y, y - radius));
+ int2 high = make_int2(min(rect.z, x + radius + 1),
+ min(rect.w, y + radius + 1));
+
+ __m128 feature_means[DENOISE_FEATURES];
+ math_vector_zero_sse(feature_means, DENOISE_FEATURES);
+ FOR_PIXEL_WINDOW_SSE {
+ filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride);
+ math_vector_add_sse(feature_means, DENOISE_FEATURES, features);
+ } END_FOR_PIXEL_WINDOW_SSE
+
+ __m128 pixel_scale = _mm_set1_ps(1.0f / ((high.y - low.y) * (high.x - low.x)));
+ for(int i = 0; i < DENOISE_FEATURES; i++) {
+ feature_means[i] = _mm_mul_ps(_mm_hsum_ps(feature_means[i]), pixel_scale);
+ }
+
+ __m128 feature_scale[DENOISE_FEATURES];
+ math_vector_zero_sse(feature_scale, DENOISE_FEATURES);
+ FOR_PIXEL_WINDOW_SSE {
+ filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
+ math_vector_max_sse(feature_scale, features, DENOISE_FEATURES);
+ } END_FOR_PIXEL_WINDOW_SSE
+
+ filter_calculate_scale_sse(feature_scale);
+
+ __m128 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
+ math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES);
+ FOR_PIXEL_WINDOW_SSE {
+ filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
+ math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale);
+ math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, _mm_set1_ps(1.0f));
+ } END_FOR_PIXEL_WINDOW_SSE
+
+ float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
+ math_matrix_hsum(feature_matrix, DENOISE_FEATURES, feature_matrix_sse);
+
+ math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
+
+ *rank = 0;
+ if(pca_threshold < 0.0f) {
+ float threshold_energy = 0.0f;
+ for(int i = 0; i < DENOISE_FEATURES; i++) {
+ threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+ }
+ threshold_energy *= 1.0f - (-pca_threshold);
+
+ float reduced_energy = 0.0f;
+ for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
+ if(i >= 2 && reduced_energy >= threshold_energy)
+ break;
+ float s = feature_matrix[i*DENOISE_FEATURES+i];
+ reduced_energy += s;
+ }
+ }
+ else {
+ for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
+ float s = feature_matrix[i*DENOISE_FEATURES+i];
+ if(i >= 2 && sqrtf(s) < pca_threshold)
+ break;
+ }
+ }
+
+ math_matrix_transpose(transform, DENOISE_FEATURES, 1);
+
+ /* Bake the feature scaling into the transformation matrix. */
+ for(int i = 0; i < DENOISE_FEATURES; i++) {
+ math_vector_scale(transform + i*DENOISE_FEATURES, _mm_cvtss_f32(feature_scale[i]), *rank);
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h
index 47778553b94..105aee8da15 100644
--- a/intern/cycles/kernel/geom/geom_triangle.h
+++ b/intern/cycles/kernel/geom/geom_triangle.h
@@ -76,7 +76,7 @@ ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3
/* Interpolate smooth vertex normal from vertices */
-ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, float u, float v)
+ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, float3 Ng, int prim, float u, float v)
{
/* load triangle vertices */
const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
@@ -84,7 +84,9 @@ ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, flo
float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y));
float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
- return normalize((1.0f - u - v)*n2 + u*n0 + v*n1);
+ float3 N = safe_normalize((1.0f - u - v)*n2 + u*n0 + v*n1);
+
+ return is_zero(N)? Ng: N;
}
/* Ray differentials on triangle */
diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h
index 06c0fb2fbca..84a988f1dbc 100644
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/kernel.h
@@ -50,30 +50,20 @@ void kernel_tex_copy(KernelGlobals *kg,
#define KERNEL_ARCH cpu
#include "kernel/kernels/cpu/kernel_cpu.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-# define KERNEL_ARCH cpu_sse2
-# include "kernel/kernels/cpu/kernel_cpu.h"
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-# define KERNEL_ARCH cpu_sse3
-# include "kernel/kernels/cpu/kernel_cpu.h"
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-# define KERNEL_ARCH cpu_sse41
-# include "kernel/kernels/cpu/kernel_cpu.h"
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-# define KERNEL_ARCH cpu_avx
-# include "kernel/kernels/cpu/kernel_cpu.h"
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-# define KERNEL_ARCH cpu_avx2
-# include "kernel/kernels/cpu/kernel_cpu.h"
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/kernel_cpu.h"
+
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/kernel_cpu.h"
+
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/kernel_cpu.h"
+
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/kernel_cpu.h"
+
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/kernel_cpu.h"
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index 823d30dde78..06728415c15 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -222,6 +222,12 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass)
L->path_total_shaded = make_float3(0.0f, 0.0f, 0.0f);
L->shadow_color = make_float3(0.0f, 0.0f, 0.0f);
#endif
+
+#ifdef __DENOISING_FEATURES__
+ L->denoising_normal = make_float3(0.0f, 0.0f, 0.0f);
+ L->denoising_albedo = make_float3(0.0f, 0.0f, 0.0f);
+ L->denoising_depth = 0.0f;
+#endif /* __DENOISING_FEATURES__ */
}
ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space float3 *throughput,
@@ -277,15 +283,15 @@ ccl_device_inline void path_radiance_accum_emission(PathRadiance *L, float3 thro
}
ccl_device_inline void path_radiance_accum_ao(PathRadiance *L,
+ ccl_addr_space PathState *state,
float3 throughput,
float3 alpha,
float3 bsdf,
- float3 ao,
- int bounce)
+ float3 ao)
{
#ifdef __PASSES__
if(L->use_light_pass) {
- if(bounce == 0) {
+ if(state->bounce == 0) {
/* directly visible lighting */
L->direct_diffuse += throughput*bsdf*ao;
L->ao += alpha*throughput*ao;
@@ -302,31 +308,43 @@ ccl_device_inline void path_radiance_accum_ao(PathRadiance *L,
}
#ifdef __SHADOW_TRICKS__
- float3 light = throughput * bsdf;
- L->path_total += light;
- L->path_total_shaded += ao * light;
+ if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+ float3 light = throughput * bsdf;
+ L->path_total += light;
+ L->path_total_shaded += ao * light;
+ }
#endif
}
ccl_device_inline void path_radiance_accum_total_ao(
PathRadiance *L,
+ ccl_addr_space PathState *state,
float3 throughput,
float3 bsdf)
{
#ifdef __SHADOW_TRICKS__
- L->path_total += throughput * bsdf;
+ if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+ L->path_total += throughput * bsdf;
+ }
#else
(void) L;
+ (void) state;
(void) throughput;
(void) bsdf;
#endif
}
-ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 throughput, BsdfEval *bsdf_eval, float3 shadow, float shadow_fac, int bounce, bool is_lamp)
+ccl_device_inline void path_radiance_accum_light(PathRadiance *L,
+ ccl_addr_space PathState *state,
+ float3 throughput,
+ BsdfEval *bsdf_eval,
+ float3 shadow,
+ float shadow_fac,
+ bool is_lamp)
{
#ifdef __PASSES__
if(L->use_light_pass) {
- if(bounce == 0) {
+ if(state->bounce == 0) {
/* directly visible lighting */
L->direct_diffuse += throughput*bsdf_eval->diffuse*shadow;
L->direct_glossy += throughput*bsdf_eval->glossy*shadow;
@@ -352,21 +370,27 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 through
}
#ifdef __SHADOW_TRICKS__
- float3 light = throughput * bsdf_eval->sum_no_mis;
- L->path_total += light;
- L->path_total_shaded += shadow * light;
+ if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+ float3 light = throughput * bsdf_eval->sum_no_mis;
+ L->path_total += light;
+ L->path_total_shaded += shadow * light;
+ }
#endif
}
ccl_device_inline void path_radiance_accum_total_light(
PathRadiance *L,
+ ccl_addr_space PathState *state,
float3 throughput,
const BsdfEval *bsdf_eval)
{
#ifdef __SHADOW_TRICKS__
- L->path_total += throughput * bsdf_eval->sum_no_mis;
+ if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+ L->path_total += throughput * bsdf_eval->sum_no_mis;
+ }
#else
(void) L;
+ (void) state;
(void) throughput;
(void) bsdf_eval;
#endif
@@ -393,11 +417,17 @@ ccl_device_inline void path_radiance_accum_background(PathRadiance *L,
}
#ifdef __SHADOW_TRICKS__
- L->path_total += throughput * value;
- if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) {
- L->path_total_shaded += throughput * value;
+ if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+ L->path_total += throughput * value;
+ if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) {
+ L->path_total_shaded += throughput * value;
+ }
}
#endif
+
+#ifdef __DENOISING_FEATURES__
+ L->denoising_albedo += state->denoising_feature_weight * value;
+#endif /* __DENOISING_FEATURES__ */
}
ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L)
@@ -555,6 +585,38 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
return L_sum;
}
+ccl_device_inline void path_radiance_split_denoising(KernelGlobals *kg, PathRadiance *L, float3 *noisy, float3 *clean)
+{
+#ifdef __PASSES__
+ kernel_assert(L->use_light_pass);
+
+ *clean = L->emission + L->background;
+ *noisy = L->direct_scatter + L->indirect_scatter;
+
+# define ADD_COMPONENT(flag, component) \
+ if(kernel_data.film.denoising_flags & flag) \
+ *clean += component; \
+ else \
+ *noisy += component;
+
+ ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_DIR, L->direct_diffuse);
+ ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_IND, L->indirect_diffuse);
+ ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_DIR, L->direct_glossy);
+ ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_IND, L->indirect_glossy);
+ ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_DIR, L->direct_transmission);
+ ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_IND, L->indirect_transmission);
+ ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_DIR, L->direct_subsurface);
+ ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_IND, L->indirect_subsurface);
+# undef ADD_COMPONENT
+#else
+ *noisy = L->emission;
+ *clean = make_float3(0.0f, 0.0f, 0.0f);
+#endif
+
+ *noisy = ensure_finite3(*noisy);
+ *clean = ensure_finite3(*clean);
+}
+
ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample, int num_samples)
{
float fac = 1.0f/num_samples;
@@ -595,12 +657,12 @@ ccl_device_inline float path_radiance_sum_shadow(const PathRadiance *L)
/* Calculate final light sum and transparency for shadow catcher object. */
ccl_device_inline float3 path_radiance_sum_shadowcatcher(KernelGlobals *kg,
const PathRadiance *L,
- ccl_addr_space float* L_transparent)
+ float* alpha)
{
const float shadow = path_radiance_sum_shadow(L);
float3 L_sum;
if(kernel_data.background.transparent) {
- *L_transparent = shadow;
+ *alpha = 1.0f-shadow;
L_sum = make_float3(0.0f, 0.0f, 0.0f);
}
else {
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h
index 21da180bb8e..7595e74e2d5 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@@ -42,6 +42,8 @@
#include "util/util_types.h"
#include "util/util_texture.h"
+#define ccl_restrict_ptr const * __restrict
+
#define ccl_addr_space
#define ccl_local_id(d) 0
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index c375d17a95f..80d7401fbcf 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -55,6 +55,10 @@
#define ccl_restrict __restrict__
#define ccl_align(n) __align__(n)
+#define ccl_restrict_ptr const * __restrict__
+#define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH)
+
+
/* No assert supported for CUDA */
#define kernel_assert(cond)
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
index c2263ac0d49..15cf4b81b21 100644
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -50,6 +50,8 @@
# define ccl_addr_space
#endif
+#define ccl_restrict_ptr const * __restrict__
+
#define ccl_local_id(d) get_local_id(d)
#define ccl_global_id(d) get_global_id(d)
diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h
index a2909cec1a1..9baa9d54957 100644
--- a/intern/cycles/kernel/kernel_light.h
+++ b/intern/cycles/kernel/kernel_light.h
@@ -102,7 +102,7 @@ ccl_device_inline float area_light_sample(float3 P,
float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f);
cu = clamp(cu, -1.0f, 1.0f);
/* Compute xu. */
- float xu = -(cu * z0) / sqrtf(1.0f - cu * cu);
+ float xu = -(cu * z0) / max(sqrtf(1.0f - cu * cu), 1e-7f);
xu = clamp(xu, x0, x1);
/* Compute yv. */
float z0sq = z0 * z0;
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index ed523696571..8ab4c724829 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -60,6 +60,135 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sa
#endif /* __SPLIT_KERNEL__ */
}
+#ifdef __DENOISING_FEATURES__
+ccl_device_inline void kernel_write_pass_float_variance(ccl_global float *buffer, int sample, float value)
+{
+ kernel_write_pass_float(buffer, sample, value);
+
+ /* The online one-pass variance update that's used for the megakernel can't easily be implemented
+ * with atomics, so for the split kernel the E[x^2] - 1/N * (E[x])^2 fallback is used. */
+# ifdef __SPLIT_KERNEL__
+ kernel_write_pass_float(buffer+1, sample, value*value);
+# else
+ if(sample == 0) {
+ kernel_write_pass_float(buffer+1, sample, 0.0f);
+ }
+ else {
+ float new_mean = buffer[0] * (1.0f / (sample + 1));
+ float old_mean = (buffer[0] - value) * (1.0f / sample);
+ kernel_write_pass_float(buffer+1, sample, (value - new_mean) * (value - old_mean));
+ }
+# endif
+}
+
+# if defined(__SPLIT_KERNEL__)
+# define kernel_write_pass_float3_unaligned kernel_write_pass_float3
+# else
+ccl_device_inline void kernel_write_pass_float3_unaligned(ccl_global float *buffer, int sample, float3 value)
+{
+ buffer[0] = (sample == 0)? value.x: buffer[0] + value.x;
+ buffer[1] = (sample == 0)? value.y: buffer[1] + value.y;
+ buffer[2] = (sample == 0)? value.z: buffer[2] + value.z;
+}
+# endif
+
+ccl_device_inline void kernel_write_pass_float3_variance(ccl_global float *buffer, int sample, float3 value)
+{
+ kernel_write_pass_float3_unaligned(buffer, sample, value);
+# ifdef __SPLIT_KERNEL__
+ kernel_write_pass_float3_unaligned(buffer+3, sample, value*value);
+# else
+ if(sample == 0) {
+ kernel_write_pass_float3_unaligned(buffer+3, sample, make_float3(0.0f, 0.0f, 0.0f));
+ }
+ else {
+ float3 sum = make_float3(buffer[0], buffer[1], buffer[2]);
+ float3 new_mean = sum * (1.0f / (sample + 1));
+ float3 old_mean = (sum - value) * (1.0f / sample);
+ kernel_write_pass_float3_unaligned(buffer+3, sample, (value - new_mean) * (value - old_mean));
+ }
+# endif
+}
+
+ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg, ccl_global float *buffer,
+ int sample, float path_total, float path_total_shaded)
+{
+ if(kernel_data.film.pass_denoising_data == 0)
+ return;
+
+ buffer += (sample & 1)? DENOISING_PASS_SHADOW_B : DENOISING_PASS_SHADOW_A;
+
+ path_total = ensure_finite(path_total);
+ path_total_shaded = ensure_finite(path_total_shaded);
+
+ kernel_write_pass_float(buffer, sample/2, path_total);
+ kernel_write_pass_float(buffer+1, sample/2, path_total_shaded);
+
+ float value = path_total_shaded / max(path_total, 1e-7f);
+# ifdef __SPLIT_KERNEL__
+ kernel_write_pass_float(buffer+2, sample/2, value*value);
+# else
+ if(sample < 2) {
+ kernel_write_pass_float(buffer+2, sample/2, 0.0f);
+ }
+ else {
+ float old_value = (buffer[1] - path_total_shaded) / max(buffer[0] - path_total, 1e-7f);
+ float new_value = buffer[1] / max(buffer[0], 1e-7f);
+ kernel_write_pass_float(buffer+2, sample, (value - new_value) * (value - old_value));
+ }
+# endif
+}
+#endif /* __DENOISING_FEATURES__ */
+
+ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg,
+ ShaderData *sd,
+ ccl_global PathState *state,
+ PathRadiance *L)
+{
+#ifdef __DENOISING_FEATURES__
+ if(state->denoising_feature_weight == 0.0f) {
+ return;
+ }
+
+ L->denoising_depth += ensure_finite(state->denoising_feature_weight * sd->ray_length);
+
+ float3 normal = make_float3(0.0f, 0.0f, 0.0f);
+ float3 albedo = make_float3(0.0f, 0.0f, 0.0f);
+ float sum_weight = 0.0f, sum_nonspecular_weight = 0.0f;
+
+ for(int i = 0; i < sd->num_closure; i++) {
+ ShaderClosure *sc = &sd->closure[i];
+
+ if(!CLOSURE_IS_BSDF_OR_BSSRDF(sc->type))
+ continue;
+
+ /* All closures contribute to the normal feature, but only diffuse-like ones to the albedo. */
+ normal += sc->N * sc->sample_weight;
+ sum_weight += sc->sample_weight;
+ if(!bsdf_is_specular_like(sc)) {
+ albedo += sc->weight;
+ sum_nonspecular_weight += sc->sample_weight;
+ }
+ }
+
+ /* Wait for next bounce if 75% or more sample weight belongs to specular-like closures. */
+ if((sum_weight == 0.0f) || (sum_nonspecular_weight*4.0f > sum_weight)) {
+ if(sum_weight != 0.0f) {
+ normal /= sum_weight;
+ }
+ L->denoising_normal += ensure_finite3(state->denoising_feature_weight * normal);
+ L->denoising_albedo += ensure_finite3(state->denoising_feature_weight * albedo);
+
+ state->denoising_feature_weight = 0.0f;
+ }
+#else
+ (void) kg;
+ (void) sd;
+ (void) state;
+ (void) L;
+#endif /* __DENOISING_FEATURES__ */
+}
+
ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L,
ShaderData *sd, int sample, ccl_addr_space PathState *state, float3 throughput)
{
@@ -199,5 +328,79 @@ ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg, ccl_global f
#endif
}
+ccl_device_inline void kernel_write_result(KernelGlobals *kg, ccl_global float *buffer,
+ int sample, PathRadiance *L, float alpha, bool is_shadow_catcher)
+{
+ if(L) {
+ float3 L_sum;
+#ifdef __SHADOW_TRICKS__
+ if(is_shadow_catcher) {
+ L_sum = path_radiance_sum_shadowcatcher(kg, L, &alpha);
+ }
+ else
+#endif /* __SHADOW_TRICKS__ */
+ {
+ L_sum = path_radiance_clamp_and_sum(kg, L);
+ }
+
+ kernel_write_pass_float4(buffer, sample, make_float4(L_sum.x, L_sum.y, L_sum.z, alpha));
+
+ kernel_write_light_passes(kg, buffer, L, sample);
+
+#ifdef __DENOISING_FEATURES__
+ if(kernel_data.film.pass_denoising_data) {
+# ifdef __SHADOW_TRICKS__
+ kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, average(L->path_total), average(L->path_total_shaded));
+# else
+ kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, 0.0f, 0.0f);
+# endif
+ if(kernel_data.film.pass_denoising_clean) {
+ float3 noisy, clean;
+ path_radiance_split_denoising(kg, L, &noisy, &clean);
+ kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
+ sample, noisy);
+ kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean,
+ sample, clean);
+ }
+ else {
+ kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
+ sample, L_sum);
+ }
+
+ kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL,
+ sample, L->denoising_normal);
+ kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO,
+ sample, L->denoising_albedo);
+ kernel_write_pass_float_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH,
+ sample, L->denoising_depth);
+ }
+#endif /* __DENOISING_FEATURES__ */
+ }
+ else {
+ kernel_write_pass_float4(buffer, sample, make_float4(0.0f, 0.0f, 0.0f, 0.0f));
+
+#ifdef __DENOISING_FEATURES__
+ if(kernel_data.film.pass_denoising_data) {
+ kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, 0.0f, 0.0f);
+
+ kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
+ sample, make_float3(0.0f, 0.0f, 0.0f));
+
+ kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL,
+ sample, make_float3(0.0f, 0.0f, 0.0f));
+ kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO,
+ sample, make_float3(0.0f, 0.0f, 0.0f));
+ kernel_write_pass_float_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH,
+ sample, 0.0f);
+
+ if(kernel_data.film.pass_denoising_clean) {
+ kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean,
+ sample, make_float3(0.0f, 0.0f, 0.0f));
+ }
+ }
+#endif /* __DENOISING_FEATURES__ */
+ }
+}
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index 58da141aed3..0d31ae32aa6 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -90,10 +90,10 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
light_ray.dD = differential3_zero();
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
- path_radiance_accum_ao(L, throughput, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
+ path_radiance_accum_ao(L, state, throughput, ao_alpha, ao_bsdf, ao_shadow);
}
else {
- path_radiance_accum_total_ao(L, throughput, ao_bsdf);
+ path_radiance_accum_total_ao(L, state, throughput, ao_bsdf);
}
}
}
@@ -366,6 +366,8 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
throughput /= probability;
}
+ kernel_update_denoising_features(kg, sd, state, L);
+
#ifdef __AO__
/* ambient occlusion */
if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) {
@@ -427,18 +429,19 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
}
-ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
- RNG *rng,
- int sample,
- Ray ray,
- ccl_global float *buffer)
+ccl_device_inline float kernel_path_integrate(KernelGlobals *kg,
+ RNG *rng,
+ int sample,
+ Ray ray,
+ ccl_global float *buffer,
+ PathRadiance *L,
+ bool *is_shadow_catcher)
{
/* initialize */
- PathRadiance L;
float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
float L_transparent = 0.0f;
- path_radiance_init(&L, kernel_data.film.use_light_pass);
+ path_radiance_init(L, kernel_data.film.use_light_pass);
/* shader data memory used for both volumes and surfaces, saves stack space */
ShaderData sd;
@@ -517,7 +520,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
float3 emission;
if(indirect_lamp_emission(kg, &emission_sd, &state, &light_ray, &emission))
- path_radiance_accum_emission(&L, throughput, emission, state.bounce);
+ path_radiance_accum_emission(L, throughput, emission, state.bounce);
}
#endif /* __LAMP_MIS__ */
@@ -549,7 +552,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
/* emission */
if(volume_segment.closure_flag & SD_EMISSION)
- path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
+ path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce);
/* scattering */
VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
@@ -559,7 +562,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
/* direct light sampling */
kernel_branched_path_volume_connect_light(kg, rng, &sd,
- &emission_sd, throughput, &state, &L, all,
+ &emission_sd, throughput, &state, L, all,
&volume_ray, &volume_segment);
/* indirect sample. if we use distance sampling and take just
@@ -577,7 +580,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
kernel_volume_decoupled_free(kg, &volume_segment);
if(result == VOLUME_PATH_SCATTERED) {
- if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
+ if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
continue;
else
break;
@@ -591,15 +594,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
{
/* integrate along volume segment with distance sampling */
VolumeIntegrateResult result = kernel_volume_integrate(
- kg, &state, &sd, &volume_ray, &L, &throughput, rng, heterogeneous);
+ kg, &state, &sd, &volume_ray, L, &throughput, rng, heterogeneous);
# ifdef __VOLUME_SCATTER__
if(result == VOLUME_PATH_SCATTERED) {
/* direct lighting */
- kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L);
+ kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L);
/* indirect light bounce */
- if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
+ if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
continue;
else
break;
@@ -623,7 +626,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
#ifdef __BACKGROUND__
/* sample background shader */
float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
- path_radiance_accum_background(&L, &state, throughput, L_background);
+ path_radiance_accum_background(L, &state, throughput, L_background);
#endif /* __BACKGROUND__ */
break;
@@ -640,10 +643,10 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
#ifdef __SHADOW_TRICKS__
if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) {
if(state.flag & PATH_RAY_CAMERA) {
- state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
+ state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO);
state.catcher_object = sd.object;
if(!kernel_data.background.transparent) {
- L.shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
+ L->shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
}
}
}
@@ -677,7 +680,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
#endif /* __HOLDOUT__ */
/* holdout mask objects do not write data passes */
- kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
+ kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput);
/* blurring of bsdf after bounces, for rays that have a small likelihood
* of following this particular path (diffuse, rough glossy) */
@@ -695,7 +698,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
if(sd.flag & SD_EMISSION) {
/* todo: is isect.t wrong here for transparent surfaces? */
float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
- path_radiance_accum_emission(&L, throughput, emission, state.bounce);
+ path_radiance_accum_emission(L, throughput, emission, state.bounce);
}
#endif /* __EMISSION__ */
@@ -715,10 +718,12 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
throughput /= probability;
}
+ kernel_update_denoising_features(kg, &sd, &state, L);
+
#ifdef __AO__
/* ambient occlusion */
if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
- kernel_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd));
+ kernel_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd));
}
#endif /* __AO__ */
@@ -729,7 +734,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
if(kernel_path_subsurface_scatter(kg,
&sd,
&emission_sd,
- &L,
+ L,
&state,
rng,
&ray,
@@ -742,15 +747,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
#endif /* __SUBSURFACE__ */
/* direct lighting */
- kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L);
+ kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L);
/* compute direct lighting and next bounce */
- if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
+ if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
break;
}
#ifdef __SUBSURFACE__
- kernel_path_subsurface_accum_indirect(&ss_indirect, &L);
+ kernel_path_subsurface_accum_indirect(&ss_indirect, L);
/* Trace indirect subsurface rays by restarting the loop. this uses less
* stack memory than invoking kernel_path_indirect.
@@ -760,7 +765,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
&ss_indirect,
&state,
&ray,
- &L,
+ L,
&throughput);
}
else {
@@ -769,24 +774,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
}
#endif /* __SUBSURFACE__ */
- float3 L_sum;
#ifdef __SHADOW_TRICKS__
- if(state.flag & PATH_RAY_SHADOW_CATCHER) {
- L_sum = path_radiance_sum_shadowcatcher(kg, &L, &L_transparent);
- }
- else
+ *is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER);
#endif /* __SHADOW_TRICKS__ */
- {
- L_sum = path_radiance_clamp_and_sum(kg, &L);
- }
-
- kernel_write_light_passes(kg, buffer, &L, sample);
#ifdef __KERNEL_DEBUG__
kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
#endif /* __KERNEL_DEBUG__ */
- return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
+ return 1.0f - L_transparent;
}
ccl_device void kernel_path_trace(KernelGlobals *kg,
@@ -807,15 +803,16 @@ ccl_device void kernel_path_trace(KernelGlobals *kg,
kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
/* integrate */
- float4 L;
-
- if(ray.t != 0.0f)
- L = kernel_path_integrate(kg, &rng, sample, ray, buffer);
- else
- L = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ PathRadiance L;
+ bool is_shadow_catcher;
- /* accumulate result in output buffer */
- kernel_write_pass_float4(buffer, sample, L);
+ if(ray.t != 0.0f) {
+ float alpha = kernel_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher);
+ kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher);
+ }
+ else {
+ kernel_write_result(kg, buffer, sample, NULL, 0.0f, false);
+ }
path_rng_end(kg, rng_state, rng);
}
diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h
index ddcb57161ea..10816d3e5d1 100644
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ b/intern/cycles/kernel/kernel_path_branched.h
@@ -56,10 +56,10 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
light_ray.dD = differential3_zero();
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
- path_radiance_accum_ao(L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
+ path_radiance_accum_ao(L, state, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow);
}
else {
- path_radiance_accum_total_ao(L, throughput*num_samples_inv, ao_bsdf);
+ path_radiance_accum_total_ao(L, state, throughput*num_samples_inv, ao_bsdf);
}
}
}
@@ -72,14 +72,32 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
RNG *rng, ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd,
float3 throughput, float num_samples_adjust, PathState *state, PathRadiance *L)
{
+ float sum_sample_weight = 0.0f;
+#ifdef __DENOISING_FEATURES__
+ if(state->denoising_feature_weight > 0.0f) {
+ for(int i = 0; i < sd->num_closure; i++) {
+ const ShaderClosure *sc = &sd->closure[i];
+
+ /* transparency is not handled here, but in outer loop */
+ if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
+ continue;
+ }
+
+ sum_sample_weight += sc->sample_weight;
+ }
+ }
+ else {
+ sum_sample_weight = 1.0f;
+ }
+#endif /* __DENOISING_FEATURES__ */
+
for(int i = 0; i < sd->num_closure; i++) {
const ShaderClosure *sc = &sd->closure[i];
- if(!CLOSURE_IS_BSDF(sc->type))
- continue;
/* transparency is not handled here, but in outer loop */
- if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID)
+ if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
continue;
+ }
int num_samples;
@@ -111,7 +129,8 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
&tp,
&ps,
L,
- &bsdf_ray))
+ &bsdf_ray,
+ sum_sample_weight))
{
continue;
}
@@ -243,14 +262,19 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
}
#endif /* __SUBSURFACE__ */
-ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer)
+ccl_device float kernel_branched_path_integrate(KernelGlobals *kg,
+ RNG *rng,
+ int sample,
+ Ray ray,
+ ccl_global float *buffer,
+ PathRadiance *L,
+ bool *is_shadow_catcher)
{
/* initialize */
- PathRadiance L;
float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
float L_transparent = 0.0f;
- path_radiance_init(&L, kernel_data.film.use_light_pass);
+ path_radiance_init(L, kernel_data.film.use_light_pass);
/* shader data memory used for both volumes and surfaces, saves stack space */
ShaderData sd;
@@ -330,7 +354,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
int all = kernel_data.integrator.sample_all_lights_direct;
kernel_branched_path_volume_connect_light(kg, rng, &sd,
- &emission_sd, throughput, &state, &L, all,
+ &emission_sd, throughput, &state, L, all,
&volume_ray, &volume_segment);
/* indirect light sampling */
@@ -362,7 +386,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
&sd,
&tp,
&ps,
- &L,
+ L,
&pray))
{
kernel_path_indirect(kg,
@@ -373,19 +397,19 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
tp*num_samples_inv,
num_samples,
&ps,
- &L);
+ L);
/* for render passes, sum and reset indirect light pass variables
* for the next samples */
- path_radiance_sum_indirect(&L);
- path_radiance_reset_indirect(&L);
+ path_radiance_sum_indirect(L);
+ path_radiance_reset_indirect(L);
}
}
}
/* emission and transmittance */
if(volume_segment.closure_flag & SD_EMISSION)
- path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
+ path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce);
throughput *= volume_segment.accum_transmittance;
/* free cached steps */
@@ -407,20 +431,20 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
path_state_branch(&ps, j, num_samples);
VolumeIntegrateResult result = kernel_volume_integrate(
- kg, &ps, &sd, &volume_ray, &L, &tp, rng, heterogeneous);
+ kg, &ps, &sd, &volume_ray, L, &tp, rng, heterogeneous);
#ifdef __VOLUME_SCATTER__
if(result == VOLUME_PATH_SCATTERED) {
/* todo: support equiangular, MIS and all light sampling.
* alternatively get decoupled ray marching working on the GPU */
- kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, &L);
+ kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, L);
if(kernel_path_volume_bounce(kg,
rng,
&sd,
&tp,
&ps,
- &L,
+ L,
&pray))
{
kernel_path_indirect(kg,
@@ -431,12 +455,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
tp,
num_samples,
&ps,
- &L);
+ L);
/* for render passes, sum and reset indirect light pass variables
* for the next samples */
- path_radiance_sum_indirect(&L);
- path_radiance_reset_indirect(&L);
+ path_radiance_sum_indirect(L);
+ path_radiance_reset_indirect(L);
}
}
#endif /* __VOLUME_SCATTER__ */
@@ -462,7 +486,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
#ifdef __BACKGROUND__
/* sample background shader */
float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
- path_radiance_accum_background(&L, &state, throughput, L_background);
+ path_radiance_accum_background(L, &state, throughput, L_background);
#endif /* __BACKGROUND__ */
break;
@@ -476,10 +500,10 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
#ifdef __SHADOW_TRICKS__
if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) {
if(state.flag & PATH_RAY_CAMERA) {
- state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
+ state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO);
state.catcher_object = sd.object;
if(!kernel_data.background.transparent) {
- L.shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
+ L->shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
}
}
}
@@ -509,13 +533,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
#endif /* __HOLDOUT__ */
/* holdout mask objects do not write data passes */
- kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
+ kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput);
#ifdef __EMISSION__
/* emission */
if(sd.flag & SD_EMISSION) {
float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
- path_radiance_accum_emission(&L, throughput, emission, state.bounce);
+ path_radiance_accum_emission(L, throughput, emission, state.bounce);
}
#endif /* __EMISSION__ */
@@ -539,10 +563,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
}
}
+ kernel_update_denoising_features(kg, &sd, &state, L);
+
#ifdef __AO__
/* ambient occlusion */
if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
- kernel_branched_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput);
+ kernel_branched_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput);
}
#endif /* __AO__ */
@@ -550,7 +576,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
/* bssrdf scatter to a different location on the same object */
if(sd.flag & SD_BSSRDF) {
kernel_branched_path_subsurface_scatter(kg, &sd, &indirect_sd, &emission_sd,
- &L, &state, rng, &ray, throughput);
+ L, &state, rng, &ray, throughput);
}
#endif /* __SUBSURFACE__ */
@@ -563,13 +589,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
int all = (kernel_data.integrator.sample_all_lights_direct) ||
(state.flag & PATH_RAY_SHADOW_CATCHER);
kernel_branched_path_surface_connect_light(kg, rng,
- &sd, &emission_sd, &hit_state, throughput, 1.0f, &L, all);
+ &sd, &emission_sd, &hit_state, throughput, 1.0f, L, all);
}
#endif /* __EMISSION__ */
/* indirect light */
kernel_branched_path_surface_indirect_light(kg, rng,
- &sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, &L);
+ &sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, L);
/* continue in case of transparency */
throughput *= shader_bsdf_transparency(kg, &sd);
@@ -598,24 +624,15 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
#endif /* __VOLUME__ */
}
- float3 L_sum;
#ifdef __SHADOW_TRICKS__
- if(state.flag & PATH_RAY_SHADOW_CATCHER) {
- L_sum = path_radiance_sum_shadowcatcher(kg, &L, &L_transparent);
- }
- else
+ *is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER);
#endif /* __SHADOW_TRICKS__ */
- {
- L_sum = path_radiance_clamp_and_sum(kg, &L);
- }
-
- kernel_write_light_passes(kg, buffer, &L, sample);
#ifdef __KERNEL_DEBUG__
kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
#endif /* __KERNEL_DEBUG__ */
- return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
+ return 1.0f - L_transparent;
}
ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
@@ -636,15 +653,16 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
/* integrate */
- float4 L;
-
- if(ray.t != 0.0f)
- L = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer);
- else
- L = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ PathRadiance L;
+ bool is_shadow_catcher;
- /* accumulate result in output buffer */
- kernel_write_pass_float4(buffer, sample, L);
+ if(ray.t != 0.0f) {
+ float alpha = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher);
+ kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher);
+ }
+ else {
+ kernel_write_result(kg, buffer, sample, NULL, 0.0f, false);
+ }
path_rng_end(kg, rng_state, rng);
}
@@ -654,4 +672,3 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
#endif /* __BRANCHED_PATH__ */
CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h
index c0cd2a63120..0fa77d9e8bd 100644
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -35,6 +35,16 @@ ccl_device_inline void path_state_init(KernelGlobals *kg,
state->transmission_bounce = 0;
state->transparent_bounce = 0;
+#ifdef __DENOISING_FEATURES__
+ if(kernel_data.film.pass_denoising_data) {
+ state->flag |= PATH_RAY_STORE_SHADOW_INFO;
+ state->denoising_feature_weight = 1.0f;
+ }
+ else {
+ state->denoising_feature_weight = 0.0f;
+ }
+#endif /* __DENOISING_FEATURES__ */
+
state->min_ray_pdf = FLT_MAX;
state->ray_pdf = 0.0f;
#ifdef __LAMP_MIS__
@@ -128,6 +138,10 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathSta
/* random number generator next bounce */
state->rng_offset += PRNG_BOUNCE_NUM;
+
+ if((state->denoising_feature_weight == 0.0f) && !(state->flag & PATH_RAY_SHADOW_CATCHER)) {
+ state->flag &= ~PATH_RAY_STORE_SHADOW_INFO;
+ }
}
ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *state)
diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h
index bd4ba775b4d..e676ea0f3ae 100644
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ b/intern/cycles/kernel/kernel_path_surface.h
@@ -70,10 +70,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
- path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+ path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
}
else {
- path_radiance_accum_total_light(L, throughput*num_samples_inv, &L_light);
+ path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light);
}
}
}
@@ -107,10 +107,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
- path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+ path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
}
else {
- path_radiance_accum_total_light(L, throughput*num_samples_inv, &L_light);
+ path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light);
}
}
}
@@ -133,10 +133,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
- path_radiance_accum_light(L, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, state->bounce, is_lamp);
+ path_radiance_accum_light(L, state, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, is_lamp);
}
else {
- path_radiance_accum_total_light(L, throughput*num_samples_adjust, &L_light);
+ path_radiance_accum_total_light(L, state, throughput*num_samples_adjust, &L_light);
}
}
}
@@ -155,7 +155,8 @@ ccl_device bool kernel_branched_path_surface_bounce(
ccl_addr_space float3 *throughput,
ccl_addr_space PathState *state,
PathRadiance *L,
- ccl_addr_space Ray *ray)
+ ccl_addr_space Ray *ray,
+ float sum_sample_weight)
{
/* sample BSDF */
float bsdf_pdf;
@@ -175,6 +176,10 @@ ccl_device bool kernel_branched_path_surface_bounce(
/* modify throughput */
path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
+#ifdef __DENOISING_FEATURES__
+ state->denoising_feature_weight *= sc->sample_weight / (sum_sample_weight * num_samples);
+#endif
+
/* modify path state */
path_state_next(kg, state, label);
@@ -257,10 +262,10 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
- path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+ path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
}
else {
- path_radiance_accum_total_light(L, throughput, &L_light);
+ path_radiance_accum_total_light(L, state, throughput, &L_light);
}
}
}
diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h
index 371f2c1c7cb..dcedf51e479 100644
--- a/intern/cycles/kernel/kernel_path_volume.h
+++ b/intern/cycles/kernel/kernel_path_volume.h
@@ -55,7 +55,7 @@ ccl_device_inline void kernel_path_volume_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
- path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+ path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
}
}
}
@@ -184,7 +184,7 @@ ccl_device void kernel_branched_path_volume_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
- path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+ path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
}
}
}
@@ -233,7 +233,7 @@ ccl_device void kernel_branched_path_volume_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
- path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+ path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
}
}
}
@@ -271,7 +271,7 @@ ccl_device void kernel_branched_path_volume_connect_light(
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
/* accumulate */
- path_radiance_accum_light(L, tp, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+ path_radiance_accum_light(L, state, tp, &L_light, shadow, 1.0f, is_lamp);
}
}
}
diff --git a/intern/cycles/kernel/kernel_projection.h b/intern/cycles/kernel/kernel_projection.h
index 9a2b0884a7e..cbb2442d1dc 100644
--- a/intern/cycles/kernel/kernel_projection.h
+++ b/intern/cycles/kernel/kernel_projection.h
@@ -57,6 +57,9 @@ ccl_device float3 spherical_to_direction(float theta, float phi)
ccl_device float2 direction_to_equirectangular_range(float3 dir, float4 range)
{
+ if(is_zero(dir))
+ return make_float2(0.0f, 0.0f);
+
float u = (atan2f(dir.y, dir.x) - range.y) / range.x;
float v = (acosf(dir.z / len(dir)) - range.w) / range.z;
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index 8c0c5e90a3e..c66f52255f0 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -99,7 +99,7 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
/* smooth normal */
if(sd->shader & SHADER_SMOOTH_NORMAL)
- sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
+ sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
#ifdef __DPDU__
/* dPdu/dPdv */
@@ -186,7 +186,7 @@ void shader_setup_from_subsurface(
sd->N = Ng;
if(sd->shader & SHADER_SMOOTH_NORMAL)
- sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
+ sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
# ifdef __DPDU__
/* dPdu/dPdv */
@@ -300,7 +300,7 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
if(sd->type & PRIMITIVE_TRIANGLE) {
/* smooth normal */
if(sd->shader & SHADER_SMOOTH_NORMAL) {
- sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
+ sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
#ifdef __INSTANCING__
if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 9b354457b91..dd1fa1b82f7 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -173,6 +173,8 @@ CCL_NAMESPACE_BEGIN
#define __PATCH_EVAL__
#define __SHADOW_TRICKS__
+#define __DENOISING_FEATURES__
+
#ifdef __KERNEL_SHADING__
# define __SVM__
# define __EMISSION__
@@ -314,31 +316,32 @@ enum SamplingPattern {
/* these flags values correspond to raytypes in osl.cpp, so keep them in sync! */
enum PathRayFlag {
- PATH_RAY_CAMERA = 1,
- PATH_RAY_REFLECT = 2,
- PATH_RAY_TRANSMIT = 4,
- PATH_RAY_DIFFUSE = 8,
- PATH_RAY_GLOSSY = 16,
- PATH_RAY_SINGULAR = 32,
- PATH_RAY_TRANSPARENT = 64,
-
- PATH_RAY_SHADOW_OPAQUE = 128,
- PATH_RAY_SHADOW_TRANSPARENT = 256,
+ PATH_RAY_CAMERA = (1 << 0),
+ PATH_RAY_REFLECT = (1 << 1),
+ PATH_RAY_TRANSMIT = (1 << 2),
+ PATH_RAY_DIFFUSE = (1 << 3),
+ PATH_RAY_GLOSSY = (1 << 4),
+ PATH_RAY_SINGULAR = (1 << 5),
+ PATH_RAY_TRANSPARENT = (1 << 6),
+
+ PATH_RAY_SHADOW_OPAQUE = (1 << 7),
+ PATH_RAY_SHADOW_TRANSPARENT = (1 << 8),
PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT),
- PATH_RAY_CURVE = 512, /* visibility flag to define curve segments */
- PATH_RAY_VOLUME_SCATTER = 1024, /* volume scattering */
+ PATH_RAY_CURVE = (1 << 9), /* visibility flag to define curve segments */
+ PATH_RAY_VOLUME_SCATTER = (1 << 10), /* volume scattering */
/* Special flag to tag unaligned BVH nodes. */
- PATH_RAY_NODE_UNALIGNED = 2048,
+ PATH_RAY_NODE_UNALIGNED = (1 << 11),
- PATH_RAY_ALL_VISIBILITY = (1|2|4|8|16|32|64|128|256|512|1024|2048),
+ PATH_RAY_ALL_VISIBILITY = ((1 << 12)-1),
- PATH_RAY_MIS_SKIP = 4096,
- PATH_RAY_DIFFUSE_ANCESTOR = 8192,
- PATH_RAY_SINGLE_PASS_DONE = 16384,
- PATH_RAY_SHADOW_CATCHER = 32768,
- PATH_RAY_SHADOW_CATCHER_ONLY = 65536,
+ PATH_RAY_MIS_SKIP = (1 << 12),
+ PATH_RAY_DIFFUSE_ANCESTOR = (1 << 13),
+ PATH_RAY_SINGLE_PASS_DONE = (1 << 14),
+ PATH_RAY_SHADOW_CATCHER = (1 << 15),
+ PATH_RAY_SHADOW_CATCHER_ONLY = (1 << 16),
+ PATH_RAY_STORE_SHADOW_INFO = (1 << 17),
};
/* Closure Label */
@@ -394,6 +397,22 @@ typedef enum PassType {
#define PASS_ALL (~0)
+typedef enum DenoisingPassOffsets {
+ DENOISING_PASS_NORMAL = 0,
+ DENOISING_PASS_NORMAL_VAR = 3,
+ DENOISING_PASS_ALBEDO = 6,
+ DENOISING_PASS_ALBEDO_VAR = 9,
+ DENOISING_PASS_DEPTH = 12,
+ DENOISING_PASS_DEPTH_VAR = 13,
+ DENOISING_PASS_SHADOW_A = 14,
+ DENOISING_PASS_SHADOW_B = 17,
+ DENOISING_PASS_COLOR = 20,
+ DENOISING_PASS_COLOR_VAR = 23,
+
+ DENOISING_PASS_SIZE_BASE = 26,
+ DENOISING_PASS_SIZE_CLEAN = 3,
+} DenoisingPassOffsets;
+
typedef enum BakePassFilter {
BAKE_FILTER_NONE = 0,
BAKE_FILTER_DIRECT = (1 << 0),
@@ -427,6 +446,18 @@ typedef enum BakePassFilterCombos {
BAKE_FILTER_SUBSURFACE_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_SUBSURFACE),
} BakePassFilterCombos;
+typedef enum DenoiseFlag {
+ DENOISING_CLEAN_DIFFUSE_DIR = (1 << 0),
+ DENOISING_CLEAN_DIFFUSE_IND = (1 << 1),
+ DENOISING_CLEAN_GLOSSY_DIR = (1 << 2),
+ DENOISING_CLEAN_GLOSSY_IND = (1 << 3),
+ DENOISING_CLEAN_TRANSMISSION_DIR = (1 << 4),
+ DENOISING_CLEAN_TRANSMISSION_IND = (1 << 5),
+ DENOISING_CLEAN_SUBSURFACE_DIR = (1 << 6),
+ DENOISING_CLEAN_SUBSURFACE_IND = (1 << 7),
+ DENOISING_CLEAN_ALL_PASSES = (1 << 8)-1,
+} DenoiseFlag;
+
typedef ccl_addr_space struct PathRadiance {
#ifdef __PASSES__
int use_light_pass;
@@ -482,6 +513,12 @@ typedef ccl_addr_space struct PathRadiance {
/* Color of the background on which shadow is alpha-overed. */
float3 shadow_color;
#endif
+
+#ifdef __DENOISING_FEATURES__
+ float3 denoising_normal;
+ float3 denoising_albedo;
+ float denoising_depth;
+#endif /* __DENOISING_FEATURES__ */
} PathRadiance;
typedef struct BsdfEval {
@@ -724,12 +761,13 @@ typedef struct AttributeDescriptor {
#define SHADER_CLOSURE_BASE \
float3 weight; \
ClosureType type; \
- float sample_weight \
+ float sample_weight; \
+ float3 N
typedef ccl_addr_space struct ccl_align(16) ShaderClosure {
SHADER_CLOSURE_BASE;
- float data[14]; /* pad to 80 bytes */
+ float data[10]; /* pad to 80 bytes */
} ShaderClosure;
/* Shader Context
@@ -960,6 +998,10 @@ typedef struct PathState {
int transmission_bounce;
int transparent_bounce;
+#ifdef __DENOISING_FEATURES__
+ float denoising_feature_weight;
+#endif /* __DENOISING_FEATURES__ */
+
/* multiple importance sampling */
float min_ray_pdf; /* smallest bounce pdf over entire path up to now */
float ray_pdf; /* last bounce pdf */
@@ -1137,6 +1179,11 @@ typedef struct KernelFilm {
float mist_inv_depth;
float mist_falloff;
+ int pass_denoising_data;
+ int pass_denoising_clean;
+ int denoising_flags;
+ int pad;
+
#ifdef __KERNEL_DEBUG__
int pass_bvh_traversed_nodes;
int pass_bvh_traversed_instances;
diff --git a/intern/cycles/kernel/kernels/cpu/filter.cpp b/intern/cycles/kernel/kernels/cpu/filter.cpp
new file mode 100644
index 00000000000..2ff1a392dc3
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CPU kernel entry points */
+
+/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
+ * one with SSE2 intrinsics.
+ */
+#if defined(__x86_64__) || defined(_M_X64)
+# define __KERNEL_SSE2__
+#endif
+
+/* When building kernel for native machine detect kernel features from the flags
+ * set by compiler.
+ */
+#ifdef WITH_KERNEL_NATIVE
+# ifdef __SSE2__
+# ifndef __KERNEL_SSE2__
+# define __KERNEL_SSE2__
+# endif
+# endif
+# ifdef __SSE3__
+# define __KERNEL_SSE3__
+# endif
+# ifdef __SSSE3__
+# define __KERNEL_SSSE3__
+# endif
+# ifdef __SSE4_1__
+# define __KERNEL_SSE41__
+# endif
+# ifdef __AVX__
+# define __KERNEL_SSE__
+# define __KERNEL_AVX__
+# endif
+# ifdef __AVX2__
+# define __KERNEL_SSE__
+# define __KERNEL_AVX2__
+# endif
+#endif
+
+/* quiet unused define warnings */
+#if defined(__KERNEL_SSE2__)
+ /* do nothing */
+#endif
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp
new file mode 100644
index 00000000000..4a9e6047ecf
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp
new file mode 100644
index 00000000000..c22ec576254
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# define __KERNEL_AVX2__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
new file mode 100644
index 00000000000..10007ee2635
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common declaration part of all CPU kernels. */
+
+void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
+ TilesInfo *tiles,
+ int x,
+ int y,
+ float *unfilteredA,
+ float *unfilteredB,
+ float *sampleV,
+ float *sampleVV,
+ float *bufferV,
+ int* prefilter_rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
+ TilesInfo *tiles,
+ int m_offset,
+ int v_offset,
+ int x,
+ int y,
+ float *mean,
+ float *variance,
+ int* prefilter_rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
+ float *mean,
+ float *variance,
+ float *a,
+ float *b,
+ int* prefilter_rect,
+ int r);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
+ int x,
+ int y,
+ int storage_ofs,
+ float *transform,
+ int *rank,
+ int* rect,
+ int pass_stride,
+ int radius,
+ float pca_threshold);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
+ int dy,
+ float *weightImage,
+ float *variance,
+ float *differenceImage,
+ int* rect,
+ int w,
+ int channel_offset,
+ float a,
+ float k_2);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *differenceImage,
+ float *outImage,
+ int* rect,
+ int w,
+ int f);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *differenceImage,
+ float *outImage,
+ int* rect,
+ int w,
+ int f);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
+ int dy,
+ float *differenceImage,
+ float *image,
+ float *outImage,
+ float *accumImage,
+ int* rect,
+ int w,
+ int f);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
+ int dy,
+ float *differenceImage,
+ float *buffer,
+ float *color_pass,
+ float *variance_pass,
+ float *transform,
+ int *rank,
+ float *XtWX,
+ float3 *XtWY,
+ int *rect,
+ int *filter_rect,
+ int w,
+ int h,
+ int f,
+ int pass_stride);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *outImage,
+ float *accumImage,
+ int* rect,
+ int w);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
+ int y,
+ int storage_ofs,
+ int w,
+ int h,
+ float *buffer,
+ int *rank,
+ float *XtWX,
+ float3 *XtWY,
+ int *buffer_params,
+ int sample);
+
+#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
new file mode 100644
index 00000000000..3b71e50ca3b
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
@@ -0,0 +1,259 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common implementation part of all CPU kernels.
+ *
+ * The idea is that particular .cpp files sets needed optimization flags and
+ * simply includes this file without worry of copying actual implementation over.
+ */
+
+#include "kernel/kernel_compat_cpu.h"
+
+#include "kernel/filter/filter_kernel.h"
+
+#ifdef KERNEL_STUB
+# include "util/util_debug.h"
+# define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!"))
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+
+/* Denoise filter */
+
+void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
+ TilesInfo *tiles,
+ int x,
+ int y,
+ float *unfilteredA,
+ float *unfilteredB,
+ float *sampleVariance,
+ float *sampleVarianceV,
+ float *bufferVariance,
+ int* prefilter_rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow);
+#else
+ kernel_filter_divide_shadow(sample, tiles,
+ x, y,
+ unfilteredA,
+ unfilteredB,
+ sampleVariance,
+ sampleVarianceV,
+ bufferVariance,
+ load_int4(prefilter_rect),
+ buffer_pass_stride,
+ buffer_denoising_offset,
+ use_split_variance);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
+ TilesInfo *tiles,
+ int m_offset,
+ int v_offset,
+ int x,
+ int y,
+ float *mean, float *variance,
+ int* prefilter_rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_get_feature);
+#else
+ kernel_filter_get_feature(sample, tiles,
+ m_offset, v_offset,
+ x, y,
+ mean, variance,
+ load_int4(prefilter_rect),
+ buffer_pass_stride,
+ buffer_denoising_offset,
+ use_split_variance);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
+ float *mean,
+ float *variance,
+ float *a,
+ float *b,
+ int* prefilter_rect,
+ int r)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_combine_halves);
+#else
+ kernel_filter_combine_halves(x, y, mean, variance, a, b, load_int4(prefilter_rect), r);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
+ int x,
+ int y,
+ int storage_ofs,
+ float *transform,
+ int *rank,
+ int* prefilter_rect,
+ int pass_stride,
+ int radius,
+ float pca_threshold)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_construct_transform);
+#else
+ rank += storage_ofs;
+ transform += storage_ofs*TRANSFORM_SIZE;
+ kernel_filter_construct_transform(buffer,
+ x, y,
+ load_int4(prefilter_rect),
+ pass_stride,
+ transform,
+ rank,
+ radius,
+ pca_threshold);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
+ int dy,
+ float *weightImage,
+ float *variance,
+ float *differenceImage,
+ int *rect,
+ int w,
+ int channel_offset,
+ float a,
+ float k_2)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference);
+#else
+ kernel_filter_nlm_calc_difference(dx, dy, weightImage, variance, differenceImage, load_int4(rect), w, channel_offset, a, k_2);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *differenceImage,
+ float *outImage,
+ int *rect,
+ int w,
+ int f)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_nlm_blur);
+#else
+ kernel_filter_nlm_blur(differenceImage, outImage, load_int4(rect), w, f);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *differenceImage,
+ float *outImage,
+ int *rect,
+ int w,
+ int f)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_weight);
+#else
+ kernel_filter_nlm_calc_weight(differenceImage, outImage, load_int4(rect), w, f);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
+ int dy,
+ float *differenceImage,
+ float *image,
+ float *outImage,
+ float *accumImage,
+ int *rect,
+ int w,
+ int f)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output);
+#else
+ kernel_filter_nlm_update_output(dx, dy, differenceImage, image, outImage, accumImage, load_int4(rect), w, f);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
+ int dy,
+ float *differenceImage,
+ float *buffer,
+ float *color_pass,
+ float *variance_pass,
+ float *transform,
+ int *rank,
+ float *XtWX,
+ float3 *XtWY,
+ int *rect,
+ int *filter_rect,
+ int w,
+ int h,
+ int f,
+ int pass_stride)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian);
+#else
+ kernel_filter_nlm_construct_gramian(dx, dy, differenceImage, buffer, color_pass, variance_pass, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_rect), w, h, f, pass_stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *outImage,
+ float *accumImage,
+ int *rect,
+ int w)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_nlm_normalize);
+#else
+ kernel_filter_nlm_normalize(outImage, accumImage, load_int4(rect), w);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
+ int y,
+ int storage_ofs,
+ int w,
+ int h,
+ float *buffer,
+ int *rank,
+ float *XtWX,
+ float3 *XtWY,
+ int *buffer_params,
+ int sample)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_finalize);
+#else
+ XtWX += storage_ofs*XTWX_SIZE;
+ XtWY += storage_ofs*XTWY_SIZE;
+ rank += storage_ofs;
+ kernel_filter_finalize(x, y, w, h, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample);
+#endif
+}
+
+#undef KERNEL_STUB
+#undef STUB_ASSERT
+#undef KERNEL_ARCH
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
new file mode 100644
index 00000000000..f7c9935f1d0
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
new file mode 100644
index 00000000000..070b95a3505
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
new file mode 100644
index 00000000000..1a7b2040da1
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
index 2600d977972..a645fb4d8dd 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
@@ -17,21 +17,23 @@
/* Optimized CPU kernel entry points. This file is compiled with AVX
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE__
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# define __KERNEL_SSE41__
-# define __KERNEL_AVX__
-#endif
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_avx
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
index dba15d037ac..6bbb87727b9 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
@@ -18,21 +18,23 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE__
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# define __KERNEL_SSE41__
-# define __KERNEL_AVX__
-# define __KERNEL_AVX2__
-#endif
-
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_avx2
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# define __KERNEL_AVX2__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
index 39c9a9cf33c..9895080d328 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
@@ -89,6 +89,4 @@ DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update)
-void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func));
-
#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
index 8c05dd1d9ef..b9d82781840 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
@@ -57,6 +57,11 @@
# include "kernel/split/kernel_buffer_update.h"
#endif
+#ifdef KERNEL_STUB
+# include "util/util_debug.h"
+# define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!"))
+#endif
+
CCL_NAMESPACE_BEGIN
#ifndef __SPLIT_KERNEL__
@@ -71,7 +76,10 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
int offset,
int stride)
{
-#ifdef __BRANCHED_PATH__
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, path_trace);
+#else
+# ifdef __BRANCHED_PATH__
if(kernel_data.integrator.branched) {
kernel_branched_path_trace(kg,
buffer,
@@ -82,10 +90,11 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
stride);
}
else
-#endif
+# endif
{
kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
}
+#endif /* KERNEL_STUB */
}
/* Film */
@@ -98,6 +107,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
int offset,
int stride)
{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, convert_to_byte);
+#else
kernel_film_convert_to_byte(kg,
rgba,
buffer,
@@ -105,6 +117,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
x, y,
offset,
stride);
+#endif /* KERNEL_STUB */
}
void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
@@ -115,6 +128,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
int offset,
int stride)
{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, convert_to_half_float);
+#else
kernel_film_convert_to_half_float(kg,
rgba,
buffer,
@@ -122,6 +138,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
x, y,
offset,
stride);
+#endif /* KERNEL_STUB */
}
/* Shader Evaluate */
@@ -136,9 +153,12 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
int offset,
int sample)
{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, shader);
+#else
if(type >= SHADER_EVAL_BAKE) {
kernel_assert(output_luma == NULL);
-#ifdef __BAKING__
+# ifdef __BAKING__
kernel_bake_evaluate(kg,
input,
output,
@@ -147,7 +167,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
i,
offset,
sample);
-#endif
+# endif
}
else {
kernel_shader_evaluate(kg,
@@ -158,17 +178,26 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
i,
sample);
}
+#endif /* KERNEL_STUB */
}
#else /* __SPLIT_KERNEL__ */
/* Split Kernel Path Tracing */
-#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
+#ifdef KERNEL_STUB
+# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
+ void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
+ { \
+ STUB_ASSERT(KERNEL_ARCH, name); \
+ }
+#else
+# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
{ \
kernel_##name(kg); \
}
+#endif /* KERNEL_STUB */
#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
@@ -194,42 +223,10 @@ DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
-
-void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func))
-{
-#define REGISTER_NAME_STRING(name) #name
-#define REGISTER_EVAL_NAME(name) REGISTER_NAME_STRING(name)
-#define REGISTER(name) reg(REGISTER_EVAL_NAME(KERNEL_FUNCTION_FULL_NAME(name)), (void*)KERNEL_FUNCTION_FULL_NAME(name));
-
- REGISTER(path_trace);
- REGISTER(convert_to_byte);
- REGISTER(convert_to_half_float);
- REGISTER(shader);
-
- REGISTER(data_init);
- REGISTER(path_init);
- REGISTER(scene_intersect);
- REGISTER(lamp_emission);
- REGISTER(do_volume);
- REGISTER(queue_enqueue);
- REGISTER(indirect_background);
- REGISTER(shader_setup);
- REGISTER(shader_sort);
- REGISTER(shader_eval);
- REGISTER(holdout_emission_blurring_pathtermination_ao);
- REGISTER(subsurface_scatter);
- REGISTER(direct_lighting);
- REGISTER(shadow_blocked_ao);
- REGISTER(shadow_blocked_dl);
- REGISTER(next_iteration_setup);
- REGISTER(indirect_subsurface);
- REGISTER(buffer_update);
-
-#undef REGISTER
-#undef REGISTER_EVAL_NAME
-#undef REGISTER_NAME_STRING
-}
-
#endif /* __SPLIT_KERNEL__ */
+#undef KERNEL_STUB
+#undef STUB_ASSERT
+#undef KERNEL_ARCH
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
index 27a746a0799..6ba3425a343 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
@@ -17,22 +17,25 @@
/* Optimized CPU kernel entry points. This file is compiled with AVX
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# define __KERNEL_SSE41__
-# define __KERNEL_AVX__
-#endif
#define __SPLIT_KERNEL__
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_avx
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
index 364d279a189..76b2d77ebb8 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
@@ -18,23 +18,25 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE__
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# define __KERNEL_SSE41__
-# define __KERNEL_AVX__
-# define __KERNEL_AVX2__
-#endif
-
#define __SPLIT_KERNEL__
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_avx2
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# define __KERNEL_AVX2__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
index 0afb481296f..b468b6f44c8 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
@@ -18,17 +18,19 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-#endif
-
#define __SPLIT_KERNEL__
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_sse2
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
index 13d00813591..3e5792d0b17 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
@@ -18,19 +18,21 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-#endif
-
#define __SPLIT_KERNEL__
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_sse3
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
index a4312071edc..3629f21cd29 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
@@ -18,20 +18,22 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# define __KERNEL_SSE41__
-#endif
-
#define __SPLIT_KERNEL__
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_sse41
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
index 1acfaa91ac9..57530c88710 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
@@ -18,15 +18,17 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-#endif
-
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_sse2
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
index f7b6a2e21fe..c607753bc4b 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
@@ -18,17 +18,19 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-#endif
-
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_sse3
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
index 1900c6e3012..a278554731c 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
@@ -18,18 +18,20 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# define __KERNEL_SSE41__
-#endif
-
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_sse41
-# include "kernel/kernels/cpu//kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu
new file mode 100644
index 00000000000..50f73f9728d
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cuda/filter.cu
@@ -0,0 +1,235 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CUDA kernel entry points */
+
+#ifdef __CUDA_ARCH__
+
+#include "kernel_config.h"
+
+#include "kernel/kernel_compat_cuda.h"
+
+#include "kernel/filter/filter_kernel.h"
+
+/* kernels */
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_divide_shadow(int sample,
+ TilesInfo *tiles,
+ float *unfilteredA,
+ float *unfilteredB,
+ float *sampleVariance,
+ float *sampleVarianceV,
+ float *bufferVariance,
+ int4 prefilter_rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance)
+{
+ int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+ int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+ if(x < prefilter_rect.z && y < prefilter_rect.w) {
+ kernel_filter_divide_shadow(sample,
+ tiles,
+ x, y,
+ unfilteredA,
+ unfilteredB,
+ sampleVariance,
+ sampleVarianceV,
+ bufferVariance,
+ prefilter_rect,
+ buffer_pass_stride,
+ buffer_denoising_offset,
+ use_split_variance);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_get_feature(int sample,
+ TilesInfo *tiles,
+ int m_offset,
+ int v_offset,
+ float *mean,
+ float *variance,
+ int4 prefilter_rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance)
+{
+ int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+ int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+ if(x < prefilter_rect.z && y < prefilter_rect.w) {
+ kernel_filter_get_feature(sample,
+ tiles,
+ m_offset, v_offset,
+ x, y,
+ mean, variance,
+ prefilter_rect,
+ buffer_pass_stride,
+ buffer_denoising_offset,
+ use_split_variance);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r)
+{
+ int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+ int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+ if(x < prefilter_rect.z && y < prefilter_rect.w) {
+ kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
+ float *transform, int *rank,
+ int4 filter_area, int4 rect,
+ int radius, float pca_threshold,
+ int pass_stride)
+{
+ int x = blockDim.x*blockIdx.x + threadIdx.x;
+ int y = blockDim.y*blockIdx.y + threadIdx.y;
+ if(x < filter_area.z && y < filter_area.w) {
+ int *l_rank = rank + y*filter_area.z + x;
+ float *l_transform = transform + y*filter_area.z + x;
+ kernel_filter_construct_transform(buffer,
+ x + filter_area.x, y + filter_area.y,
+ rect, pass_stride,
+ l_transform, l_rank,
+ radius, pca_threshold,
+ filter_area.z*filter_area.w,
+ threadIdx.y*blockDim.x + threadIdx.x);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_calc_difference(int dx, int dy,
+ float ccl_restrict_ptr weightImage,
+ float ccl_restrict_ptr varianceImage,
+ float *differenceImage,
+ int4 rect, int w,
+ int channel_offset,
+ float a, float k_2) {
+ int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+ int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_calc_difference(x, y, dx, dy, weightImage, varianceImage, differenceImage, rect, w, channel_offset, a, k_2);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_blur(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) {
+ int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+ int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_blur(x, y, differenceImage, outImage, rect, w, f);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_calc_weight(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) {
+ int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+ int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_calc_weight(x, y, differenceImage, outImage, rect, w, f);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_update_output(int dx, int dy,
+ float ccl_restrict_ptr differenceImage,
+ float ccl_restrict_ptr image,
+ float *outImage, float *accumImage,
+ int4 rect, int w,
+ int f) {
+ int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+ int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_update_output(x, y, dx, dy, differenceImage, image, outImage, accumImage, rect, w, f);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_normalize(float *outImage, float ccl_restrict_ptr accumImage, int4 rect, int w) {
+ int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+ int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_normalize(x, y, outImage, accumImage, rect, w);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_construct_gramian(int dx, int dy,
+ float ccl_restrict_ptr differenceImage,
+ float ccl_restrict_ptr buffer,
+ float *color_pass,
+ float *variance_pass,
+ float const* __restrict__ transform,
+ int *rank,
+ float *XtWX,
+ float3 *XtWY,
+ int4 rect,
+ int4 filter_rect,
+ int w, int h, int f,
+ int pass_stride) {
+ int x = blockDim.x*blockIdx.x + threadIdx.x + max(0, rect.x-filter_rect.x);
+ int y = blockDim.y*blockIdx.y + threadIdx.y + max(0, rect.y-filter_rect.y);
+ if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
+ kernel_filter_nlm_construct_gramian(x, y,
+ dx, dy,
+ differenceImage,
+ buffer,
+ color_pass, variance_pass,
+ transform, rank,
+ XtWX, XtWY,
+ rect, filter_rect,
+ w, h, f,
+ pass_stride,
+ threadIdx.y*blockDim.x + threadIdx.x);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_finalize(int w, int h,
+ float *buffer, int *rank,
+ float *XtWX, float3 *XtWY,
+ int4 filter_area, int4 buffer_params,
+ int sample) {
+ int x = blockDim.x*blockIdx.x + threadIdx.x;
+ int y = blockDim.y*blockIdx.y + threadIdx.y;
+ if(x < filter_area.z && y < filter_area.w) {
+ int storage_ofs = y*filter_area.z+x;
+ rank += storage_ofs;
+ XtWX += storage_ofs;
+ XtWY += storage_ofs;
+ kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);
+ }
+}
+
+#endif
+
diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl
new file mode 100644
index 00000000000..3d82bff9892
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/filter.cl
@@ -0,0 +1,262 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* OpenCL kernel entry points */
+
+#include "kernel/kernel_compat_opencl.h"
+
+#include "kernel/filter/filter_kernel.h"
+
+/* kernels */
+
+__kernel void kernel_ocl_filter_divide_shadow(int sample,
+ ccl_global TilesInfo *tiles,
+ ccl_global float *unfilteredA,
+ ccl_global float *unfilteredB,
+ ccl_global float *sampleVariance,
+ ccl_global float *sampleVarianceV,
+ ccl_global float *bufferVariance,
+ int4 prefilter_rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ char use_split_variance)
+{
+ int x = prefilter_rect.x + get_global_id(0);
+ int y = prefilter_rect.y + get_global_id(1);
+ if(x < prefilter_rect.z && y < prefilter_rect.w) {
+ kernel_filter_divide_shadow(sample,
+ tiles,
+ x, y,
+ unfilteredA,
+ unfilteredB,
+ sampleVariance,
+ sampleVarianceV,
+ bufferVariance,
+ prefilter_rect,
+ buffer_pass_stride,
+ buffer_denoising_offset,
+ use_split_variance);
+ }
+}
+
+__kernel void kernel_ocl_filter_get_feature(int sample,
+ ccl_global TilesInfo *tiles,
+ int m_offset,
+ int v_offset,
+ ccl_global float *mean,
+ ccl_global float *variance,
+ int4 prefilter_rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ char use_split_variance)
+{
+ int x = prefilter_rect.x + get_global_id(0);
+ int y = prefilter_rect.y + get_global_id(1);
+ if(x < prefilter_rect.z && y < prefilter_rect.w) {
+ kernel_filter_get_feature(sample,
+ tiles,
+ m_offset, v_offset,
+ x, y,
+ mean, variance,
+ prefilter_rect,
+ buffer_pass_stride,
+ buffer_denoising_offset,
+ use_split_variance);
+ }
+}
+
+__kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean,
+ ccl_global float *variance,
+ ccl_global float *a,
+ ccl_global float *b,
+ int4 prefilter_rect,
+ int r)
+{
+ int x = prefilter_rect.x + get_global_id(0);
+ int y = prefilter_rect.y + get_global_id(1);
+ if(x < prefilter_rect.z && y < prefilter_rect.w) {
+ kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
+ }
+}
+
+__kernel void kernel_ocl_filter_construct_transform(ccl_global float ccl_restrict_ptr buffer,
+ ccl_global float *transform,
+ ccl_global int *rank,
+ int4 filter_area,
+ int4 rect,
+ int pass_stride,
+ int radius,
+ float pca_threshold)
+{
+ int x = get_global_id(0);
+ int y = get_global_id(1);
+ if(x < filter_area.z && y < filter_area.w) {
+ ccl_global int *l_rank = rank + y*filter_area.z + x;
+ ccl_global float *l_transform = transform + y*filter_area.z + x;
+ kernel_filter_construct_transform(buffer,
+ x + filter_area.x, y + filter_area.y,
+ rect, pass_stride,
+ l_transform, l_rank,
+ radius, pca_threshold,
+ filter_area.z*filter_area.w,
+ get_local_id(1)*get_local_size(0) + get_local_id(0));
+ }
+}
+
+__kernel void kernel_ocl_filter_nlm_calc_difference(int dx,
+ int dy,
+ ccl_global float ccl_restrict_ptr weightImage,
+ ccl_global float ccl_restrict_ptr varianceImage,
+ ccl_global float *differenceImage,
+ int4 rect,
+ int w,
+ int channel_offset,
+ float a,
+ float k_2) {
+ int x = get_global_id(0) + rect.x;
+ int y = get_global_id(1) + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_calc_difference(x, y, dx, dy, weightImage, varianceImage, differenceImage, rect, w, channel_offset, a, k_2);
+ }
+}
+
+__kernel void kernel_ocl_filter_nlm_blur(ccl_global float ccl_restrict_ptr differenceImage,
+ ccl_global float *outImage,
+ int4 rect,
+ int w,
+ int f) {
+ int x = get_global_id(0) + rect.x;
+ int y = get_global_id(1) + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_blur(x, y, differenceImage, outImage, rect, w, f);
+ }
+}
+
+__kernel void kernel_ocl_filter_nlm_calc_weight(ccl_global float ccl_restrict_ptr differenceImage,
+ ccl_global float *outImage,
+ int4 rect,
+ int w,
+ int f) {
+ int x = get_global_id(0) + rect.x;
+ int y = get_global_id(1) + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_calc_weight(x, y, differenceImage, outImage, rect, w, f);
+ }
+}
+
+__kernel void kernel_ocl_filter_nlm_update_output(int dx,
+ int dy,
+ ccl_global float ccl_restrict_ptr differenceImage,
+ ccl_global float ccl_restrict_ptr image,
+ ccl_global float *outImage,
+ ccl_global float *accumImage,
+ int4 rect,
+ int w,
+ int f) {
+ int x = get_global_id(0) + rect.x;
+ int y = get_global_id(1) + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_update_output(x, y, dx, dy, differenceImage, image, outImage, accumImage, rect, w, f);
+ }
+}
+
+__kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *outImage,
+ ccl_global float ccl_restrict_ptr accumImage,
+ int4 rect,
+ int w) {
+ int x = get_global_id(0) + rect.x;
+ int y = get_global_id(1) + rect.y;
+ if(x < rect.z && y < rect.w) {
+ kernel_filter_nlm_normalize(x, y, outImage, accumImage, rect, w);
+ }
+}
+
+__kernel void kernel_ocl_filter_nlm_construct_gramian(int dx,
+ int dy,
+ ccl_global float ccl_restrict_ptr differenceImage,
+ ccl_global float ccl_restrict_ptr buffer,
+ ccl_global float *color_pass,
+ ccl_global float *variance_pass,
+ ccl_global float ccl_restrict_ptr transform,
+ ccl_global int *rank,
+ ccl_global float *XtWX,
+ ccl_global float3 *XtWY,
+ int4 rect,
+ int4 filter_rect,
+ int w,
+ int h,
+ int f,
+ int pass_stride) {
+ int x = get_global_id(0) + max(0, rect.x-filter_rect.x);
+ int y = get_global_id(1) + max(0, rect.y-filter_rect.y);
+ if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
+ kernel_filter_nlm_construct_gramian(x, y,
+ dx, dy,
+ differenceImage,
+ buffer,
+ color_pass, variance_pass,
+ transform, rank,
+ XtWX, XtWY,
+ rect, filter_rect,
+ w, h, f,
+ pass_stride,
+ get_local_id(1)*get_local_size(0) + get_local_id(0));
+ }
+}
+
+__kernel void kernel_ocl_filter_finalize(int w,
+ int h,
+ ccl_global float *buffer,
+ ccl_global int *rank,
+ ccl_global float *XtWX,
+ ccl_global float3 *XtWY,
+ int4 filter_area,
+ int4 buffer_params,
+ int sample) {
+ int x = get_global_id(0);
+ int y = get_global_id(1);
+ if(x < filter_area.z && y < filter_area.w) {
+ int storage_ofs = y*filter_area.z+x;
+ rank += storage_ofs;
+ XtWX += storage_ofs;
+ XtWY += storage_ofs;
+ kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);
+ }
+}
+
+__kernel void kernel_ocl_filter_set_tiles(ccl_global TilesInfo* tiles,
+ ccl_global float *buffer_1,
+ ccl_global float *buffer_2,
+ ccl_global float *buffer_3,
+ ccl_global float *buffer_4,
+ ccl_global float *buffer_5,
+ ccl_global float *buffer_6,
+ ccl_global float *buffer_7,
+ ccl_global float *buffer_8,
+ ccl_global float *buffer_9)
+{
+ if((get_global_id(0) == 0) && (get_global_id(1) == 0)) {
+ tiles->buffers[0] = buffer_1;
+ tiles->buffers[1] = buffer_2;
+ tiles->buffers[2] = buffer_3;
+ tiles->buffers[3] = buffer_4;
+ tiles->buffers[4] = buffer_5;
+ tiles->buffers[5] = buffer_6;
+ tiles->buffers[6] = buffer_7;
+ tiles->buffers[7] = buffer_8;
+ tiles->buffers[8] = buffer_9;
+ }
+}
diff --git a/intern/cycles/kernel/split/kernel_branched.h b/intern/cycles/kernel/split/kernel_branched.h
index c7bc1b4df0a..dc74a2ada53 100644
--- a/intern/cycles/kernel/split/kernel_branched.h
+++ b/intern/cycles/kernel/split/kernel_branched.h
@@ -76,6 +76,26 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
RNG rng = kernel_split_state.rng[ray_index];
PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
float3 throughput = branched_state->throughput;
+ ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
+
+ float sum_sample_weight = 0.0f;
+#ifdef __DENOISING_FEATURES__
+ if(ps->denoising_feature_weight > 0.0f) {
+ for(int i = 0; i < sd->num_closure; i++) {
+ const ShaderClosure *sc = &sd->closure[i];
+
+ /* transparency is not handled here, but in outer loop */
+ if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
+ continue;
+ }
+
+ sum_sample_weight += sc->sample_weight;
+ }
+ }
+ else {
+ sum_sample_weight = 1.0f;
+ }
+#endif /* __DENOISING_FEATURES__ */
for(int i = branched_state->next_closure; i < sd->num_closure; i++) {
const ShaderClosure *sc = &sd->closure[i];
@@ -103,7 +123,6 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
RNG bsdf_rng = cmj_hash(rng, i);
for(int j = branched_state->next_sample; j < num_samples; j++) {
- ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
if(reset_path_state) {
*ps = branched_state->path_state;
}
@@ -122,7 +141,8 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
tp,
ps,
L,
- bsdf_ray))
+ bsdf_ray,
+ sum_sample_weight))
{
continue;
}
diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h
index 859c221d976..1f6dce0253c 100644
--- a/intern/cycles/kernel/split/kernel_buffer_update.h
+++ b/intern/cycles/kernel/split/kernel_buffer_update.h
@@ -111,24 +111,15 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
- float3 L_sum;
-#ifdef __SHADOW_TRICKS__
- if(state->flag & PATH_RAY_SHADOW_CATCHER) {
- L_sum = path_radiance_sum_shadowcatcher(kg, L, L_transparent);
- }
- else
-#endif /* __SHADOW_TRICKS__ */
- {
- L_sum = path_radiance_clamp_and_sum(kg, L);
- }
kernel_write_light_passes(kg, buffer, L, sample);
#ifdef __KERNEL_DEBUG__
kernel_write_debug_passes(kg, buffer, state, debug_data, sample);
#endif
- float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent));
/* accumulate result in output buffer */
- kernel_write_pass_float4(buffer, sample, L_rad);
+ bool is_shadow_catcher = (state->flag & PATH_RAY_SHADOW_CATCHER);
+ kernel_write_result(kg, buffer, sample, L, 1.0f - (*L_transparent), is_shadow_catcher);
+
path_rng_end(kg, rng_state, rng);
ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
index 87498910d38..670a557f084 100644
--- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
+++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
@@ -125,7 +125,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
#ifdef __SHADOW_TRICKS__
if((sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) {
if(state->flag & PATH_RAY_CAMERA) {
- state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
+ state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO);
state->catcher_object = sd->object;
if(!kernel_data.background.transparent) {
PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
@@ -246,6 +246,8 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
kernel_split_state.throughput[ray_index] = throughput/probability;
}
}
+
+ kernel_update_denoising_features(kg, sd, state, L);
}
}
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
index 452b6e45a36..386fbbc4d09 100644
--- a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
@@ -89,10 +89,10 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
&shadow))
{
/* accumulate */
- path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+ path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
}
else {
- path_radiance_accum_total_light(L, throughput, &L_light);
+ path_radiance_accum_total_light(L, state, throughput, &L_light);
}
}
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index 407f8e784c0..7918c640175 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -444,6 +444,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
if(bsdf) {
+ bsdf->N = N;
sd->flag |= bsdf_transparent_setup(bsdf);
}
break;
@@ -704,6 +705,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
if(bsdf) {
+ bsdf->N = N;
/* todo: giving a fixed weight here will cause issues when
* mixing multiple BSDFS. energy will not be conserved and
* the throughput can blow up after multiple bounces. we
diff --git a/intern/cycles/kernel/svm/svm_displace.h b/intern/cycles/kernel/svm/svm_displace.h
index c94fa130af7..656357be52d 100644
--- a/intern/cycles/kernel/svm/svm_displace.h
+++ b/intern/cycles/kernel/svm/svm_displace.h
@@ -63,8 +63,13 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
strength = max(strength, 0.0f);
/* compute and output perturbed normal */
- float3 normal_out = normalize(absdet*normal_in - distance*signf(det)*surfgrad);
- normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in);
+ float3 normal_out = safe_normalize(absdet*normal_in - distance*signf(det)*surfgrad);
+ if(is_zero(normal_out)) {
+ normal_out = normal_in;
+ }
+ else {
+ normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in);
+ }
if(use_object_space) {
object_normal_transform(kg, sd, &normal_out);
diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h
index 4a09d9f6653..cce4e89e715 100644
--- a/intern/cycles/kernel/svm/svm_geometry.h
+++ b/intern/cycles/kernel/svm/svm_geometry.h
@@ -37,6 +37,7 @@ ccl_device_inline void svm_node_geometry(KernelGlobals *kg,
#ifdef __UV__
case NODE_GEOM_uv: data = make_float3(sd->u, sd->v, 0.0f); break;
#endif
+ default: data = make_float3(0.0f, 0.0f, 0.0f);
}
stack_store_float3(stack, out_offset, data);
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index 328ff79223b..8e45dbfa5ff 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -317,8 +317,8 @@ ccl_device void svm_node_tex_environment(KernelGlobals *kg, ShaderData *sd, floa
float3 co = stack_load_float3(stack, co_offset);
float2 uv;
- co = normalize(co);
-
+ co = safe_normalize(co);
+
if(projection == 0)
uv = direction_to_equirectangular(co);
else
diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h
index cc9b840b13f..d859cae1708 100644
--- a/intern/cycles/kernel/svm/svm_types.h
+++ b/intern/cycles/kernel/svm/svm_types.h
@@ -402,7 +402,6 @@ typedef enum ClosureType {
CLOSURE_BSDF_DIFFUSE_TOON_ID,
/* Glossy */
- CLOSURE_BSDF_GLOSSY_ID,
CLOSURE_BSDF_REFLECTION_ID,
CLOSURE_BSDF_MICROFACET_GGX_ID,
CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID,
@@ -423,14 +422,13 @@ typedef enum ClosureType {
CLOSURE_BSDF_HAIR_REFLECTION_ID,
/* Transmission */
- CLOSURE_BSDF_TRANSMISSION_ID,
CLOSURE_BSDF_TRANSLUCENT_ID,
CLOSURE_BSDF_REFRACTION_ID,
CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID,
CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID,
+ CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID,
CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID,
CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID,
- CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID,
CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID,
CLOSURE_BSDF_SHARP_GLASS_ID,
CLOSURE_BSDF_HAIR_TRANSMISSION_ID,
@@ -465,13 +463,16 @@ typedef enum ClosureType {
/* watch this, being lazy with memory usage */
#define CLOSURE_IS_BSDF(type) (type <= CLOSURE_BSDF_TRANSPARENT_ID)
#define CLOSURE_IS_BSDF_DIFFUSE(type) (type >= CLOSURE_BSDF_DIFFUSE_ID && type <= CLOSURE_BSDF_DIFFUSE_TOON_ID)
-#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_GLOSSY_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID)
-#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSMISSION_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID)
+#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID)
+#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSLUCENT_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID)
#define CLOSURE_IS_BSDF_BSSRDF(type) (type == CLOSURE_BSDF_BSSRDF_ID || type == CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID)
+#define CLOSURE_IS_BSDF_TRANSPARENT(type) (type == CLOSURE_BSDF_TRANSPARENT_ID)
#define CLOSURE_IS_BSDF_ANISOTROPIC(type) (type >= CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID)
#define CLOSURE_IS_BSDF_MULTISCATTER(type) (type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID ||\
type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID || \
type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID)
+#define CLOSURE_IS_BSDF_MICROFACET(type) ((type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID) ||\
+ (type >= CLOSURE_BSDF_REFRACTION_ID && type <= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID))
#define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_BURLEY_ID)
#define CLOSURE_IS_BSSRDF(type) (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_BURLEY_ID)
#define CLOSURE_IS_VOLUME(type) (type >= CLOSURE_VOLUME_ID && type <= CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
@@ -480,7 +481,7 @@ typedef enum ClosureType {
#define CLOSURE_IS_BACKGROUND(type) (type == CLOSURE_BACKGROUND_ID)
#define CLOSURE_IS_AMBIENT_OCCLUSION(type) (type == CLOSURE_AMBIENT_OCCLUSION_ID)
#define CLOSURE_IS_PHASE(type) (type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
-#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID)
+#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID)
#define CLOSURE_IS_PRINCIPLED(type) (type == CLOSURE_BSDF_PRINCIPLED_ID)
#define CLOSURE_WEIGHT_CUTOFF 1e-5f