Cycles: Implement denoising option for reducing noise in the rendered image

This commit contains the first part of the new Cycles denoising option, which filters the resulting image using information gathered during rendering to get rid of noise while preserving visual features as well as possible. To use the option, enable it in the render layer options. The default settings fit a wide range of scenes, but the user can tweak individual settings to control the tradeoff between a noise-free image, image details, and calculation time. Note that the denoiser may still change in the future and that some features are not implemented yet. The most important missing feature is animation denoising, which uses information from multiple frames at once to produce a flicker-free and smoother result. These features will be added in the future. Finally, thanks to all the people who supported this project: - Google (through the GSoC) and Theory Studios for sponsoring the development - The authors of the papers I used for implementing the denoiser (more details on them will be included in the technical docs) - The other Cycles devs for feedback on the code, especially Sergey for mentoring the GSoC project and Brecht for the code review! - And of course the users who helped with testing, reported bugs and things that could and/or should work better!
author: Lukas Stockner <lukas.stockner@freenet.de> 2017-05-07 15:40:58 +0300
committer: Lukas Stockner <lukas.stockner@freenet.de> 2017-05-07 15:40:58 +0300
commit: 43b374e8c5430488a302298b1026faa1c3a231e9 (patch)
tree: 42e619a9fa08d02cef515b6315ce34dd7fd062b2 /intern/cycles/kernel/kernels
parent: bca697834728fd12c84941aa2a428abfe2090b27 (diff)
22 files changed, 1315 insertions, 164 deletions
diff --git a/intern/cycles/kernel/kernels/cpu/filter.cpp b/intern/cycles/kernel/kernels/cpu/filter.cpp
new file mode 100644
index 00000000000..2ff1a392dc3
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CPU kernel entry points */
+
+/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
+ * one with SSE2 intrinsics.
+ */
+#if defined(__x86_64__) || defined(_M_X64)
+#  define __KERNEL_SSE2__
+#endif
+
+/* When building kernel for native machine detect kernel features from the flags
+ * set by compiler.
+ */
+#ifdef WITH_KERNEL_NATIVE
+#  ifdef __SSE2__
+#    ifndef __KERNEL_SSE2__
+#      define __KERNEL_SSE2__
+#    endif
+#  endif
+#  ifdef __SSE3__
+#    define __KERNEL_SSE3__
+#  endif
+#  ifdef __SSSE3__
+#    define __KERNEL_SSSE3__
+#  endif
+#  ifdef __SSE4_1__
+#    define __KERNEL_SSE41__
+#  endif
+#  ifdef __AVX__
+#    define __KERNEL_SSE__
+#    define __KERNEL_AVX__
+#  endif
+#  ifdef __AVX2__
+#    define __KERNEL_SSE__
+#    define __KERNEL_AVX2__
+#  endif
+#endif
+
+/* quiet unused define warnings */
+#if defined(__KERNEL_SSE2__)
+    /* do nothing */
+#endif
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp
new file mode 100644
index 00000000000..4a9e6047ecf
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp
new file mode 100644
index 00000000000..c22ec576254
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#    define __KERNEL_AVX2__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
new file mode 100644
index 00000000000..10007ee2635
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common declaration part of all CPU kernels. */
+
+void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
+                                                     TilesInfo *tiles,
+                                                     int x,
+                                                     int y,
+                                                     float *unfilteredA,
+                                                     float *unfilteredB,
+                                                     float *sampleV,
+                                                     float *sampleVV,
+                                                     float *bufferV,
+                                                     int* prefilter_rect,
+                                                     int buffer_pass_stride,
+                                                     int buffer_denoising_offset,
+                                                     bool use_split_variance);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
+                                                   TilesInfo *tiles,
+                                                   int m_offset,
+                                                   int v_offset,
+                                                   int x,
+                                                   int y,
+                                                   float *mean,
+                                                   float *variance,
+                                                   int* prefilter_rect,
+                                                   int buffer_pass_stride,
+                                                   int buffer_denoising_offset,
+                                                   bool use_split_variance);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
+                                                      float *mean,
+                                                      float *variance,
+                                                      float *a,
+                                                      float *b,
+                                                      int* prefilter_rect,
+                                                      int r);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
+                                                           int x,
+                                                           int y,
+                                                           int storage_ofs,
+                                                           float *transform,
+                                                           int *rank,
+                                                           int* rect,
+                                                           int pass_stride,
+                                                           int radius,
+                                                           float pca_threshold);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
+                                                           int dy,
+                                                           float *weightImage,
+                                                           float *variance,
+                                                           float *differenceImage,
+                                                           int* rect,
+                                                           int w,
+                                                           int channel_offset,
+                                                           float a,
+                                                           float k_2);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *differenceImage,
+                                                float *outImage,
+                                                int* rect,
+                                                int w,
+                                                int f);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *differenceImage,
+                                                       float *outImage,
+                                                       int* rect,
+                                                       int w,
+                                                       int f);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
+                                                         int dy,
+                                                         float *differenceImage,
+                                                         float *image,
+                                                         float *outImage,
+                                                         float *accumImage,
+                                                         int* rect,
+                                                         int w,
+                                                         int f);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
+                                                             int dy,
+                                                             float *differenceImage,
+                                                             float *buffer,
+                                                             float *color_pass,
+                                                             float *variance_pass,
+                                                             float *transform,
+                                                             int *rank,
+                                                             float *XtWX,
+                                                             float3 *XtWY,
+                                                             int *rect,
+                                                             int *filter_rect,
+                                                             int w,
+                                                             int h,
+                                                             int f,
+                                                             int pass_stride);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *outImage,
+                                                     float *accumImage,
+                                                     int* rect,
+                                                     int w);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
+                                                int y,
+                                                int storage_ofs,
+                                                int w,
+                                                int h,
+                                                float *buffer,
+                                                int *rank,
+                                                float *XtWX,
+                                                float3 *XtWY,
+                                                int *buffer_params,
+                                                int sample);
+
+#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
new file mode 100644
index 00000000000..3b71e50ca3b
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
@@ -0,0 +1,259 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common implementation part of all CPU kernels.
+ *
+ * The idea is that particular .cpp files sets needed optimization flags and
+ * simply includes this file without worry of copying actual implementation over.
+ */
+
+#include "kernel/kernel_compat_cpu.h"
+
+#include "kernel/filter/filter_kernel.h"
+
+#ifdef KERNEL_STUB
+#  include "util/util_debug.h"
+#  define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!"))
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+
+/* Denoise filter */
+
+void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
+                                                     TilesInfo *tiles,
+                                                     int x,
+                                                     int y,
+                                                     float *unfilteredA,
+                                                     float *unfilteredB,
+                                                     float *sampleVariance,
+                                                     float *sampleVarianceV,
+                                                     float *bufferVariance,
+                                                     int* prefilter_rect,
+                                                     int buffer_pass_stride,
+                                                     int buffer_denoising_offset,
+                                                     bool use_split_variance)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow);
+#else
+	kernel_filter_divide_shadow(sample, tiles,
+	                            x, y,
+	                            unfilteredA,
+	                            unfilteredB,
+	                            sampleVariance,
+	                            sampleVarianceV,
+	                            bufferVariance,
+	                            load_int4(prefilter_rect),
+	                            buffer_pass_stride,
+	                            buffer_denoising_offset,
+	                            use_split_variance);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
+                                                   TilesInfo *tiles,
+                                                   int m_offset,
+                                                   int v_offset,
+                                                   int x,
+                                                   int y,
+                                                   float *mean, float *variance,
+                                                   int* prefilter_rect,
+                                                   int buffer_pass_stride,
+                                                   int buffer_denoising_offset,
+                                                   bool use_split_variance)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_get_feature);
+#else
+	kernel_filter_get_feature(sample, tiles,
+	                          m_offset, v_offset,
+	                          x, y,
+	                          mean, variance,
+	                          load_int4(prefilter_rect),
+	                          buffer_pass_stride,
+	                          buffer_denoising_offset,
+	                          use_split_variance);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
+                                                      float *mean,
+                                                      float *variance,
+                                                      float *a,
+                                                      float *b,
+                                                      int* prefilter_rect,
+                                                      int r)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_combine_halves);
+#else
+	kernel_filter_combine_halves(x, y, mean, variance, a, b, load_int4(prefilter_rect), r);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
+                                                           int x,
+                                                           int y,
+                                                           int storage_ofs,
+                                                           float *transform,
+                                                           int *rank,
+                                                           int* prefilter_rect,
+                                                           int pass_stride,
+                                                           int radius,
+                                                           float pca_threshold)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_construct_transform);
+#else
+  rank += storage_ofs;
+  transform += storage_ofs*TRANSFORM_SIZE;
+	kernel_filter_construct_transform(buffer,
+	                                  x, y,
+	                                  load_int4(prefilter_rect),
+	                                  pass_stride,
+	                                  transform,
+	                                  rank,
+	                                  radius,
+	                                  pca_threshold);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
+                                                           int dy,
+                                                           float *weightImage,
+                                                           float *variance,
+                                                           float *differenceImage,
+                                                           int *rect,
+                                                           int w,
+                                                           int channel_offset,
+                                                           float a,
+                                                           float k_2)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference);
+#else
+	kernel_filter_nlm_calc_difference(dx, dy, weightImage, variance, differenceImage, load_int4(rect), w, channel_offset, a, k_2);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *differenceImage,
+                                                float *outImage,
+                                                int *rect,
+                                                int w,
+                                                int f)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_nlm_blur);
+#else
+	kernel_filter_nlm_blur(differenceImage, outImage, load_int4(rect), w, f);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *differenceImage,
+                                                       float *outImage,
+                                                       int *rect,
+                                                       int w,
+                                                       int f)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_weight);
+#else
+	kernel_filter_nlm_calc_weight(differenceImage, outImage, load_int4(rect), w, f);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
+                                                         int dy,
+                                                         float *differenceImage,
+                                                         float *image,
+                                                         float *outImage,
+                                                         float *accumImage,
+                                                         int *rect,
+                                                         int w,
+                                                         int f)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output);
+#else
+	kernel_filter_nlm_update_output(dx, dy, differenceImage, image, outImage, accumImage, load_int4(rect), w, f);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
+                                                             int dy,
+                                                             float *differenceImage,
+                                                             float *buffer,
+                                                             float *color_pass,
+                                                             float *variance_pass,
+                                                             float *transform,
+                                                             int *rank,
+                                                             float *XtWX,
+                                                             float3 *XtWY,
+                                                             int *rect,
+                                                             int *filter_rect,
+                                                             int w,
+                                                             int h,
+                                                             int f,
+                                                             int pass_stride)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian);
+#else
+    kernel_filter_nlm_construct_gramian(dx, dy, differenceImage, buffer, color_pass, variance_pass, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_rect), w, h, f, pass_stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *outImage,
+                                                     float *accumImage,
+                                                     int *rect,
+                                                     int w)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_nlm_normalize);
+#else
+	kernel_filter_nlm_normalize(outImage, accumImage, load_int4(rect), w);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
+                                                int y,
+                                                int storage_ofs,
+                                                int w,
+                                                int h,
+                                                float *buffer,
+                                                int *rank,
+                                                float *XtWX,
+                                                float3 *XtWY,
+                                                int *buffer_params,
+                                                int sample)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_finalize);
+#else
+	XtWX += storage_ofs*XTWX_SIZE;
+	XtWY += storage_ofs*XTWY_SIZE;
+	rank += storage_ofs;
+	kernel_filter_finalize(x, y, w, h, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample);
+#endif
+}
+
+#undef KERNEL_STUB
+#undef STUB_ASSERT
+#undef KERNEL_ARCH
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
new file mode 100644
index 00000000000..f7c9935f1d0
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
new file mode 100644
index 00000000000..070b95a3505
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
new file mode 100644
index 00000000000..1a7b2040da1
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
index 2600d977972..a645fb4d8dd 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
@@ -17,21 +17,23 @@
 /* Optimized CPU kernel entry points. This file is compiled with AVX
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
- 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE__
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#  define __KERNEL_SSE41__
-#  define __KERNEL_AVX__
-#endif
 
 #include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_avx
-#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
index dba15d037ac..6bbb87727b9 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
@@ -18,21 +18,23 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE__
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#  define __KERNEL_SSE41__
-#  define __KERNEL_AVX__
-#  define __KERNEL_AVX2__
-#endif
-
 #include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_avx2
-#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#    define __KERNEL_AVX2__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
index 39c9a9cf33c..9895080d328 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
@@ -89,6 +89,4 @@ DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
 DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
 DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update)
 
-void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func));
-
 #undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
index 8c05dd1d9ef..b9d82781840 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
@@ -57,6 +57,11 @@
 #  include "kernel/split/kernel_buffer_update.h"
 #endif
 
+#ifdef KERNEL_STUB
+#  include "util/util_debug.h"
+#  define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!"))
+#endif
+
 CCL_NAMESPACE_BEGIN
 
 #ifndef __SPLIT_KERNEL__
@@ -71,7 +76,10 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
                                            int offset,
                                            int stride)
 {
-#ifdef __BRANCHED_PATH__
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, path_trace);
+#else
+#  ifdef __BRANCHED_PATH__
 	if(kernel_data.integrator.branched) {
 		kernel_branched_path_trace(kg,
 		                           buffer,
@@ -82,10 +90,11 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
 		                           stride);
 	}
 	else
-#endif
+#  endif
 	{
 		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
 	}
+#endif /* KERNEL_STUB */
 }
 
 /* Film */
@@ -98,6 +107,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
                                                 int offset,
                                                 int stride)
 {
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, convert_to_byte);
+#else
 	kernel_film_convert_to_byte(kg,
 	                            rgba,
 	                            buffer,
@@ -105,6 +117,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
 	                            x, y,
 	                            offset,
 	                            stride);
+#endif /* KERNEL_STUB */
 }
 
 void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
@@ -115,6 +128,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
                                                       int offset,
                                                       int stride)
 {
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, convert_to_half_float);
+#else
 	kernel_film_convert_to_half_float(kg,
 	                                  rgba,
 	                                  buffer,
@@ -122,6 +138,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
 	                                  x, y,
 	                                  offset,
 	                                  stride);
+#endif /* KERNEL_STUB */
 }
 
 /* Shader Evaluate */
@@ -136,9 +153,12 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
                                        int offset,
                                        int sample)
 {
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, shader);
+#else
 	if(type >= SHADER_EVAL_BAKE) {
 		kernel_assert(output_luma == NULL);
-#ifdef __BAKING__
+#  ifdef __BAKING__
 		kernel_bake_evaluate(kg,
 		                     input,
 		                     output,
@@ -147,7 +167,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
 		                     i,
 		                     offset,
 		                     sample);
-#endif
+#  endif
 	}
 	else {
 		kernel_shader_evaluate(kg,
@@ -158,17 +178,26 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
 		                       i,
 		                       sample);
 	}
+#endif /* KERNEL_STUB */
 }
 
 #else  /* __SPLIT_KERNEL__ */
 
 /* Split Kernel Path Tracing */
 
-#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
+#ifdef KERNEL_STUB
+#  define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
+	void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
+	{ \
+		STUB_ASSERT(KERNEL_ARCH, name); \
+	}
+#else
+#  define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
 	void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
 	{ \
 		kernel_##name(kg); \
 	}
+#endif /* KERNEL_STUB */
 
 #define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
 	void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
@@ -194,42 +223,10 @@ DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
-
-void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func))
-{
-#define REGISTER_NAME_STRING(name) #name
-#define REGISTER_EVAL_NAME(name) REGISTER_NAME_STRING(name)
-#define REGISTER(name) reg(REGISTER_EVAL_NAME(KERNEL_FUNCTION_FULL_NAME(name)), (void*)KERNEL_FUNCTION_FULL_NAME(name));
-
-	REGISTER(path_trace);
-	REGISTER(convert_to_byte);
-	REGISTER(convert_to_half_float);
-	REGISTER(shader);
-
-	REGISTER(data_init);
-	REGISTER(path_init);
-	REGISTER(scene_intersect);
-	REGISTER(lamp_emission);
-	REGISTER(do_volume);
-	REGISTER(queue_enqueue);
-	REGISTER(indirect_background);
-	REGISTER(shader_setup);
-	REGISTER(shader_sort);
-	REGISTER(shader_eval);
-	REGISTER(holdout_emission_blurring_pathtermination_ao);
-	REGISTER(subsurface_scatter);
-	REGISTER(direct_lighting);
-	REGISTER(shadow_blocked_ao);
-	REGISTER(shadow_blocked_dl);
-	REGISTER(next_iteration_setup);
-	REGISTER(indirect_subsurface);
-	REGISTER(buffer_update);
-
-#undef REGISTER
-#undef REGISTER_EVAL_NAME
-#undef REGISTER_NAME_STRING
-}
-
 #endif  /* __SPLIT_KERNEL__ */
 
+#undef KERNEL_STUB
+#undef STUB_ASSERT
+#undef KERNEL_ARCH
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
index 27a746a0799..6ba3425a343 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
@@ -17,22 +17,25 @@
 /* Optimized CPU kernel entry points. This file is compiled with AVX
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
- 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#  define __KERNEL_SSE41__
-#  define __KERNEL_AVX__
-#endif
 
 #define __SPLIT_KERNEL__
 
 #include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_avx
-#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
index 364d279a189..76b2d77ebb8 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
@@ -18,23 +18,25 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE__
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#  define __KERNEL_SSE41__
-#  define __KERNEL_AVX__
-#  define __KERNEL_AVX2__
-#endif
-
 #define __SPLIT_KERNEL__
 
 #include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_avx2
-#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#    define __KERNEL_AVX2__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
index 0afb481296f..b468b6f44c8 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
@@ -18,17 +18,19 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE2__
-#endif
-
 #define __SPLIT_KERNEL__
 
 #include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_sse2
-#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
index 13d00813591..3e5792d0b17 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
@@ -18,19 +18,21 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#endif
-
 #define __SPLIT_KERNEL__
 
 #include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_sse3
-#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
index a4312071edc..3629f21cd29 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
@@ -18,20 +18,22 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#  define __KERNEL_SSE41__
-#endif
-
 #define __SPLIT_KERNEL__
 
 #include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_sse41
-#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
index 1acfaa91ac9..57530c88710 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
@@ -18,15 +18,17 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE2__
-#endif
-
 #include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_sse2
-#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
index f7b6a2e21fe..c607753bc4b 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
@@ -18,17 +18,19 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#endif
-
 #include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_sse3
-#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
index 1900c6e3012..a278554731c 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
@@ -18,18 +18,20 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#  define __KERNEL_SSE41__
-#endif
-
 #include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_sse41
-#  include "kernel/kernels/cpu//kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu
new file mode 100644
index 00000000000..50f73f9728d
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cuda/filter.cu
@@ -0,0 +1,235 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CUDA kernel entry points */
+
+#ifdef __CUDA_ARCH__
+
+#include "kernel_config.h"
+
+#include "kernel/kernel_compat_cuda.h"
+
+#include "kernel/filter/filter_kernel.h"
+
+/* kernels */
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_divide_shadow(int sample,
+                                 TilesInfo *tiles,
+                                 float *unfilteredA,
+                                 float *unfilteredB,
+                                 float *sampleVariance,
+                                 float *sampleVarianceV,
+                                 float *bufferVariance,
+                                 int4 prefilter_rect,
+                                 int buffer_pass_stride,
+                                 int buffer_denoising_offset,
+                                 bool use_split_variance)
+{
+	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_divide_shadow(sample,
+		                            tiles,
+		                            x, y,
+		                            unfilteredA,
+		                            unfilteredB,
+		                            sampleVariance,
+		                            sampleVarianceV,
+		                            bufferVariance,
+		                            prefilter_rect,
+		                            buffer_pass_stride,
+		                            buffer_denoising_offset,
+		                            use_split_variance);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_get_feature(int sample,
+                               TilesInfo *tiles,
+                               int m_offset,
+                               int v_offset,
+                               float *mean,
+                               float *variance,
+                               int4 prefilter_rect,
+                               int buffer_pass_stride,
+                               int buffer_denoising_offset,
+                               bool use_split_variance)
+{
+	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_get_feature(sample,
+		                          tiles,
+		                          m_offset, v_offset,
+		                          x, y,
+		                          mean, variance,
+		                          prefilter_rect,
+		                          buffer_pass_stride,
+		                          buffer_denoising_offset,
+		                          use_split_variance);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r)
+{
+	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
+                                       float *transform, int *rank,
+                                       int4 filter_area, int4 rect,
+                                       int radius, float pca_threshold,
+                                       int pass_stride)
+{
+	int x = blockDim.x*blockIdx.x + threadIdx.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < filter_area.z && y < filter_area.w) {
+		int *l_rank = rank + y*filter_area.z + x;
+		float *l_transform = transform + y*filter_area.z + x;
+		kernel_filter_construct_transform(buffer,
+		                                  x + filter_area.x, y + filter_area.y,
+		                                  rect, pass_stride,
+		                                  l_transform, l_rank,
+		                                  radius, pca_threshold,
+		                                  filter_area.z*filter_area.w,
+		                                  threadIdx.y*blockDim.x + threadIdx.x);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_calc_difference(int dx, int dy,
+                                       float ccl_restrict_ptr weightImage,
+                                       float ccl_restrict_ptr varianceImage,
+                                       float *differenceImage,
+                                       int4 rect, int w,
+                                       int channel_offset,
+                                       float a, float k_2) {
+	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_calc_difference(x, y, dx, dy, weightImage, varianceImage, differenceImage, rect, w, channel_offset, a, k_2);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_blur(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) {
+	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_blur(x, y, differenceImage, outImage, rect, w, f);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_calc_weight(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) {
+	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_calc_weight(x, y, differenceImage, outImage, rect, w, f);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_update_output(int dx, int dy,
+                                     float ccl_restrict_ptr differenceImage,
+                                     float ccl_restrict_ptr image,
+                                     float *outImage, float *accumImage,
+                                     int4 rect, int w,
+                                     int f) {
+	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_update_output(x, y, dx, dy, differenceImage, image, outImage, accumImage, rect, w, f);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_normalize(float *outImage, float ccl_restrict_ptr accumImage, int4 rect, int w) {
+	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_normalize(x, y, outImage, accumImage, rect, w);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_construct_gramian(int dx, int dy,
+                                         float ccl_restrict_ptr differenceImage,
+                                         float ccl_restrict_ptr buffer,
+                                         float *color_pass,
+                                         float *variance_pass,
+                                         float const* __restrict__ transform,
+                                         int *rank,
+                                         float *XtWX,
+                                         float3 *XtWY,
+                                         int4 rect,
+                                         int4 filter_rect,
+                                         int w, int h, int f,
+                                         int pass_stride) {
+	int x = blockDim.x*blockIdx.x + threadIdx.x + max(0, rect.x-filter_rect.x);
+	int y = blockDim.y*blockIdx.y + threadIdx.y + max(0, rect.y-filter_rect.y);
+	if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
+		kernel_filter_nlm_construct_gramian(x, y,
+		                                    dx, dy,
+		                                    differenceImage,
+		                                    buffer,
+		                                    color_pass, variance_pass,
+		                                    transform, rank,
+		                                    XtWX, XtWY,
+		                                    rect, filter_rect,
+		                                    w, h, f,
+		                                    pass_stride,
+		                                    threadIdx.y*blockDim.x + threadIdx.x);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_finalize(int w, int h,
+                            float *buffer, int *rank,
+                            float *XtWX, float3 *XtWY,
+                            int4 filter_area, int4 buffer_params,
+                            int sample) {
+	int x = blockDim.x*blockIdx.x + threadIdx.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < filter_area.z && y < filter_area.w) {
+		int storage_ofs = y*filter_area.z+x;
+		rank += storage_ofs;
+		XtWX += storage_ofs;
+		XtWY += storage_ofs;
+		kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);
+	}
+}
+
+#endif
+
diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl
new file mode 100644
index 00000000000..3d82bff9892
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/filter.cl
@@ -0,0 +1,262 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* OpenCL kernel entry points */
+
+#include "kernel/kernel_compat_opencl.h"
+
+#include "kernel/filter/filter_kernel.h"
+
+/* kernels */
+
+__kernel void kernel_ocl_filter_divide_shadow(int sample,
+                                              ccl_global TilesInfo *tiles,
+                                              ccl_global float *unfilteredA,
+                                              ccl_global float *unfilteredB,
+                                              ccl_global float *sampleVariance,
+                                              ccl_global float *sampleVarianceV,
+                                              ccl_global float *bufferVariance,
+                                              int4 prefilter_rect,
+                                              int buffer_pass_stride,
+                                              int buffer_denoising_offset,
+                                              char use_split_variance)
+{
+	int x = prefilter_rect.x + get_global_id(0);
+	int y = prefilter_rect.y + get_global_id(1);
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_divide_shadow(sample,
+		                            tiles,
+		                            x, y,
+		                            unfilteredA,
+		                            unfilteredB,
+		                            sampleVariance,
+		                            sampleVarianceV,
+		                            bufferVariance,
+		                            prefilter_rect,
+		                            buffer_pass_stride,
+		                            buffer_denoising_offset,
+		                            use_split_variance);
+	}
+}
+
+__kernel void kernel_ocl_filter_get_feature(int sample,
+                                            ccl_global TilesInfo *tiles,
+                                            int m_offset,
+                                            int v_offset,
+                                            ccl_global float *mean,
+                                            ccl_global float *variance,
+                                            int4 prefilter_rect,
+                                            int buffer_pass_stride,
+                                            int buffer_denoising_offset,
+                                            char use_split_variance)
+{
+	int x = prefilter_rect.x + get_global_id(0);
+	int y = prefilter_rect.y + get_global_id(1);
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_get_feature(sample,
+		                          tiles,
+		                          m_offset, v_offset,
+		                          x, y,
+		                          mean, variance,
+		                          prefilter_rect,
+		                          buffer_pass_stride,
+		                          buffer_denoising_offset,
+		                          use_split_variance);
+	}
+}
+
+__kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean,
+                                               ccl_global float *variance,
+                                               ccl_global float *a,
+                                               ccl_global float *b,
+                                               int4 prefilter_rect,
+                                               int r)
+{
+	int x = prefilter_rect.x + get_global_id(0);
+	int y = prefilter_rect.y + get_global_id(1);
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
+	}
+}
+
+__kernel void kernel_ocl_filter_construct_transform(ccl_global float ccl_restrict_ptr buffer,
+                                                    ccl_global float *transform,
+                                                    ccl_global int *rank,
+                                                    int4 filter_area,
+                                                    int4 rect,
+                                                    int pass_stride,
+                                                    int radius,
+                                                    float pca_threshold)
+{
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+	if(x < filter_area.z && y < filter_area.w) {
+		ccl_global int *l_rank = rank + y*filter_area.z + x;
+		ccl_global float *l_transform = transform + y*filter_area.z + x;
+		kernel_filter_construct_transform(buffer,
+		                                  x + filter_area.x, y + filter_area.y,
+		                                  rect, pass_stride,
+		                                  l_transform, l_rank,
+		                                  radius, pca_threshold,
+		                                  filter_area.z*filter_area.w,
+		                                  get_local_id(1)*get_local_size(0) + get_local_id(0));
+	}
+}
+
+__kernel void kernel_ocl_filter_nlm_calc_difference(int dx,
+                                                    int dy,
+                                                    ccl_global float ccl_restrict_ptr weightImage,
+                                                    ccl_global float ccl_restrict_ptr varianceImage,
+                                                    ccl_global float *differenceImage,
+                                                    int4 rect,
+                                                    int w,
+                                                    int channel_offset,
+                                                    float a,
+                                                    float k_2) {
+	int x = get_global_id(0) + rect.x;
+	int y = get_global_id(1) + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_calc_difference(x, y, dx, dy, weightImage, varianceImage, differenceImage, rect, w, channel_offset, a, k_2);
+	}
+}
+
+__kernel void kernel_ocl_filter_nlm_blur(ccl_global float ccl_restrict_ptr differenceImage,
+                                         ccl_global float *outImage,
+                                         int4 rect,
+                                         int w,
+                                         int f) {
+	int x = get_global_id(0) + rect.x;
+	int y = get_global_id(1) + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_blur(x, y, differenceImage, outImage, rect, w, f);
+	}
+}
+
+__kernel void kernel_ocl_filter_nlm_calc_weight(ccl_global float ccl_restrict_ptr differenceImage,
+                                                ccl_global float *outImage,
+                                                int4 rect,
+                                                int w,
+                                                int f) {
+	int x = get_global_id(0) + rect.x;
+	int y = get_global_id(1) + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_calc_weight(x, y, differenceImage, outImage, rect, w, f);
+	}
+}
+
+__kernel void kernel_ocl_filter_nlm_update_output(int dx,
+                                                  int dy,
+                                                  ccl_global float ccl_restrict_ptr differenceImage,
+                                                  ccl_global float ccl_restrict_ptr image,
+                                                  ccl_global float *outImage,
+                                                  ccl_global float *accumImage,
+                                                  int4 rect,
+                                                  int w,
+                                                  int f) {
+	int x = get_global_id(0) + rect.x;
+	int y = get_global_id(1) + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_update_output(x, y, dx, dy, differenceImage, image, outImage, accumImage, rect, w, f);
+	}
+}
+
+__kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *outImage,
+                                              ccl_global float ccl_restrict_ptr accumImage,
+                                              int4 rect,
+                                              int w) {
+	int x = get_global_id(0) + rect.x;
+	int y = get_global_id(1) + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_normalize(x, y, outImage, accumImage, rect, w);
+	}
+}
+
+__kernel void kernel_ocl_filter_nlm_construct_gramian(int dx,
+                                                      int dy,
+                                                      ccl_global float ccl_restrict_ptr differenceImage,
+                                                      ccl_global float ccl_restrict_ptr buffer,
+                                                      ccl_global float *color_pass,
+                                                      ccl_global float *variance_pass,
+                                                      ccl_global float ccl_restrict_ptr transform,
+                                                      ccl_global int *rank,
+                                                      ccl_global float *XtWX,
+                                                      ccl_global float3 *XtWY,
+                                                      int4 rect,
+                                                      int4 filter_rect,
+                                                      int w,
+                                                      int h,
+                                                      int f,
+                                                      int pass_stride) {
+	int x = get_global_id(0) + max(0, rect.x-filter_rect.x);
+	int y = get_global_id(1) + max(0, rect.y-filter_rect.y);
+	if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
+		kernel_filter_nlm_construct_gramian(x, y,
+		                                    dx, dy,
+		                                    differenceImage,
+		                                    buffer,
+		                                    color_pass, variance_pass,
+		                                    transform, rank,
+		                                    XtWX, XtWY,
+		                                    rect, filter_rect,
+		                                    w, h, f,
+		                                    pass_stride,
+		                                    get_local_id(1)*get_local_size(0) + get_local_id(0));
+	}
+}
+
+__kernel void kernel_ocl_filter_finalize(int w,
+	                                     int h,
+                                         ccl_global float *buffer,
+                                         ccl_global int *rank,
+                                         ccl_global float *XtWX,
+                                         ccl_global float3 *XtWY,
+                                         int4 filter_area,
+                                         int4 buffer_params,
+                                         int sample) {
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+	if(x < filter_area.z && y < filter_area.w) {
+		int storage_ofs = y*filter_area.z+x;
+		rank += storage_ofs;
+		XtWX += storage_ofs;
+		XtWY += storage_ofs;
+		kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);
+	}
+}
+
+__kernel void kernel_ocl_filter_set_tiles(ccl_global TilesInfo* tiles,
+                                          ccl_global float *buffer_1,
+                                          ccl_global float *buffer_2,
+                                          ccl_global float *buffer_3,
+                                          ccl_global float *buffer_4,
+                                          ccl_global float *buffer_5,
+                                          ccl_global float *buffer_6,
+                                          ccl_global float *buffer_7,
+                                          ccl_global float *buffer_8,
+                                          ccl_global float *buffer_9)
+{
+	if((get_global_id(0) == 0) && (get_global_id(1) == 0)) {
+		tiles->buffers[0] = buffer_1;
+		tiles->buffers[1] = buffer_2;
+		tiles->buffers[2] = buffer_3;
+		tiles->buffers[3] = buffer_4;
+		tiles->buffers[4] = buffer_5;
+		tiles->buffers[5] = buffer_6;
+		tiles->buffers[6] = buffer_7;
+		tiles->buffers[7] = buffer_8;
+		tiles->buffers[8] = buffer_9;
+	}
+}
author	Lukas Stockner <lukas.stockner@freenet.de>	2017-05-07 15:40:58 +0300
committer	Lukas Stockner <lukas.stockner@freenet.de>	2017-05-07 15:40:58 +0300
commit	43b374e8c5430488a302298b1026faa1c3a231e9 (patch)
tree	42e619a9fa08d02cef515b6315ce34dd7fd062b2 /intern/cycles/kernel/kernels
parent	bca697834728fd12c84941aa2a428abfe2090b27 (diff)