Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/kernel/kernels/cpu')
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter.cpp61
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_avx.cpp39
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_avx2.cpp40
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_cpu.h132
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h259
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_sse2.cpp34
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_sse3.cpp36
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_sse41.cpp37
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_avx.cpp30
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp32
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_cpu.h2
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h79
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp29
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp32
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp20
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp24
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp26
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp20
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp24
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp26
20 files changed, 818 insertions, 164 deletions
diff --git a/intern/cycles/kernel/kernels/cpu/filter.cpp b/intern/cycles/kernel/kernels/cpu/filter.cpp
new file mode 100644
index 00000000000..2ff1a392dc3
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CPU kernel entry points */
+
+/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
+ * one with SSE2 intrinsics.
+ */
+#if defined(__x86_64__) || defined(_M_X64)
+# define __KERNEL_SSE2__
+#endif
+
+/* When building kernel for native machine detect kernel features from the flags
+ * set by compiler.
+ */
+#ifdef WITH_KERNEL_NATIVE
+# ifdef __SSE2__
+# ifndef __KERNEL_SSE2__
+# define __KERNEL_SSE2__
+# endif
+# endif
+# ifdef __SSE3__
+# define __KERNEL_SSE3__
+# endif
+# ifdef __SSSE3__
+# define __KERNEL_SSSE3__
+# endif
+# ifdef __SSE4_1__
+# define __KERNEL_SSE41__
+# endif
+# ifdef __AVX__
+# define __KERNEL_SSE__
+# define __KERNEL_AVX__
+# endif
+# ifdef __AVX2__
+# define __KERNEL_SSE__
+# define __KERNEL_AVX2__
+# endif
+#endif
+
+/* quiet unused define warnings */
+#if defined(__KERNEL_SSE2__)
+ /* do nothing */
+#endif
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp
new file mode 100644
index 00000000000..4a9e6047ecf
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp
new file mode 100644
index 00000000000..c22ec576254
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# define __KERNEL_AVX2__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
new file mode 100644
index 00000000000..10007ee2635
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common declaration part of all CPU kernels. */
+
+void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
+ TilesInfo *tiles,
+ int x,
+ int y,
+ float *unfilteredA,
+ float *unfilteredB,
+ float *sampleV,
+ float *sampleVV,
+ float *bufferV,
+ int* prefilter_rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
+ TilesInfo *tiles,
+ int m_offset,
+ int v_offset,
+ int x,
+ int y,
+ float *mean,
+ float *variance,
+ int* prefilter_rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
+ float *mean,
+ float *variance,
+ float *a,
+ float *b,
+ int* prefilter_rect,
+ int r);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
+ int x,
+ int y,
+ int storage_ofs,
+ float *transform,
+ int *rank,
+ int* rect,
+ int pass_stride,
+ int radius,
+ float pca_threshold);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
+ int dy,
+ float *weightImage,
+ float *variance,
+ float *differenceImage,
+ int* rect,
+ int w,
+ int channel_offset,
+ float a,
+ float k_2);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *differenceImage,
+ float *outImage,
+ int* rect,
+ int w,
+ int f);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *differenceImage,
+ float *outImage,
+ int* rect,
+ int w,
+ int f);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
+ int dy,
+ float *differenceImage,
+ float *image,
+ float *outImage,
+ float *accumImage,
+ int* rect,
+ int w,
+ int f);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
+ int dy,
+ float *differenceImage,
+ float *buffer,
+ float *color_pass,
+ float *variance_pass,
+ float *transform,
+ int *rank,
+ float *XtWX,
+ float3 *XtWY,
+ int *rect,
+ int *filter_rect,
+ int w,
+ int h,
+ int f,
+ int pass_stride);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *outImage,
+ float *accumImage,
+ int* rect,
+ int w);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
+ int y,
+ int storage_ofs,
+ int w,
+ int h,
+ float *buffer,
+ int *rank,
+ float *XtWX,
+ float3 *XtWY,
+ int *buffer_params,
+ int sample);
+
+#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
new file mode 100644
index 00000000000..3b71e50ca3b
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
@@ -0,0 +1,259 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common implementation part of all CPU kernels.
+ *
+ * The idea is that particular .cpp files sets needed optimization flags and
+ * simply includes this file without worry of copying actual implementation over.
+ */
+
+#include "kernel/kernel_compat_cpu.h"
+
+#include "kernel/filter/filter_kernel.h"
+
+#ifdef KERNEL_STUB
+# include "util/util_debug.h"
+# define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!"))
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+
+/* Denoise filter */
+
+void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
+ TilesInfo *tiles,
+ int x,
+ int y,
+ float *unfilteredA,
+ float *unfilteredB,
+ float *sampleVariance,
+ float *sampleVarianceV,
+ float *bufferVariance,
+ int* prefilter_rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow);
+#else
+ kernel_filter_divide_shadow(sample, tiles,
+ x, y,
+ unfilteredA,
+ unfilteredB,
+ sampleVariance,
+ sampleVarianceV,
+ bufferVariance,
+ load_int4(prefilter_rect),
+ buffer_pass_stride,
+ buffer_denoising_offset,
+ use_split_variance);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
+ TilesInfo *tiles,
+ int m_offset,
+ int v_offset,
+ int x,
+ int y,
+ float *mean, float *variance,
+ int* prefilter_rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_get_feature);
+#else
+ kernel_filter_get_feature(sample, tiles,
+ m_offset, v_offset,
+ x, y,
+ mean, variance,
+ load_int4(prefilter_rect),
+ buffer_pass_stride,
+ buffer_denoising_offset,
+ use_split_variance);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
+ float *mean,
+ float *variance,
+ float *a,
+ float *b,
+ int* prefilter_rect,
+ int r)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_combine_halves);
+#else
+ kernel_filter_combine_halves(x, y, mean, variance, a, b, load_int4(prefilter_rect), r);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
+ int x,
+ int y,
+ int storage_ofs,
+ float *transform,
+ int *rank,
+ int* prefilter_rect,
+ int pass_stride,
+ int radius,
+ float pca_threshold)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_construct_transform);
+#else
+ rank += storage_ofs;
+ transform += storage_ofs*TRANSFORM_SIZE;
+ kernel_filter_construct_transform(buffer,
+ x, y,
+ load_int4(prefilter_rect),
+ pass_stride,
+ transform,
+ rank,
+ radius,
+ pca_threshold);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
+ int dy,
+ float *weightImage,
+ float *variance,
+ float *differenceImage,
+ int *rect,
+ int w,
+ int channel_offset,
+ float a,
+ float k_2)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference);
+#else
+ kernel_filter_nlm_calc_difference(dx, dy, weightImage, variance, differenceImage, load_int4(rect), w, channel_offset, a, k_2);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *differenceImage,
+ float *outImage,
+ int *rect,
+ int w,
+ int f)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_nlm_blur);
+#else
+ kernel_filter_nlm_blur(differenceImage, outImage, load_int4(rect), w, f);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *differenceImage,
+ float *outImage,
+ int *rect,
+ int w,
+ int f)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_weight);
+#else
+ kernel_filter_nlm_calc_weight(differenceImage, outImage, load_int4(rect), w, f);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
+ int dy,
+ float *differenceImage,
+ float *image,
+ float *outImage,
+ float *accumImage,
+ int *rect,
+ int w,
+ int f)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output);
+#else
+ kernel_filter_nlm_update_output(dx, dy, differenceImage, image, outImage, accumImage, load_int4(rect), w, f);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
+ int dy,
+ float *differenceImage,
+ float *buffer,
+ float *color_pass,
+ float *variance_pass,
+ float *transform,
+ int *rank,
+ float *XtWX,
+ float3 *XtWY,
+ int *rect,
+ int *filter_rect,
+ int w,
+ int h,
+ int f,
+ int pass_stride)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian);
+#else
+ kernel_filter_nlm_construct_gramian(dx, dy, differenceImage, buffer, color_pass, variance_pass, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_rect), w, h, f, pass_stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *outImage,
+ float *accumImage,
+ int *rect,
+ int w)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_nlm_normalize);
+#else
+ kernel_filter_nlm_normalize(outImage, accumImage, load_int4(rect), w);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
+ int y,
+ int storage_ofs,
+ int w,
+ int h,
+ float *buffer,
+ int *rank,
+ float *XtWX,
+ float3 *XtWY,
+ int *buffer_params,
+ int sample)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, filter_finalize);
+#else
+ XtWX += storage_ofs*XTWX_SIZE;
+ XtWY += storage_ofs*XTWY_SIZE;
+ rank += storage_ofs;
+ kernel_filter_finalize(x, y, w, h, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample);
+#endif
+}
+
+#undef KERNEL_STUB
+#undef STUB_ASSERT
+#undef KERNEL_ARCH
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
new file mode 100644
index 00000000000..f7c9935f1d0
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
new file mode 100644
index 00000000000..070b95a3505
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
new file mode 100644
index 00000000000..1a7b2040da1
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
index 2600d977972..a645fb4d8dd 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
@@ -17,21 +17,23 @@
/* Optimized CPU kernel entry points. This file is compiled with AVX
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE__
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# define __KERNEL_SSE41__
-# define __KERNEL_AVX__
-#endif
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_avx
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
index dba15d037ac..6bbb87727b9 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
@@ -18,21 +18,23 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE__
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# define __KERNEL_SSE41__
-# define __KERNEL_AVX__
-# define __KERNEL_AVX2__
-#endif
-
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_avx2
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# define __KERNEL_AVX2__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
index 39c9a9cf33c..9895080d328 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
@@ -89,6 +89,4 @@ DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update)
-void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func));
-
#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
index 8c05dd1d9ef..b9d82781840 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
@@ -57,6 +57,11 @@
# include "kernel/split/kernel_buffer_update.h"
#endif
+#ifdef KERNEL_STUB
+# include "util/util_debug.h"
+# define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!"))
+#endif
+
CCL_NAMESPACE_BEGIN
#ifndef __SPLIT_KERNEL__
@@ -71,7 +76,10 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
int offset,
int stride)
{
-#ifdef __BRANCHED_PATH__
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, path_trace);
+#else
+# ifdef __BRANCHED_PATH__
if(kernel_data.integrator.branched) {
kernel_branched_path_trace(kg,
buffer,
@@ -82,10 +90,11 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
stride);
}
else
-#endif
+# endif
{
kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
}
+#endif /* KERNEL_STUB */
}
/* Film */
@@ -98,6 +107,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
int offset,
int stride)
{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, convert_to_byte);
+#else
kernel_film_convert_to_byte(kg,
rgba,
buffer,
@@ -105,6 +117,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
x, y,
offset,
stride);
+#endif /* KERNEL_STUB */
}
void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
@@ -115,6 +128,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
int offset,
int stride)
{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, convert_to_half_float);
+#else
kernel_film_convert_to_half_float(kg,
rgba,
buffer,
@@ -122,6 +138,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
x, y,
offset,
stride);
+#endif /* KERNEL_STUB */
}
/* Shader Evaluate */
@@ -136,9 +153,12 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
int offset,
int sample)
{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, shader);
+#else
if(type >= SHADER_EVAL_BAKE) {
kernel_assert(output_luma == NULL);
-#ifdef __BAKING__
+# ifdef __BAKING__
kernel_bake_evaluate(kg,
input,
output,
@@ -147,7 +167,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
i,
offset,
sample);
-#endif
+# endif
}
else {
kernel_shader_evaluate(kg,
@@ -158,17 +178,26 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
i,
sample);
}
+#endif /* KERNEL_STUB */
}
#else /* __SPLIT_KERNEL__ */
/* Split Kernel Path Tracing */
-#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
+#ifdef KERNEL_STUB
+# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
+ void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
+ { \
+ STUB_ASSERT(KERNEL_ARCH, name); \
+ }
+#else
+# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
{ \
kernel_##name(kg); \
}
+#endif /* KERNEL_STUB */
#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
@@ -194,42 +223,10 @@ DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
-
-void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func))
-{
-#define REGISTER_NAME_STRING(name) #name
-#define REGISTER_EVAL_NAME(name) REGISTER_NAME_STRING(name)
-#define REGISTER(name) reg(REGISTER_EVAL_NAME(KERNEL_FUNCTION_FULL_NAME(name)), (void*)KERNEL_FUNCTION_FULL_NAME(name));
-
- REGISTER(path_trace);
- REGISTER(convert_to_byte);
- REGISTER(convert_to_half_float);
- REGISTER(shader);
-
- REGISTER(data_init);
- REGISTER(path_init);
- REGISTER(scene_intersect);
- REGISTER(lamp_emission);
- REGISTER(do_volume);
- REGISTER(queue_enqueue);
- REGISTER(indirect_background);
- REGISTER(shader_setup);
- REGISTER(shader_sort);
- REGISTER(shader_eval);
- REGISTER(holdout_emission_blurring_pathtermination_ao);
- REGISTER(subsurface_scatter);
- REGISTER(direct_lighting);
- REGISTER(shadow_blocked_ao);
- REGISTER(shadow_blocked_dl);
- REGISTER(next_iteration_setup);
- REGISTER(indirect_subsurface);
- REGISTER(buffer_update);
-
-#undef REGISTER
-#undef REGISTER_EVAL_NAME
-#undef REGISTER_NAME_STRING
-}
-
#endif /* __SPLIT_KERNEL__ */
+#undef KERNEL_STUB
+#undef STUB_ASSERT
+#undef KERNEL_ARCH
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
index 27a746a0799..6ba3425a343 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
@@ -17,22 +17,25 @@
/* Optimized CPU kernel entry points. This file is compiled with AVX
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# define __KERNEL_SSE41__
-# define __KERNEL_AVX__
-#endif
#define __SPLIT_KERNEL__
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_avx
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
index 364d279a189..76b2d77ebb8 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
@@ -18,23 +18,25 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE__
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# define __KERNEL_SSE41__
-# define __KERNEL_AVX__
-# define __KERNEL_AVX2__
-#endif
-
#define __SPLIT_KERNEL__
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_avx2
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# define __KERNEL_AVX2__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
index 0afb481296f..b468b6f44c8 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
@@ -18,17 +18,19 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-#endif
-
#define __SPLIT_KERNEL__
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_sse2
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
index 13d00813591..3e5792d0b17 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
@@ -18,19 +18,21 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-#endif
-
#define __SPLIT_KERNEL__
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_sse3
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
index a4312071edc..3629f21cd29 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
@@ -18,20 +18,22 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# define __KERNEL_SSE41__
-#endif
-
#define __SPLIT_KERNEL__
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_sse41
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
index 1acfaa91ac9..57530c88710 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
@@ -18,15 +18,17 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-#endif
-
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_sse2
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
index f7b6a2e21fe..c607753bc4b 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
@@ -18,17 +18,19 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-#endif
-
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_sse3
-# include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
index 1900c6e3012..a278554731c 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
@@ -18,18 +18,20 @@
* optimization flags and nearly all functions inlined, while kernel.cpp
* is compiled without for other CPU's. */
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-# define __KERNEL_SSE2__
-# define __KERNEL_SSE3__
-# define __KERNEL_SSSE3__
-# define __KERNEL_SSE41__
-#endif
-
#include "util/util_optimization.h"
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-# include "kernel/kernel.h"
-# define KERNEL_ARCH cpu_sse41
-# include "kernel/kernels/cpu//kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# endif
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"