Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/kernel/device/cpu')
-rw-r--r--intern/cycles/kernel/device/cpu/compat.h101
-rw-r--r--intern/cycles/kernel/device/cpu/globals.h61
-rw-r--r--intern/cycles/kernel/device/cpu/image.h657
-rw-r--r--intern/cycles/kernel/device/cpu/kernel.cpp94
-rw-r--r--intern/cycles/kernel/device/cpu/kernel.h62
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_arch.h113
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_arch_impl.h235
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_avx.cpp39
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_avx2.cpp40
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_sse2.cpp34
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_sse3.cpp36
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_sse41.cpp37
12 files changed, 1509 insertions, 0 deletions
diff --git a/intern/cycles/kernel/device/cpu/compat.h b/intern/cycles/kernel/device/cpu/compat.h
new file mode 100644
index 00000000000..bfd936c7bbd
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/compat.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#define __KERNEL_CPU__
+
+/* Release kernel has too much false-positive maybe-uninitialized warnings,
+ * which makes it possible to miss actual warnings.
+ */
+#if (defined(__GNUC__) && !defined(__clang__)) && defined(NDEBUG)
+# pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+# pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+
+#include "util/util_half.h"
+#include "util/util_math.h"
+#include "util/util_simd.h"
+#include "util/util_texture.h"
+#include "util/util_types.h"
+
+#define ccl_addr_space
+
+/* On x86_64, versions of glibc < 2.16 have an issue where expf is
+ * much slower than the double version. This was fixed in glibc 2.16.
+ */
+#if !defined(__KERNEL_GPU__) && defined(__x86_64__) && defined(__x86_64__) && \
+ defined(__GNU_LIBRARY__) && defined(__GLIBC__) && defined(__GLIBC_MINOR__) && \
+ (__GLIBC__ <= 2 && __GLIBC_MINOR__ < 16)
+# define expf(x) ((float)exp((double)(x)))
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/* Assertions inside the kernel only work for the CPU device, so we wrap it in
+ * a macro which is empty for other devices */
+
+#define kernel_assert(cond) assert(cond)
+
+/* Texture types to be compatible with CUDA textures. These are really just
+ * simple arrays and after inlining fetch hopefully revert to being a simple
+ * pointer lookup. */
+template<typename T> struct texture {
+ ccl_always_inline const T &fetch(int index) const
+ {
+ kernel_assert(index >= 0 && index < width);
+ return data[index];
+ }
+
+ T *data;
+ int width;
+};
+
+/* Macros to handle different memory storage on different devices */
+
+#ifdef __KERNEL_SSE2__
+typedef vector3<sseb> sse3b;
+typedef vector3<ssef> sse3f;
+typedef vector3<ssei> sse3i;
+
+ccl_device_inline void print_sse3b(const char *label, sse3b &a)
+{
+ print_sseb(label, a.x);
+ print_sseb(label, a.y);
+ print_sseb(label, a.z);
+}
+
+ccl_device_inline void print_sse3f(const char *label, sse3f &a)
+{
+ print_ssef(label, a.x);
+ print_ssef(label, a.y);
+ print_ssef(label, a.z);
+}
+
+ccl_device_inline void print_sse3i(const char *label, sse3i &a)
+{
+ print_ssei(label, a.x);
+ print_ssei(label, a.y);
+ print_ssei(label, a.z);
+}
+
+# if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
+typedef vector3<avxf> avx3f;
+# endif
+
+#endif
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cpu/globals.h b/intern/cycles/kernel/device/cpu/globals.h
new file mode 100644
index 00000000000..98b036e269d
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/globals.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#pragma once
+
+#include "kernel/kernel_profiling.h"
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in
+ * the kernel, to access constant data. These are all stored as "textures", but
+ * these are really just standard arrays. We can't use actually globals because
+ * multiple renders may be running inside the same process. */
+
+#ifdef __OSL__
+struct OSLGlobals;
+struct OSLThreadData;
+struct OSLShadingSystem;
+#endif
+
+typedef struct KernelGlobals {
+#define KERNEL_TEX(type, name) texture<type> name;
+#include "kernel/kernel_textures.h"
+
+ KernelData __data;
+
+#ifdef __OSL__
+ /* On the CPU, we also have the OSL globals here. Most data structures are shared
+ * with SVM, the difference is in the shaders and object/mesh attributes. */
+ OSLGlobals *osl;
+ OSLShadingSystem *osl_ss;
+ OSLThreadData *osl_tdata;
+#endif
+
+ /* **** Run-time data **** */
+
+ ProfilingState profiler;
+} KernelGlobals;
+
+/* Abstraction macros */
+#define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
+#define kernel_tex_array(tex) (kg->tex.data)
+#define kernel_data (kg->__data)
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cpu/image.h b/intern/cycles/kernel/device/cpu/image.h
new file mode 100644
index 00000000000..57e81ab186d
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/image.h
@@ -0,0 +1,657 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_NANOVDB
+# define NANOVDB_USE_INTRINSICS
+# include <nanovdb/NanoVDB.h>
+# include <nanovdb/util/SampleFromVoxels.h>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/* Make template functions private so symbols don't conflict between kernels with different
+ * instruction sets. */
+namespace {
+
+#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
+ { \
+ u[0] = (((-1.0f / 6.0f) * t + 0.5f) * t - 0.5f) * t + (1.0f / 6.0f); \
+ u[1] = ((0.5f * t - 1.0f) * t) * t + (2.0f / 3.0f); \
+ u[2] = ((-0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f / 6.0f); \
+ u[3] = (1.0f / 6.0f) * t * t * t; \
+ } \
+ (void)0
+
+ccl_device_inline float frac(float x, int *ix)
+{
+ int i = float_to_int(x) - ((x < 0.0f) ? 1 : 0);
+ *ix = i;
+ return x - (float)i;
+}
+
+template<typename T> struct TextureInterpolator {
+
+ static ccl_always_inline float4 read(float4 r)
+ {
+ return r;
+ }
+
+ static ccl_always_inline float4 read(uchar4 r)
+ {
+ float f = 1.0f / 255.0f;
+ return make_float4(r.x * f, r.y * f, r.z * f, r.w * f);
+ }
+
+ static ccl_always_inline float4 read(uchar r)
+ {
+ float f = r * (1.0f / 255.0f);
+ return make_float4(f, f, f, 1.0f);
+ }
+
+ static ccl_always_inline float4 read(float r)
+ {
+ /* TODO(dingto): Optimize this, so interpolation
+ * happens on float instead of float4 */
+ return make_float4(r, r, r, 1.0f);
+ }
+
+ static ccl_always_inline float4 read(half4 r)
+ {
+ return half4_to_float4(r);
+ }
+
+ static ccl_always_inline float4 read(half r)
+ {
+ float f = half_to_float(r);
+ return make_float4(f, f, f, 1.0f);
+ }
+
+ static ccl_always_inline float4 read(uint16_t r)
+ {
+ float f = r * (1.0f / 65535.0f);
+ return make_float4(f, f, f, 1.0f);
+ }
+
+ static ccl_always_inline float4 read(ushort4 r)
+ {
+ float f = 1.0f / 65535.0f;
+ return make_float4(r.x * f, r.y * f, r.z * f, r.w * f);
+ }
+
+ static ccl_always_inline float4 read(const T *data, int x, int y, int width, int height)
+ {
+ if (x < 0 || y < 0 || x >= width || y >= height) {
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ return read(data[y * width + x]);
+ }
+
+ static ccl_always_inline int wrap_periodic(int x, int width)
+ {
+ x %= width;
+ if (x < 0)
+ x += width;
+ return x;
+ }
+
+ static ccl_always_inline int wrap_clamp(int x, int width)
+ {
+ return clamp(x, 0, width - 1);
+ }
+
+ /* ******** 2D interpolation ******** */
+
+ static ccl_always_inline float4 interp_closest(const TextureInfo &info, float x, float y)
+ {
+ const T *data = (const T *)info.data;
+ const int width = info.width;
+ const int height = info.height;
+ int ix, iy;
+ frac(x * (float)width, &ix);
+ frac(y * (float)height, &iy);
+ switch (info.extension) {
+ case EXTENSION_REPEAT:
+ ix = wrap_periodic(ix, width);
+ iy = wrap_periodic(iy, height);
+ break;
+ case EXTENSION_CLIP:
+ if (x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ ATTR_FALLTHROUGH;
+ case EXTENSION_EXTEND:
+ ix = wrap_clamp(ix, width);
+ iy = wrap_clamp(iy, height);
+ break;
+ default:
+ kernel_assert(0);
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ return read(data[ix + iy * width]);
+ }
+
+ static ccl_always_inline float4 interp_linear(const TextureInfo &info, float x, float y)
+ {
+ const T *data = (const T *)info.data;
+ const int width = info.width;
+ const int height = info.height;
+ int ix, iy, nix, niy;
+ const float tx = frac(x * (float)width - 0.5f, &ix);
+ const float ty = frac(y * (float)height - 0.5f, &iy);
+ switch (info.extension) {
+ case EXTENSION_REPEAT:
+ ix = wrap_periodic(ix, width);
+ iy = wrap_periodic(iy, height);
+ nix = wrap_periodic(ix + 1, width);
+ niy = wrap_periodic(iy + 1, height);
+ break;
+ case EXTENSION_CLIP:
+ nix = ix + 1;
+ niy = iy + 1;
+ break;
+ case EXTENSION_EXTEND:
+ nix = wrap_clamp(ix + 1, width);
+ niy = wrap_clamp(iy + 1, height);
+ ix = wrap_clamp(ix, width);
+ iy = wrap_clamp(iy, height);
+ break;
+ default:
+ kernel_assert(0);
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ return (1.0f - ty) * (1.0f - tx) * read(data, ix, iy, width, height) +
+ (1.0f - ty) * tx * read(data, nix, iy, width, height) +
+ ty * (1.0f - tx) * read(data, ix, niy, width, height) +
+ ty * tx * read(data, nix, niy, width, height);
+ }
+
+ static ccl_always_inline float4 interp_cubic(const TextureInfo &info, float x, float y)
+ {
+ const T *data = (const T *)info.data;
+ const int width = info.width;
+ const int height = info.height;
+ int ix, iy, nix, niy;
+ const float tx = frac(x * (float)width - 0.5f, &ix);
+ const float ty = frac(y * (float)height - 0.5f, &iy);
+ int pix, piy, nnix, nniy;
+ switch (info.extension) {
+ case EXTENSION_REPEAT:
+ ix = wrap_periodic(ix, width);
+ iy = wrap_periodic(iy, height);
+ pix = wrap_periodic(ix - 1, width);
+ piy = wrap_periodic(iy - 1, height);
+ nix = wrap_periodic(ix + 1, width);
+ niy = wrap_periodic(iy + 1, height);
+ nnix = wrap_periodic(ix + 2, width);
+ nniy = wrap_periodic(iy + 2, height);
+ break;
+ case EXTENSION_CLIP:
+ pix = ix - 1;
+ piy = iy - 1;
+ nix = ix + 1;
+ niy = iy + 1;
+ nnix = ix + 2;
+ nniy = iy + 2;
+ break;
+ case EXTENSION_EXTEND:
+ pix = wrap_clamp(ix - 1, width);
+ piy = wrap_clamp(iy - 1, height);
+ nix = wrap_clamp(ix + 1, width);
+ niy = wrap_clamp(iy + 1, height);
+ nnix = wrap_clamp(ix + 2, width);
+ nniy = wrap_clamp(iy + 2, height);
+ ix = wrap_clamp(ix, width);
+ iy = wrap_clamp(iy, height);
+ break;
+ default:
+ kernel_assert(0);
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ const int xc[4] = {pix, ix, nix, nnix};
+ const int yc[4] = {piy, iy, niy, nniy};
+ float u[4], v[4];
+ /* Some helper macro to keep code reasonable size,
+ * let compiler to inline all the matrix multiplications.
+ */
+#define DATA(x, y) (read(data, xc[x], yc[y], width, height))
+#define TERM(col) \
+ (v[col] * \
+ (u[0] * DATA(0, col) + u[1] * DATA(1, col) + u[2] * DATA(2, col) + u[3] * DATA(3, col)))
+
+ SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+ SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+
+ /* Actual interpolation. */
+ return TERM(0) + TERM(1) + TERM(2) + TERM(3);
+#undef TERM
+#undef DATA
+ }
+
+ static ccl_always_inline float4 interp(const TextureInfo &info, float x, float y)
+ {
+ if (UNLIKELY(!info.data)) {
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ switch (info.interpolation) {
+ case INTERPOLATION_CLOSEST:
+ return interp_closest(info, x, y);
+ case INTERPOLATION_LINEAR:
+ return interp_linear(info, x, y);
+ default:
+ return interp_cubic(info, x, y);
+ }
+ }
+
+ /* ******** 3D interpolation ******** */
+
+ static ccl_always_inline float4 interp_3d_closest(const TextureInfo &info,
+ float x,
+ float y,
+ float z)
+ {
+ int width = info.width;
+ int height = info.height;
+ int depth = info.depth;
+ int ix, iy, iz;
+
+ frac(x * (float)width, &ix);
+ frac(y * (float)height, &iy);
+ frac(z * (float)depth, &iz);
+
+ switch (info.extension) {
+ case EXTENSION_REPEAT:
+ ix = wrap_periodic(ix, width);
+ iy = wrap_periodic(iy, height);
+ iz = wrap_periodic(iz, depth);
+ break;
+ case EXTENSION_CLIP:
+ if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) {
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ ATTR_FALLTHROUGH;
+ case EXTENSION_EXTEND:
+ ix = wrap_clamp(ix, width);
+ iy = wrap_clamp(iy, height);
+ iz = wrap_clamp(iz, depth);
+ break;
+ default:
+ kernel_assert(0);
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+
+ const T *data = (const T *)info.data;
+ return read(data[ix + iy * width + iz * width * height]);
+ }
+
+ static ccl_always_inline float4 interp_3d_linear(const TextureInfo &info,
+ float x,
+ float y,
+ float z)
+ {
+ int width = info.width;
+ int height = info.height;
+ int depth = info.depth;
+ int ix, iy, iz;
+ int nix, niy, niz;
+
+ float tx = frac(x * (float)width - 0.5f, &ix);
+ float ty = frac(y * (float)height - 0.5f, &iy);
+ float tz = frac(z * (float)depth - 0.5f, &iz);
+
+ switch (info.extension) {
+ case EXTENSION_REPEAT:
+ ix = wrap_periodic(ix, width);
+ iy = wrap_periodic(iy, height);
+ iz = wrap_periodic(iz, depth);
+
+ nix = wrap_periodic(ix + 1, width);
+ niy = wrap_periodic(iy + 1, height);
+ niz = wrap_periodic(iz + 1, depth);
+ break;
+ case EXTENSION_CLIP:
+ if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) {
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ ATTR_FALLTHROUGH;
+ case EXTENSION_EXTEND:
+ nix = wrap_clamp(ix + 1, width);
+ niy = wrap_clamp(iy + 1, height);
+ niz = wrap_clamp(iz + 1, depth);
+
+ ix = wrap_clamp(ix, width);
+ iy = wrap_clamp(iy, height);
+ iz = wrap_clamp(iz, depth);
+ break;
+ default:
+ kernel_assert(0);
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+
+ const T *data = (const T *)info.data;
+ float4 r;
+
+ r = (1.0f - tz) * (1.0f - ty) * (1.0f - tx) *
+ read(data[ix + iy * width + iz * width * height]);
+ r += (1.0f - tz) * (1.0f - ty) * tx * read(data[nix + iy * width + iz * width * height]);
+ r += (1.0f - tz) * ty * (1.0f - tx) * read(data[ix + niy * width + iz * width * height]);
+ r += (1.0f - tz) * ty * tx * read(data[nix + niy * width + iz * width * height]);
+
+ r += tz * (1.0f - ty) * (1.0f - tx) * read(data[ix + iy * width + niz * width * height]);
+ r += tz * (1.0f - ty) * tx * read(data[nix + iy * width + niz * width * height]);
+ r += tz * ty * (1.0f - tx) * read(data[ix + niy * width + niz * width * height]);
+ r += tz * ty * tx * read(data[nix + niy * width + niz * width * height]);
+
+ return r;
+ }
+
+ /* TODO(sergey): For some unspeakable reason both GCC-6 and Clang-3.9 are
+ * causing stack overflow issue in this function unless it is inlined.
+ *
+ * Only happens for AVX2 kernel and global __KERNEL_SSE__ vectorization
+ * enabled.
+ */
+#if defined(__GNUC__) || defined(__clang__)
+ static ccl_always_inline
+#else
+ static ccl_never_inline
+#endif
+ float4
+ interp_3d_cubic(const TextureInfo &info, float x, float y, float z)
+ {
+ int width = info.width;
+ int height = info.height;
+ int depth = info.depth;
+ int ix, iy, iz;
+ int nix, niy, niz;
+ /* Tricubic b-spline interpolation. */
+ const float tx = frac(x * (float)width - 0.5f, &ix);
+ const float ty = frac(y * (float)height - 0.5f, &iy);
+ const float tz = frac(z * (float)depth - 0.5f, &iz);
+ int pix, piy, piz, nnix, nniy, nniz;
+
+ switch (info.extension) {
+ case EXTENSION_REPEAT:
+ ix = wrap_periodic(ix, width);
+ iy = wrap_periodic(iy, height);
+ iz = wrap_periodic(iz, depth);
+
+ pix = wrap_periodic(ix - 1, width);
+ piy = wrap_periodic(iy - 1, height);
+ piz = wrap_periodic(iz - 1, depth);
+
+ nix = wrap_periodic(ix + 1, width);
+ niy = wrap_periodic(iy + 1, height);
+ niz = wrap_periodic(iz + 1, depth);
+
+ nnix = wrap_periodic(ix + 2, width);
+ nniy = wrap_periodic(iy + 2, height);
+ nniz = wrap_periodic(iz + 2, depth);
+ break;
+ case EXTENSION_CLIP:
+ if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) {
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ ATTR_FALLTHROUGH;
+ case EXTENSION_EXTEND:
+ pix = wrap_clamp(ix - 1, width);
+ piy = wrap_clamp(iy - 1, height);
+ piz = wrap_clamp(iz - 1, depth);
+
+ nix = wrap_clamp(ix + 1, width);
+ niy = wrap_clamp(iy + 1, height);
+ niz = wrap_clamp(iz + 1, depth);
+
+ nnix = wrap_clamp(ix + 2, width);
+ nniy = wrap_clamp(iy + 2, height);
+ nniz = wrap_clamp(iz + 2, depth);
+
+ ix = wrap_clamp(ix, width);
+ iy = wrap_clamp(iy, height);
+ iz = wrap_clamp(iz, depth);
+ break;
+ default:
+ kernel_assert(0);
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+
+ const int xc[4] = {pix, ix, nix, nnix};
+ const int yc[4] = {width * piy, width * iy, width * niy, width * nniy};
+ const int zc[4] = {
+ width * height * piz, width * height * iz, width * height * niz, width * height * nniz};
+ float u[4], v[4], w[4];
+
+ /* Some helper macro to keep code reasonable size,
+ * let compiler to inline all the matrix multiplications.
+ */
+#define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]]))
+#define COL_TERM(col, row) \
+ (v[col] * (u[0] * DATA(0, col, row) + u[1] * DATA(1, col, row) + u[2] * DATA(2, col, row) + \
+ u[3] * DATA(3, col, row)))
+#define ROW_TERM(row) \
+ (w[row] * (COL_TERM(0, row) + COL_TERM(1, row) + COL_TERM(2, row) + COL_TERM(3, row)))
+
+ SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+ SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+ SET_CUBIC_SPLINE_WEIGHTS(w, tz);
+
+ /* Actual interpolation. */
+ const T *data = (const T *)info.data;
+ return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
+
+#undef COL_TERM
+#undef ROW_TERM
+#undef DATA
+ }
+
+ static ccl_always_inline float4
+ interp_3d(const TextureInfo &info, float x, float y, float z, InterpolationType interp)
+ {
+ if (UNLIKELY(!info.data))
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+ switch ((interp == INTERPOLATION_NONE) ? info.interpolation : interp) {
+ case INTERPOLATION_CLOSEST:
+ return interp_3d_closest(info, x, y, z);
+ case INTERPOLATION_LINEAR:
+ return interp_3d_linear(info, x, y, z);
+ default:
+ return interp_3d_cubic(info, x, y, z);
+ }
+ }
+};
+
+#ifdef WITH_NANOVDB
+template<typename T> struct NanoVDBInterpolator {
+
+ typedef typename nanovdb::NanoGrid<T>::AccessorType AccessorType;
+
+ static ccl_always_inline float4 read(float r)
+ {
+ return make_float4(r, r, r, 1.0f);
+ }
+
+ static ccl_always_inline float4 read(nanovdb::Vec3f r)
+ {
+ return make_float4(r[0], r[1], r[2], 1.0f);
+ }
+
+ static ccl_always_inline float4 interp_3d_closest(const AccessorType &acc,
+ float x,
+ float y,
+ float z)
+ {
+ const nanovdb::Vec3f xyz(x, y, z);
+ return read(nanovdb::SampleFromVoxels<AccessorType, 0, false>(acc)(xyz));
+ }
+
+ static ccl_always_inline float4 interp_3d_linear(const AccessorType &acc,
+ float x,
+ float y,
+ float z)
+ {
+ const nanovdb::Vec3f xyz(x - 0.5f, y - 0.5f, z - 0.5f);
+ return read(nanovdb::SampleFromVoxels<AccessorType, 1, false>(acc)(xyz));
+ }
+
+# if defined(__GNUC__) || defined(__clang__)
+ static ccl_always_inline
+# else
+ static ccl_never_inline
+# endif
+ float4
+ interp_3d_cubic(const AccessorType &acc, float x, float y, float z)
+ {
+ int ix, iy, iz;
+ int nix, niy, niz;
+ int pix, piy, piz;
+ int nnix, nniy, nniz;
+ /* Tricubic b-spline interpolation. */
+ const float tx = frac(x - 0.5f, &ix);
+ const float ty = frac(y - 0.5f, &iy);
+ const float tz = frac(z - 0.5f, &iz);
+ pix = ix - 1;
+ piy = iy - 1;
+ piz = iz - 1;
+ nix = ix + 1;
+ niy = iy + 1;
+ niz = iz + 1;
+ nnix = ix + 2;
+ nniy = iy + 2;
+ nniz = iz + 2;
+
+ const int xc[4] = {pix, ix, nix, nnix};
+ const int yc[4] = {piy, iy, niy, nniy};
+ const int zc[4] = {piz, iz, niz, nniz};
+ float u[4], v[4], w[4];
+
+ /* Some helper macro to keep code reasonable size,
+ * let compiler to inline all the matrix multiplications.
+ */
+# define DATA(x, y, z) (read(acc.getValue(nanovdb::Coord(xc[x], yc[y], zc[z]))))
+# define COL_TERM(col, row) \
+ (v[col] * (u[0] * DATA(0, col, row) + u[1] * DATA(1, col, row) + u[2] * DATA(2, col, row) + \
+ u[3] * DATA(3, col, row)))
+# define ROW_TERM(row) \
+ (w[row] * (COL_TERM(0, row) + COL_TERM(1, row) + COL_TERM(2, row) + COL_TERM(3, row)))
+
+ SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+ SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+ SET_CUBIC_SPLINE_WEIGHTS(w, tz);
+
+ /* Actual interpolation. */
+ return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
+
+# undef COL_TERM
+# undef ROW_TERM
+# undef DATA
+ }
+
+ static ccl_always_inline float4
+ interp_3d(const TextureInfo &info, float x, float y, float z, InterpolationType interp)
+ {
+ using namespace nanovdb;
+
+ NanoGrid<T> *const grid = (NanoGrid<T> *)info.data;
+ AccessorType acc = grid->getAccessor();
+
+ switch ((interp == INTERPOLATION_NONE) ? info.interpolation : interp) {
+ case INTERPOLATION_CLOSEST:
+ return interp_3d_closest(acc, x, y, z);
+ case INTERPOLATION_LINEAR:
+ return interp_3d_linear(acc, x, y, z);
+ default:
+ return interp_3d_cubic(acc, x, y, z);
+ }
+ }
+};
+#endif
+
+#undef SET_CUBIC_SPLINE_WEIGHTS
+
+ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float x, float y)
+{
+ const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+
+ switch (info.data_type) {
+ case IMAGE_DATA_TYPE_HALF:
+ return TextureInterpolator<half>::interp(info, x, y);
+ case IMAGE_DATA_TYPE_BYTE:
+ return TextureInterpolator<uchar>::interp(info, x, y);
+ case IMAGE_DATA_TYPE_USHORT:
+ return TextureInterpolator<uint16_t>::interp(info, x, y);
+ case IMAGE_DATA_TYPE_FLOAT:
+ return TextureInterpolator<float>::interp(info, x, y);
+ case IMAGE_DATA_TYPE_HALF4:
+ return TextureInterpolator<half4>::interp(info, x, y);
+ case IMAGE_DATA_TYPE_BYTE4:
+ return TextureInterpolator<uchar4>::interp(info, x, y);
+ case IMAGE_DATA_TYPE_USHORT4:
+ return TextureInterpolator<ushort4>::interp(info, x, y);
+ case IMAGE_DATA_TYPE_FLOAT4:
+ return TextureInterpolator<float4>::interp(info, x, y);
+ default:
+ assert(0);
+ return make_float4(
+ TEX_IMAGE_MISSING_R, TEX_IMAGE_MISSING_G, TEX_IMAGE_MISSING_B, TEX_IMAGE_MISSING_A);
+ }
+}
+
+ccl_device float4 kernel_tex_image_interp_3d(const KernelGlobals *kg,
+ int id,
+ float3 P,
+ InterpolationType interp)
+{
+ const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+
+ if (info.use_transform_3d) {
+ P = transform_point(&info.transform_3d, P);
+ }
+
+ switch (info.data_type) {
+ case IMAGE_DATA_TYPE_HALF:
+ return TextureInterpolator<half>::interp_3d(info, P.x, P.y, P.z, interp);
+ case IMAGE_DATA_TYPE_BYTE:
+ return TextureInterpolator<uchar>::interp_3d(info, P.x, P.y, P.z, interp);
+ case IMAGE_DATA_TYPE_USHORT:
+ return TextureInterpolator<uint16_t>::interp_3d(info, P.x, P.y, P.z, interp);
+ case IMAGE_DATA_TYPE_FLOAT:
+ return TextureInterpolator<float>::interp_3d(info, P.x, P.y, P.z, interp);
+ case IMAGE_DATA_TYPE_HALF4:
+ return TextureInterpolator<half4>::interp_3d(info, P.x, P.y, P.z, interp);
+ case IMAGE_DATA_TYPE_BYTE4:
+ return TextureInterpolator<uchar4>::interp_3d(info, P.x, P.y, P.z, interp);
+ case IMAGE_DATA_TYPE_USHORT4:
+ return TextureInterpolator<ushort4>::interp_3d(info, P.x, P.y, P.z, interp);
+ case IMAGE_DATA_TYPE_FLOAT4:
+ return TextureInterpolator<float4>::interp_3d(info, P.x, P.y, P.z, interp);
+#ifdef WITH_NANOVDB
+ case IMAGE_DATA_TYPE_NANOVDB_FLOAT:
+ return NanoVDBInterpolator<float>::interp_3d(info, P.x, P.y, P.z, interp);
+ case IMAGE_DATA_TYPE_NANOVDB_FLOAT3:
+ return NanoVDBInterpolator<nanovdb::Vec3f>::interp_3d(info, P.x, P.y, P.z, interp);
+#endif
+ default:
+ assert(0);
+ return make_float4(
+ TEX_IMAGE_MISSING_R, TEX_IMAGE_MISSING_G, TEX_IMAGE_MISSING_B, TEX_IMAGE_MISSING_A);
+ }
+}
+
+} /* Namespace. */
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cpu/kernel.cpp b/intern/cycles/kernel/device/cpu/kernel.cpp
new file mode 100644
index 00000000000..ac1cdf5fffe
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel.cpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CPU kernel entry points */
+
+/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
+ * one with SSE2 intrinsics.
+ */
+#if defined(__x86_64__) || defined(_M_X64)
+# define __KERNEL_SSE2__
+#endif
+
+/* When building kernel for native machine detect kernel features from the flags
+ * set by compiler.
+ */
+#ifdef WITH_KERNEL_NATIVE
+# ifdef __SSE2__
+# ifndef __KERNEL_SSE2__
+# define __KERNEL_SSE2__
+# endif
+# endif
+# ifdef __SSE3__
+# define __KERNEL_SSE3__
+# endif
+# ifdef __SSSE3__
+# define __KERNEL_SSSE3__
+# endif
+# ifdef __SSE4_1__
+# define __KERNEL_SSE41__
+# endif
+# ifdef __AVX__
+# define __KERNEL_SSE__
+# define __KERNEL_AVX__
+# endif
+# ifdef __AVX2__
+# define __KERNEL_SSE__
+# define __KERNEL_AVX2__
+# endif
+#endif
+
+/* quiet unused define warnings */
+#if defined(__KERNEL_SSE2__)
+/* do nothing */
+#endif
+
+#include "kernel/device/cpu/kernel.h"
+#define KERNEL_ARCH cpu
+#include "kernel/device/cpu/kernel_arch_impl.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Memory Copy */
+
+void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t)
+{
+ if (strcmp(name, "__data") == 0) {
+ kg->__data = *(KernelData *)host;
+ }
+ else {
+ assert(0);
+ }
+}
+
+void kernel_global_memory_copy(KernelGlobals *kg, const char *name, void *mem, size_t size)
+{
+ if (0) {
+ }
+
+#define KERNEL_TEX(type, tname) \
+ else if (strcmp(name, #tname) == 0) \
+ { \
+ kg->tname.data = (type *)mem; \
+ kg->tname.width = size; \
+ }
+#include "kernel/kernel_textures.h"
+ else {
+ assert(0);
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cpu/kernel.h b/intern/cycles/kernel/device/cpu/kernel.h
new file mode 100644
index 00000000000..ae2a841835a
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+/* CPU Kernel Interface */
+
+#include "util/util_types.h"
+
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+#define KERNEL_NAME_JOIN(x, y, z) x##_##y##_##z
+#define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name)
+#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
+
+struct IntegratorStateCPU;
+struct KernelGlobals;
+struct KernelData;
+
+KernelGlobals *kernel_globals_create();
+void kernel_globals_free(KernelGlobals *kg);
+
+void *kernel_osl_memory(const KernelGlobals *kg);
+bool kernel_osl_use(const KernelGlobals *kg);
+
+void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t size);
+void kernel_global_memory_copy(KernelGlobals *kg, const char *name, void *mem, size_t size);
+
+#define KERNEL_ARCH cpu
+#include "kernel/device/cpu/kernel_arch.h"
+
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/device/cpu/kernel_arch.h"
+
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/device/cpu/kernel_arch.h"
+
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/device/cpu/kernel_arch.h"
+
+#define KERNEL_ARCH cpu_avx
+#include "kernel/device/cpu/kernel_arch.h"
+
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/device/cpu/kernel_arch.h"
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch.h b/intern/cycles/kernel/device/cpu/kernel_arch.h
new file mode 100644
index 00000000000..81f328c710b
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_arch.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common declaration part of all CPU kernels. */
+
+/* --------------------------------------------------------------------
+ * Integrator.
+ */
+
+#define KERNEL_INTEGRATOR_FUNCTION(name) \
+ void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \
+ IntegratorStateCPU *state)
+
+#define KERNEL_INTEGRATOR_SHADE_FUNCTION(name) \
+ void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \
+ IntegratorStateCPU *state, \
+ ccl_global float *render_buffer)
+
+#define KERNEL_INTEGRATOR_INIT_FUNCTION(name) \
+ bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \
+ IntegratorStateCPU *state, \
+ KernelWorkTile *tile, \
+ ccl_global float *render_buffer)
+
+KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_camera);
+KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_bake);
+KERNEL_INTEGRATOR_FUNCTION(intersect_closest);
+KERNEL_INTEGRATOR_FUNCTION(intersect_shadow);
+KERNEL_INTEGRATOR_FUNCTION(intersect_subsurface);
+KERNEL_INTEGRATOR_FUNCTION(intersect_volume_stack);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_background);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_light);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_shadow);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_surface);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_volume);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(megakernel);
+
+#undef KERNEL_INTEGRATOR_FUNCTION
+#undef KERNEL_INTEGRATOR_INIT_FUNCTION
+#undef KERNEL_INTEGRATOR_SHADE_FUNCTION
+
+/* --------------------------------------------------------------------
+ * Shader evaluation.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg,
+ const KernelShaderEvalInput *input,
+ float4 *output,
+ const int offset);
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,
+ const KernelShaderEvalInput *input,
+ float4 *output,
+ const int offset);
+
+/* --------------------------------------------------------------------
+ * Adaptive sampling.
+ */
+
+bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
+ const KernelGlobals *kg,
+ ccl_global float *render_buffer,
+ int x,
+ int y,
+ float threshold,
+ bool reset,
+ int offset,
+ int stride);
+
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobals *kg,
+ ccl_global float *render_buffer,
+ int y,
+ int start_x,
+ int width,
+ int offset,
+ int stride);
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobals *kg,
+ ccl_global float *render_buffer,
+ int x,
+ int start_y,
+ int height,
+ int offset,
+ int stride);
+
+/* --------------------------------------------------------------------
+ * Cryptomatte.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobals *kg,
+ ccl_global float *render_buffer,
+ int pixel_index);
+
+/* --------------------------------------------------------------------
+ * Bake.
+ */
+/* TODO(sergey): Needs to be re-implemented. Or not? Brecht did it already :) */
+
+void KERNEL_FUNCTION_FULL_NAME(bake)(
+ const KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride);
+
+#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
new file mode 100644
index 00000000000..1432abfd330
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
@@ -0,0 +1,235 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common implementation part of all CPU kernels.
+ *
+ * The idea is that particular .cpp files sets needed optimization flags and
+ * simply includes this file without worry of copying actual implementation over.
+ */
+
+#pragma once
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+
+#ifndef KERNEL_STUB
+# include "kernel/device/cpu/globals.h"
+# include "kernel/device/cpu/image.h"
+
+# include "kernel/integrator/integrator_state.h"
+# include "kernel/integrator/integrator_state_flow.h"
+# include "kernel/integrator/integrator_state_util.h"
+
+# include "kernel/integrator/integrator_init_from_camera.h"
+# include "kernel/integrator/integrator_init_from_bake.h"
+# include "kernel/integrator/integrator_intersect_closest.h"
+# include "kernel/integrator/integrator_intersect_shadow.h"
+# include "kernel/integrator/integrator_intersect_subsurface.h"
+# include "kernel/integrator/integrator_intersect_volume_stack.h"
+# include "kernel/integrator/integrator_shade_background.h"
+# include "kernel/integrator/integrator_shade_light.h"
+# include "kernel/integrator/integrator_shade_shadow.h"
+# include "kernel/integrator/integrator_shade_surface.h"
+# include "kernel/integrator/integrator_shade_volume.h"
+# include "kernel/integrator/integrator_megakernel.h"
+
+# include "kernel/kernel_film.h"
+# include "kernel/kernel_adaptive_sampling.h"
+# include "kernel/kernel_bake.h"
+# include "kernel/kernel_id_passes.h"
+
+#else
+# define STUB_ASSERT(arch, name) \
+ assert(!(#name " kernel stub for architecture " #arch " was called!"))
+#endif /* KERNEL_STUB */
+// clang-format on
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * Integrator.
+ */
+
+#ifdef KERNEL_STUB
+# define KERNEL_INVOKE(name, ...) (STUB_ASSERT(KERNEL_ARCH, name), 0)
+#else
+# define KERNEL_INVOKE(name, ...) integrator_##name(__VA_ARGS__)
+#endif
+
+#define DEFINE_INTEGRATOR_KERNEL(name) \
+ void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *kg, \
+ IntegratorStateCPU *state) \
+ { \
+ KERNEL_INVOKE(name, kg, state); \
+ }
+
+#define DEFINE_INTEGRATOR_SHADE_KERNEL(name) \
+ void KERNEL_FUNCTION_FULL_NAME(integrator_##name)( \
+ const KernelGlobals *kg, IntegratorStateCPU *state, ccl_global float *render_buffer) \
+ { \
+ KERNEL_INVOKE(name, kg, state, render_buffer); \
+ }
+
+/* TODO: Either use something like get_work_pixel(), or simplify tile which is passed here, so
+ * that it does not contain unused fields. */
+#define DEFINE_INTEGRATOR_INIT_KERNEL(name) \
+ bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *kg, \
+ IntegratorStateCPU *state, \
+ KernelWorkTile *tile, \
+ ccl_global float *render_buffer) \
+ { \
+ return KERNEL_INVOKE( \
+ name, kg, state, tile, render_buffer, tile->x, tile->y, tile->start_sample); \
+ }
+
+DEFINE_INTEGRATOR_INIT_KERNEL(init_from_camera)
+DEFINE_INTEGRATOR_INIT_KERNEL(init_from_bake)
+DEFINE_INTEGRATOR_KERNEL(intersect_closest)
+DEFINE_INTEGRATOR_KERNEL(intersect_shadow)
+DEFINE_INTEGRATOR_KERNEL(intersect_subsurface)
+DEFINE_INTEGRATOR_KERNEL(intersect_volume_stack)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_background)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_light)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_shadow)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_surface)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_volume)
+DEFINE_INTEGRATOR_SHADE_KERNEL(megakernel)
+
+/* --------------------------------------------------------------------
+ * Shader evaluation.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,
+ const KernelShaderEvalInput *input,
+ float4 *output,
+ const int offset)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, shader_eval_displace);
+#else
+ kernel_displace_evaluate(kg, input, output, offset);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg,
+ const KernelShaderEvalInput *input,
+ float4 *output,
+ const int offset)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, shader_eval_background);
+#else
+ kernel_background_evaluate(kg, input, output, offset);
+#endif
+}
+
+/* --------------------------------------------------------------------
+ * Adaptive sampling.
+ */
+
+bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
+ const KernelGlobals *kg,
+ ccl_global float *render_buffer,
+ int x,
+ int y,
+ float threshold,
+ bool reset,
+ int offset,
+ int stride)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_convergence_check);
+ return false;
+#else
+ return kernel_adaptive_sampling_convergence_check(
+ kg, render_buffer, x, y, threshold, reset, offset, stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobals *kg,
+ ccl_global float *render_buffer,
+ int y,
+ int start_x,
+ int width,
+ int offset,
+ int stride)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_filter_x);
+#else
+ kernel_adaptive_sampling_filter_x(kg, render_buffer, y, start_x, width, offset, stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobals *kg,
+ ccl_global float *render_buffer,
+ int x,
+ int start_y,
+ int height,
+ int offset,
+ int stride)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_filter_y);
+#else
+ kernel_adaptive_sampling_filter_y(kg, render_buffer, x, start_y, height, offset, stride);
+#endif
+}
+
+/* --------------------------------------------------------------------
+ * Cryptomatte.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobals *kg,
+ ccl_global float *render_buffer,
+ int pixel_index)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, cryptomatte_postprocess);
+#else
+ kernel_cryptomatte_post(kg, render_buffer, pixel_index);
+#endif
+}
+
+/* --------------------------------------------------------------------
+ * Bake.
+ */
+/* TODO(sergey): Needs to be re-implemented. Or not? Brecht did it already :) */
+
+void KERNEL_FUNCTION_FULL_NAME(bake)(
+ const KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride)
+{
+#if 0
+# ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, bake);
+# else
+# ifdef __BAKING__
+ kernel_bake_evaluate(kg, buffer, sample, x, y, offset, stride);
+# endif
+# endif /* KERNEL_STUB */
+#endif
+}
+
+#undef KERNEL_INVOKE
+#undef DEFINE_INTEGRATOR_KERNEL
+#undef DEFINE_INTEGRATOR_SHADE_KERNEL
+#undef DEFINE_INTEGRATOR_INIT_KERNEL
+
+#undef KERNEL_STUB
+#undef STUB_ASSERT
+#undef KERNEL_ARCH
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cpu/kernel_avx.cpp b/intern/cycles/kernel/device/cpu/kernel_avx.cpp
new file mode 100644
index 00000000000..220768036ab
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_avx.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug T36316. */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/device/cpu/kernel.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/device/cpu/kernel_avx2.cpp b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
new file mode 100644
index 00000000000..90c05113cbe
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011-2014 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug T36316. */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# define __KERNEL_AVX2__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/device/cpu/kernel.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/device/cpu/kernel_sse2.cpp b/intern/cycles/kernel/device/cpu/kernel_sse2.cpp
new file mode 100644
index 00000000000..fb85ef5b0d0
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_sse2.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug T36316. */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/device/cpu/kernel.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/device/cpu/kernel_sse3.cpp b/intern/cycles/kernel/device/cpu/kernel_sse3.cpp
new file mode 100644
index 00000000000..87baf04258a
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_sse3.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug T36316. */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/device/cpu/kernel.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/device/cpu/kernel_sse41.cpp b/intern/cycles/kernel/device/cpu/kernel_sse41.cpp
new file mode 100644
index 00000000000..bb421d58815
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_sse41.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug T36316. */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/device/cpu/kernel.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/device/cpu/kernel_arch_impl.h"