12 files changed, 1509 insertions, 0 deletions
diff --git a/intern/cycles/kernel/device/cpu/compat.h b/intern/cycles/kernel/device/cpu/compat.h
new file mode 100644
index 00000000000..bfd936c7bbd
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/compat.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#define __KERNEL_CPU__
+
+/* Release kernel has too much false-positive maybe-uninitialized warnings,
+ * which makes it possible to miss actual warnings.
+ */
+#if (defined(__GNUC__) && !defined(__clang__)) && defined(NDEBUG)
+#  pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#  pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+
+#include "util/util_half.h"
+#include "util/util_math.h"
+#include "util/util_simd.h"
+#include "util/util_texture.h"
+#include "util/util_types.h"
+
+#define ccl_addr_space
+
+/* On x86_64, versions of glibc < 2.16 have an issue where expf is
+ * much slower than the double version.  This was fixed in glibc 2.16.
+ */
+#if !defined(__KERNEL_GPU__) && defined(__x86_64__) && defined(__x86_64__) && \
+    defined(__GNU_LIBRARY__) && defined(__GLIBC__) && defined(__GLIBC_MINOR__) && \
+    (__GLIBC__ <= 2 && __GLIBC_MINOR__ < 16)
+#  define expf(x) ((float)exp((double)(x)))
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/* Assertions inside the kernel only work for the CPU device, so we wrap it in
+ * a macro which is empty for other devices */
+
+#define kernel_assert(cond) assert(cond)
+
+/* Texture types to be compatible with CUDA textures. These are really just
+ * simple arrays and after inlining fetch hopefully revert to being a simple
+ * pointer lookup. */
+template<typename T> struct texture {
+  ccl_always_inline const T &fetch(int index) const
+  {
+    kernel_assert(index >= 0 && index < width);
+    return data[index];
+  }
+
+  T *data;
+  int width;
+};
+
+/* Macros to handle different memory storage on different devices */
+
+#ifdef __KERNEL_SSE2__
+typedef vector3<sseb> sse3b;
+typedef vector3<ssef> sse3f;
+typedef vector3<ssei> sse3i;
+
+ccl_device_inline void print_sse3b(const char *label, sse3b &a)
+{
+  print_sseb(label, a.x);
+  print_sseb(label, a.y);
+  print_sseb(label, a.z);
+}
+
+ccl_device_inline void print_sse3f(const char *label, sse3f &a)
+{
+  print_ssef(label, a.x);
+  print_ssef(label, a.y);
+  print_ssef(label, a.z);
+}
+
+ccl_device_inline void print_sse3i(const char *label, sse3i &a)
+{
+  print_ssei(label, a.x);
+  print_ssei(label, a.y);
+  print_ssei(label, a.z);
+}
+
+#  if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
+typedef vector3<avxf> avx3f;
+#  endif
+
+#endif
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cpu/globals.h b/intern/cycles/kernel/device/cpu/globals.h
new file mode 100644
index 00000000000..98b036e269d
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/globals.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#pragma once
+
+#include "kernel/kernel_profiling.h"
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in
+ * the kernel, to access constant data. These are all stored as "textures", but
+ * these are really just standard arrays. We can't use actually globals because
+ * multiple renders may be running inside the same process. */
+
+#ifdef __OSL__
+struct OSLGlobals;
+struct OSLThreadData;
+struct OSLShadingSystem;
+#endif
+
+typedef struct KernelGlobals {
+#define KERNEL_TEX(type, name) texture<type> name;
+#include "kernel/kernel_textures.h"
+
+  KernelData __data;
+
+#ifdef __OSL__
+  /* On the CPU, we also have the OSL globals here. Most data structures are shared
+   * with SVM, the difference is in the shaders and object/mesh attributes. */
+  OSLGlobals *osl;
+  OSLShadingSystem *osl_ss;
+  OSLThreadData *osl_tdata;
+#endif
+
+  /* **** Run-time data ****  */
+
+  ProfilingState profiler;
+} KernelGlobals;
+
+/* Abstraction macros */
+#define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
+#define kernel_tex_array(tex) (kg->tex.data)
+#define kernel_data (kg->__data)
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cpu/image.h b/intern/cycles/kernel/device/cpu/image.h
new file mode 100644
index 00000000000..57e81ab186d
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/image.h
@@ -0,0 +1,657 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_NANOVDB
+#  define NANOVDB_USE_INTRINSICS
+#  include <nanovdb/NanoVDB.h>
+#  include <nanovdb/util/SampleFromVoxels.h>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/* Make template functions private so symbols don't conflict between kernels with different
+ * instruction sets. */
+namespace {
+
+#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
+  { \
+    u[0] = (((-1.0f / 6.0f) * t + 0.5f) * t - 0.5f) * t + (1.0f / 6.0f); \
+    u[1] = ((0.5f * t - 1.0f) * t) * t + (2.0f / 3.0f); \
+    u[2] = ((-0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f / 6.0f); \
+    u[3] = (1.0f / 6.0f) * t * t * t; \
+  } \
+  (void)0
+
+ccl_device_inline float frac(float x, int *ix)
+{
+  int i = float_to_int(x) - ((x < 0.0f) ? 1 : 0);
+  *ix = i;
+  return x - (float)i;
+}
+
+template<typename T> struct TextureInterpolator {
+
+  static ccl_always_inline float4 read(float4 r)
+  {
+    return r;
+  }
+
+  static ccl_always_inline float4 read(uchar4 r)
+  {
+    float f = 1.0f / 255.0f;
+    return make_float4(r.x * f, r.y * f, r.z * f, r.w * f);
+  }
+
+  static ccl_always_inline float4 read(uchar r)
+  {
+    float f = r * (1.0f / 255.0f);
+    return make_float4(f, f, f, 1.0f);
+  }
+
+  static ccl_always_inline float4 read(float r)
+  {
+    /* TODO(dingto): Optimize this, so interpolation
+     * happens on float instead of float4 */
+    return make_float4(r, r, r, 1.0f);
+  }
+
+  static ccl_always_inline float4 read(half4 r)
+  {
+    return half4_to_float4(r);
+  }
+
+  static ccl_always_inline float4 read(half r)
+  {
+    float f = half_to_float(r);
+    return make_float4(f, f, f, 1.0f);
+  }
+
+  static ccl_always_inline float4 read(uint16_t r)
+  {
+    float f = r * (1.0f / 65535.0f);
+    return make_float4(f, f, f, 1.0f);
+  }
+
+  static ccl_always_inline float4 read(ushort4 r)
+  {
+    float f = 1.0f / 65535.0f;
+    return make_float4(r.x * f, r.y * f, r.z * f, r.w * f);
+  }
+
+  static ccl_always_inline float4 read(const T *data, int x, int y, int width, int height)
+  {
+    if (x < 0 || y < 0 || x >= width || y >= height) {
+      return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    }
+    return read(data[y * width + x]);
+  }
+
+  static ccl_always_inline int wrap_periodic(int x, int width)
+  {
+    x %= width;
+    if (x < 0)
+      x += width;
+    return x;
+  }
+
+  static ccl_always_inline int wrap_clamp(int x, int width)
+  {
+    return clamp(x, 0, width - 1);
+  }
+
+  /* ********  2D interpolation ******** */
+
+  static ccl_always_inline float4 interp_closest(const TextureInfo &info, float x, float y)
+  {
+    const T *data = (const T *)info.data;
+    const int width = info.width;
+    const int height = info.height;
+    int ix, iy;
+    frac(x * (float)width, &ix);
+    frac(y * (float)height, &iy);
+    switch (info.extension) {
+      case EXTENSION_REPEAT:
+        ix = wrap_periodic(ix, width);
+        iy = wrap_periodic(iy, height);
+        break;
+      case EXTENSION_CLIP:
+        if (x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+          return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+        }
+        ATTR_FALLTHROUGH;
+      case EXTENSION_EXTEND:
+        ix = wrap_clamp(ix, width);
+        iy = wrap_clamp(iy, height);
+        break;
+      default:
+        kernel_assert(0);
+        return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    }
+    return read(data[ix + iy * width]);
+  }
+
+  static ccl_always_inline float4 interp_linear(const TextureInfo &info, float x, float y)
+  {
+    const T *data = (const T *)info.data;
+    const int width = info.width;
+    const int height = info.height;
+    int ix, iy, nix, niy;
+    const float tx = frac(x * (float)width - 0.5f, &ix);
+    const float ty = frac(y * (float)height - 0.5f, &iy);
+    switch (info.extension) {
+      case EXTENSION_REPEAT:
+        ix = wrap_periodic(ix, width);
+        iy = wrap_periodic(iy, height);
+        nix = wrap_periodic(ix + 1, width);
+        niy = wrap_periodic(iy + 1, height);
+        break;
+      case EXTENSION_CLIP:
+        nix = ix + 1;
+        niy = iy + 1;
+        break;
+      case EXTENSION_EXTEND:
+        nix = wrap_clamp(ix + 1, width);
+        niy = wrap_clamp(iy + 1, height);
+        ix = wrap_clamp(ix, width);
+        iy = wrap_clamp(iy, height);
+        break;
+      default:
+        kernel_assert(0);
+        return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    }
+    return (1.0f - ty) * (1.0f - tx) * read(data, ix, iy, width, height) +
+           (1.0f - ty) * tx * read(data, nix, iy, width, height) +
+           ty * (1.0f - tx) * read(data, ix, niy, width, height) +
+           ty * tx * read(data, nix, niy, width, height);
+  }
+
+  static ccl_always_inline float4 interp_cubic(const TextureInfo &info, float x, float y)
+  {
+    const T *data = (const T *)info.data;
+    const int width = info.width;
+    const int height = info.height;
+    int ix, iy, nix, niy;
+    const float tx = frac(x * (float)width - 0.5f, &ix);
+    const float ty = frac(y * (float)height - 0.5f, &iy);
+    int pix, piy, nnix, nniy;
+    switch (info.extension) {
+      case EXTENSION_REPEAT:
+        ix = wrap_periodic(ix, width);
+        iy = wrap_periodic(iy, height);
+        pix = wrap_periodic(ix - 1, width);
+        piy = wrap_periodic(iy - 1, height);
+        nix = wrap_periodic(ix + 1, width);
+        niy = wrap_periodic(iy + 1, height);
+        nnix = wrap_periodic(ix + 2, width);
+        nniy = wrap_periodic(iy + 2, height);
+        break;
+      case EXTENSION_CLIP:
+        pix = ix - 1;
+        piy = iy - 1;
+        nix = ix + 1;
+        niy = iy + 1;
+        nnix = ix + 2;
+        nniy = iy + 2;
+        break;
+      case EXTENSION_EXTEND:
+        pix = wrap_clamp(ix - 1, width);
+        piy = wrap_clamp(iy - 1, height);
+        nix = wrap_clamp(ix + 1, width);
+        niy = wrap_clamp(iy + 1, height);
+        nnix = wrap_clamp(ix + 2, width);
+        nniy = wrap_clamp(iy + 2, height);
+        ix = wrap_clamp(ix, width);
+        iy = wrap_clamp(iy, height);
+        break;
+      default:
+        kernel_assert(0);
+        return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    }
+    const int xc[4] = {pix, ix, nix, nnix};
+    const int yc[4] = {piy, iy, niy, nniy};
+    float u[4], v[4];
+    /* Some helper macro to keep code reasonable size,
+     * let compiler to inline all the matrix multiplications.
+     */
+#define DATA(x, y) (read(data, xc[x], yc[y], width, height))
+#define TERM(col) \
+  (v[col] * \
+   (u[0] * DATA(0, col) + u[1] * DATA(1, col) + u[2] * DATA(2, col) + u[3] * DATA(3, col)))
+
+    SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+    SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+
+    /* Actual interpolation. */
+    return TERM(0) + TERM(1) + TERM(2) + TERM(3);
+#undef TERM
+#undef DATA
+  }
+
+  static ccl_always_inline float4 interp(const TextureInfo &info, float x, float y)
+  {
+    if (UNLIKELY(!info.data)) {
+      return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    }
+    switch (info.interpolation) {
+      case INTERPOLATION_CLOSEST:
+        return interp_closest(info, x, y);
+      case INTERPOLATION_LINEAR:
+        return interp_linear(info, x, y);
+      default:
+        return interp_cubic(info, x, y);
+    }
+  }
+
+  /* ********  3D interpolation ******** */
+
+  static ccl_always_inline float4 interp_3d_closest(const TextureInfo &info,
+                                                    float x,
+                                                    float y,
+                                                    float z)
+  {
+    int width = info.width;
+    int height = info.height;
+    int depth = info.depth;
+    int ix, iy, iz;
+
+    frac(x * (float)width, &ix);
+    frac(y * (float)height, &iy);
+    frac(z * (float)depth, &iz);
+
+    switch (info.extension) {
+      case EXTENSION_REPEAT:
+        ix = wrap_periodic(ix, width);
+        iy = wrap_periodic(iy, height);
+        iz = wrap_periodic(iz, depth);
+        break;
+      case EXTENSION_CLIP:
+        if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) {
+          return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+        }
+        ATTR_FALLTHROUGH;
+      case EXTENSION_EXTEND:
+        ix = wrap_clamp(ix, width);
+        iy = wrap_clamp(iy, height);
+        iz = wrap_clamp(iz, depth);
+        break;
+      default:
+        kernel_assert(0);
+        return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    }
+
+    const T *data = (const T *)info.data;
+    return read(data[ix + iy * width + iz * width * height]);
+  }
+
+  static ccl_always_inline float4 interp_3d_linear(const TextureInfo &info,
+                                                   float x,
+                                                   float y,
+                                                   float z)
+  {
+    int width = info.width;
+    int height = info.height;
+    int depth = info.depth;
+    int ix, iy, iz;
+    int nix, niy, niz;
+
+    float tx = frac(x * (float)width - 0.5f, &ix);
+    float ty = frac(y * (float)height - 0.5f, &iy);
+    float tz = frac(z * (float)depth - 0.5f, &iz);
+
+    switch (info.extension) {
+      case EXTENSION_REPEAT:
+        ix = wrap_periodic(ix, width);
+        iy = wrap_periodic(iy, height);
+        iz = wrap_periodic(iz, depth);
+
+        nix = wrap_periodic(ix + 1, width);
+        niy = wrap_periodic(iy + 1, height);
+        niz = wrap_periodic(iz + 1, depth);
+        break;
+      case EXTENSION_CLIP:
+        if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) {
+          return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+        }
+        ATTR_FALLTHROUGH;
+      case EXTENSION_EXTEND:
+        nix = wrap_clamp(ix + 1, width);
+        niy = wrap_clamp(iy + 1, height);
+        niz = wrap_clamp(iz + 1, depth);
+
+        ix = wrap_clamp(ix, width);
+        iy = wrap_clamp(iy, height);
+        iz = wrap_clamp(iz, depth);
+        break;
+      default:
+        kernel_assert(0);
+        return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    }
+
+    const T *data = (const T *)info.data;
+    float4 r;
+
+    r = (1.0f - tz) * (1.0f - ty) * (1.0f - tx) *
+        read(data[ix + iy * width + iz * width * height]);
+    r += (1.0f - tz) * (1.0f - ty) * tx * read(data[nix + iy * width + iz * width * height]);
+    r += (1.0f - tz) * ty * (1.0f - tx) * read(data[ix + niy * width + iz * width * height]);
+    r += (1.0f - tz) * ty * tx * read(data[nix + niy * width + iz * width * height]);
+
+    r += tz * (1.0f - ty) * (1.0f - tx) * read(data[ix + iy * width + niz * width * height]);
+    r += tz * (1.0f - ty) * tx * read(data[nix + iy * width + niz * width * height]);
+    r += tz * ty * (1.0f - tx) * read(data[ix + niy * width + niz * width * height]);
+    r += tz * ty * tx * read(data[nix + niy * width + niz * width * height]);
+
+    return r;
+  }
+
+  /* TODO(sergey): For some unspeakable reason both GCC-6 and Clang-3.9 are
+   * causing stack overflow issue in this function unless it is inlined.
+   *
+   * Only happens for AVX2 kernel and global __KERNEL_SSE__ vectorization
+   * enabled.
+   */
+#if defined(__GNUC__) || defined(__clang__)
+  static ccl_always_inline
+#else
+  static ccl_never_inline
+#endif
+      float4
+      interp_3d_cubic(const TextureInfo &info, float x, float y, float z)
+  {
+    int width = info.width;
+    int height = info.height;
+    int depth = info.depth;
+    int ix, iy, iz;
+    int nix, niy, niz;
+    /* Tricubic b-spline interpolation. */
+    const float tx = frac(x * (float)width - 0.5f, &ix);
+    const float ty = frac(y * (float)height - 0.5f, &iy);
+    const float tz = frac(z * (float)depth - 0.5f, &iz);
+    int pix, piy, piz, nnix, nniy, nniz;
+
+    switch (info.extension) {
+      case EXTENSION_REPEAT:
+        ix = wrap_periodic(ix, width);
+        iy = wrap_periodic(iy, height);
+        iz = wrap_periodic(iz, depth);
+
+        pix = wrap_periodic(ix - 1, width);
+        piy = wrap_periodic(iy - 1, height);
+        piz = wrap_periodic(iz - 1, depth);
+
+        nix = wrap_periodic(ix + 1, width);
+        niy = wrap_periodic(iy + 1, height);
+        niz = wrap_periodic(iz + 1, depth);
+
+        nnix = wrap_periodic(ix + 2, width);
+        nniy = wrap_periodic(iy + 2, height);
+        nniz = wrap_periodic(iz + 2, depth);
+        break;
+      case EXTENSION_CLIP:
+        if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) {
+          return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+        }
+        ATTR_FALLTHROUGH;
+      case EXTENSION_EXTEND:
+        pix = wrap_clamp(ix - 1, width);
+        piy = wrap_clamp(iy - 1, height);
+        piz = wrap_clamp(iz - 1, depth);
+
+        nix = wrap_clamp(ix + 1, width);
+        niy = wrap_clamp(iy + 1, height);
+        niz = wrap_clamp(iz + 1, depth);
+
+        nnix = wrap_clamp(ix + 2, width);
+        nniy = wrap_clamp(iy + 2, height);
+        nniz = wrap_clamp(iz + 2, depth);
+
+        ix = wrap_clamp(ix, width);
+        iy = wrap_clamp(iy, height);
+        iz = wrap_clamp(iz, depth);
+        break;
+      default:
+        kernel_assert(0);
+        return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    }
+
+    const int xc[4] = {pix, ix, nix, nnix};
+    const int yc[4] = {width * piy, width * iy, width * niy, width * nniy};
+    const int zc[4] = {
+        width * height * piz, width * height * iz, width * height * niz, width * height * nniz};
+    float u[4], v[4], w[4];
+
+    /* Some helper macro to keep code reasonable size,
+     * let compiler to inline all the matrix multiplications.
+     */
+#define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]]))
+#define COL_TERM(col, row) \
+  (v[col] * (u[0] * DATA(0, col, row) + u[1] * DATA(1, col, row) + u[2] * DATA(2, col, row) + \
+             u[3] * DATA(3, col, row)))
+#define ROW_TERM(row) \
+  (w[row] * (COL_TERM(0, row) + COL_TERM(1, row) + COL_TERM(2, row) + COL_TERM(3, row)))
+
+    SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+    SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+    SET_CUBIC_SPLINE_WEIGHTS(w, tz);
+
+    /* Actual interpolation. */
+    const T *data = (const T *)info.data;
+    return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
+
+#undef COL_TERM
+#undef ROW_TERM
+#undef DATA
+  }
+
+  static ccl_always_inline float4
+  interp_3d(const TextureInfo &info, float x, float y, float z, InterpolationType interp)
+  {
+    if (UNLIKELY(!info.data))
+      return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+    switch ((interp == INTERPOLATION_NONE) ? info.interpolation : interp) {
+      case INTERPOLATION_CLOSEST:
+        return interp_3d_closest(info, x, y, z);
+      case INTERPOLATION_LINEAR:
+        return interp_3d_linear(info, x, y, z);
+      default:
+        return interp_3d_cubic(info, x, y, z);
+    }
+  }
+};
+
+#ifdef WITH_NANOVDB
+template<typename T> struct NanoVDBInterpolator {
+
+  typedef typename nanovdb::NanoGrid<T>::AccessorType AccessorType;
+
+  static ccl_always_inline float4 read(float r)
+  {
+    return make_float4(r, r, r, 1.0f);
+  }
+
+  static ccl_always_inline float4 read(nanovdb::Vec3f r)
+  {
+    return make_float4(r[0], r[1], r[2], 1.0f);
+  }
+
+  static ccl_always_inline float4 interp_3d_closest(const AccessorType &acc,
+                                                    float x,
+                                                    float y,
+                                                    float z)
+  {
+    const nanovdb::Vec3f xyz(x, y, z);
+    return read(nanovdb::SampleFromVoxels<AccessorType, 0, false>(acc)(xyz));
+  }
+
+  static ccl_always_inline float4 interp_3d_linear(const AccessorType &acc,
+                                                   float x,
+                                                   float y,
+                                                   float z)
+  {
+    const nanovdb::Vec3f xyz(x - 0.5f, y - 0.5f, z - 0.5f);
+    return read(nanovdb::SampleFromVoxels<AccessorType, 1, false>(acc)(xyz));
+  }
+
+#  if defined(__GNUC__) || defined(__clang__)
+  static ccl_always_inline
+#  else
+  static ccl_never_inline
+#  endif
+      float4
+      interp_3d_cubic(const AccessorType &acc, float x, float y, float z)
+  {
+    int ix, iy, iz;
+    int nix, niy, niz;
+    int pix, piy, piz;
+    int nnix, nniy, nniz;
+    /* Tricubic b-spline interpolation. */
+    const float tx = frac(x - 0.5f, &ix);
+    const float ty = frac(y - 0.5f, &iy);
+    const float tz = frac(z - 0.5f, &iz);
+    pix = ix - 1;
+    piy = iy - 1;
+    piz = iz - 1;
+    nix = ix + 1;
+    niy = iy + 1;
+    niz = iz + 1;
+    nnix = ix + 2;
+    nniy = iy + 2;
+    nniz = iz + 2;
+
+    const int xc[4] = {pix, ix, nix, nnix};
+    const int yc[4] = {piy, iy, niy, nniy};
+    const int zc[4] = {piz, iz, niz, nniz};
+    float u[4], v[4], w[4];
+
+    /* Some helper macro to keep code reasonable size,
+     * let compiler to inline all the matrix multiplications.
+     */
+#  define DATA(x, y, z) (read(acc.getValue(nanovdb::Coord(xc[x], yc[y], zc[z]))))
+#  define COL_TERM(col, row) \
+    (v[col] * (u[0] * DATA(0, col, row) + u[1] * DATA(1, col, row) + u[2] * DATA(2, col, row) + \
+               u[3] * DATA(3, col, row)))
+#  define ROW_TERM(row) \
+    (w[row] * (COL_TERM(0, row) + COL_TERM(1, row) + COL_TERM(2, row) + COL_TERM(3, row)))
+
+    SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+    SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+    SET_CUBIC_SPLINE_WEIGHTS(w, tz);
+
+    /* Actual interpolation. */
+    return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
+
+#  undef COL_TERM
+#  undef ROW_TERM
+#  undef DATA
+  }
+
+  static ccl_always_inline float4
+  interp_3d(const TextureInfo &info, float x, float y, float z, InterpolationType interp)
+  {
+    using namespace nanovdb;
+
+    NanoGrid<T> *const grid = (NanoGrid<T> *)info.data;
+    AccessorType acc = grid->getAccessor();
+
+    switch ((interp == INTERPOLATION_NONE) ? info.interpolation : interp) {
+      case INTERPOLATION_CLOSEST:
+        return interp_3d_closest(acc, x, y, z);
+      case INTERPOLATION_LINEAR:
+        return interp_3d_linear(acc, x, y, z);
+      default:
+        return interp_3d_cubic(acc, x, y, z);
+    }
+  }
+};
+#endif
+
+#undef SET_CUBIC_SPLINE_WEIGHTS
+
+ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float x, float y)
+{
+  const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+
+  switch (info.data_type) {
+    case IMAGE_DATA_TYPE_HALF:
+      return TextureInterpolator<half>::interp(info, x, y);
+    case IMAGE_DATA_TYPE_BYTE:
+      return TextureInterpolator<uchar>::interp(info, x, y);
+    case IMAGE_DATA_TYPE_USHORT:
+      return TextureInterpolator<uint16_t>::interp(info, x, y);
+    case IMAGE_DATA_TYPE_FLOAT:
+      return TextureInterpolator<float>::interp(info, x, y);
+    case IMAGE_DATA_TYPE_HALF4:
+      return TextureInterpolator<half4>::interp(info, x, y);
+    case IMAGE_DATA_TYPE_BYTE4:
+      return TextureInterpolator<uchar4>::interp(info, x, y);
+    case IMAGE_DATA_TYPE_USHORT4:
+      return TextureInterpolator<ushort4>::interp(info, x, y);
+    case IMAGE_DATA_TYPE_FLOAT4:
+      return TextureInterpolator<float4>::interp(info, x, y);
+    default:
+      assert(0);
+      return make_float4(
+          TEX_IMAGE_MISSING_R, TEX_IMAGE_MISSING_G, TEX_IMAGE_MISSING_B, TEX_IMAGE_MISSING_A);
+  }
+}
+
+ccl_device float4 kernel_tex_image_interp_3d(const KernelGlobals *kg,
+                                             int id,
+                                             float3 P,
+                                             InterpolationType interp)
+{
+  const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+
+  if (info.use_transform_3d) {
+    P = transform_point(&info.transform_3d, P);
+  }
+
+  switch (info.data_type) {
+    case IMAGE_DATA_TYPE_HALF:
+      return TextureInterpolator<half>::interp_3d(info, P.x, P.y, P.z, interp);
+    case IMAGE_DATA_TYPE_BYTE:
+      return TextureInterpolator<uchar>::interp_3d(info, P.x, P.y, P.z, interp);
+    case IMAGE_DATA_TYPE_USHORT:
+      return TextureInterpolator<uint16_t>::interp_3d(info, P.x, P.y, P.z, interp);
+    case IMAGE_DATA_TYPE_FLOAT:
+      return TextureInterpolator<float>::interp_3d(info, P.x, P.y, P.z, interp);
+    case IMAGE_DATA_TYPE_HALF4:
+      return TextureInterpolator<half4>::interp_3d(info, P.x, P.y, P.z, interp);
+    case IMAGE_DATA_TYPE_BYTE4:
+      return TextureInterpolator<uchar4>::interp_3d(info, P.x, P.y, P.z, interp);
+    case IMAGE_DATA_TYPE_USHORT4:
+      return TextureInterpolator<ushort4>::interp_3d(info, P.x, P.y, P.z, interp);
+    case IMAGE_DATA_TYPE_FLOAT4:
+      return TextureInterpolator<float4>::interp_3d(info, P.x, P.y, P.z, interp);
+#ifdef WITH_NANOVDB
+    case IMAGE_DATA_TYPE_NANOVDB_FLOAT:
+      return NanoVDBInterpolator<float>::interp_3d(info, P.x, P.y, P.z, interp);
+    case IMAGE_DATA_TYPE_NANOVDB_FLOAT3:
+      return NanoVDBInterpolator<nanovdb::Vec3f>::interp_3d(info, P.x, P.y, P.z, interp);
+#endif
+    default:
+      assert(0);
+      return make_float4(
+          TEX_IMAGE_MISSING_R, TEX_IMAGE_MISSING_G, TEX_IMAGE_MISSING_B, TEX_IMAGE_MISSING_A);
+  }
+}
+
+} /* Namespace. */
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cpu/kernel.cpp b/intern/cycles/kernel/device/cpu/kernel.cpp
new file mode 100644
index 00000000000..ac1cdf5fffe
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel.cpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CPU kernel entry points */
+
+/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
+ * one with SSE2 intrinsics.
+ */
+#if defined(__x86_64__) || defined(_M_X64)
+#  define __KERNEL_SSE2__
+#endif
+
+/* When building kernel for native machine detect kernel features from the flags
+ * set by compiler.
+ */
+#ifdef WITH_KERNEL_NATIVE
+#  ifdef __SSE2__
+#    ifndef __KERNEL_SSE2__
+#      define __KERNEL_SSE2__
+#    endif
+#  endif
+#  ifdef __SSE3__
+#    define __KERNEL_SSE3__
+#  endif
+#  ifdef __SSSE3__
+#    define __KERNEL_SSSE3__
+#  endif
+#  ifdef __SSE4_1__
+#    define __KERNEL_SSE41__
+#  endif
+#  ifdef __AVX__
+#    define __KERNEL_SSE__
+#    define __KERNEL_AVX__
+#  endif
+#  ifdef __AVX2__
+#    define __KERNEL_SSE__
+#    define __KERNEL_AVX2__
+#  endif
+#endif
+
+/* quiet unused define warnings */
+#if defined(__KERNEL_SSE2__)
+/* do nothing */
+#endif
+
+#include "kernel/device/cpu/kernel.h"
+#define KERNEL_ARCH cpu
+#include "kernel/device/cpu/kernel_arch_impl.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Memory Copy */
+
+void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t)
+{
+  if (strcmp(name, "__data") == 0) {
+    kg->__data = *(KernelData *)host;
+  }
+  else {
+    assert(0);
+  }
+}
+
+void kernel_global_memory_copy(KernelGlobals *kg, const char *name, void *mem, size_t size)
+{
+  if (0) {
+  }
+
+#define KERNEL_TEX(type, tname) \
+  else if (strcmp(name, #tname) == 0) \
+  { \
+    kg->tname.data = (type *)mem; \
+    kg->tname.width = size; \
+  }
+#include "kernel/kernel_textures.h"
+  else {
+    assert(0);
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cpu/kernel.h b/intern/cycles/kernel/device/cpu/kernel.h
new file mode 100644
index 00000000000..ae2a841835a
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+/* CPU Kernel Interface */
+
+#include "util/util_types.h"
+
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+#define KERNEL_NAME_JOIN(x, y, z) x##_##y##_##z
+#define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name)
+#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
+
+struct IntegratorStateCPU;
+struct KernelGlobals;
+struct KernelData;
+
+KernelGlobals *kernel_globals_create();
+void kernel_globals_free(KernelGlobals *kg);
+
+void *kernel_osl_memory(const KernelGlobals *kg);
+bool kernel_osl_use(const KernelGlobals *kg);
+
+void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t size);
+void kernel_global_memory_copy(KernelGlobals *kg, const char *name, void *mem, size_t size);
+
+#define KERNEL_ARCH cpu
+#include "kernel/device/cpu/kernel_arch.h"
+
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/device/cpu/kernel_arch.h"
+
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/device/cpu/kernel_arch.h"
+
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/device/cpu/kernel_arch.h"
+
+#define KERNEL_ARCH cpu_avx
+#include "kernel/device/cpu/kernel_arch.h"
+
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/device/cpu/kernel_arch.h"
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch.h b/intern/cycles/kernel/device/cpu/kernel_arch.h
new file mode 100644
index 00000000000..81f328c710b
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_arch.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common declaration part of all CPU kernels. */
+
+/* --------------------------------------------------------------------
+ * Integrator.
+ */
+
+#define KERNEL_INTEGRATOR_FUNCTION(name) \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \
+                                                    IntegratorStateCPU *state)
+
+#define KERNEL_INTEGRATOR_SHADE_FUNCTION(name) \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \
+                                                    IntegratorStateCPU *state, \
+                                                    ccl_global float *render_buffer)
+
+#define KERNEL_INTEGRATOR_INIT_FUNCTION(name) \
+  bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \
+                                                    IntegratorStateCPU *state, \
+                                                    KernelWorkTile *tile, \
+                                                    ccl_global float *render_buffer)
+
+KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_camera);
+KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_bake);
+KERNEL_INTEGRATOR_FUNCTION(intersect_closest);
+KERNEL_INTEGRATOR_FUNCTION(intersect_shadow);
+KERNEL_INTEGRATOR_FUNCTION(intersect_subsurface);
+KERNEL_INTEGRATOR_FUNCTION(intersect_volume_stack);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_background);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_light);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_shadow);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_surface);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_volume);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(megakernel);
+
+#undef KERNEL_INTEGRATOR_FUNCTION
+#undef KERNEL_INTEGRATOR_INIT_FUNCTION
+#undef KERNEL_INTEGRATOR_SHADE_FUNCTION
+
+/* --------------------------------------------------------------------
+ * Shader evaluation.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg,
+                                                       const KernelShaderEvalInput *input,
+                                                       float4 *output,
+                                                       const int offset);
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,
+                                                     const KernelShaderEvalInput *input,
+                                                     float4 *output,
+                                                     const int offset);
+
+/* --------------------------------------------------------------------
+ * Adaptive sampling.
+ */
+
+bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
+    const KernelGlobals *kg,
+    ccl_global float *render_buffer,
+    int x,
+    int y,
+    float threshold,
+    bool reset,
+    int offset,
+    int stride);
+
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobals *kg,
+                                                           ccl_global float *render_buffer,
+                                                           int y,
+                                                           int start_x,
+                                                           int width,
+                                                           int offset,
+                                                           int stride);
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobals *kg,
+                                                           ccl_global float *render_buffer,
+                                                           int x,
+                                                           int start_y,
+                                                           int height,
+                                                           int offset,
+                                                           int stride);
+
+/* --------------------------------------------------------------------
+ * Cryptomatte.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobals *kg,
+                                                        ccl_global float *render_buffer,
+                                                        int pixel_index);
+
+/* --------------------------------------------------------------------
+ * Bake.
+ */
+/* TODO(sergey): Needs to be re-implemented. Or not? Brecht did it already :) */
+
+void KERNEL_FUNCTION_FULL_NAME(bake)(
+    const KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride);
+
+#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
new file mode 100644
index 00000000000..1432abfd330
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
@@ -0,0 +1,235 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common implementation part of all CPU kernels.
+ *
+ * The idea is that particular .cpp files sets needed optimization flags and
+ * simply includes this file without worry of copying actual implementation over.
+ */
+
+#pragma once
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+
+#ifndef KERNEL_STUB
+#    include "kernel/device/cpu/globals.h"
+#    include "kernel/device/cpu/image.h"
+
+#    include "kernel/integrator/integrator_state.h"
+#    include "kernel/integrator/integrator_state_flow.h"
+#    include "kernel/integrator/integrator_state_util.h"
+
+#    include "kernel/integrator/integrator_init_from_camera.h"
+#    include "kernel/integrator/integrator_init_from_bake.h"
+#    include "kernel/integrator/integrator_intersect_closest.h"
+#    include "kernel/integrator/integrator_intersect_shadow.h"
+#    include "kernel/integrator/integrator_intersect_subsurface.h"
+#    include "kernel/integrator/integrator_intersect_volume_stack.h"
+#    include "kernel/integrator/integrator_shade_background.h"
+#    include "kernel/integrator/integrator_shade_light.h"
+#    include "kernel/integrator/integrator_shade_shadow.h"
+#    include "kernel/integrator/integrator_shade_surface.h"
+#    include "kernel/integrator/integrator_shade_volume.h"
+#    include "kernel/integrator/integrator_megakernel.h"
+
+#    include "kernel/kernel_film.h"
+#    include "kernel/kernel_adaptive_sampling.h"
+#    include "kernel/kernel_bake.h"
+# include "kernel/kernel_id_passes.h"
+
+#else
+#  define STUB_ASSERT(arch, name) \
+    assert(!(#name " kernel stub for architecture " #arch " was called!"))
+#endif   /* KERNEL_STUB */
+// clang-format on
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * Integrator.
+ */
+
+#ifdef KERNEL_STUB
+#  define KERNEL_INVOKE(name, ...) (STUB_ASSERT(KERNEL_ARCH, name), 0)
+#else
+#  define KERNEL_INVOKE(name, ...) integrator_##name(__VA_ARGS__)
+#endif
+
+#define DEFINE_INTEGRATOR_KERNEL(name) \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *kg, \
+                                                    IntegratorStateCPU *state) \
+  { \
+    KERNEL_INVOKE(name, kg, state); \
+  }
+
+#define DEFINE_INTEGRATOR_SHADE_KERNEL(name) \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)( \
+      const KernelGlobals *kg, IntegratorStateCPU *state, ccl_global float *render_buffer) \
+  { \
+    KERNEL_INVOKE(name, kg, state, render_buffer); \
+  }
+
+/* TODO: Either use something like get_work_pixel(), or simplify tile which is passed here, so
+ * that it does not contain unused fields. */
+#define DEFINE_INTEGRATOR_INIT_KERNEL(name) \
+  bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *kg, \
+                                                    IntegratorStateCPU *state, \
+                                                    KernelWorkTile *tile, \
+                                                    ccl_global float *render_buffer) \
+  { \
+    return KERNEL_INVOKE( \
+        name, kg, state, tile, render_buffer, tile->x, tile->y, tile->start_sample); \
+  }
+
+DEFINE_INTEGRATOR_INIT_KERNEL(init_from_camera)
+DEFINE_INTEGRATOR_INIT_KERNEL(init_from_bake)
+DEFINE_INTEGRATOR_KERNEL(intersect_closest)
+DEFINE_INTEGRATOR_KERNEL(intersect_shadow)
+DEFINE_INTEGRATOR_KERNEL(intersect_subsurface)
+DEFINE_INTEGRATOR_KERNEL(intersect_volume_stack)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_background)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_light)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_shadow)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_surface)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_volume)
+DEFINE_INTEGRATOR_SHADE_KERNEL(megakernel)
+
+/* --------------------------------------------------------------------
+ * Shader evaluation.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,
+                                                     const KernelShaderEvalInput *input,
+                                                     float4 *output,
+                                                     const int offset)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, shader_eval_displace);
+#else
+  kernel_displace_evaluate(kg, input, output, offset);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg,
+                                                       const KernelShaderEvalInput *input,
+                                                       float4 *output,
+                                                       const int offset)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, shader_eval_background);
+#else
+  kernel_background_evaluate(kg, input, output, offset);
+#endif
+}
+
+/* --------------------------------------------------------------------
+ * Adaptive sampling.
+ */
+
+bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
+    const KernelGlobals *kg,
+    ccl_global float *render_buffer,
+    int x,
+    int y,
+    float threshold,
+    bool reset,
+    int offset,
+    int stride)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_convergence_check);
+  return false;
+#else
+  return kernel_adaptive_sampling_convergence_check(
+      kg, render_buffer, x, y, threshold, reset, offset, stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobals *kg,
+                                                           ccl_global float *render_buffer,
+                                                           int y,
+                                                           int start_x,
+                                                           int width,
+                                                           int offset,
+                                                           int stride)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_filter_x);
+#else
+  kernel_adaptive_sampling_filter_x(kg, render_buffer, y, start_x, width, offset, stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobals *kg,
+                                                           ccl_global float *render_buffer,
+                                                           int x,
+                                                           int start_y,
+                                                           int height,
+                                                           int offset,
+                                                           int stride)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_filter_y);
+#else
+  kernel_adaptive_sampling_filter_y(kg, render_buffer, x, start_y, height, offset, stride);
+#endif
+}
+
+/* --------------------------------------------------------------------
+ * Cryptomatte.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobals *kg,
+                                                        ccl_global float *render_buffer,
+                                                        int pixel_index)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, cryptomatte_postprocess);
+#else
+  kernel_cryptomatte_post(kg, render_buffer, pixel_index);
+#endif
+}
+
+/* --------------------------------------------------------------------
+ * Bake.
+ */
+/* TODO(sergey): Needs to be re-implemented. Or not? Brecht did it already :) */
+
+void KERNEL_FUNCTION_FULL_NAME(bake)(
+    const KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride)
+{
+#if 0
+#  ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, bake);
+#  else
+#    ifdef __BAKING__
+  kernel_bake_evaluate(kg, buffer, sample, x, y, offset, stride);
+#    endif
+#  endif /* KERNEL_STUB */
+#endif
+}
+
+#undef KERNEL_INVOKE
+#undef DEFINE_INTEGRATOR_KERNEL
+#undef DEFINE_INTEGRATOR_SHADE_KERNEL
+#undef DEFINE_INTEGRATOR_INIT_KERNEL
+
+#undef KERNEL_STUB
+#undef STUB_ASSERT
+#undef KERNEL_ARCH
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cpu/kernel_avx.cpp b/intern/cycles/kernel/device/cpu/kernel_avx.cpp
new file mode 100644
index 00000000000..220768036ab
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_avx.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug T36316. */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#  endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/device/cpu/kernel.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/device/cpu/kernel_avx2.cpp b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
new file mode 100644
index 00000000000..90c05113cbe
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011-2014 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug T36316. */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#    define __KERNEL_AVX2__
+#  endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/device/cpu/kernel.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/device/cpu/kernel_sse2.cpp b/intern/cycles/kernel/device/cpu/kernel_sse2.cpp
new file mode 100644
index 00000000000..fb85ef5b0d0
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_sse2.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug T36316. */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#  endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/device/cpu/kernel.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/device/cpu/kernel_sse3.cpp b/intern/cycles/kernel/device/cpu/kernel_sse3.cpp
new file mode 100644
index 00000000000..87baf04258a
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_sse3.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug T36316. */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#  endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/device/cpu/kernel.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/device/cpu/kernel_sse41.cpp b/intern/cycles/kernel/device/cpu/kernel_sse41.cpp
new file mode 100644
index 00000000000..bb421d58815
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_sse41.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug T36316. */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#  endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/device/cpu/kernel.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/device/cpu/kernel_arch_impl.h"