Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBrecht Van Lommel <brecht@blender.org>2021-09-20 18:59:20 +0300
committerBrecht Van Lommel <brecht@blender.org>2021-09-21 15:55:54 +0300
commit08031197250aeecbaca3803254e6f25b8c7b7b37 (patch)
tree6fe7ab045f0dc0a423d6557c4073f34309ef4740 /intern/cycles/kernel/device/cpu
parentfa6b1007bad065440950cd67deb16a04f368856f (diff)
Cycles: merge of cycles-x branch, a major update to the renderer
This includes much improved GPU rendering performance, viewport interactivity, new shadow catcher, revamped sampling settings, subsurface scattering anisotropy, new GPU volume sampling, improved PMJ sampling pattern, and more. Some features have also been removed or changed, breaking backwards compatibility. Including the removal of the OpenCL backend, for which alternatives are under development. Release notes and code docs: https://wiki.blender.org/wiki/Reference/Release_Notes/3.0/Cycles https://wiki.blender.org/wiki/Source/Render/Cycles Credits: * Sergey Sharybin * Brecht Van Lommel * Patrick Mours (OptiX backend) * Christophe Hery (subsurface scattering anisotropy) * William Leeson (PMJ sampling pattern) * Alaska (various fixes and tweaks) * Thomas Dinges (various fixes) For the full commit history, see the cycles-x branch. This squashes together all the changes since intermediate changes would often fail building or tests. Ref T87839, T87837, T87836 Fixes T90734, T89353, T80267, T80267, T77185, T69800
Diffstat (limited to 'intern/cycles/kernel/device/cpu')
-rw-r--r--intern/cycles/kernel/device/cpu/compat.h101
-rw-r--r--intern/cycles/kernel/device/cpu/globals.h61
-rw-r--r--intern/cycles/kernel/device/cpu/image.h657
-rw-r--r--intern/cycles/kernel/device/cpu/kernel.cpp94
-rw-r--r--intern/cycles/kernel/device/cpu/kernel.h62
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_arch.h113
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_arch_impl.h235
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_avx.cpp39
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_avx2.cpp40
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_sse2.cpp34
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_sse3.cpp36
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_sse41.cpp37
12 files changed, 1509 insertions, 0 deletions
diff --git a/intern/cycles/kernel/device/cpu/compat.h b/intern/cycles/kernel/device/cpu/compat.h
new file mode 100644
index 00000000000..bfd936c7bbd
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/compat.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#define __KERNEL_CPU__
+
+/* Release kernel has too much false-positive maybe-uninitialized warnings,
+ * which makes it possible to miss actual warnings.
+ */
+#if (defined(__GNUC__) && !defined(__clang__)) && defined(NDEBUG)
+# pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+# pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+
+#include "util/util_half.h"
+#include "util/util_math.h"
+#include "util/util_simd.h"
+#include "util/util_texture.h"
+#include "util/util_types.h"
+
+#define ccl_addr_space
+
+/* On x86_64, versions of glibc < 2.16 have an issue where expf is
+ * much slower than the double version. This was fixed in glibc 2.16.
+ */
+#if !defined(__KERNEL_GPU__) && defined(__x86_64__) && defined(__x86_64__) && \
+ defined(__GNU_LIBRARY__) && defined(__GLIBC__) && defined(__GLIBC_MINOR__) && \
+ (__GLIBC__ <= 2 && __GLIBC_MINOR__ < 16)
+# define expf(x) ((float)exp((double)(x)))
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/* Assertions inside the kernel only work for the CPU device, so we wrap it in
+ * a macro which is empty for other devices */
+
+#define kernel_assert(cond) assert(cond)
+
+/* Texture types to be compatible with CUDA textures. These are really just
+ * simple arrays and after inlining fetch hopefully revert to being a simple
+ * pointer lookup. */
+template<typename T> struct texture {
+ ccl_always_inline const T &fetch(int index) const
+ {
+ kernel_assert(index >= 0 && index < width);
+ return data[index];
+ }
+
+ T *data;
+ int width;
+};
+
+/* Macros to handle different memory storage on different devices */
+
+#ifdef __KERNEL_SSE2__
+typedef vector3<sseb> sse3b;
+typedef vector3<ssef> sse3f;
+typedef vector3<ssei> sse3i;
+
+ccl_device_inline void print_sse3b(const char *label, sse3b &a)
+{
+ print_sseb(label, a.x);
+ print_sseb(label, a.y);
+ print_sseb(label, a.z);
+}
+
+ccl_device_inline void print_sse3f(const char *label, sse3f &a)
+{
+ print_ssef(label, a.x);
+ print_ssef(label, a.y);
+ print_ssef(label, a.z);
+}
+
+ccl_device_inline void print_sse3i(const char *label, sse3i &a)
+{
+ print_ssei(label, a.x);
+ print_ssei(label, a.y);
+ print_ssei(label, a.z);
+}
+
+# if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
+typedef vector3<avxf> avx3f;
+# endif
+
+#endif
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cpu/globals.h b/intern/cycles/kernel/device/cpu/globals.h
new file mode 100644
index 00000000000..98b036e269d
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/globals.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#pragma once
+
+#include "kernel/kernel_profiling.h"
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in
+ * the kernel, to access constant data. These are all stored as "textures", but
+ * these are really just standard arrays. We can't use actually globals because
+ * multiple renders may be running inside the same process. */
+
+#ifdef __OSL__
+struct OSLGlobals;
+struct OSLThreadData;
+struct OSLShadingSystem;
+#endif
+
+typedef struct KernelGlobals {
+#define KERNEL_TEX(type, name) texture<type> name;
+#include "kernel/kernel_textures.h"
+
+ KernelData __data;
+
+#ifdef __OSL__
+ /* On the CPU, we also have the OSL globals here. Most data structures are shared
+ * with SVM, the difference is in the shaders and object/mesh attributes. */
+ OSLGlobals *osl;
+ OSLShadingSystem *osl_ss;
+ OSLThreadData *osl_tdata;
+#endif
+
+ /* **** Run-time data **** */
+
+ ProfilingState profiler;
+} KernelGlobals;
+
+/* Abstraction macros */
+#define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
+#define kernel_tex_array(tex) (kg->tex.data)
+#define kernel_data (kg->__data)
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cpu/image.h b/intern/cycles/kernel/device/cpu/image.h
new file mode 100644
index 00000000000..57e81ab186d
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/image.h
@@ -0,0 +1,657 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_NANOVDB
+# define NANOVDB_USE_INTRINSICS
+# include <nanovdb/NanoVDB.h>
+# include <nanovdb/util/SampleFromVoxels.h>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/* Make template functions private so symbols don't conflict between kernels with different
+ * instruction sets. */
+namespace {
+
+#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
+ { \
+ u[0] = (((-1.0f / 6.0f) * t + 0.5f) * t - 0.5f) * t + (1.0f / 6.0f); \
+ u[1] = ((0.5f * t - 1.0f) * t) * t + (2.0f / 3.0f); \
+ u[2] = ((-0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f / 6.0f); \
+ u[3] = (1.0f / 6.0f) * t * t * t; \
+ } \
+ (void)0
+
+ccl_device_inline float frac(float x, int *ix)
+{
+ int i = float_to_int(x) - ((x < 0.0f) ? 1 : 0);
+ *ix = i;
+ return x - (float)i;
+}
+
+template<typename T> struct TextureInterpolator {
+
+ static ccl_always_inline float4 read(float4 r)
+ {
+ return r;
+ }
+
+ static ccl_always_inline float4 read(uchar4 r)
+ {
+ float f = 1.0f / 255.0f;
+ return make_float4(r.x * f, r.y * f, r.z * f, r.w * f);
+ }
+
+ static ccl_always_inline float4 read(uchar r)
+ {
+ float f = r * (1.0f / 255.0f);
+ return make_float4(f, f, f, 1.0f);
+ }
+
+ static ccl_always_inline float4 read(float r)
+ {
+ /* TODO(dingto): Optimize this, so interpolation
+ * happens on float instead of float4 */
+ return make_float4(r, r, r, 1.0f);
+ }
+
+ static ccl_always_inline float4 read(half4 r)
+ {
+ return half4_to_float4(r);
+ }
+
+ static ccl_always_inline float4 read(half r)
+ {
+ float f = half_to_float(r);
+ return make_float4(f, f, f, 1.0f);
+ }
+
+ static ccl_always_inline float4 read(uint16_t r)
+ {
+ float f = r * (1.0f / 65535.0f);
+ return make_float4(f, f, f, 1.0f);
+ }
+
+ static ccl_always_inline float4 read(ushort4 r)
+ {
+ float f = 1.0f / 65535.0f;
+ return make_float4(r.x * f, r.y * f, r.z * f, r.w * f);
+ }
+
+ static ccl_always_inline float4 read(const T *data, int x, int y, int width, int height)
+ {
+ if (x < 0 || y < 0 || x >= width || y >= height) {
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ return read(data[y * width + x]);
+ }
+
+ static ccl_always_inline int wrap_periodic(int x, int width)
+ {
+ x %= width;
+ if (x < 0)
+ x += width;
+ return x;
+ }
+
+ static ccl_always_inline int wrap_clamp(int x, int width)
+ {
+ return clamp(x, 0, width - 1);
+ }
+
+ /* ******** 2D interpolation ******** */
+
+ static ccl_always_inline float4 interp_closest(const TextureInfo &info, float x, float y)
+ {
+ const T *data = (const T *)info.data;
+ const int width = info.width;
+ const int height = info.height;
+ int ix, iy;
+ frac(x * (float)width, &ix);
+ frac(y * (float)height, &iy);
+ switch (info.extension) {
+ case EXTENSION_REPEAT:
+ ix = wrap_periodic(ix, width);
+ iy = wrap_periodic(iy, height);
+ break;
+ case EXTENSION_CLIP:
+ if (x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ ATTR_FALLTHROUGH;
+ case EXTENSION_EXTEND:
+ ix = wrap_clamp(ix, width);
+ iy = wrap_clamp(iy, height);
+ break;
+ default:
+ kernel_assert(0);
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ return read(data[ix + iy * width]);
+ }
+
+ static ccl_always_inline float4 interp_linear(const TextureInfo &info, float x, float y)
+ {
+ const T *data = (const T *)info.data;
+ const int width = info.width;
+ const int height = info.height;
+ int ix, iy, nix, niy;
+ const float tx = frac(x * (float)width - 0.5f, &ix);
+ const float ty = frac(y * (float)height - 0.5f, &iy);
+ switch (info.extension) {
+ case EXTENSION_REPEAT:
+ ix = wrap_periodic(ix, width);
+ iy = wrap_periodic(iy, height);
+ nix = wrap_periodic(ix + 1, width);
+ niy = wrap_periodic(iy + 1, height);
+ break;
+ case EXTENSION_CLIP:
+ nix = ix + 1;
+ niy = iy + 1;
+ break;
+ case EXTENSION_EXTEND:
+ nix = wrap_clamp(ix + 1, width);
+ niy = wrap_clamp(iy + 1, height);
+ ix = wrap_clamp(ix, width);
+ iy = wrap_clamp(iy, height);
+ break;
+ default:
+ kernel_assert(0);
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ return (1.0f - ty) * (1.0f - tx) * read(data, ix, iy, width, height) +
+ (1.0f - ty) * tx * read(data, nix, iy, width, height) +
+ ty * (1.0f - tx) * read(data, ix, niy, width, height) +
+ ty * tx * read(data, nix, niy, width, height);
+ }
+
+ static ccl_always_inline float4 interp_cubic(const TextureInfo &info, float x, float y)
+ {
+ const T *data = (const T *)info.data;
+ const int width = info.width;
+ const int height = info.height;
+ int ix, iy, nix, niy;
+ const float tx = frac(x * (float)width - 0.5f, &ix);
+ const float ty = frac(y * (float)height - 0.5f, &iy);
+ int pix, piy, nnix, nniy;
+ switch (info.extension) {
+ case EXTENSION_REPEAT:
+ ix = wrap_periodic(ix, width);
+ iy = wrap_periodic(iy, height);
+ pix = wrap_periodic(ix - 1, width);
+ piy = wrap_periodic(iy - 1, height);
+ nix = wrap_periodic(ix + 1, width);
+ niy = wrap_periodic(iy + 1, height);
+ nnix = wrap_periodic(ix + 2, width);
+ nniy = wrap_periodic(iy + 2, height);
+ break;
+ case EXTENSION_CLIP:
+ pix = ix - 1;
+ piy = iy - 1;
+ nix = ix + 1;
+ niy = iy + 1;
+ nnix = ix + 2;
+ nniy = iy + 2;
+ break;
+ case EXTENSION_EXTEND:
+ pix = wrap_clamp(ix - 1, width);
+ piy = wrap_clamp(iy - 1, height);
+ nix = wrap_clamp(ix + 1, width);
+ niy = wrap_clamp(iy + 1, height);
+ nnix = wrap_clamp(ix + 2, width);
+ nniy = wrap_clamp(iy + 2, height);
+ ix = wrap_clamp(ix, width);
+ iy = wrap_clamp(iy, height);
+ break;
+ default:
+ kernel_assert(0);
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ const int xc[4] = {pix, ix, nix, nnix};
+ const int yc[4] = {piy, iy, niy, nniy};
+ float u[4], v[4];
+ /* Some helper macro to keep code reasonable size,
+ * let compiler to inline all the matrix multiplications.
+ */
+#define DATA(x, y) (read(data, xc[x], yc[y], width, height))
+#define TERM(col) \
+ (v[col] * \
+ (u[0] * DATA(0, col) + u[1] * DATA(1, col) + u[2] * DATA(2, col) + u[3] * DATA(3, col)))
+
+ SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+ SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+
+ /* Actual interpolation. */
+ return TERM(0) + TERM(1) + TERM(2) + TERM(3);
+#undef TERM
+#undef DATA
+ }
+
+ static ccl_always_inline float4 interp(const TextureInfo &info, float x, float y)
+ {
+ if (UNLIKELY(!info.data)) {
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ switch (info.interpolation) {
+ case INTERPOLATION_CLOSEST:
+ return interp_closest(info, x, y);
+ case INTERPOLATION_LINEAR:
+ return interp_linear(info, x, y);
+ default:
+ return interp_cubic(info, x, y);
+ }
+ }
+
+ /* ******** 3D interpolation ******** */
+
+ static ccl_always_inline float4 interp_3d_closest(const TextureInfo &info,
+ float x,
+ float y,
+ float z)
+ {
+ int width = info.width;
+ int height = info.height;
+ int depth = info.depth;
+ int ix, iy, iz;
+
+ frac(x * (float)width, &ix);
+ frac(y * (float)height, &iy);
+ frac(z * (float)depth, &iz);
+
+ switch (info.extension) {
+ case EXTENSION_REPEAT:
+ ix = wrap_periodic(ix, width);
+ iy = wrap_periodic(iy, height);
+ iz = wrap_periodic(iz, depth);
+ break;
+ case EXTENSION_CLIP:
+ if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) {
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ ATTR_FALLTHROUGH;
+ case EXTENSION_EXTEND:
+ ix = wrap_clamp(ix, width);
+ iy = wrap_clamp(iy, height);
+ iz = wrap_clamp(iz, depth);
+ break;
+ default:
+ kernel_assert(0);
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+
+ const T *data = (const T *)info.data;
+ return read(data[ix + iy * width + iz * width * height]);
+ }
+
+ static ccl_always_inline float4 interp_3d_linear(const TextureInfo &info,
+ float x,
+ float y,
+ float z)
+ {
+ int width = info.width;
+ int height = info.height;
+ int depth = info.depth;
+ int ix, iy, iz;
+ int nix, niy, niz;
+
+ float tx = frac(x * (float)width - 0.5f, &ix);
+ float ty = frac(y * (float)height - 0.5f, &iy);
+ float tz = frac(z * (float)depth - 0.5f, &iz);
+
+ switch (info.extension) {
+ case EXTENSION_REPEAT:
+ ix = wrap_periodic(ix, width);
+ iy = wrap_periodic(iy, height);
+ iz = wrap_periodic(iz, depth);
+
+ nix = wrap_periodic(ix + 1, width);
+ niy = wrap_periodic(iy + 1, height);
+ niz = wrap_periodic(iz + 1, depth);
+ break;
+ case EXTENSION_CLIP:
+ if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) {
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ ATTR_FALLTHROUGH;
+ case EXTENSION_EXTEND:
+ nix = wrap_clamp(ix + 1, width);
+ niy = wrap_clamp(iy + 1, height);
+ niz = wrap_clamp(iz + 1, depth);
+
+ ix = wrap_clamp(ix, width);
+ iy = wrap_clamp(iy, height);
+ iz = wrap_clamp(iz, depth);
+ break;
+ default:
+ kernel_assert(0);
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+
+ const T *data = (const T *)info.data;
+ float4 r;
+
+ r = (1.0f - tz) * (1.0f - ty) * (1.0f - tx) *
+ read(data[ix + iy * width + iz * width * height]);
+ r += (1.0f - tz) * (1.0f - ty) * tx * read(data[nix + iy * width + iz * width * height]);
+ r += (1.0f - tz) * ty * (1.0f - tx) * read(data[ix + niy * width + iz * width * height]);
+ r += (1.0f - tz) * ty * tx * read(data[nix + niy * width + iz * width * height]);
+
+ r += tz * (1.0f - ty) * (1.0f - tx) * read(data[ix + iy * width + niz * width * height]);
+ r += tz * (1.0f - ty) * tx * read(data[nix + iy * width + niz * width * height]);
+ r += tz * ty * (1.0f - tx) * read(data[ix + niy * width + niz * width * height]);
+ r += tz * ty * tx * read(data[nix + niy * width + niz * width * height]);
+
+ return r;
+ }
+
+ /* TODO(sergey): For some unspeakable reason both GCC-6 and Clang-3.9 are
+ * causing stack overflow issue in this function unless it is inlined.
+ *
+ * Only happens for AVX2 kernel and global __KERNEL_SSE__ vectorization
+ * enabled.
+ */
+#if defined(__GNUC__) || defined(__clang__)
+ static ccl_always_inline
+#else
+ static ccl_never_inline
+#endif
+ float4
+ interp_3d_cubic(const TextureInfo &info, float x, float y, float z)
+ {
+ int width = info.width;
+ int height = info.height;
+ int depth = info.depth;
+ int ix, iy, iz;
+ int nix, niy, niz;
+ /* Tricubic b-spline interpolation. */
+ const float tx = frac(x * (float)width - 0.5f, &ix);
+ const float ty = frac(y * (float)height - 0.5f, &iy);
+ const float tz = frac(z * (float)depth - 0.5f, &iz);
+ int pix, piy, piz, nnix, nniy, nniz;
+
+ switch (info.extension) {
+ case EXTENSION_REPEAT:
+ ix = wrap_periodic(ix, width);
+ iy = wrap_periodic(iy, height);
+ iz = wrap_periodic(iz, depth);
+
+ pix = wrap_periodic(ix - 1, width);
+ piy = wrap_periodic(iy - 1, height);
+ piz = wrap_periodic(iz - 1, depth);
+
+ nix = wrap_periodic(ix + 1, width);
+ niy = wrap_periodic(iy + 1, height);
+ niz = wrap_periodic(iz + 1, depth);
+
+ nnix = wrap_periodic(ix + 2, width);
+ nniy = wrap_periodic(iy + 2, height);
+ nniz = wrap_periodic(iz + 2, depth);
+ break;
+ case EXTENSION_CLIP:
+ if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) {
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ ATTR_FALLTHROUGH;
+ case EXTENSION_EXTEND:
+ pix = wrap_clamp(ix - 1, width);
+ piy = wrap_clamp(iy - 1, height);
+ piz = wrap_clamp(iz - 1, depth);
+
+ nix = wrap_clamp(ix + 1, width);
+ niy = wrap_clamp(iy + 1, height);
+ niz = wrap_clamp(iz + 1, depth);
+
+ nnix = wrap_clamp(ix + 2, width);
+ nniy = wrap_clamp(iy + 2, height);
+ nniz = wrap_clamp(iz + 2, depth);
+
+ ix = wrap_clamp(ix, width);
+ iy = wrap_clamp(iy, height);
+ iz = wrap_clamp(iz, depth);
+ break;
+ default:
+ kernel_assert(0);
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+
+ const int xc[4] = {pix, ix, nix, nnix};
+ const int yc[4] = {width * piy, width * iy, width * niy, width * nniy};
+ const int zc[4] = {
+ width * height * piz, width * height * iz, width * height * niz, width * height * nniz};
+ float u[4], v[4], w[4];
+
+ /* Some helper macro to keep code reasonable size,
+ * let compiler to inline all the matrix multiplications.
+ */
+#define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]]))
+#define COL_TERM(col, row) \
+ (v[col] * (u[0] * DATA(0, col, row) + u[1] * DATA(1, col, row) + u[2] * DATA(2, col, row) + \
+ u[3] * DATA(3, col, row)))
+#define ROW_TERM(row) \
+ (w[row] * (COL_TERM(0, row) + COL_TERM(1, row) + COL_TERM(2, row) + COL_TERM(3, row)))
+
+ SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+ SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+ SET_CUBIC_SPLINE_WEIGHTS(w, tz);
+
+ /* Actual interpolation. */
+ const T *data = (const T *)info.data;
+ return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
+
+#undef COL_TERM
+#undef ROW_TERM
+#undef DATA
+ }
+
+ static ccl_always_inline float4
+ interp_3d(const TextureInfo &info, float x, float y, float z, InterpolationType interp)
+ {
+ if (UNLIKELY(!info.data))
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+ switch ((interp == INTERPOLATION_NONE) ? info.interpolation : interp) {
+ case INTERPOLATION_CLOSEST:
+ return interp_3d_closest(info, x, y, z);
+ case INTERPOLATION_LINEAR:
+ return interp_3d_linear(info, x, y, z);
+ default:
+ return interp_3d_cubic(info, x, y, z);
+ }
+ }
+};
+
+#ifdef WITH_NANOVDB
+template<typename T> struct NanoVDBInterpolator {
+
+ typedef typename nanovdb::NanoGrid<T>::AccessorType AccessorType;
+
+ static ccl_always_inline float4 read(float r)
+ {
+ return make_float4(r, r, r, 1.0f);
+ }
+
+ static ccl_always_inline float4 read(nanovdb::Vec3f r)
+ {
+ return make_float4(r[0], r[1], r[2], 1.0f);
+ }
+
+ static ccl_always_inline float4 interp_3d_closest(const AccessorType &acc,
+ float x,
+ float y,
+ float z)
+ {
+ const nanovdb::Vec3f xyz(x, y, z);
+ return read(nanovdb::SampleFromVoxels<AccessorType, 0, false>(acc)(xyz));
+ }
+
+ static ccl_always_inline float4 interp_3d_linear(const AccessorType &acc,
+ float x,
+ float y,
+ float z)
+ {
+ const nanovdb::Vec3f xyz(x - 0.5f, y - 0.5f, z - 0.5f);
+ return read(nanovdb::SampleFromVoxels<AccessorType, 1, false>(acc)(xyz));
+ }
+
+# if defined(__GNUC__) || defined(__clang__)
+ static ccl_always_inline
+# else
+ static ccl_never_inline
+# endif
+ float4
+ interp_3d_cubic(const AccessorType &acc, float x, float y, float z)
+ {
+ int ix, iy, iz;
+ int nix, niy, niz;
+ int pix, piy, piz;
+ int nnix, nniy, nniz;
+ /* Tricubic b-spline interpolation. */
+ const float tx = frac(x - 0.5f, &ix);
+ const float ty = frac(y - 0.5f, &iy);
+ const float tz = frac(z - 0.5f, &iz);
+ pix = ix - 1;
+ piy = iy - 1;
+ piz = iz - 1;
+ nix = ix + 1;
+ niy = iy + 1;
+ niz = iz + 1;
+ nnix = ix + 2;
+ nniy = iy + 2;
+ nniz = iz + 2;
+
+ const int xc[4] = {pix, ix, nix, nnix};
+ const int yc[4] = {piy, iy, niy, nniy};
+ const int zc[4] = {piz, iz, niz, nniz};
+ float u[4], v[4], w[4];
+
+ /* Some helper macro to keep code reasonable size,
+ * let compiler to inline all the matrix multiplications.
+ */
+# define DATA(x, y, z) (read(acc.getValue(nanovdb::Coord(xc[x], yc[y], zc[z]))))
+# define COL_TERM(col, row) \
+ (v[col] * (u[0] * DATA(0, col, row) + u[1] * DATA(1, col, row) + u[2] * DATA(2, col, row) + \
+ u[3] * DATA(3, col, row)))
+# define ROW_TERM(row) \
+ (w[row] * (COL_TERM(0, row) + COL_TERM(1, row) + COL_TERM(2, row) + COL_TERM(3, row)))
+
+ SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+ SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+ SET_CUBIC_SPLINE_WEIGHTS(w, tz);
+
+ /* Actual interpolation. */
+ return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
+
+# undef COL_TERM
+# undef ROW_TERM
+# undef DATA
+ }
+
+ static ccl_always_inline float4
+ interp_3d(const TextureInfo &info, float x, float y, float z, InterpolationType interp)
+ {
+ using namespace nanovdb;
+
+ NanoGrid<T> *const grid = (NanoGrid<T> *)info.data;
+ AccessorType acc = grid->getAccessor();
+
+ switch ((interp == INTERPOLATION_NONE) ? info.interpolation : interp) {
+ case INTERPOLATION_CLOSEST:
+ return interp_3d_closest(acc, x, y, z);
+ case INTERPOLATION_LINEAR:
+ return interp_3d_linear(acc, x, y, z);
+ default:
+ return interp_3d_cubic(acc, x, y, z);
+ }
+ }
+};
+#endif
+
+#undef SET_CUBIC_SPLINE_WEIGHTS
+
+ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float x, float y)
+{
+ const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+
+ switch (info.data_type) {
+ case IMAGE_DATA_TYPE_HALF:
+ return TextureInterpolator<half>::interp(info, x, y);
+ case IMAGE_DATA_TYPE_BYTE:
+ return TextureInterpolator<uchar>::interp(info, x, y);
+ case IMAGE_DATA_TYPE_USHORT:
+ return TextureInterpolator<uint16_t>::interp(info, x, y);
+ case IMAGE_DATA_TYPE_FLOAT:
+ return TextureInterpolator<float>::interp(info, x, y);
+ case IMAGE_DATA_TYPE_HALF4:
+ return TextureInterpolator<half4>::interp(info, x, y);
+ case IMAGE_DATA_TYPE_BYTE4:
+ return TextureInterpolator<uchar4>::interp(info, x, y);
+ case IMAGE_DATA_TYPE_USHORT4:
+ return TextureInterpolator<ushort4>::interp(info, x, y);
+ case IMAGE_DATA_TYPE_FLOAT4:
+ return TextureInterpolator<float4>::interp(info, x, y);
+ default:
+ assert(0);
+ return make_float4(
+ TEX_IMAGE_MISSING_R, TEX_IMAGE_MISSING_G, TEX_IMAGE_MISSING_B, TEX_IMAGE_MISSING_A);
+ }
+}
+
+ccl_device float4 kernel_tex_image_interp_3d(const KernelGlobals *kg,
+ int id,
+ float3 P,
+ InterpolationType interp)
+{
+ const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+
+ if (info.use_transform_3d) {
+ P = transform_point(&info.transform_3d, P);
+ }
+
+ switch (info.data_type) {
+ case IMAGE_DATA_TYPE_HALF:
+ return TextureInterpolator<half>::interp_3d(info, P.x, P.y, P.z, interp);
+ case IMAGE_DATA_TYPE_BYTE:
+ return TextureInterpolator<uchar>::interp_3d(info, P.x, P.y, P.z, interp);
+ case IMAGE_DATA_TYPE_USHORT:
+ return TextureInterpolator<uint16_t>::interp_3d(info, P.x, P.y, P.z, interp);
+ case IMAGE_DATA_TYPE_FLOAT:
+ return TextureInterpolator<float>::interp_3d(info, P.x, P.y, P.z, interp);
+ case IMAGE_DATA_TYPE_HALF4:
+ return TextureInterpolator<half4>::interp_3d(info, P.x, P.y, P.z, interp);
+ case IMAGE_DATA_TYPE_BYTE4:
+ return TextureInterpolator<uchar4>::interp_3d(info, P.x, P.y, P.z, interp);
+ case IMAGE_DATA_TYPE_USHORT4:
+ return TextureInterpolator<ushort4>::interp_3d(info, P.x, P.y, P.z, interp);
+ case IMAGE_DATA_TYPE_FLOAT4:
+ return TextureInterpolator<float4>::interp_3d(info, P.x, P.y, P.z, interp);
+#ifdef WITH_NANOVDB
+ case IMAGE_DATA_TYPE_NANOVDB_FLOAT:
+ return NanoVDBInterpolator<float>::interp_3d(info, P.x, P.y, P.z, interp);
+ case IMAGE_DATA_TYPE_NANOVDB_FLOAT3:
+ return NanoVDBInterpolator<nanovdb::Vec3f>::interp_3d(info, P.x, P.y, P.z, interp);
+#endif
+ default:
+ assert(0);
+ return make_float4(
+ TEX_IMAGE_MISSING_R, TEX_IMAGE_MISSING_G, TEX_IMAGE_MISSING_B, TEX_IMAGE_MISSING_A);
+ }
+}
+
+} /* Namespace. */
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cpu/kernel.cpp b/intern/cycles/kernel/device/cpu/kernel.cpp
new file mode 100644
index 00000000000..ac1cdf5fffe
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel.cpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CPU kernel entry points */
+
+/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
+ * one with SSE2 intrinsics.
+ */
+#if defined(__x86_64__) || defined(_M_X64)
+# define __KERNEL_SSE2__
+#endif
+
+/* When building kernel for native machine detect kernel features from the flags
+ * set by compiler.
+ */
+#ifdef WITH_KERNEL_NATIVE
+# ifdef __SSE2__
+# ifndef __KERNEL_SSE2__
+# define __KERNEL_SSE2__
+# endif
+# endif
+# ifdef __SSE3__
+# define __KERNEL_SSE3__
+# endif
+# ifdef __SSSE3__
+# define __KERNEL_SSSE3__
+# endif
+# ifdef __SSE4_1__
+# define __KERNEL_SSE41__
+# endif
+# ifdef __AVX__
+# define __KERNEL_SSE__
+# define __KERNEL_AVX__
+# endif
+# ifdef __AVX2__
+# define __KERNEL_SSE__
+# define __KERNEL_AVX2__
+# endif
+#endif
+
+/* quiet unused define warnings */
+#if defined(__KERNEL_SSE2__)
+/* do nothing */
+#endif
+
+#include "kernel/device/cpu/kernel.h"
+#define KERNEL_ARCH cpu
+#include "kernel/device/cpu/kernel_arch_impl.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Memory Copy */
+
+void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t)
+{
+ if (strcmp(name, "__data") == 0) {
+ kg->__data = *(KernelData *)host;
+ }
+ else {
+ assert(0);
+ }
+}
+
+void kernel_global_memory_copy(KernelGlobals *kg, const char *name, void *mem, size_t size)
+{
+ if (0) {
+ }
+
+#define KERNEL_TEX(type, tname) \
+ else if (strcmp(name, #tname) == 0) \
+ { \
+ kg->tname.data = (type *)mem; \
+ kg->tname.width = size; \
+ }
+#include "kernel/kernel_textures.h"
+ else {
+ assert(0);
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cpu/kernel.h b/intern/cycles/kernel/device/cpu/kernel.h
new file mode 100644
index 00000000000..ae2a841835a
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+/* CPU Kernel Interface */
+
+#include "util/util_types.h"
+
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+#define KERNEL_NAME_JOIN(x, y, z) x##_##y##_##z
+#define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name)
+#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
+
+struct IntegratorStateCPU;
+struct KernelGlobals;
+struct KernelData;
+
+KernelGlobals *kernel_globals_create();
+void kernel_globals_free(KernelGlobals *kg);
+
+void *kernel_osl_memory(const KernelGlobals *kg);
+bool kernel_osl_use(const KernelGlobals *kg);
+
+void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t size);
+void kernel_global_memory_copy(KernelGlobals *kg, const char *name, void *mem, size_t size);
+
+#define KERNEL_ARCH cpu
+#include "kernel/device/cpu/kernel_arch.h"
+
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/device/cpu/kernel_arch.h"
+
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/device/cpu/kernel_arch.h"
+
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/device/cpu/kernel_arch.h"
+
+#define KERNEL_ARCH cpu_avx
+#include "kernel/device/cpu/kernel_arch.h"
+
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/device/cpu/kernel_arch.h"
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch.h b/intern/cycles/kernel/device/cpu/kernel_arch.h
new file mode 100644
index 00000000000..81f328c710b
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_arch.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common declaration part of all CPU kernels. */
+
+/* --------------------------------------------------------------------
+ * Integrator.
+ */
+
+#define KERNEL_INTEGRATOR_FUNCTION(name) \
+ void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \
+ IntegratorStateCPU *state)
+
+#define KERNEL_INTEGRATOR_SHADE_FUNCTION(name) \
+ void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \
+ IntegratorStateCPU *state, \
+ ccl_global float *render_buffer)
+
+#define KERNEL_INTEGRATOR_INIT_FUNCTION(name) \
+ bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \
+ IntegratorStateCPU *state, \
+ KernelWorkTile *tile, \
+ ccl_global float *render_buffer)
+
+KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_camera);
+KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_bake);
+KERNEL_INTEGRATOR_FUNCTION(intersect_closest);
+KERNEL_INTEGRATOR_FUNCTION(intersect_shadow);
+KERNEL_INTEGRATOR_FUNCTION(intersect_subsurface);
+KERNEL_INTEGRATOR_FUNCTION(intersect_volume_stack);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_background);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_light);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_shadow);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_surface);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_volume);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(megakernel);
+
+#undef KERNEL_INTEGRATOR_FUNCTION
+#undef KERNEL_INTEGRATOR_INIT_FUNCTION
+#undef KERNEL_INTEGRATOR_SHADE_FUNCTION
+
+/* --------------------------------------------------------------------
+ * Shader evaluation.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg,
+ const KernelShaderEvalInput *input,
+ float4 *output,
+ const int offset);
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,
+ const KernelShaderEvalInput *input,
+ float4 *output,
+ const int offset);
+
+/* --------------------------------------------------------------------
+ * Adaptive sampling.
+ */
+
+bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
+ const KernelGlobals *kg,
+ ccl_global float *render_buffer,
+ int x,
+ int y,
+ float threshold,
+ bool reset,
+ int offset,
+ int stride);
+
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobals *kg,
+ ccl_global float *render_buffer,
+ int y,
+ int start_x,
+ int width,
+ int offset,
+ int stride);
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobals *kg,
+ ccl_global float *render_buffer,
+ int x,
+ int start_y,
+ int height,
+ int offset,
+ int stride);
+
+/* --------------------------------------------------------------------
+ * Cryptomatte.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobals *kg,
+ ccl_global float *render_buffer,
+ int pixel_index);
+
+/* --------------------------------------------------------------------
+ * Bake.
+ */
+/* TODO(sergey): Needs to be re-implemented. Or not? Brecht did it already :) */
+
+void KERNEL_FUNCTION_FULL_NAME(bake)(
+ const KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride);
+
+#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
new file mode 100644
index 00000000000..1432abfd330
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
@@ -0,0 +1,235 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common implementation part of all CPU kernels.
+ *
+ * The idea is that particular .cpp files sets needed optimization flags and
+ * simply includes this file without worry of copying actual implementation over.
+ */
+
+#pragma once
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+
+#ifndef KERNEL_STUB
+# include "kernel/device/cpu/globals.h"
+# include "kernel/device/cpu/image.h"
+
+# include "kernel/integrator/integrator_state.h"
+# include "kernel/integrator/integrator_state_flow.h"
+# include "kernel/integrator/integrator_state_util.h"
+
+# include "kernel/integrator/integrator_init_from_camera.h"
+# include "kernel/integrator/integrator_init_from_bake.h"
+# include "kernel/integrator/integrator_intersect_closest.h"
+# include "kernel/integrator/integrator_intersect_shadow.h"
+# include "kernel/integrator/integrator_intersect_subsurface.h"
+# include "kernel/integrator/integrator_intersect_volume_stack.h"
+# include "kernel/integrator/integrator_shade_background.h"
+# include "kernel/integrator/integrator_shade_light.h"
+# include "kernel/integrator/integrator_shade_shadow.h"
+# include "kernel/integrator/integrator_shade_surface.h"
+# include "kernel/integrator/integrator_shade_volume.h"
+# include "kernel/integrator/integrator_megakernel.h"
+
+# include "kernel/kernel_film.h"
+# include "kernel/kernel_adaptive_sampling.h"
+# include "kernel/kernel_bake.h"
+# include "kernel/kernel_id_passes.h"
+
+#else
+# define STUB_ASSERT(arch, name) \
+ assert(!(#name " kernel stub for architecture " #arch " was called!"))
+#endif /* KERNEL_STUB */
+// clang-format on
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * Integrator.
+ */
+
+#ifdef KERNEL_STUB
+# define KERNEL_INVOKE(name, ...) (STUB_ASSERT(KERNEL_ARCH, name), 0)
+#else
+# define KERNEL_INVOKE(name, ...) integrator_##name(__VA_ARGS__)
+#endif
+
+#define DEFINE_INTEGRATOR_KERNEL(name) \
+ void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *kg, \
+ IntegratorStateCPU *state) \
+ { \
+ KERNEL_INVOKE(name, kg, state); \
+ }
+
+#define DEFINE_INTEGRATOR_SHADE_KERNEL(name) \
+ void KERNEL_FUNCTION_FULL_NAME(integrator_##name)( \
+ const KernelGlobals *kg, IntegratorStateCPU *state, ccl_global float *render_buffer) \
+ { \
+ KERNEL_INVOKE(name, kg, state, render_buffer); \
+ }
+
+/* TODO: Either use something like get_work_pixel(), or simplify tile which is passed here, so
+ * that it does not contain unused fields. */
+#define DEFINE_INTEGRATOR_INIT_KERNEL(name) \
+ bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *kg, \
+ IntegratorStateCPU *state, \
+ KernelWorkTile *tile, \
+ ccl_global float *render_buffer) \
+ { \
+ return KERNEL_INVOKE( \
+ name, kg, state, tile, render_buffer, tile->x, tile->y, tile->start_sample); \
+ }
+
+DEFINE_INTEGRATOR_INIT_KERNEL(init_from_camera)
+DEFINE_INTEGRATOR_INIT_KERNEL(init_from_bake)
+DEFINE_INTEGRATOR_KERNEL(intersect_closest)
+DEFINE_INTEGRATOR_KERNEL(intersect_shadow)
+DEFINE_INTEGRATOR_KERNEL(intersect_subsurface)
+DEFINE_INTEGRATOR_KERNEL(intersect_volume_stack)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_background)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_light)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_shadow)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_surface)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_volume)
+DEFINE_INTEGRATOR_SHADE_KERNEL(megakernel)
+
+/* --------------------------------------------------------------------
+ * Shader evaluation.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,
+ const KernelShaderEvalInput *input,
+ float4 *output,
+ const int offset)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, shader_eval_displace);
+#else
+ kernel_displace_evaluate(kg, input, output, offset);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg,
+ const KernelShaderEvalInput *input,
+ float4 *output,
+ const int offset)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, shader_eval_background);
+#else
+ kernel_background_evaluate(kg, input, output, offset);
+#endif
+}
+
+/* --------------------------------------------------------------------
+ * Adaptive sampling.
+ */
+
+bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
+ const KernelGlobals *kg,
+ ccl_global float *render_buffer,
+ int x,
+ int y,
+ float threshold,
+ bool reset,
+ int offset,
+ int stride)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_convergence_check);
+ return false;
+#else
+ return kernel_adaptive_sampling_convergence_check(
+ kg, render_buffer, x, y, threshold, reset, offset, stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobals *kg,
+ ccl_global float *render_buffer,
+ int y,
+ int start_x,
+ int width,
+ int offset,
+ int stride)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_filter_x);
+#else
+ kernel_adaptive_sampling_filter_x(kg, render_buffer, y, start_x, width, offset, stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobals *kg,
+ ccl_global float *render_buffer,
+ int x,
+ int start_y,
+ int height,
+ int offset,
+ int stride)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_filter_y);
+#else
+ kernel_adaptive_sampling_filter_y(kg, render_buffer, x, start_y, height, offset, stride);
+#endif
+}
+
+/* --------------------------------------------------------------------
+ * Cryptomatte.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobals *kg,
+ ccl_global float *render_buffer,
+ int pixel_index)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, cryptomatte_postprocess);
+#else
+ kernel_cryptomatte_post(kg, render_buffer, pixel_index);
+#endif
+}
+
+/* --------------------------------------------------------------------
+ * Bake.
+ */
+/* TODO(sergey): Needs to be re-implemented. Or not? Brecht did it already :) */
+
+void KERNEL_FUNCTION_FULL_NAME(bake)(
+ const KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride)
+{
+#if 0
+# ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, bake);
+# else
+# ifdef __BAKING__
+ kernel_bake_evaluate(kg, buffer, sample, x, y, offset, stride);
+# endif
+# endif /* KERNEL_STUB */
+#endif
+}
+
+#undef KERNEL_INVOKE
+#undef DEFINE_INTEGRATOR_KERNEL
+#undef DEFINE_INTEGRATOR_SHADE_KERNEL
+#undef DEFINE_INTEGRATOR_INIT_KERNEL
+
+#undef KERNEL_STUB
+#undef STUB_ASSERT
+#undef KERNEL_ARCH
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cpu/kernel_avx.cpp b/intern/cycles/kernel/device/cpu/kernel_avx.cpp
new file mode 100644
index 00000000000..220768036ab
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_avx.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug T36316. */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/device/cpu/kernel.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/device/cpu/kernel_avx2.cpp b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
new file mode 100644
index 00000000000..90c05113cbe
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011-2014 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug T36316. */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# define __KERNEL_AVX2__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/device/cpu/kernel.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/device/cpu/kernel_sse2.cpp b/intern/cycles/kernel/device/cpu/kernel_sse2.cpp
new file mode 100644
index 00000000000..fb85ef5b0d0
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_sse2.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug T36316. */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/device/cpu/kernel.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/device/cpu/kernel_sse3.cpp b/intern/cycles/kernel/device/cpu/kernel_sse3.cpp
new file mode 100644
index 00000000000..87baf04258a
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_sse3.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug T36316. */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/device/cpu/kernel.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/device/cpu/kernel_sse41.cpp b/intern/cycles/kernel/device/cpu/kernel_sse41.cpp
new file mode 100644
index 00000000000..bb421d58815
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_sse41.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug T36316. */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/device/cpu/kernel.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/device/cpu/kernel_arch_impl.h"