Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHans Goudey <h.goudey@me.com>2021-10-25 20:48:15 +0300
committerHans Goudey <h.goudey@me.com>2021-10-25 21:13:54 +0300
commit47d0b2f14e818ee8e0116df47421ba0a52596790 (patch)
tree8637ba628347e1d1e67d8384e370c7ecaad16d23 /intern/cycles/kernel/device
parent05f500580b6f0e5bdad5f432e1734b639b47816e (diff)
parent972677b25e1d84e4c02d0e55b12ce87661faff5e (diff)
Merge branch 'master' into geometry-nodes-level-set-nodes
Diffstat (limited to 'intern/cycles/kernel/device')
-rw-r--r--intern/cycles/kernel/device/cpu/compat.h99
-rw-r--r--intern/cycles/kernel/device/cpu/globals.h63
-rw-r--r--intern/cycles/kernel/device/cpu/image.h657
-rw-r--r--intern/cycles/kernel/device/cpu/kernel.cpp94
-rw-r--r--intern/cycles/kernel/device/cpu/kernel.h62
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_arch.h110
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_arch_impl.h243
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_avx.cpp39
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_avx2.cpp40
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_sse2.cpp34
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_sse3.cpp36
-rw-r--r--intern/cycles/kernel/device/cpu/kernel_sse41.cpp37
-rw-r--r--intern/cycles/kernel/device/cuda/compat.h141
-rw-r--r--intern/cycles/kernel/device/cuda/config.h114
-rw-r--r--intern/cycles/kernel/device/cuda/globals.h49
-rw-r--r--intern/cycles/kernel/device/cuda/kernel.cu28
-rw-r--r--intern/cycles/kernel/device/gpu/image.h278
-rw-r--r--intern/cycles/kernel/device/gpu/kernel.h910
-rw-r--r--intern/cycles/kernel/device/gpu/parallel_active_index.h87
-rw-r--r--intern/cycles/kernel/device/gpu/parallel_prefix_sum.h52
-rw-r--r--intern/cycles/kernel/device/gpu/parallel_reduce.h87
-rw-r--r--intern/cycles/kernel/device/gpu/parallel_sorted_index.h63
-rw-r--r--intern/cycles/kernel/device/hip/compat.h120
-rw-r--r--intern/cycles/kernel/device/hip/config.h57
-rw-r--r--intern/cycles/kernel/device/hip/globals.h49
-rw-r--r--intern/cycles/kernel/device/hip/kernel.cpp28
-rw-r--r--intern/cycles/kernel/device/metal/compat.h126
-rw-r--r--intern/cycles/kernel/device/optix/compat.h133
-rw-r--r--intern/cycles/kernel/device/optix/globals.h60
-rw-r--r--intern/cycles/kernel/device/optix/kernel.cu406
-rw-r--r--intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu29
31 files changed, 4331 insertions, 0 deletions
diff --git a/intern/cycles/kernel/device/cpu/compat.h b/intern/cycles/kernel/device/cpu/compat.h
new file mode 100644
index 00000000000..888c0d5d872
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/compat.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#define __KERNEL_CPU__
+
+/* Release kernel has too much false-positive maybe-uninitialized warnings,
+ * which makes it possible to miss actual warnings.
+ */
+#if (defined(__GNUC__) && !defined(__clang__)) && defined(NDEBUG)
+# pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+# pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+
+#include "util/util_half.h"
+#include "util/util_math.h"
+#include "util/util_simd.h"
+#include "util/util_texture.h"
+#include "util/util_types.h"
+
+/* On x86_64, versions of glibc < 2.16 have an issue where expf is
+ * much slower than the double version. This was fixed in glibc 2.16.
+ */
+#if !defined(__KERNEL_GPU__) && defined(__x86_64__) && defined(__x86_64__) && \
+ defined(__GNU_LIBRARY__) && defined(__GLIBC__) && defined(__GLIBC_MINOR__) && \
+ (__GLIBC__ <= 2 && __GLIBC_MINOR__ < 16)
+# define expf(x) ((float)exp((double)(x)))
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/* Assertions inside the kernel only work for the CPU device, so we wrap it in
+ * a macro which is empty for other devices */
+
+#define kernel_assert(cond) assert(cond)
+
+/* Texture types to be compatible with CUDA textures. These are really just
+ * simple arrays and after inlining fetch hopefully revert to being a simple
+ * pointer lookup. */
+template<typename T> struct texture {
+ ccl_always_inline const T &fetch(int index) const
+ {
+ kernel_assert(index >= 0 && index < width);
+ return data[index];
+ }
+
+ T *data;
+ int width;
+};
+
+/* Macros to handle different memory storage on different devices */
+
+#ifdef __KERNEL_SSE2__
+typedef vector3<sseb> sse3b;
+typedef vector3<ssef> sse3f;
+typedef vector3<ssei> sse3i;
+
+ccl_device_inline void print_sse3b(const char *label, sse3b &a)
+{
+ print_sseb(label, a.x);
+ print_sseb(label, a.y);
+ print_sseb(label, a.z);
+}
+
+ccl_device_inline void print_sse3f(const char *label, sse3f &a)
+{
+ print_ssef(label, a.x);
+ print_ssef(label, a.y);
+ print_ssef(label, a.z);
+}
+
+ccl_device_inline void print_sse3i(const char *label, sse3i &a)
+{
+ print_ssei(label, a.x);
+ print_ssei(label, a.y);
+ print_ssei(label, a.z);
+}
+
+# if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
+typedef vector3<avxf> avx3f;
+# endif
+
+#endif
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cpu/globals.h b/intern/cycles/kernel/device/cpu/globals.h
new file mode 100644
index 00000000000..fb9aae38cfc
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/globals.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#pragma once
+
+#include "kernel/kernel_profiling.h"
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in
+ * the kernel, to access constant data. These are all stored as "textures", but
+ * these are really just standard arrays. We can't use actually globals because
+ * multiple renders may be running inside the same process. */
+
+#ifdef __OSL__
+struct OSLGlobals;
+struct OSLThreadData;
+struct OSLShadingSystem;
+#endif
+
+typedef struct KernelGlobalsCPU {
+#define KERNEL_TEX(type, name) texture<type> name;
+#include "kernel/kernel_textures.h"
+
+ KernelData __data;
+
+#ifdef __OSL__
+ /* On the CPU, we also have the OSL globals here. Most data structures are shared
+ * with SVM, the difference is in the shaders and object/mesh attributes. */
+ OSLGlobals *osl;
+ OSLShadingSystem *osl_ss;
+ OSLThreadData *osl_tdata;
+#endif
+
+ /* **** Run-time data **** */
+
+ ProfilingState profiler;
+} KernelGlobalsCPU;
+
+typedef const KernelGlobalsCPU *ccl_restrict KernelGlobals;
+
+/* Abstraction macros */
+#define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
+#define kernel_tex_array(tex) (kg->tex.data)
+#define kernel_data (kg->__data)
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cpu/image.h b/intern/cycles/kernel/device/cpu/image.h
new file mode 100644
index 00000000000..93f956e354d
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/image.h
@@ -0,0 +1,657 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_NANOVDB
+# define NANOVDB_USE_INTRINSICS
+# include <nanovdb/NanoVDB.h>
+# include <nanovdb/util/SampleFromVoxels.h>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/* Make template functions private so symbols don't conflict between kernels with different
+ * instruction sets. */
+namespace {
+
+#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
+ { \
+ u[0] = (((-1.0f / 6.0f) * t + 0.5f) * t - 0.5f) * t + (1.0f / 6.0f); \
+ u[1] = ((0.5f * t - 1.0f) * t) * t + (2.0f / 3.0f); \
+ u[2] = ((-0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f / 6.0f); \
+ u[3] = (1.0f / 6.0f) * t * t * t; \
+ } \
+ (void)0
+
+ccl_device_inline float frac(float x, int *ix)
+{
+ int i = float_to_int(x) - ((x < 0.0f) ? 1 : 0);
+ *ix = i;
+ return x - (float)i;
+}
+
+template<typename T> struct TextureInterpolator {
+
+ static ccl_always_inline float4 read(float4 r)
+ {
+ return r;
+ }
+
+ static ccl_always_inline float4 read(uchar4 r)
+ {
+ float f = 1.0f / 255.0f;
+ return make_float4(r.x * f, r.y * f, r.z * f, r.w * f);
+ }
+
+ static ccl_always_inline float4 read(uchar r)
+ {
+ float f = r * (1.0f / 255.0f);
+ return make_float4(f, f, f, 1.0f);
+ }
+
+ static ccl_always_inline float4 read(float r)
+ {
+ /* TODO(dingto): Optimize this, so interpolation
+ * happens on float instead of float4 */
+ return make_float4(r, r, r, 1.0f);
+ }
+
+ static ccl_always_inline float4 read(half4 r)
+ {
+ return half4_to_float4_image(r);
+ }
+
+ static ccl_always_inline float4 read(half r)
+ {
+ float f = half_to_float_image(r);
+ return make_float4(f, f, f, 1.0f);
+ }
+
+ static ccl_always_inline float4 read(uint16_t r)
+ {
+ float f = r * (1.0f / 65535.0f);
+ return make_float4(f, f, f, 1.0f);
+ }
+
+ static ccl_always_inline float4 read(ushort4 r)
+ {
+ float f = 1.0f / 65535.0f;
+ return make_float4(r.x * f, r.y * f, r.z * f, r.w * f);
+ }
+
+ static ccl_always_inline float4 read(const T *data, int x, int y, int width, int height)
+ {
+ if (x < 0 || y < 0 || x >= width || y >= height) {
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ return read(data[y * width + x]);
+ }
+
+ static ccl_always_inline int wrap_periodic(int x, int width)
+ {
+ x %= width;
+ if (x < 0)
+ x += width;
+ return x;
+ }
+
+ static ccl_always_inline int wrap_clamp(int x, int width)
+ {
+ return clamp(x, 0, width - 1);
+ }
+
+ /* ******** 2D interpolation ******** */
+
+ static ccl_always_inline float4 interp_closest(const TextureInfo &info, float x, float y)
+ {
+ const T *data = (const T *)info.data;
+ const int width = info.width;
+ const int height = info.height;
+ int ix, iy;
+ frac(x * (float)width, &ix);
+ frac(y * (float)height, &iy);
+ switch (info.extension) {
+ case EXTENSION_REPEAT:
+ ix = wrap_periodic(ix, width);
+ iy = wrap_periodic(iy, height);
+ break;
+ case EXTENSION_CLIP:
+ if (x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ ATTR_FALLTHROUGH;
+ case EXTENSION_EXTEND:
+ ix = wrap_clamp(ix, width);
+ iy = wrap_clamp(iy, height);
+ break;
+ default:
+ kernel_assert(0);
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ return read(data[ix + iy * width]);
+ }
+
+ static ccl_always_inline float4 interp_linear(const TextureInfo &info, float x, float y)
+ {
+ const T *data = (const T *)info.data;
+ const int width = info.width;
+ const int height = info.height;
+ int ix, iy, nix, niy;
+ const float tx = frac(x * (float)width - 0.5f, &ix);
+ const float ty = frac(y * (float)height - 0.5f, &iy);
+ switch (info.extension) {
+ case EXTENSION_REPEAT:
+ ix = wrap_periodic(ix, width);
+ iy = wrap_periodic(iy, height);
+ nix = wrap_periodic(ix + 1, width);
+ niy = wrap_periodic(iy + 1, height);
+ break;
+ case EXTENSION_CLIP:
+ nix = ix + 1;
+ niy = iy + 1;
+ break;
+ case EXTENSION_EXTEND:
+ nix = wrap_clamp(ix + 1, width);
+ niy = wrap_clamp(iy + 1, height);
+ ix = wrap_clamp(ix, width);
+ iy = wrap_clamp(iy, height);
+ break;
+ default:
+ kernel_assert(0);
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ return (1.0f - ty) * (1.0f - tx) * read(data, ix, iy, width, height) +
+ (1.0f - ty) * tx * read(data, nix, iy, width, height) +
+ ty * (1.0f - tx) * read(data, ix, niy, width, height) +
+ ty * tx * read(data, nix, niy, width, height);
+ }
+
+ static ccl_always_inline float4 interp_cubic(const TextureInfo &info, float x, float y)
+ {
+ const T *data = (const T *)info.data;
+ const int width = info.width;
+ const int height = info.height;
+ int ix, iy, nix, niy;
+ const float tx = frac(x * (float)width - 0.5f, &ix);
+ const float ty = frac(y * (float)height - 0.5f, &iy);
+ int pix, piy, nnix, nniy;
+ switch (info.extension) {
+ case EXTENSION_REPEAT:
+ ix = wrap_periodic(ix, width);
+ iy = wrap_periodic(iy, height);
+ pix = wrap_periodic(ix - 1, width);
+ piy = wrap_periodic(iy - 1, height);
+ nix = wrap_periodic(ix + 1, width);
+ niy = wrap_periodic(iy + 1, height);
+ nnix = wrap_periodic(ix + 2, width);
+ nniy = wrap_periodic(iy + 2, height);
+ break;
+ case EXTENSION_CLIP:
+ pix = ix - 1;
+ piy = iy - 1;
+ nix = ix + 1;
+ niy = iy + 1;
+ nnix = ix + 2;
+ nniy = iy + 2;
+ break;
+ case EXTENSION_EXTEND:
+ pix = wrap_clamp(ix - 1, width);
+ piy = wrap_clamp(iy - 1, height);
+ nix = wrap_clamp(ix + 1, width);
+ niy = wrap_clamp(iy + 1, height);
+ nnix = wrap_clamp(ix + 2, width);
+ nniy = wrap_clamp(iy + 2, height);
+ ix = wrap_clamp(ix, width);
+ iy = wrap_clamp(iy, height);
+ break;
+ default:
+ kernel_assert(0);
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ const int xc[4] = {pix, ix, nix, nnix};
+ const int yc[4] = {piy, iy, niy, nniy};
+ float u[4], v[4];
+ /* Some helper macro to keep code reasonable size,
+ * let compiler to inline all the matrix multiplications.
+ */
+#define DATA(x, y) (read(data, xc[x], yc[y], width, height))
+#define TERM(col) \
+ (v[col] * \
+ (u[0] * DATA(0, col) + u[1] * DATA(1, col) + u[2] * DATA(2, col) + u[3] * DATA(3, col)))
+
+ SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+ SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+
+ /* Actual interpolation. */
+ return TERM(0) + TERM(1) + TERM(2) + TERM(3);
+#undef TERM
+#undef DATA
+ }
+
+ static ccl_always_inline float4 interp(const TextureInfo &info, float x, float y)
+ {
+ if (UNLIKELY(!info.data)) {
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ switch (info.interpolation) {
+ case INTERPOLATION_CLOSEST:
+ return interp_closest(info, x, y);
+ case INTERPOLATION_LINEAR:
+ return interp_linear(info, x, y);
+ default:
+ return interp_cubic(info, x, y);
+ }
+ }
+
+ /* ******** 3D interpolation ******** */
+
+ static ccl_always_inline float4 interp_3d_closest(const TextureInfo &info,
+ float x,
+ float y,
+ float z)
+ {
+ int width = info.width;
+ int height = info.height;
+ int depth = info.depth;
+ int ix, iy, iz;
+
+ frac(x * (float)width, &ix);
+ frac(y * (float)height, &iy);
+ frac(z * (float)depth, &iz);
+
+ switch (info.extension) {
+ case EXTENSION_REPEAT:
+ ix = wrap_periodic(ix, width);
+ iy = wrap_periodic(iy, height);
+ iz = wrap_periodic(iz, depth);
+ break;
+ case EXTENSION_CLIP:
+ if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) {
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ ATTR_FALLTHROUGH;
+ case EXTENSION_EXTEND:
+ ix = wrap_clamp(ix, width);
+ iy = wrap_clamp(iy, height);
+ iz = wrap_clamp(iz, depth);
+ break;
+ default:
+ kernel_assert(0);
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+
+ const T *data = (const T *)info.data;
+ return read(data[ix + iy * width + iz * width * height]);
+ }
+
+ static ccl_always_inline float4 interp_3d_linear(const TextureInfo &info,
+ float x,
+ float y,
+ float z)
+ {
+ int width = info.width;
+ int height = info.height;
+ int depth = info.depth;
+ int ix, iy, iz;
+ int nix, niy, niz;
+
+ float tx = frac(x * (float)width - 0.5f, &ix);
+ float ty = frac(y * (float)height - 0.5f, &iy);
+ float tz = frac(z * (float)depth - 0.5f, &iz);
+
+ switch (info.extension) {
+ case EXTENSION_REPEAT:
+ ix = wrap_periodic(ix, width);
+ iy = wrap_periodic(iy, height);
+ iz = wrap_periodic(iz, depth);
+
+ nix = wrap_periodic(ix + 1, width);
+ niy = wrap_periodic(iy + 1, height);
+ niz = wrap_periodic(iz + 1, depth);
+ break;
+ case EXTENSION_CLIP:
+ if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) {
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ ATTR_FALLTHROUGH;
+ case EXTENSION_EXTEND:
+ nix = wrap_clamp(ix + 1, width);
+ niy = wrap_clamp(iy + 1, height);
+ niz = wrap_clamp(iz + 1, depth);
+
+ ix = wrap_clamp(ix, width);
+ iy = wrap_clamp(iy, height);
+ iz = wrap_clamp(iz, depth);
+ break;
+ default:
+ kernel_assert(0);
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+
+ const T *data = (const T *)info.data;
+ float4 r;
+
+ r = (1.0f - tz) * (1.0f - ty) * (1.0f - tx) *
+ read(data[ix + iy * width + iz * width * height]);
+ r += (1.0f - tz) * (1.0f - ty) * tx * read(data[nix + iy * width + iz * width * height]);
+ r += (1.0f - tz) * ty * (1.0f - tx) * read(data[ix + niy * width + iz * width * height]);
+ r += (1.0f - tz) * ty * tx * read(data[nix + niy * width + iz * width * height]);
+
+ r += tz * (1.0f - ty) * (1.0f - tx) * read(data[ix + iy * width + niz * width * height]);
+ r += tz * (1.0f - ty) * tx * read(data[nix + iy * width + niz * width * height]);
+ r += tz * ty * (1.0f - tx) * read(data[ix + niy * width + niz * width * height]);
+ r += tz * ty * tx * read(data[nix + niy * width + niz * width * height]);
+
+ return r;
+ }
+
+ /* TODO(sergey): For some unspeakable reason both GCC-6 and Clang-3.9 are
+ * causing stack overflow issue in this function unless it is inlined.
+ *
+ * Only happens for AVX2 kernel and global __KERNEL_SSE__ vectorization
+ * enabled.
+ */
+#if defined(__GNUC__) || defined(__clang__)
+ static ccl_always_inline
+#else
+ static ccl_never_inline
+#endif
+ float4
+ interp_3d_cubic(const TextureInfo &info, float x, float y, float z)
+ {
+ int width = info.width;
+ int height = info.height;
+ int depth = info.depth;
+ int ix, iy, iz;
+ int nix, niy, niz;
+ /* Tricubic b-spline interpolation. */
+ const float tx = frac(x * (float)width - 0.5f, &ix);
+ const float ty = frac(y * (float)height - 0.5f, &iy);
+ const float tz = frac(z * (float)depth - 0.5f, &iz);
+ int pix, piy, piz, nnix, nniy, nniz;
+
+ switch (info.extension) {
+ case EXTENSION_REPEAT:
+ ix = wrap_periodic(ix, width);
+ iy = wrap_periodic(iy, height);
+ iz = wrap_periodic(iz, depth);
+
+ pix = wrap_periodic(ix - 1, width);
+ piy = wrap_periodic(iy - 1, height);
+ piz = wrap_periodic(iz - 1, depth);
+
+ nix = wrap_periodic(ix + 1, width);
+ niy = wrap_periodic(iy + 1, height);
+ niz = wrap_periodic(iz + 1, depth);
+
+ nnix = wrap_periodic(ix + 2, width);
+ nniy = wrap_periodic(iy + 2, height);
+ nniz = wrap_periodic(iz + 2, depth);
+ break;
+ case EXTENSION_CLIP:
+ if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) {
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ ATTR_FALLTHROUGH;
+ case EXTENSION_EXTEND:
+ pix = wrap_clamp(ix - 1, width);
+ piy = wrap_clamp(iy - 1, height);
+ piz = wrap_clamp(iz - 1, depth);
+
+ nix = wrap_clamp(ix + 1, width);
+ niy = wrap_clamp(iy + 1, height);
+ niz = wrap_clamp(iz + 1, depth);
+
+ nnix = wrap_clamp(ix + 2, width);
+ nniy = wrap_clamp(iy + 2, height);
+ nniz = wrap_clamp(iz + 2, depth);
+
+ ix = wrap_clamp(ix, width);
+ iy = wrap_clamp(iy, height);
+ iz = wrap_clamp(iz, depth);
+ break;
+ default:
+ kernel_assert(0);
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+
+ const int xc[4] = {pix, ix, nix, nnix};
+ const int yc[4] = {width * piy, width * iy, width * niy, width * nniy};
+ const int zc[4] = {
+ width * height * piz, width * height * iz, width * height * niz, width * height * nniz};
+ float u[4], v[4], w[4];
+
+ /* Some helper macro to keep code reasonable size,
+ * let compiler to inline all the matrix multiplications.
+ */
+#define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]]))
+#define COL_TERM(col, row) \
+ (v[col] * (u[0] * DATA(0, col, row) + u[1] * DATA(1, col, row) + u[2] * DATA(2, col, row) + \
+ u[3] * DATA(3, col, row)))
+#define ROW_TERM(row) \
+ (w[row] * (COL_TERM(0, row) + COL_TERM(1, row) + COL_TERM(2, row) + COL_TERM(3, row)))
+
+ SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+ SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+ SET_CUBIC_SPLINE_WEIGHTS(w, tz);
+
+ /* Actual interpolation. */
+ const T *data = (const T *)info.data;
+ return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
+
+#undef COL_TERM
+#undef ROW_TERM
+#undef DATA
+ }
+
+ static ccl_always_inline float4
+ interp_3d(const TextureInfo &info, float x, float y, float z, InterpolationType interp)
+ {
+ if (UNLIKELY(!info.data))
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+ switch ((interp == INTERPOLATION_NONE) ? info.interpolation : interp) {
+ case INTERPOLATION_CLOSEST:
+ return interp_3d_closest(info, x, y, z);
+ case INTERPOLATION_LINEAR:
+ return interp_3d_linear(info, x, y, z);
+ default:
+ return interp_3d_cubic(info, x, y, z);
+ }
+ }
+};
+
+#ifdef WITH_NANOVDB
+template<typename T> struct NanoVDBInterpolator {
+
+ typedef typename nanovdb::NanoGrid<T>::AccessorType AccessorType;
+
+ static ccl_always_inline float4 read(float r)
+ {
+ return make_float4(r, r, r, 1.0f);
+ }
+
+ static ccl_always_inline float4 read(nanovdb::Vec3f r)
+ {
+ return make_float4(r[0], r[1], r[2], 1.0f);
+ }
+
+ static ccl_always_inline float4 interp_3d_closest(const AccessorType &acc,
+ float x,
+ float y,
+ float z)
+ {
+ const nanovdb::Vec3f xyz(x, y, z);
+ return read(nanovdb::SampleFromVoxels<AccessorType, 0, false>(acc)(xyz));
+ }
+
+ static ccl_always_inline float4 interp_3d_linear(const AccessorType &acc,
+ float x,
+ float y,
+ float z)
+ {
+ const nanovdb::Vec3f xyz(x - 0.5f, y - 0.5f, z - 0.5f);
+ return read(nanovdb::SampleFromVoxels<AccessorType, 1, false>(acc)(xyz));
+ }
+
+# if defined(__GNUC__) || defined(__clang__)
+ static ccl_always_inline
+# else
+ static ccl_never_inline
+# endif
+ float4
+ interp_3d_cubic(const AccessorType &acc, float x, float y, float z)
+ {
+ int ix, iy, iz;
+ int nix, niy, niz;
+ int pix, piy, piz;
+ int nnix, nniy, nniz;
+ /* Tricubic b-spline interpolation. */
+ const float tx = frac(x - 0.5f, &ix);
+ const float ty = frac(y - 0.5f, &iy);
+ const float tz = frac(z - 0.5f, &iz);
+ pix = ix - 1;
+ piy = iy - 1;
+ piz = iz - 1;
+ nix = ix + 1;
+ niy = iy + 1;
+ niz = iz + 1;
+ nnix = ix + 2;
+ nniy = iy + 2;
+ nniz = iz + 2;
+
+ const int xc[4] = {pix, ix, nix, nnix};
+ const int yc[4] = {piy, iy, niy, nniy};
+ const int zc[4] = {piz, iz, niz, nniz};
+ float u[4], v[4], w[4];
+
+ /* Some helper macro to keep code reasonable size,
+ * let compiler to inline all the matrix multiplications.
+ */
+# define DATA(x, y, z) (read(acc.getValue(nanovdb::Coord(xc[x], yc[y], zc[z]))))
+# define COL_TERM(col, row) \
+ (v[col] * (u[0] * DATA(0, col, row) + u[1] * DATA(1, col, row) + u[2] * DATA(2, col, row) + \
+ u[3] * DATA(3, col, row)))
+# define ROW_TERM(row) \
+ (w[row] * (COL_TERM(0, row) + COL_TERM(1, row) + COL_TERM(2, row) + COL_TERM(3, row)))
+
+ SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+ SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+ SET_CUBIC_SPLINE_WEIGHTS(w, tz);
+
+ /* Actual interpolation. */
+ return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
+
+# undef COL_TERM
+# undef ROW_TERM
+# undef DATA
+ }
+
+ static ccl_always_inline float4
+ interp_3d(const TextureInfo &info, float x, float y, float z, InterpolationType interp)
+ {
+ using namespace nanovdb;
+
+ NanoGrid<T> *const grid = (NanoGrid<T> *)info.data;
+ AccessorType acc = grid->getAccessor();
+
+ switch ((interp == INTERPOLATION_NONE) ? info.interpolation : interp) {
+ case INTERPOLATION_CLOSEST:
+ return interp_3d_closest(acc, x, y, z);
+ case INTERPOLATION_LINEAR:
+ return interp_3d_linear(acc, x, y, z);
+ default:
+ return interp_3d_cubic(acc, x, y, z);
+ }
+ }
+};
+#endif
+
+#undef SET_CUBIC_SPLINE_WEIGHTS
+
+ccl_device float4 kernel_tex_image_interp(KernelGlobals kg, int id, float x, float y)
+{
+ const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+
+ switch (info.data_type) {
+ case IMAGE_DATA_TYPE_HALF:
+ return TextureInterpolator<half>::interp(info, x, y);
+ case IMAGE_DATA_TYPE_BYTE:
+ return TextureInterpolator<uchar>::interp(info, x, y);
+ case IMAGE_DATA_TYPE_USHORT:
+ return TextureInterpolator<uint16_t>::interp(info, x, y);
+ case IMAGE_DATA_TYPE_FLOAT:
+ return TextureInterpolator<float>::interp(info, x, y);
+ case IMAGE_DATA_TYPE_HALF4:
+ return TextureInterpolator<half4>::interp(info, x, y);
+ case IMAGE_DATA_TYPE_BYTE4:
+ return TextureInterpolator<uchar4>::interp(info, x, y);
+ case IMAGE_DATA_TYPE_USHORT4:
+ return TextureInterpolator<ushort4>::interp(info, x, y);
+ case IMAGE_DATA_TYPE_FLOAT4:
+ return TextureInterpolator<float4>::interp(info, x, y);
+ default:
+ assert(0);
+ return make_float4(
+ TEX_IMAGE_MISSING_R, TEX_IMAGE_MISSING_G, TEX_IMAGE_MISSING_B, TEX_IMAGE_MISSING_A);
+ }
+}
+
+ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg,
+ int id,
+ float3 P,
+ InterpolationType interp)
+{
+ const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+
+ if (info.use_transform_3d) {
+ P = transform_point(&info.transform_3d, P);
+ }
+
+ switch (info.data_type) {
+ case IMAGE_DATA_TYPE_HALF:
+ return TextureInterpolator<half>::interp_3d(info, P.x, P.y, P.z, interp);
+ case IMAGE_DATA_TYPE_BYTE:
+ return TextureInterpolator<uchar>::interp_3d(info, P.x, P.y, P.z, interp);
+ case IMAGE_DATA_TYPE_USHORT:
+ return TextureInterpolator<uint16_t>::interp_3d(info, P.x, P.y, P.z, interp);
+ case IMAGE_DATA_TYPE_FLOAT:
+ return TextureInterpolator<float>::interp_3d(info, P.x, P.y, P.z, interp);
+ case IMAGE_DATA_TYPE_HALF4:
+ return TextureInterpolator<half4>::interp_3d(info, P.x, P.y, P.z, interp);
+ case IMAGE_DATA_TYPE_BYTE4:
+ return TextureInterpolator<uchar4>::interp_3d(info, P.x, P.y, P.z, interp);
+ case IMAGE_DATA_TYPE_USHORT4:
+ return TextureInterpolator<ushort4>::interp_3d(info, P.x, P.y, P.z, interp);
+ case IMAGE_DATA_TYPE_FLOAT4:
+ return TextureInterpolator<float4>::interp_3d(info, P.x, P.y, P.z, interp);
+#ifdef WITH_NANOVDB
+ case IMAGE_DATA_TYPE_NANOVDB_FLOAT:
+ return NanoVDBInterpolator<float>::interp_3d(info, P.x, P.y, P.z, interp);
+ case IMAGE_DATA_TYPE_NANOVDB_FLOAT3:
+ return NanoVDBInterpolator<nanovdb::Vec3f>::interp_3d(info, P.x, P.y, P.z, interp);
+#endif
+ default:
+ assert(0);
+ return make_float4(
+ TEX_IMAGE_MISSING_R, TEX_IMAGE_MISSING_G, TEX_IMAGE_MISSING_B, TEX_IMAGE_MISSING_A);
+ }
+}
+
+} /* Namespace. */
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cpu/kernel.cpp b/intern/cycles/kernel/device/cpu/kernel.cpp
new file mode 100644
index 00000000000..8519b77aa08
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel.cpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CPU kernel entry points */
+
+/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
+ * one with SSE2 intrinsics.
+ */
+#if defined(__x86_64__) || defined(_M_X64)
+# define __KERNEL_SSE2__
+#endif
+
+/* When building kernel for native machine detect kernel features from the flags
+ * set by compiler.
+ */
+#ifdef WITH_KERNEL_NATIVE
+# ifdef __SSE2__
+# ifndef __KERNEL_SSE2__
+# define __KERNEL_SSE2__
+# endif
+# endif
+# ifdef __SSE3__
+# define __KERNEL_SSE3__
+# endif
+# ifdef __SSSE3__
+# define __KERNEL_SSSE3__
+# endif
+# ifdef __SSE4_1__
+# define __KERNEL_SSE41__
+# endif
+# ifdef __AVX__
+# define __KERNEL_SSE__
+# define __KERNEL_AVX__
+# endif
+# ifdef __AVX2__
+# define __KERNEL_SSE__
+# define __KERNEL_AVX2__
+# endif
+#endif
+
+/* quiet unused define warnings */
+#if defined(__KERNEL_SSE2__)
+/* do nothing */
+#endif
+
+#include "kernel/device/cpu/kernel.h"
+#define KERNEL_ARCH cpu
+#include "kernel/device/cpu/kernel_arch_impl.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Memory Copy */
+
+void kernel_const_copy(KernelGlobalsCPU *kg, const char *name, void *host, size_t)
+{
+ if (strcmp(name, "__data") == 0) {
+ kg->__data = *(KernelData *)host;
+ }
+ else {
+ assert(0);
+ }
+}
+
+void kernel_global_memory_copy(KernelGlobalsCPU *kg, const char *name, void *mem, size_t size)
+{
+ if (0) {
+ }
+
+#define KERNEL_TEX(type, tname) \
+ else if (strcmp(name, #tname) == 0) \
+ { \
+ kg->tname.data = (type *)mem; \
+ kg->tname.width = size; \
+ }
+#include "kernel/kernel_textures.h"
+ else {
+ assert(0);
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cpu/kernel.h b/intern/cycles/kernel/device/cpu/kernel.h
new file mode 100644
index 00000000000..28337a58898
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+/* CPU Kernel Interface */
+
+#include "util/util_types.h"
+
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+#define KERNEL_NAME_JOIN(x, y, z) x##_##y##_##z
+#define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name)
+#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
+
+struct IntegratorStateCPU;
+struct KernelGlobalsCPU;
+struct KernelData;
+
+KernelGlobalsCPU *kernel_globals_create();
+void kernel_globals_free(KernelGlobalsCPU *kg);
+
+void *kernel_osl_memory(const KernelGlobalsCPU *kg);
+bool kernel_osl_use(const KernelGlobalsCPU *kg);
+
+void kernel_const_copy(KernelGlobalsCPU *kg, const char *name, void *host, size_t size);
+void kernel_global_memory_copy(KernelGlobalsCPU *kg, const char *name, void *mem, size_t size);
+
+#define KERNEL_ARCH cpu
+#include "kernel/device/cpu/kernel_arch.h"
+
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/device/cpu/kernel_arch.h"
+
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/device/cpu/kernel_arch.h"
+
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/device/cpu/kernel_arch.h"
+
+#define KERNEL_ARCH cpu_avx
+#include "kernel/device/cpu/kernel_arch.h"
+
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/device/cpu/kernel_arch.h"
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch.h b/intern/cycles/kernel/device/cpu/kernel_arch.h
new file mode 100644
index 00000000000..432ac5e15a9
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_arch.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common declaration part of all CPU kernels. */
+
+/* --------------------------------------------------------------------
+ * Integrator.
+ */
+
+#define KERNEL_INTEGRATOR_FUNCTION(name) \
+ void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *ccl_restrict kg, \
+ IntegratorStateCPU *state)
+
+#define KERNEL_INTEGRATOR_SHADE_FUNCTION(name) \
+ void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *ccl_restrict kg, \
+ IntegratorStateCPU *state, \
+ ccl_global float *render_buffer)
+
+#define KERNEL_INTEGRATOR_INIT_FUNCTION(name) \
+ bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *ccl_restrict kg, \
+ IntegratorStateCPU *state, \
+ KernelWorkTile *tile, \
+ ccl_global float *render_buffer)
+
+KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_camera);
+KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_bake);
+KERNEL_INTEGRATOR_FUNCTION(intersect_closest);
+KERNEL_INTEGRATOR_FUNCTION(intersect_shadow);
+KERNEL_INTEGRATOR_FUNCTION(intersect_subsurface);
+KERNEL_INTEGRATOR_FUNCTION(intersect_volume_stack);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_background);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_light);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_shadow);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_surface);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_volume);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(megakernel);
+
+#undef KERNEL_INTEGRATOR_FUNCTION
+#undef KERNEL_INTEGRATOR_INIT_FUNCTION
+#undef KERNEL_INTEGRATOR_SHADE_FUNCTION
+
+/* --------------------------------------------------------------------
+ * Shader evaluation.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobalsCPU *kg,
+ const KernelShaderEvalInput *input,
+ float *output,
+ const int offset);
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobalsCPU *kg,
+ const KernelShaderEvalInput *input,
+ float *output,
+ const int offset);
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_curve_shadow_transparency)(
+ const KernelGlobalsCPU *kg,
+ const KernelShaderEvalInput *input,
+ float *output,
+ const int offset);
+
+/* --------------------------------------------------------------------
+ * Adaptive sampling.
+ */
+
+bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
+ const KernelGlobalsCPU *kg,
+ ccl_global float *render_buffer,
+ int x,
+ int y,
+ float threshold,
+ bool reset,
+ int offset,
+ int stride);
+
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobalsCPU *kg,
+ ccl_global float *render_buffer,
+ int y,
+ int start_x,
+ int width,
+ int offset,
+ int stride);
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobalsCPU *kg,
+ ccl_global float *render_buffer,
+ int x,
+ int start_y,
+ int height,
+ int offset,
+ int stride);
+
+/* --------------------------------------------------------------------
+ * Cryptomatte.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobalsCPU *kg,
+ ccl_global float *render_buffer,
+ int pixel_index);
+
+#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
new file mode 100644
index 00000000000..ba777062113
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
@@ -0,0 +1,243 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common implementation part of all CPU kernels.
+ *
+ * The idea is that particular .cpp files sets needed optimization flags and
+ * simply includes this file without worry of copying actual implementation over.
+ */
+
+#pragma once
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+
+#ifndef KERNEL_STUB
+# include "kernel/device/cpu/globals.h"
+# include "kernel/device/cpu/image.h"
+
+# include "kernel/integrator/integrator_state.h"
+# include "kernel/integrator/integrator_state_flow.h"
+# include "kernel/integrator/integrator_state_util.h"
+
+# include "kernel/integrator/integrator_init_from_camera.h"
+# include "kernel/integrator/integrator_init_from_bake.h"
+# include "kernel/integrator/integrator_intersect_closest.h"
+# include "kernel/integrator/integrator_intersect_shadow.h"
+# include "kernel/integrator/integrator_intersect_subsurface.h"
+# include "kernel/integrator/integrator_intersect_volume_stack.h"
+# include "kernel/integrator/integrator_shade_background.h"
+# include "kernel/integrator/integrator_shade_light.h"
+# include "kernel/integrator/integrator_shade_shadow.h"
+# include "kernel/integrator/integrator_shade_surface.h"
+# include "kernel/integrator/integrator_shade_volume.h"
+# include "kernel/integrator/integrator_megakernel.h"
+
+# include "kernel/kernel_film.h"
+# include "kernel/kernel_adaptive_sampling.h"
+# include "kernel/kernel_bake.h"
+# include "kernel/kernel_id_passes.h"
+
+#else
+# define STUB_ASSERT(arch, name) \
+ assert(!(#name " kernel stub for architecture " #arch " was called!"))
+#endif /* KERNEL_STUB */
+// clang-format on
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * Integrator.
+ */
+
+#ifdef KERNEL_STUB
+# define KERNEL_INVOKE(name, ...) (STUB_ASSERT(KERNEL_ARCH, name), 0)
+#else
+# define KERNEL_INVOKE(name, ...) integrator_##name(__VA_ARGS__)
+#endif
+
+/* TODO: Either use something like get_work_pixel(), or simplify tile which is passed here, so
+ * that it does not contain unused fields. */
+#define DEFINE_INTEGRATOR_INIT_KERNEL(name) \
+ bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *kg, \
+ IntegratorStateCPU *state, \
+ KernelWorkTile *tile, \
+ ccl_global float *render_buffer) \
+ { \
+ return KERNEL_INVOKE( \
+ name, kg, state, tile, render_buffer, tile->x, tile->y, tile->start_sample); \
+ }
+
+#define DEFINE_INTEGRATOR_KERNEL(name) \
+ void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *kg, \
+ IntegratorStateCPU *state) \
+ { \
+ KERNEL_INVOKE(name, kg, state); \
+ }
+
+#define DEFINE_INTEGRATOR_SHADE_KERNEL(name) \
+ void KERNEL_FUNCTION_FULL_NAME(integrator_##name)( \
+ const KernelGlobalsCPU *kg, IntegratorStateCPU *state, ccl_global float *render_buffer) \
+ { \
+ KERNEL_INVOKE(name, kg, state, render_buffer); \
+ }
+
+#define DEFINE_INTEGRATOR_SHADOW_KERNEL(name) \
+ void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *kg, \
+ IntegratorStateCPU *state) \
+ { \
+ KERNEL_INVOKE(name, kg, &state->shadow); \
+ }
+
+#define DEFINE_INTEGRATOR_SHADOW_SHADE_KERNEL(name) \
+ void KERNEL_FUNCTION_FULL_NAME(integrator_##name)( \
+ const KernelGlobalsCPU *kg, IntegratorStateCPU *state, ccl_global float *render_buffer) \
+ { \
+ KERNEL_INVOKE(name, kg, &state->shadow, render_buffer); \
+ }
+
+DEFINE_INTEGRATOR_INIT_KERNEL(init_from_camera)
+DEFINE_INTEGRATOR_INIT_KERNEL(init_from_bake)
+DEFINE_INTEGRATOR_KERNEL(intersect_closest)
+DEFINE_INTEGRATOR_KERNEL(intersect_subsurface)
+DEFINE_INTEGRATOR_KERNEL(intersect_volume_stack)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_background)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_light)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_surface)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_volume)
+DEFINE_INTEGRATOR_SHADE_KERNEL(megakernel)
+DEFINE_INTEGRATOR_SHADOW_KERNEL(intersect_shadow)
+DEFINE_INTEGRATOR_SHADOW_SHADE_KERNEL(shade_shadow)
+
+/* --------------------------------------------------------------------
+ * Shader evaluation.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobalsCPU *kg,
+ const KernelShaderEvalInput *input,
+ float *output,
+ const int offset)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, shader_eval_displace);
+#else
+ kernel_displace_evaluate(kg, input, output, offset);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobalsCPU *kg,
+ const KernelShaderEvalInput *input,
+ float *output,
+ const int offset)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, shader_eval_background);
+#else
+ kernel_background_evaluate(kg, input, output, offset);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_curve_shadow_transparency)(
+ const KernelGlobalsCPU *kg,
+ const KernelShaderEvalInput *input,
+ float *output,
+ const int offset)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, shader_eval_curve_shadow_transparency);
+#else
+ kernel_curve_shadow_transparency_evaluate(kg, input, output, offset);
+#endif
+}
+
+/* --------------------------------------------------------------------
+ * Adaptive sampling.
+ */
+
+bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
+ const KernelGlobalsCPU *kg,
+ ccl_global float *render_buffer,
+ int x,
+ int y,
+ float threshold,
+ bool reset,
+ int offset,
+ int stride)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_convergence_check);
+ return false;
+#else
+ return kernel_adaptive_sampling_convergence_check(
+ kg, render_buffer, x, y, threshold, reset, offset, stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobalsCPU *kg,
+ ccl_global float *render_buffer,
+ int y,
+ int start_x,
+ int width,
+ int offset,
+ int stride)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_filter_x);
+#else
+ kernel_adaptive_sampling_filter_x(kg, render_buffer, y, start_x, width, offset, stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobalsCPU *kg,
+ ccl_global float *render_buffer,
+ int x,
+ int start_y,
+ int height,
+ int offset,
+ int stride)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_filter_y);
+#else
+ kernel_adaptive_sampling_filter_y(kg, render_buffer, x, start_y, height, offset, stride);
+#endif
+}
+
+/* --------------------------------------------------------------------
+ * Cryptomatte.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobalsCPU *kg,
+ ccl_global float *render_buffer,
+ int pixel_index)
+{
+#ifdef KERNEL_STUB
+ STUB_ASSERT(KERNEL_ARCH, cryptomatte_postprocess);
+#else
+ kernel_cryptomatte_post(kg, render_buffer, pixel_index);
+#endif
+}
+
+#undef KERNEL_INVOKE
+#undef DEFINE_INTEGRATOR_KERNEL
+#undef DEFINE_INTEGRATOR_SHADE_KERNEL
+#undef DEFINE_INTEGRATOR_INIT_KERNEL
+
+#undef KERNEL_STUB
+#undef STUB_ASSERT
+#undef KERNEL_ARCH
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cpu/kernel_avx.cpp b/intern/cycles/kernel/device/cpu/kernel_avx.cpp
new file mode 100644
index 00000000000..220768036ab
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_avx.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug T36316. */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/device/cpu/kernel.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/device/cpu/kernel_avx2.cpp b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
new file mode 100644
index 00000000000..90c05113cbe
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011-2014 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug T36316. */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# define __KERNEL_AVX2__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/device/cpu/kernel.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/device/cpu/kernel_sse2.cpp b/intern/cycles/kernel/device/cpu/kernel_sse2.cpp
new file mode 100644
index 00000000000..fb85ef5b0d0
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_sse2.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug T36316. */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/device/cpu/kernel.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/device/cpu/kernel_sse3.cpp b/intern/cycles/kernel/device/cpu/kernel_sse3.cpp
new file mode 100644
index 00000000000..87baf04258a
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_sse3.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug T36316. */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/device/cpu/kernel.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/device/cpu/kernel_sse41.cpp b/intern/cycles/kernel/device/cpu/kernel_sse41.cpp
new file mode 100644
index 00000000000..bb421d58815
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_sse41.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+# define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug T36316. */
+# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/device/cpu/kernel.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/device/cuda/compat.h b/intern/cycles/kernel/device/cuda/compat.h
new file mode 100644
index 00000000000..8a50eb1a3d5
--- /dev/null
+++ b/intern/cycles/kernel/device/cuda/compat.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#define __KERNEL_GPU__
+#define __KERNEL_CUDA__
+#define CCL_NAMESPACE_BEGIN
+#define CCL_NAMESPACE_END
+
+#ifndef ATTR_FALLTHROUGH
+# define ATTR_FALLTHROUGH
+#endif
+
+/* Manual definitions so we can compile without CUDA toolkit. */
+
+#ifdef __CUDACC_RTC__
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+#else
+# include <stdint.h>
+#endif
+
+#ifdef CYCLES_CUBIN_CC
+# define FLT_MIN 1.175494350822287507969e-38f
+# define FLT_MAX 340282346638528859811704183484516925440.0f
+# define FLT_EPSILON 1.192092896e-07F
+#endif
+
+/* Qualifiers */
+
+#define ccl_device __device__ __inline__
+#if __CUDA_ARCH__ < 500
+# define ccl_device_inline __device__ __forceinline__
+# define ccl_device_forceinline __device__ __forceinline__
+#else
+# define ccl_device_inline __device__ __inline__
+# define ccl_device_forceinline __device__ __forceinline__
+#endif
+#define ccl_device_noinline __device__ __noinline__
+#define ccl_device_noinline_cpu ccl_device
+#define ccl_global
+#define ccl_static_constant __constant__
+#define ccl_device_constant __constant__ __device__
+#define ccl_constant const
+#define ccl_gpu_shared __shared__
+#define ccl_private
+#define ccl_may_alias
+#define ccl_restrict __restrict__
+#define ccl_loop_no_unroll
+#define ccl_align(n) __align__(n)
+#define ccl_optional_struct_init
+
+/* No assert supported for CUDA */
+
+#define kernel_assert(cond)
+
+/* GPU thread, block, grid size and index */
+
+#define ccl_gpu_thread_idx_x (threadIdx.x)
+#define ccl_gpu_block_dim_x (blockDim.x)
+#define ccl_gpu_block_idx_x (blockIdx.x)
+#define ccl_gpu_grid_dim_x (gridDim.x)
+#define ccl_gpu_warp_size (warpSize)
+
+#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
+#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
+
+/* GPU warp synchronization. */
+
+#define ccl_gpu_syncthreads() __syncthreads()
+#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
+#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
+#define ccl_gpu_popc(x) __popc(x)
+
+/* GPU texture objects */
+
+typedef unsigned long long CUtexObject;
+typedef CUtexObject ccl_gpu_tex_object;
+
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_2D(const ccl_gpu_tex_object texobj,
+ const float x,
+ const float y)
+{
+ return tex2D<T>(texobj, x, y);
+}
+
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object texobj,
+ const float x,
+ const float y,
+ const float z)
+{
+ return tex3D<T>(texobj, x, y, z);
+}
+
+/* Use fast math functions */
+
+#define cosf(x) __cosf(((float)(x)))
+#define sinf(x) __sinf(((float)(x)))
+#define powf(x, y) __powf(((float)(x)), ((float)(y)))
+#define tanf(x) __tanf(((float)(x)))
+#define logf(x) __logf(((float)(x)))
+#define expf(x) __expf(((float)(x)))
+
+/* Half */
+
+typedef unsigned short half;
+
+__device__ half __float2half(const float f)
+{
+ half val;
+ asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
+ return val;
+}
+
+__device__ float __half2float(const half h)
+{
+ float val;
+ asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h));
+ return val;
+}
+
+/* Types */
+
+#include "util/util_half.h"
+#include "util/util_types.h"
diff --git a/intern/cycles/kernel/device/cuda/config.h b/intern/cycles/kernel/device/cuda/config.h
new file mode 100644
index 00000000000..46196dcdb51
--- /dev/null
+++ b/intern/cycles/kernel/device/cuda/config.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Device data taken from CUDA occupancy calculator.
+ *
+ * Terminology
+ * - CUDA GPUs have multiple streaming multiprocessors
+ * - Each multiprocessor executes multiple thread blocks
+ * - Each thread block contains a number of threads, also known as the block size
+ * - Multiprocessors have a fixed number of registers, and the amount of registers
+ * used by each threads limits the number of threads per block.
+ */
+
+/* 3.0 and 3.5 */
+#if __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
+# define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+# define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
+# define GPU_BLOCK_MAX_THREADS 1024
+# define GPU_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+# define GPU_KERNEL_BLOCK_NUM_THREADS 256
+# define GPU_KERNEL_MAX_REGISTERS 63
+
+/* 3.2 */
+#elif __CUDA_ARCH__ == 320
+# define GPU_MULTIPRESSOR_MAX_REGISTERS 32768
+# define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
+# define GPU_BLOCK_MAX_THREADS 1024
+# define GPU_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+# define GPU_KERNEL_BLOCK_NUM_THREADS 256
+# define GPU_KERNEL_MAX_REGISTERS 63
+
+/* 3.7 */
+#elif __CUDA_ARCH__ == 370
+# define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+# define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
+# define GPU_BLOCK_MAX_THREADS 1024
+# define GPU_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+# define GPU_KERNEL_BLOCK_NUM_THREADS 256
+# define GPU_KERNEL_MAX_REGISTERS 63
+
+/* 5.x, 6.x */
+#elif __CUDA_ARCH__ <= 699
+# define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+# define GPU_MULTIPROCESSOR_MAX_BLOCKS 32
+# define GPU_BLOCK_MAX_THREADS 1024
+# define GPU_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+# define GPU_KERNEL_BLOCK_NUM_THREADS 256
+/* CUDA 9.0 seems to cause slowdowns on high-end Pascal cards unless we increase the number of
+ * registers */
+# if __CUDACC_VER_MAJOR__ >= 9 && __CUDA_ARCH__ >= 600
+# define GPU_KERNEL_MAX_REGISTERS 64
+# else
+# define GPU_KERNEL_MAX_REGISTERS 48
+# endif
+
+/* 7.x, 8.x */
+#elif __CUDA_ARCH__ <= 899
+# define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+# define GPU_MULTIPROCESSOR_MAX_BLOCKS 32
+# define GPU_BLOCK_MAX_THREADS 1024
+# define GPU_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+# define GPU_KERNEL_BLOCK_NUM_THREADS 512
+# define GPU_KERNEL_MAX_REGISTERS 96
+
+/* unknown architecture */
+#else
+# error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
+#endif
+
+/* Compute number of threads per block and minimum blocks per multiprocessor
+ * given the maximum number of registers per thread. */
+
+#define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
+ extern "C" __global__ void __launch_bounds__(block_num_threads, \
+ GPU_MULTIPRESSOR_MAX_REGISTERS / \
+ (block_num_threads * thread_num_registers))
+
+/* sanity checks */
+
+#if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
+# error "Maximum number of threads per block exceeded"
+#endif
+
+#if GPU_MULTIPRESSOR_MAX_REGISTERS / (GPU_KERNEL_BLOCK_NUM_THREADS * GPU_KERNEL_MAX_REGISTERS) > \
+ GPU_MULTIPROCESSOR_MAX_BLOCKS
+# error "Maximum number of blocks per multiprocessor exceeded"
+#endif
+
+#if GPU_KERNEL_MAX_REGISTERS > GPU_THREAD_MAX_REGISTERS
+# error "Maximum number of registers per thread exceeded"
+#endif
diff --git a/intern/cycles/kernel/device/cuda/globals.h b/intern/cycles/kernel/device/cuda/globals.h
new file mode 100644
index 00000000000..2c187cf8a23
--- /dev/null
+++ b/intern/cycles/kernel/device/cuda/globals.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#pragma once
+
+#include "kernel/kernel_profiling.h"
+#include "kernel/kernel_types.h"
+
+#include "kernel/integrator/integrator_state.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Not actually used, just a NULL pointer that gets passed everywhere, which we
+ * hope gets optimized out by the compiler. */
+struct KernelGlobalsGPU {
+ int unused[1];
+};
+typedef ccl_global const KernelGlobalsGPU *ccl_restrict KernelGlobals;
+
+/* Global scene data and textures */
+__constant__ KernelData __data;
+#define KERNEL_TEX(type, name) const __constant__ __device__ type *name;
+#include "kernel/kernel_textures.h"
+
+/* Integrator state */
+__constant__ IntegratorStateGPU __integrator_state;
+
+/* Abstraction macros */
+#define kernel_data __data
+#define kernel_tex_fetch(t, index) t[(index)]
+#define kernel_tex_array(t) (t)
+#define kernel_integrator_state __integrator_state
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cuda/kernel.cu b/intern/cycles/kernel/device/cuda/kernel.cu
new file mode 100644
index 00000000000..e26fe243642
--- /dev/null
+++ b/intern/cycles/kernel/device/cuda/kernel.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CUDA kernel entry points */
+
+#ifdef __CUDA_ARCH__
+
+# include "kernel/device/cuda/compat.h"
+# include "kernel/device/cuda/config.h"
+# include "kernel/device/cuda/globals.h"
+
+# include "kernel/device/gpu/image.h"
+# include "kernel/device/gpu/kernel.h"
+
+#endif
diff --git a/intern/cycles/kernel/device/gpu/image.h b/intern/cycles/kernel/device/gpu/image.h
new file mode 100644
index 00000000000..95a37c693ae
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/image.h
@@ -0,0 +1,278 @@
+/*
+ * Copyright 2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef WITH_NANOVDB
+# define NDEBUG /* Disable "assert" in device code */
+# define NANOVDB_USE_INTRINSICS
+# include "nanovdb/NanoVDB.h"
+# include "nanovdb/util/SampleFromVoxels.h"
+#endif
+
+/* w0, w1, w2, and w3 are the four cubic B-spline basis functions. */
+ccl_device float cubic_w0(float a)
+{
+ return (1.0f / 6.0f) * (a * (a * (-a + 3.0f) - 3.0f) + 1.0f);
+}
+ccl_device float cubic_w1(float a)
+{
+ return (1.0f / 6.0f) * (a * a * (3.0f * a - 6.0f) + 4.0f);
+}
+ccl_device float cubic_w2(float a)
+{
+ return (1.0f / 6.0f) * (a * (a * (-3.0f * a + 3.0f) + 3.0f) + 1.0f);
+}
+ccl_device float cubic_w3(float a)
+{
+ return (1.0f / 6.0f) * (a * a * a);
+}
+
+/* g0 and g1 are the two amplitude functions. */
+ccl_device float cubic_g0(float a)
+{
+ return cubic_w0(a) + cubic_w1(a);
+}
+ccl_device float cubic_g1(float a)
+{
+ return cubic_w2(a) + cubic_w3(a);
+}
+
+/* h0 and h1 are the two offset functions */
+ccl_device float cubic_h0(float a)
+{
+ return (cubic_w1(a) / cubic_g0(a)) - 1.0f;
+}
+ccl_device float cubic_h1(float a)
+{
+ return (cubic_w3(a) / cubic_g1(a)) + 1.0f;
+}
+
+/* Fast bicubic texture lookup using 4 bilinear lookups, adapted from CUDA samples. */
+template<typename T>
+ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, float y)
+{
+ ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+
+ x = (x * info.width) - 0.5f;
+ y = (y * info.height) - 0.5f;
+
+ float px = floorf(x);
+ float py = floorf(y);
+ float fx = x - px;
+ float fy = y - py;
+
+ float g0x = cubic_g0(fx);
+ float g1x = cubic_g1(fx);
+ /* Note +0.5 offset to compensate for CUDA linear filtering convention. */
+ float x0 = (px + cubic_h0(fx) + 0.5f) / info.width;
+ float x1 = (px + cubic_h1(fx) + 0.5f) / info.width;
+ float y0 = (py + cubic_h0(fy) + 0.5f) / info.height;
+ float y1 = (py + cubic_h1(fy) + 0.5f) / info.height;
+
+ return cubic_g0(fy) * (g0x * ccl_gpu_tex_object_read_2D<T>(tex, x0, y0) +
+ g1x * ccl_gpu_tex_object_read_2D<T>(tex, x1, y0)) +
+ cubic_g1(fy) * (g0x * ccl_gpu_tex_object_read_2D<T>(tex, x0, y1) +
+ g1x * ccl_gpu_tex_object_read_2D<T>(tex, x1, y1));
+}
+
+/* Fast tricubic texture lookup using 8 trilinear lookups. */
+template<typename T>
+ccl_device_noinline T
+kernel_tex_image_interp_tricubic(const TextureInfo &info, float x, float y, float z)
+{
+ ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+
+ x = (x * info.width) - 0.5f;
+ y = (y * info.height) - 0.5f;
+ z = (z * info.depth) - 0.5f;
+
+ float px = floorf(x);
+ float py = floorf(y);
+ float pz = floorf(z);
+ float fx = x - px;
+ float fy = y - py;
+ float fz = z - pz;
+
+ float g0x = cubic_g0(fx);
+ float g1x = cubic_g1(fx);
+ float g0y = cubic_g0(fy);
+ float g1y = cubic_g1(fy);
+ float g0z = cubic_g0(fz);
+ float g1z = cubic_g1(fz);
+
+ /* Note +0.5 offset to compensate for CUDA linear filtering convention. */
+ float x0 = (px + cubic_h0(fx) + 0.5f) / info.width;
+ float x1 = (px + cubic_h1(fx) + 0.5f) / info.width;
+ float y0 = (py + cubic_h0(fy) + 0.5f) / info.height;
+ float y1 = (py + cubic_h1(fy) + 0.5f) / info.height;
+ float z0 = (pz + cubic_h0(fz) + 0.5f) / info.depth;
+ float z1 = (pz + cubic_h1(fz) + 0.5f) / info.depth;
+
+ return g0z * (g0y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y0, z0) +
+ g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y0, z0)) +
+ g1y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y1, z0) +
+ g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y1, z0))) +
+ g1z * (g0y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y0, z1) +
+ g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y0, z1)) +
+ g1y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y1, z1) +
+ g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y1, z1)));
+}
+
+#ifdef WITH_NANOVDB
+template<typename T, typename S>
+ccl_device T kernel_tex_image_interp_tricubic_nanovdb(S &s, float x, float y, float z)
+{
+ float px = floorf(x);
+ float py = floorf(y);
+ float pz = floorf(z);
+ float fx = x - px;
+ float fy = y - py;
+ float fz = z - pz;
+
+ float g0x = cubic_g0(fx);
+ float g1x = cubic_g1(fx);
+ float g0y = cubic_g0(fy);
+ float g1y = cubic_g1(fy);
+ float g0z = cubic_g0(fz);
+ float g1z = cubic_g1(fz);
+
+ float x0 = px + cubic_h0(fx);
+ float x1 = px + cubic_h1(fx);
+ float y0 = py + cubic_h0(fy);
+ float y1 = py + cubic_h1(fy);
+ float z0 = pz + cubic_h0(fz);
+ float z1 = pz + cubic_h1(fz);
+
+ using namespace nanovdb;
+
+ return g0z * (g0y * (g0x * s(Vec3f(x0, y0, z0)) + g1x * s(Vec3f(x1, y0, z0))) +
+ g1y * (g0x * s(Vec3f(x0, y1, z0)) + g1x * s(Vec3f(x1, y1, z0)))) +
+ g1z * (g0y * (g0x * s(Vec3f(x0, y0, z1)) + g1x * s(Vec3f(x1, y0, z1))) +
+ g1y * (g0x * s(Vec3f(x0, y1, z1)) + g1x * s(Vec3f(x1, y1, z1))));
+}
+
+template<typename T>
+ccl_device_noinline T kernel_tex_image_interp_nanovdb(
+ const TextureInfo &info, float x, float y, float z, uint interpolation)
+{
+ using namespace nanovdb;
+
+ NanoGrid<T> *const grid = (NanoGrid<T> *)info.data;
+ typedef typename nanovdb::NanoGrid<T>::AccessorType AccessorType;
+ AccessorType acc = grid->getAccessor();
+
+ switch (interpolation) {
+ case INTERPOLATION_CLOSEST:
+ return SampleFromVoxels<AccessorType, 0, false>(acc)(Vec3f(x, y, z));
+ case INTERPOLATION_LINEAR:
+ return SampleFromVoxels<AccessorType, 1, false>(acc)(Vec3f(x - 0.5f, y - 0.5f, z - 0.5f));
+ default:
+ SampleFromVoxels<AccessorType, 1, false> s(acc);
+ return kernel_tex_image_interp_tricubic_nanovdb<T>(s, x - 0.5f, y - 0.5f, z - 0.5f);
+ }
+}
+#endif
+
+ccl_device float4 kernel_tex_image_interp(KernelGlobals kg, int id, float x, float y)
+{
+ const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+
+ /* float4, byte4, ushort4 and half4 */
+ const int texture_type = info.data_type;
+ if (texture_type == IMAGE_DATA_TYPE_FLOAT4 || texture_type == IMAGE_DATA_TYPE_BYTE4 ||
+ texture_type == IMAGE_DATA_TYPE_HALF4 || texture_type == IMAGE_DATA_TYPE_USHORT4) {
+ if (info.interpolation == INTERPOLATION_CUBIC) {
+ return kernel_tex_image_interp_bicubic<float4>(info, x, y);
+ }
+ else {
+ ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+ return ccl_gpu_tex_object_read_2D<float4>(tex, x, y);
+ }
+ }
+ /* float, byte and half */
+ else {
+ float f;
+
+ if (info.interpolation == INTERPOLATION_CUBIC) {
+ f = kernel_tex_image_interp_bicubic<float>(info, x, y);
+ }
+ else {
+ ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+ f = ccl_gpu_tex_object_read_2D<float>(tex, x, y);
+ }
+
+ return make_float4(f, f, f, 1.0f);
+ }
+}
+
+ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg,
+ int id,
+ float3 P,
+ InterpolationType interp)
+{
+ const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+
+ if (info.use_transform_3d) {
+ P = transform_point(&info.transform_3d, P);
+ }
+
+ const float x = P.x;
+ const float y = P.y;
+ const float z = P.z;
+
+ uint interpolation = (interp == INTERPOLATION_NONE) ? info.interpolation : interp;
+ const int texture_type = info.data_type;
+
+#ifdef WITH_NANOVDB
+ if (texture_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT) {
+ float f = kernel_tex_image_interp_nanovdb<float>(info, x, y, z, interpolation);
+ return make_float4(f, f, f, 1.0f);
+ }
+ if (texture_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
+ nanovdb::Vec3f f = kernel_tex_image_interp_nanovdb<nanovdb::Vec3f>(
+ info, x, y, z, interpolation);
+ return make_float4(f[0], f[1], f[2], 1.0f);
+ }
+#endif
+ if (texture_type == IMAGE_DATA_TYPE_FLOAT4 || texture_type == IMAGE_DATA_TYPE_BYTE4 ||
+ texture_type == IMAGE_DATA_TYPE_HALF4 || texture_type == IMAGE_DATA_TYPE_USHORT4) {
+ if (interpolation == INTERPOLATION_CUBIC) {
+ return kernel_tex_image_interp_tricubic<float4>(info, x, y, z);
+ }
+ else {
+ ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+ return ccl_gpu_tex_object_read_3D<float4>(tex, x, y, z);
+ }
+ }
+ else {
+ float f;
+
+ if (interpolation == INTERPOLATION_CUBIC) {
+ f = kernel_tex_image_interp_tricubic<float>(info, x, y, z);
+ }
+ else {
+ ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+ f = ccl_gpu_tex_object_read_3D<float>(tex, x, y, z);
+ }
+
+ return make_float4(f, f, f, 1.0f);
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
new file mode 100644
index 00000000000..335cb1ec0c0
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -0,0 +1,910 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Common GPU kernels. */
+
+#include "kernel/device/gpu/parallel_active_index.h"
+#include "kernel/device/gpu/parallel_prefix_sum.h"
+#include "kernel/device/gpu/parallel_sorted_index.h"
+
+#include "kernel/integrator/integrator_state.h"
+#include "kernel/integrator/integrator_state_flow.h"
+#include "kernel/integrator/integrator_state_util.h"
+
+#include "kernel/integrator/integrator_init_from_bake.h"
+#include "kernel/integrator/integrator_init_from_camera.h"
+#include "kernel/integrator/integrator_intersect_closest.h"
+#include "kernel/integrator/integrator_intersect_shadow.h"
+#include "kernel/integrator/integrator_intersect_subsurface.h"
+#include "kernel/integrator/integrator_intersect_volume_stack.h"
+#include "kernel/integrator/integrator_shade_background.h"
+#include "kernel/integrator/integrator_shade_light.h"
+#include "kernel/integrator/integrator_shade_shadow.h"
+#include "kernel/integrator/integrator_shade_surface.h"
+#include "kernel/integrator/integrator_shade_volume.h"
+
+#include "kernel/kernel_adaptive_sampling.h"
+#include "kernel/kernel_bake.h"
+#include "kernel/kernel_film.h"
+#include "kernel/kernel_work_stealing.h"
+
+/* --------------------------------------------------------------------
+ * Integrator.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_integrator_reset(int num_states)
+{
+ const int state = ccl_gpu_global_id_x();
+
+ if (state < num_states) {
+ INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
+ INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0;
+ }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_integrator_init_from_camera(KernelWorkTile *tiles,
+ const int num_tiles,
+ float *render_buffer,
+ const int max_tile_work_size)
+{
+ const int work_index = ccl_gpu_global_id_x();
+
+ if (work_index >= max_tile_work_size * num_tiles) {
+ return;
+ }
+
+ const int tile_index = work_index / max_tile_work_size;
+ const int tile_work_index = work_index - tile_index * max_tile_work_size;
+
+ const KernelWorkTile *tile = &tiles[tile_index];
+
+ if (tile_work_index >= tile->work_size) {
+ return;
+ }
+
+ const int state = tile->path_index_offset + tile_work_index;
+
+ uint x, y, sample;
+ get_work_pixel(tile, tile_work_index, &x, &y, &sample);
+
+ integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample);
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_integrator_init_from_bake(KernelWorkTile *tiles,
+ const int num_tiles,
+ float *render_buffer,
+ const int max_tile_work_size)
+{
+ const int work_index = ccl_gpu_global_id_x();
+
+ if (work_index >= max_tile_work_size * num_tiles) {
+ return;
+ }
+
+ const int tile_index = work_index / max_tile_work_size;
+ const int tile_work_index = work_index - tile_index * max_tile_work_size;
+
+ const KernelWorkTile *tile = &tiles[tile_index];
+
+ if (tile_work_index >= tile->work_size) {
+ return;
+ }
+
+ const int state = tile->path_index_offset + tile_work_index;
+
+ uint x, y, sample;
+ get_work_pixel(tile, tile_work_index, &x, &y, &sample);
+
+ integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample);
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_integrator_intersect_closest(const int *path_index_array, const int work_size)
+{
+ const int global_index = ccl_gpu_global_id_x();
+
+ if (global_index < work_size) {
+ const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+ integrator_intersect_closest(NULL, state);
+ }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_integrator_intersect_shadow(const int *path_index_array, const int work_size)
+{
+ const int global_index = ccl_gpu_global_id_x();
+
+ if (global_index < work_size) {
+ const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+ integrator_intersect_shadow(NULL, state);
+ }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_integrator_intersect_subsurface(const int *path_index_array, const int work_size)
+{
+ const int global_index = ccl_gpu_global_id_x();
+
+ if (global_index < work_size) {
+ const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+ integrator_intersect_subsurface(NULL, state);
+ }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_integrator_intersect_volume_stack(const int *path_index_array, const int work_size)
+{
+ const int global_index = ccl_gpu_global_id_x();
+
+ if (global_index < work_size) {
+ const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+ integrator_intersect_volume_stack(NULL, state);
+ }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_integrator_shade_background(const int *path_index_array,
+ float *render_buffer,
+ const int work_size)
+{
+ const int global_index = ccl_gpu_global_id_x();
+
+ if (global_index < work_size) {
+ const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+ integrator_shade_background(NULL, state, render_buffer);
+ }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_integrator_shade_light(const int *path_index_array,
+ float *render_buffer,
+ const int work_size)
+{
+ const int global_index = ccl_gpu_global_id_x();
+
+ if (global_index < work_size) {
+ const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+ integrator_shade_light(NULL, state, render_buffer);
+ }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_integrator_shade_shadow(const int *path_index_array,
+ float *render_buffer,
+ const int work_size)
+{
+ const int global_index = ccl_gpu_global_id_x();
+
+ if (global_index < work_size) {
+ const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+ integrator_shade_shadow(NULL, state, render_buffer);
+ }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_integrator_shade_surface(const int *path_index_array,
+ float *render_buffer,
+ const int work_size)
+{
+ const int global_index = ccl_gpu_global_id_x();
+
+ if (global_index < work_size) {
+ const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+ integrator_shade_surface(NULL, state, render_buffer);
+ }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_integrator_shade_surface_raytrace(const int *path_index_array,
+ float *render_buffer,
+ const int work_size)
+{
+ const int global_index = ccl_gpu_global_id_x();
+
+ if (global_index < work_size) {
+ const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+ integrator_shade_surface_raytrace(NULL, state, render_buffer);
+ }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_integrator_shade_volume(const int *path_index_array,
+ float *render_buffer,
+ const int work_size)
+{
+ const int global_index = ccl_gpu_global_id_x();
+
+ if (global_index < work_size) {
+ const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+ integrator_shade_volume(NULL, state, render_buffer);
+ }
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ kernel_gpu_integrator_queued_paths_array(int num_states,
+ int *indices,
+ int *num_indices,
+ int kernel)
+{
+ gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+ num_states, indices, num_indices, [kernel](const int state) {
+ return (INTEGRATOR_STATE(state, path, queued_kernel) == kernel);
+ });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ kernel_gpu_integrator_queued_shadow_paths_array(int num_states,
+ int *indices,
+ int *num_indices,
+ int kernel)
+{
+ gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+ num_states, indices, num_indices, [kernel](const int state) {
+ return (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == kernel);
+ });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ kernel_gpu_integrator_active_paths_array(int num_states, int *indices, int *num_indices)
+{
+ gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+ num_states, indices, num_indices, [](const int state) {
+ return (INTEGRATOR_STATE(state, path, queued_kernel) != 0);
+ });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ kernel_gpu_integrator_terminated_paths_array(int num_states,
+ int *indices,
+ int *num_indices,
+ int indices_offset)
+{
+ gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+ num_states, indices + indices_offset, num_indices, [](const int state) {
+ return (INTEGRATOR_STATE(state, path, queued_kernel) == 0);
+ });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ kernel_gpu_integrator_terminated_shadow_paths_array(int num_states,
+ int *indices,
+ int *num_indices,
+ int indices_offset)
+{
+ gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+ num_states, indices + indices_offset, num_indices, [](const int state) {
+ return (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
+ });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+ kernel_gpu_integrator_sorted_paths_array(int num_states,
+ int num_states_limit,
+ int *indices,
+ int *num_indices,
+ int *key_counter,
+ int *key_prefix_sum,
+ int kernel)
+{
+ gpu_parallel_sorted_index_array<GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE>(
+ num_states,
+ num_states_limit,
+ indices,
+ num_indices,
+ key_counter,
+ key_prefix_sum,
+ [kernel](const int state) {
+ return (INTEGRATOR_STATE(state, path, queued_kernel) == kernel) ?
+ INTEGRATOR_STATE(state, path, shader_sort_key) :
+ GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
+ });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ kernel_gpu_integrator_compact_paths_array(int num_states,
+ int *indices,
+ int *num_indices,
+ int num_active_paths)
+{
+ gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+ num_states, indices, num_indices, [num_active_paths](const int state) {
+ return (state >= num_active_paths) && (INTEGRATOR_STATE(state, path, queued_kernel) != 0);
+ });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+ kernel_gpu_integrator_compact_states(const int *active_terminated_states,
+ const int active_states_offset,
+ const int terminated_states_offset,
+ const int work_size)
+{
+ const int global_index = ccl_gpu_global_id_x();
+
+ if (global_index < work_size) {
+ const int from_state = active_terminated_states[active_states_offset + global_index];
+ const int to_state = active_terminated_states[terminated_states_offset + global_index];
+
+ integrator_state_move(NULL, to_state, from_state);
+ }
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ kernel_gpu_integrator_compact_shadow_paths_array(int num_states,
+ int *indices,
+ int *num_indices,
+ int num_active_paths)
+{
+ gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+ num_states, indices, num_indices, [num_active_paths](const int state) {
+ return (state >= num_active_paths) &&
+ (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0);
+ });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+ kernel_gpu_integrator_compact_shadow_states(const int *active_terminated_states,
+ const int active_states_offset,
+ const int terminated_states_offset,
+ const int work_size)
+{
+ const int global_index = ccl_gpu_global_id_x();
+
+ if (global_index < work_size) {
+ const int from_state = active_terminated_states[active_states_offset + global_index];
+ const int to_state = active_terminated_states[terminated_states_offset + global_index];
+
+ integrator_shadow_state_move(NULL, to_state, from_state);
+ }
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE)
+ kernel_gpu_prefix_sum(int *counter, int *prefix_sum, int num_values)
+{
+ gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>(
+ counter, prefix_sum, num_values);
+}
+
+/* --------------------------------------------------------------------
+ * Adaptive sampling.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_adaptive_sampling_convergence_check(float *render_buffer,
+ int sx,
+ int sy,
+ int sw,
+ int sh,
+ float threshold,
+ bool reset,
+ int offset,
+ int stride,
+ uint *num_active_pixels)
+{
+ const int work_index = ccl_gpu_global_id_x();
+ const int y = work_index / sw;
+ const int x = work_index - y * sw;
+
+ bool converged = true;
+
+ if (x < sw && y < sh) {
+ converged = kernel_adaptive_sampling_convergence_check(
+ nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride);
+ }
+
+ /* NOTE: All threads specified in the mask must execute the intrinsic. */
+ const uint num_active_pixels_mask = ccl_gpu_ballot(!converged);
+ const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
+ if (lane_id == 0) {
+ atomic_fetch_and_add_uint32(num_active_pixels, __popc(num_active_pixels_mask));
+ }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_adaptive_sampling_filter_x(
+ float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+ const int y = ccl_gpu_global_id_x();
+
+ if (y < sh) {
+ kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride);
+ }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_adaptive_sampling_filter_y(
+ float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+ const int x = ccl_gpu_global_id_x();
+
+ if (x < sw) {
+ kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride);
+ }
+}
+
+/* --------------------------------------------------------------------
+ * Cryptomatte.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_cryptomatte_postprocess(float *render_buffer, int num_pixels)
+{
+ const int pixel_index = ccl_gpu_global_id_x();
+
+ if (pixel_index < num_pixels) {
+ kernel_cryptomatte_post(nullptr, render_buffer, pixel_index);
+ }
+}
+
+/* --------------------------------------------------------------------
+ * Film.
+ */
+
+/* Common implementation for float destination. */
+template<typename Processor>
+ccl_device_inline void kernel_gpu_film_convert_common(const KernelFilmConvert *kfilm_convert,
+ float *pixels,
+ float *render_buffer,
+ int num_pixels,
+ int width,
+ int offset,
+ int stride,
+ int dst_offset,
+ int dst_stride,
+ const Processor &processor)
+{
+ const int render_pixel_index = ccl_gpu_global_id_x();
+ if (render_pixel_index >= num_pixels) {
+ return;
+ }
+
+ const int x = render_pixel_index % width;
+ const int y = render_pixel_index / width;
+
+ ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert->pass_stride +
+ y * stride * kfilm_convert->pass_stride;
+
+ ccl_global float *pixel = pixels +
+ (render_pixel_index + dst_offset) * kfilm_convert->pixel_stride;
+
+ processor(kfilm_convert, buffer, pixel);
+}
+
+/* Common implementation for half4 destination and 4-channel input pass. */
+template<typename Processor>
+ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgba(
+ const KernelFilmConvert *kfilm_convert,
+ uchar4 *rgba,
+ float *render_buffer,
+ int num_pixels,
+ int width,
+ int offset,
+ int stride,
+ int rgba_offset,
+ int rgba_stride,
+ const Processor &processor)
+{
+ const int render_pixel_index = ccl_gpu_global_id_x();
+ if (render_pixel_index >= num_pixels) {
+ return;
+ }
+
+ const int x = render_pixel_index % width;
+ const int y = render_pixel_index / width;
+
+ ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert->pass_stride +
+ y * stride * kfilm_convert->pass_stride;
+
+ float pixel[4];
+ processor(kfilm_convert, buffer, pixel);
+
+ film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel);
+
+ ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x;
+ *out = float4_to_half4_display(make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
+}
+
+/* Common implementation for half4 destination and 3-channel input pass. */
+template<typename Processor>
+ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgb(
+ const KernelFilmConvert *kfilm_convert,
+ uchar4 *rgba,
+ float *render_buffer,
+ int num_pixels,
+ int width,
+ int offset,
+ int stride,
+ int rgba_offset,
+ int rgba_stride,
+ const Processor &processor)
+{
+ kernel_gpu_film_convert_half_rgba_common_rgba(
+ kfilm_convert,
+ rgba,
+ render_buffer,
+ num_pixels,
+ width,
+ offset,
+ stride,
+ rgba_offset,
+ rgba_stride,
+ [&processor](const KernelFilmConvert *kfilm_convert,
+ ccl_global const float *buffer,
+ float *pixel_rgba) {
+ processor(kfilm_convert, buffer, pixel_rgba);
+ pixel_rgba[3] = 1.0f;
+ });
+}
+
+/* Common implementation for half4 destination and single channel input pass. */
+template<typename Processor>
+ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_value(
+ const KernelFilmConvert *kfilm_convert,
+ uchar4 *rgba,
+ float *render_buffer,
+ int num_pixels,
+ int width,
+ int offset,
+ int stride,
+ int rgba_offset,
+ int rgba_stride,
+ const Processor &processor)
+{
+ kernel_gpu_film_convert_half_rgba_common_rgba(
+ kfilm_convert,
+ rgba,
+ render_buffer,
+ num_pixels,
+ width,
+ offset,
+ stride,
+ rgba_offset,
+ rgba_stride,
+ [&processor](const KernelFilmConvert *kfilm_convert,
+ ccl_global const float *buffer,
+ float *pixel_rgba) {
+ float value;
+ processor(kfilm_convert, buffer, &value);
+
+ pixel_rgba[0] = value;
+ pixel_rgba[1] = value;
+ pixel_rgba[2] = value;
+ pixel_rgba[3] = 1.0f;
+ });
+}
+
+#define KERNEL_FILM_CONVERT_PROC(name) \
+ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) name
+
+#define KERNEL_FILM_CONVERT_DEFINE(variant, channels) \
+ KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant) \
+ (const KernelFilmConvert kfilm_convert, \
+ float *pixels, \
+ float *render_buffer, \
+ int num_pixels, \
+ int width, \
+ int offset, \
+ int stride, \
+ int rgba_offset, \
+ int rgba_stride) \
+ { \
+ kernel_gpu_film_convert_common(&kfilm_convert, \
+ pixels, \
+ render_buffer, \
+ num_pixels, \
+ width, \
+ offset, \
+ stride, \
+ rgba_offset, \
+ rgba_stride, \
+ film_get_pass_pixel_##variant); \
+ } \
+ KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant##_half_rgba) \
+ (const KernelFilmConvert kfilm_convert, \
+ uchar4 *rgba, \
+ float *render_buffer, \
+ int num_pixels, \
+ int width, \
+ int offset, \
+ int stride, \
+ int rgba_offset, \
+ int rgba_stride) \
+ { \
+ kernel_gpu_film_convert_half_rgba_common_##channels(&kfilm_convert, \
+ rgba, \
+ render_buffer, \
+ num_pixels, \
+ width, \
+ offset, \
+ stride, \
+ rgba_offset, \
+ rgba_stride, \
+ film_get_pass_pixel_##variant); \
+ }
+
+KERNEL_FILM_CONVERT_DEFINE(depth, value)
+KERNEL_FILM_CONVERT_DEFINE(mist, value)
+KERNEL_FILM_CONVERT_DEFINE(sample_count, value)
+KERNEL_FILM_CONVERT_DEFINE(float, value)
+
+KERNEL_FILM_CONVERT_DEFINE(light_path, rgb)
+KERNEL_FILM_CONVERT_DEFINE(float3, rgb)
+
+KERNEL_FILM_CONVERT_DEFINE(motion, rgba)
+KERNEL_FILM_CONVERT_DEFINE(cryptomatte, rgba)
+KERNEL_FILM_CONVERT_DEFINE(shadow_catcher, rgba)
+KERNEL_FILM_CONVERT_DEFINE(shadow_catcher_matte_with_shadow, rgba)
+KERNEL_FILM_CONVERT_DEFINE(combined, rgba)
+KERNEL_FILM_CONVERT_DEFINE(float4, rgba)
+
+#undef KERNEL_FILM_CONVERT_DEFINE
+#undef KERNEL_FILM_CONVERT_HALF_RGBA_DEFINE
+#undef KERNEL_FILM_CONVERT_PROC
+
+/* --------------------------------------------------------------------
+ * Shader evaluation.
+ */
+
+/* Displacement */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_shader_eval_displace(KernelShaderEvalInput *input,
+ float *output,
+ const int offset,
+ const int work_size)
+{
+ int i = ccl_gpu_global_id_x();
+ if (i < work_size) {
+ kernel_displace_evaluate(NULL, input, output, offset + i);
+ }
+}
+
+/* Background */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_shader_eval_background(KernelShaderEvalInput *input,
+ float *output,
+ const int offset,
+ const int work_size)
+{
+ int i = ccl_gpu_global_id_x();
+ if (i < work_size) {
+ kernel_background_evaluate(NULL, input, output, offset + i);
+ }
+}
+
+/* Curve Shadow Transparency */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_shader_eval_curve_shadow_transparency(KernelShaderEvalInput *input,
+ float *output,
+ const int offset,
+ const int work_size)
+{
+ int i = ccl_gpu_global_id_x();
+ if (i < work_size) {
+ kernel_curve_shadow_transparency_evaluate(NULL, input, output, offset + i);
+ }
+}
+
+/* --------------------------------------------------------------------
+ * Denoising.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_filter_color_preprocess(float *render_buffer,
+ int full_x,
+ int full_y,
+ int width,
+ int height,
+ int offset,
+ int stride,
+ int pass_stride,
+ int pass_denoised)
+{
+ const int work_index = ccl_gpu_global_id_x();
+ const int y = work_index / width;
+ const int x = work_index - y * width;
+
+ if (x >= width || y >= height) {
+ return;
+ }
+
+ const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
+ float *buffer = render_buffer + render_pixel_index * pass_stride;
+
+ float *color_out = buffer + pass_denoised;
+ color_out[0] = clamp(color_out[0], 0.0f, 10000.0f);
+ color_out[1] = clamp(color_out[1], 0.0f, 10000.0f);
+ color_out[2] = clamp(color_out[2], 0.0f, 10000.0f);
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_filter_guiding_preprocess(float *guiding_buffer,
+ int guiding_pass_stride,
+ int guiding_pass_albedo,
+ int guiding_pass_normal,
+ const float *render_buffer,
+ int render_offset,
+ int render_stride,
+ int render_pass_stride,
+ int render_pass_sample_count,
+ int render_pass_denoising_albedo,
+ int render_pass_denoising_normal,
+ int full_x,
+ int full_y,
+ int width,
+ int height,
+ int num_samples)
+{
+ const int work_index = ccl_gpu_global_id_x();
+ const int y = work_index / width;
+ const int x = work_index - y * width;
+
+ if (x >= width || y >= height) {
+ return;
+ }
+
+ const uint64_t guiding_pixel_index = x + y * width;
+ float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
+
+ const uint64_t render_pixel_index = render_offset + (x + full_x) + (y + full_y) * render_stride;
+ const float *buffer = render_buffer + render_pixel_index * render_pass_stride;
+
+ float pixel_scale;
+ if (render_pass_sample_count == PASS_UNUSED) {
+ pixel_scale = 1.0f / num_samples;
+ }
+ else {
+ pixel_scale = 1.0f / __float_as_uint(buffer[render_pass_sample_count]);
+ }
+
+ /* Albedo pass. */
+ if (guiding_pass_albedo != PASS_UNUSED) {
+ kernel_assert(render_pass_denoising_albedo != PASS_UNUSED);
+
+ const float *aledo_in = buffer + render_pass_denoising_albedo;
+ float *albedo_out = guiding_pixel + guiding_pass_albedo;
+
+ albedo_out[0] = aledo_in[0] * pixel_scale;
+ albedo_out[1] = aledo_in[1] * pixel_scale;
+ albedo_out[2] = aledo_in[2] * pixel_scale;
+ }
+
+ /* Normal pass. */
+ if (render_pass_denoising_normal != PASS_UNUSED) {
+ kernel_assert(render_pass_denoising_normal != PASS_UNUSED);
+
+ const float *normal_in = buffer + render_pass_denoising_normal;
+ float *normal_out = guiding_pixel + guiding_pass_normal;
+
+ normal_out[0] = normal_in[0] * pixel_scale;
+ normal_out[1] = normal_in[1] * pixel_scale;
+ normal_out[2] = normal_in[2] * pixel_scale;
+ }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_filter_guiding_set_fake_albedo(float *guiding_buffer,
+ int guiding_pass_stride,
+ int guiding_pass_albedo,
+ int width,
+ int height)
+{
+ kernel_assert(guiding_pass_albedo != PASS_UNUSED);
+
+ const int work_index = ccl_gpu_global_id_x();
+ const int y = work_index / width;
+ const int x = work_index - y * width;
+
+ if (x >= width || y >= height) {
+ return;
+ }
+
+ const uint64_t guiding_pixel_index = x + y * width;
+ float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
+
+ float *albedo_out = guiding_pixel + guiding_pass_albedo;
+
+ albedo_out[0] = 0.5f;
+ albedo_out[1] = 0.5f;
+ albedo_out[2] = 0.5f;
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_filter_color_postprocess(float *render_buffer,
+ int full_x,
+ int full_y,
+ int width,
+ int height,
+ int offset,
+ int stride,
+ int pass_stride,
+ int num_samples,
+ int pass_noisy,
+ int pass_denoised,
+ int pass_sample_count,
+ int num_components,
+ bool use_compositing)
+{
+ const int work_index = ccl_gpu_global_id_x();
+ const int y = work_index / width;
+ const int x = work_index - y * width;
+
+ if (x >= width || y >= height) {
+ return;
+ }
+
+ const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
+ float *buffer = render_buffer + render_pixel_index * pass_stride;
+
+ float pixel_scale;
+ if (pass_sample_count == PASS_UNUSED) {
+ pixel_scale = num_samples;
+ }
+ else {
+ pixel_scale = __float_as_uint(buffer[pass_sample_count]);
+ }
+
+ float *denoised_pixel = buffer + pass_denoised;
+
+ denoised_pixel[0] *= pixel_scale;
+ denoised_pixel[1] *= pixel_scale;
+ denoised_pixel[2] *= pixel_scale;
+
+ if (num_components == 3) {
+ /* Pass without alpha channel. */
+ }
+ else if (!use_compositing) {
+ /* Currently compositing passes are either 3-component (derived by dividing light passes)
+ * or do not have transparency (shadow catcher). Implicitly rely on this logic, as it
+ * simplifies logic and avoids extra memory allocation. */
+ const float *noisy_pixel = buffer + pass_noisy;
+ denoised_pixel[3] = noisy_pixel[3];
+ }
+ else {
+ /* Assigning to zero since this is a default alpha value for 3-component passes, and it
+ * is an opaque pixel for 4 component passes. */
+
+ denoised_pixel[3] = 0;
+ }
+}
+
+/* --------------------------------------------------------------------
+ * Shadow catcher.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+ kernel_gpu_integrator_shadow_catcher_count_possible_splits(int num_states,
+ uint *num_possible_splits)
+{
+ const int state = ccl_gpu_global_id_x();
+
+ bool can_split = false;
+
+ if (state < num_states) {
+ can_split = kernel_shadow_catcher_path_can_split(nullptr, state);
+ }
+
+ /* NOTE: All threads specified in the mask must execute the intrinsic. */
+ const uint can_split_mask = ccl_gpu_ballot(can_split);
+ const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
+ if (lane_id == 0) {
+ atomic_fetch_and_add_uint32(num_possible_splits, __popc(can_split_mask));
+ }
+}
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
new file mode 100644
index 00000000000..db4a4bf71e0
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Given an array of states, build an array of indices for which the states
+ * are active.
+ *
+ * Shared memory requirement is `sizeof(int) * (number_of_warps + 1)`. */
+
+#include "util/util_atomic.h"
+
+#ifdef __HIP__
+# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 1024
+#else
+# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
+#endif
+
+template<uint blocksize, typename IsActiveOp>
+__device__ void gpu_parallel_active_index_array(const uint num_states,
+ int *indices,
+ int *num_indices,
+ IsActiveOp is_active_op)
+{
+ extern ccl_gpu_shared int warp_offset[];
+
+ const uint thread_index = ccl_gpu_thread_idx_x;
+ const uint thread_warp = thread_index % ccl_gpu_warp_size;
+
+ const uint warp_index = thread_index / ccl_gpu_warp_size;
+ const uint num_warps = blocksize / ccl_gpu_warp_size;
+
+ /* Test if state corresponding to this thread is active. */
+ const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index;
+ const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
+
+ /* For each thread within a warp compute how many other active states precede it. */
+ const uint thread_mask = 0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp);
+ const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) & thread_mask);
+
+ /* Last thread in warp stores number of active states for each warp. */
+ if (thread_warp == ccl_gpu_warp_size - 1) {
+ warp_offset[warp_index] = thread_offset + is_active;
+ }
+
+ ccl_gpu_syncthreads();
+
+ /* Last thread in block converts per-warp sizes to offsets, increments global size of
+ * index array and gets offset to write to. */
+ if (thread_index == blocksize - 1) {
+ /* TODO: parallelize this. */
+ int offset = 0;
+ for (int i = 0; i < num_warps; i++) {
+ int num_active = warp_offset[i];
+ warp_offset[i] = offset;
+ offset += num_active;
+ }
+
+ const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
+ warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
+ }
+
+ ccl_gpu_syncthreads();
+
+ /* Write to index array. */
+ if (is_active) {
+ const uint block_offset = warp_offset[num_warps];
+ indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
new file mode 100644
index 00000000000..aabe6e2e27a
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Parallel prefix sum.
+ *
+ * TODO: actually make this work in parallel.
+ *
+ * This is used for an array the size of the number of shaders in the scene
+ * which is not usually huge, so might not be a significant bottleneck. */
+
+#include "util/util_atomic.h"
+
+#ifdef __HIP__
+# define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 1024
+#else
+# define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
+#endif
+
+template<uint blocksize>
+__device__ void gpu_parallel_prefix_sum(int *counter, int *prefix_sum, const int num_values)
+{
+ if (!(ccl_gpu_block_idx_x == 0 && ccl_gpu_thread_idx_x == 0)) {
+ return;
+ }
+
+ int offset = 0;
+ for (int i = 0; i < num_values; i++) {
+ const int new_offset = offset + counter[i];
+ prefix_sum[i] = offset;
+ counter[i] = 0;
+ offset = new_offset;
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/parallel_reduce.h b/intern/cycles/kernel/device/gpu/parallel_reduce.h
new file mode 100644
index 00000000000..b60dceb2ed0
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/parallel_reduce.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Parallel sum of array input_data with size n into output_sum.
+ *
+ * Adapted from "Optimizing Parallel Reduction in GPU", Mark Harris.
+ *
+ * This version adds multiple elements per thread sequentially. This reduces
+ * the overall cost of the algorithm while keeping the work complexity O(n) and
+ * the step complexity O(log n). (Brent's Theorem optimization) */
+
+#ifdef __HIP__
+# define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 1024
+#else
+# define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 512
+#endif
+
+template<uint blocksize, typename InputT, typename OutputT, typename ConvertOp>
+__device__ void gpu_parallel_sum(
+ const InputT *input_data, const uint n, OutputT *output_sum, OutputT zero, ConvertOp convert)
+{
+ extern ccl_gpu_shared OutputT shared_data[];
+
+ const uint tid = ccl_gpu_thread_idx_x;
+ const uint gridsize = blocksize * ccl_gpu_grid_dim_x();
+
+ OutputT sum = zero;
+ for (uint i = ccl_gpu_block_idx_x * blocksize + tid; i < n; i += gridsize) {
+ sum += convert(input_data[i]);
+ }
+ shared_data[tid] = sum;
+
+ ccl_gpu_syncthreads();
+
+ if (blocksize >= 512 && tid < 256) {
+ shared_data[tid] = sum = sum + shared_data[tid + 256];
+ }
+
+ ccl_gpu_syncthreads();
+
+ if (blocksize >= 256 && tid < 128) {
+ shared_data[tid] = sum = sum + shared_data[tid + 128];
+ }
+
+ ccl_gpu_syncthreads();
+
+ if (blocksize >= 128 && tid < 64) {
+ shared_data[tid] = sum = sum + shared_data[tid + 64];
+ }
+
+ ccl_gpu_syncthreads();
+
+ if (blocksize >= 64 && tid < 32) {
+ shared_data[tid] = sum = sum + shared_data[tid + 32];
+ }
+
+ ccl_gpu_syncthreads();
+
+ if (tid < 32) {
+ for (int offset = ccl_gpu_warp_size / 2; offset > 0; offset /= 2) {
+ sum += ccl_shfl_down_sync(0xFFFFFFFF, sum, offset);
+ }
+ }
+
+ if (tid == 0) {
+ output_sum[ccl_gpu_block_idx_x] = sum;
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
new file mode 100644
index 00000000000..7570c5a6bbd
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Given an array of states, build an array of indices for which the states
+ * are active and sorted by a given key. The prefix sum of the number of active
+ * states per key must have already been computed.
+ *
+ * TODO: there may be ways to optimize this to avoid this many atomic ops? */
+
+#include "util/util_atomic.h"
+
+#ifdef __HIP__
+# define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 1024
+#else
+# define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
+#endif
+#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
+
+template<uint blocksize, typename GetKeyOp>
+__device__ void gpu_parallel_sorted_index_array(const uint num_states,
+ const int num_states_limit,
+ int *indices,
+ int *num_indices,
+ int *key_counter,
+ int *key_prefix_sum,
+ GetKeyOp get_key_op)
+{
+ const uint state_index = ccl_gpu_block_idx_x * blocksize + ccl_gpu_thread_idx_x;
+ const int key = (state_index < num_states) ? get_key_op(state_index) :
+ GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
+
+ if (key != GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY) {
+ const uint index = atomic_fetch_and_add_uint32(&key_prefix_sum[key], 1);
+ if (index < num_states_limit) {
+ /* Assign state index. */
+ indices[index] = state_index;
+ }
+ else {
+ /* Can't process this state now, increase the counter again so that
+ * it will be handled in another iteration. */
+ atomic_fetch_and_add_uint32(&key_counter[key], 1);
+ }
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/hip/compat.h b/intern/cycles/kernel/device/hip/compat.h
new file mode 100644
index 00000000000..089976d84e4
--- /dev/null
+++ b/intern/cycles/kernel/device/hip/compat.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#define __KERNEL_GPU__
+#define __KERNEL_HIP__
+#define CCL_NAMESPACE_BEGIN
+#define CCL_NAMESPACE_END
+
+#ifndef ATTR_FALLTHROUGH
+# define ATTR_FALLTHROUGH
+#endif
+
+#ifdef __HIPCC_RTC__
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+#else
+# include <stdint.h>
+#endif
+
+#ifdef CYCLES_HIPBIN_CC
+# define FLT_MIN 1.175494350822287507969e-38f
+# define FLT_MAX 340282346638528859811704183484516925440.0f
+# define FLT_EPSILON 1.192092896e-07F
+#endif
+
+/* Qualifiers */
+
+#define ccl_device __device__ __inline__
+#define ccl_device_inline __device__ __inline__
+#define ccl_device_forceinline __device__ __forceinline__
+#define ccl_device_noinline __device__ __noinline__
+#define ccl_device_noinline_cpu ccl_device
+#define ccl_global
+#define ccl_static_constant __constant__
+#define ccl_device_constant __constant__ __device__
+#define ccl_constant const
+#define ccl_gpu_shared __shared__
+#define ccl_private
+#define ccl_may_alias
+#define ccl_restrict __restrict__
+#define ccl_loop_no_unroll
+#define ccl_align(n) __align__(n)
+#define ccl_optional_struct_init
+
+#define kernel_assert(cond)
+
+/* Types */
+#ifdef __HIP__
+# include "hip/hip_fp16.h"
+# include "hip/hip_runtime.h"
+#endif
+
+#ifdef _MSC_VER
+# include <immintrin.h>
+#endif
+
+#define ccl_gpu_thread_idx_x (threadIdx.x)
+#define ccl_gpu_block_dim_x (blockDim.x)
+#define ccl_gpu_block_idx_x (blockIdx.x)
+#define ccl_gpu_grid_dim_x (gridDim.x)
+#define ccl_gpu_warp_size (warpSize)
+
+#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
+#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
+
+/* GPU warp synchronization */
+
+#define ccl_gpu_syncthreads() __syncthreads()
+#define ccl_gpu_ballot(predicate) __ballot(predicate)
+#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down(var, detla)
+#define ccl_gpu_popc(x) __popc(x)
+
+/* GPU texture objects */
+typedef hipTextureObject_t ccl_gpu_tex_object;
+
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_2D(const ccl_gpu_tex_object texobj,
+ const float x,
+ const float y)
+{
+ return tex2D<T>(texobj, x, y);
+}
+
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object texobj,
+ const float x,
+ const float y,
+ const float z)
+{
+ return tex3D<T>(texobj, x, y, z);
+}
+
+/* Use fast math functions */
+
+#define cosf(x) __cosf(((float)(x)))
+#define sinf(x) __sinf(((float)(x)))
+#define powf(x, y) __powf(((float)(x)), ((float)(y)))
+#define tanf(x) __tanf(((float)(x)))
+#define logf(x) __logf(((float)(x)))
+#define expf(x) __expf(((float)(x)))
+
+/* Types */
+
+#include "util/util_half.h"
+#include "util/util_types.h"
diff --git a/intern/cycles/kernel/device/hip/config.h b/intern/cycles/kernel/device/hip/config.h
new file mode 100644
index 00000000000..2fde0d46015
--- /dev/null
+++ b/intern/cycles/kernel/device/hip/config.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Device data taken from HIP occupancy calculator.
+ *
+ * Terminology
+ * - HIP GPUs have multiple streaming multiprocessors
+ * - Each multiprocessor executes multiple thread blocks
+ * - Each thread block contains a number of threads, also known as the block size
+ * - Multiprocessors have a fixed number of registers, and the amount of registers
+ * used by each threads limits the number of threads per block.
+ */
+
+/* Launch Bound Definitions */
+#define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+#define GPU_MULTIPROCESSOR_MAX_BLOCKS 64
+#define GPU_BLOCK_MAX_THREADS 1024
+#define GPU_THREAD_MAX_REGISTERS 255
+
+#define GPU_KERNEL_BLOCK_NUM_THREADS 1024
+#define GPU_KERNEL_MAX_REGISTERS 64
+
+/* Compute number of threads per block and minimum blocks per multiprocessor
+ * given the maximum number of registers per thread. */
+
+#define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
+ extern "C" __global__ void __launch_bounds__(block_num_threads, \
+ GPU_MULTIPRESSOR_MAX_REGISTERS / \
+ (block_num_threads * thread_num_registers))
+
+/* sanity checks */
+
+#if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
+# error "Maximum number of threads per block exceeded"
+#endif
+
+#if GPU_MULTIPRESSOR_MAX_REGISTERS / (GPU_KERNEL_BLOCK_NUM_THREADS * GPU_KERNEL_MAX_REGISTERS) > \
+ GPU_MULTIPROCESSOR_MAX_BLOCKS
+# error "Maximum number of blocks per multiprocessor exceeded"
+#endif
+
+#if GPU_KERNEL_MAX_REGISTERS > GPU_THREAD_MAX_REGISTERS
+# error "Maximum number of registers per thread exceeded"
+#endif
diff --git a/intern/cycles/kernel/device/hip/globals.h b/intern/cycles/kernel/device/hip/globals.h
new file mode 100644
index 00000000000..28e1cc4282f
--- /dev/null
+++ b/intern/cycles/kernel/device/hip/globals.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#pragma once
+
+#include "kernel/kernel_profiling.h"
+#include "kernel/kernel_types.h"
+
+#include "kernel/integrator/integrator_state.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Not actually used, just a NULL pointer that gets passed everywhere, which we
+ * hope gets optimized out by the compiler. */
+struct KernelGlobalsGPU {
+ int unused[1];
+};
+typedef ccl_global const KernelGlobalsGPU *ccl_restrict KernelGlobals;
+
+/* Global scene data and textures */
+__constant__ KernelData __data;
+#define KERNEL_TEX(type, name) __attribute__((used)) const __constant__ __device__ type *name;
+#include "kernel/kernel_textures.h"
+
+/* Integrator state */
+__constant__ IntegratorStateGPU __integrator_state;
+
+/* Abstraction macros */
+#define kernel_data __data
+#define kernel_tex_fetch(t, index) t[(index)]
+#define kernel_tex_array(t) (t)
+#define kernel_integrator_state __integrator_state
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/hip/kernel.cpp b/intern/cycles/kernel/device/hip/kernel.cpp
new file mode 100644
index 00000000000..c801320a2e1
--- /dev/null
+++ b/intern/cycles/kernel/device/hip/kernel.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* HIP kernel entry points */
+
+#ifdef __HIP_DEVICE_COMPILE__
+
+# include "kernel/device/hip/compat.h"
+# include "kernel/device/hip/config.h"
+# include "kernel/device/hip/globals.h"
+
+# include "kernel/device/gpu/image.h"
+# include "kernel/device/gpu/kernel.h"
+
+#endif
diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h
new file mode 100644
index 00000000000..77cea30914c
--- /dev/null
+++ b/intern/cycles/kernel/device/metal/compat.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#define __KERNEL_GPU__
+#define __KERNEL_METAL__
+#define CCL_NAMESPACE_BEGIN
+#define CCL_NAMESPACE_END
+
+#ifndef ATTR_FALLTHROUGH
+# define ATTR_FALLTHROUGH
+#endif
+
+#include <metal_atomic>
+#include <metal_pack>
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+#pragma clang diagnostic ignored "-Wunused-variable"
+#pragma clang diagnostic ignored "-Wsign-compare"
+
+/* Qualifiers */
+
+#define ccl_device
+#define ccl_device_inline ccl_device
+#define ccl_device_forceinline ccl_device
+#define ccl_device_noinline ccl_device __attribute__((noinline))
+#define ccl_device_noinline_cpu ccl_device
+#define ccl_global device
+#define ccl_static_constant static constant constexpr
+#define ccl_device_constant constant
+#define ccl_constant const device
+#define ccl_gpu_shared threadgroup
+#define ccl_private thread
+#define ccl_may_alias
+#define ccl_restrict __restrict
+#define ccl_loop_no_unroll
+#define ccl_align(n) alignas(n)
+#define ccl_optional_struct_init
+
+/* No assert supported for Metal */
+
+#define kernel_assert(cond)
+
+/* make_type definitions with Metal style element initializers */
+#ifdef make_float2
+# undef make_float2
+#endif
+#ifdef make_float3
+# undef make_float3
+#endif
+#ifdef make_float4
+# undef make_float4
+#endif
+#ifdef make_int2
+# undef make_int2
+#endif
+#ifdef make_int3
+# undef make_int3
+#endif
+#ifdef make_int4
+# undef make_int4
+#endif
+#ifdef make_uchar4
+# undef make_uchar4
+#endif
+
+#define make_float2(x, y) float2(x, y)
+#define make_float3(x, y, z) float3(x, y, z)
+#define make_float4(x, y, z, w) float4(x, y, z, w)
+#define make_int2(x, y) int2(x, y)
+#define make_int3(x, y, z) int3(x, y, z)
+#define make_int4(x, y, z, w) int4(x, y, z, w)
+#define make_uchar4(x, y, z, w) uchar4(x, y, z, w)
+
+/* Math functions */
+
+#define __uint_as_float(x) as_type<float>(x)
+#define __float_as_uint(x) as_type<uint>(x)
+#define __int_as_float(x) as_type<float>(x)
+#define __float_as_int(x) as_type<int>(x)
+#define __float2half(x) half(x)
+#define powf(x, y) pow(float(x), float(y))
+#define fabsf(x) fabs(float(x))
+#define copysignf(x, y) copysign(float(x), float(y))
+#define asinf(x) asin(float(x))
+#define acosf(x) acos(float(x))
+#define atanf(x) atan(float(x))
+#define floorf(x) floor(float(x))
+#define ceilf(x) ceil(float(x))
+#define hypotf(x, y) hypot(float(x), float(y))
+#define atan2f(x, y) atan2(float(x), float(y))
+#define fmaxf(x, y) fmax(float(x), float(y))
+#define fminf(x, y) fmin(float(x), float(y))
+#define fmodf(x, y) fmod(float(x), float(y))
+#define sinhf(x) sinh(float(x))
+#define coshf(x) cosh(float(x))
+#define tanhf(x) tanh(float(x))
+
+/* Use native functions with possibly lower precision for performance,
+ * no issues found so far. */
+#define trigmode fast
+#define sinf(x) trigmode::sin(float(x))
+#define cosf(x) trigmode::cos(float(x))
+#define tanf(x) trigmode::tan(float(x))
+#define expf(x) trigmode::exp(float(x))
+#define sqrtf(x) trigmode::sqrt(float(x))
+#define logf(x) trigmode::log(float(x))
+
+#define NULL 0
diff --git a/intern/cycles/kernel/device/optix/compat.h b/intern/cycles/kernel/device/optix/compat.h
new file mode 100644
index 00000000000..d27b7d55475
--- /dev/null
+++ b/intern/cycles/kernel/device/optix/compat.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#define OPTIX_DONT_INCLUDE_CUDA
+#include <optix.h>
+
+#define __KERNEL_GPU__
+#define __KERNEL_CUDA__ /* OptiX kernels are implicitly CUDA kernels too */
+#define __KERNEL_OPTIX__
+#define CCL_NAMESPACE_BEGIN
+#define CCL_NAMESPACE_END
+
+#ifndef ATTR_FALLTHROUGH
+# define ATTR_FALLTHROUGH
+#endif
+
+/* Manual definitions so we can compile without CUDA toolkit. */
+
+#ifdef __CUDACC_RTC__
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+#else
+# include <stdint.h>
+#endif
+
+#ifdef CYCLES_CUBIN_CC
+# define FLT_MIN 1.175494350822287507969e-38f
+# define FLT_MAX 340282346638528859811704183484516925440.0f
+# define FLT_EPSILON 1.192092896e-07F
+#endif
+
+#define ccl_device \
+ __device__ __forceinline__ // Function calls are bad for OptiX performance, so inline everything
+#define ccl_device_inline ccl_device
+#define ccl_device_forceinline ccl_device
+#define ccl_device_noinline __device__ __noinline__
+#define ccl_device_noinline_cpu ccl_device
+#define ccl_global
+#define ccl_static_constant __constant__
+#define ccl_device_constant __constant__ __device__
+#define ccl_constant const
+#define ccl_gpu_shared __shared__
+#define ccl_private
+#define ccl_may_alias
+#define ccl_restrict __restrict__
+#define ccl_loop_no_unroll
+#define ccl_align(n) __align__(n)
+
+/* Zero initialize structs to help the compiler figure out scoping */
+#define ccl_optional_struct_init = {}
+
+/* No assert supported for CUDA */
+
+#define kernel_assert(cond)
+
+/* GPU thread, block, grid size and index */
+
+#define ccl_gpu_thread_idx_x (threadIdx.x)
+#define ccl_gpu_block_dim_x (blockDim.x)
+#define ccl_gpu_block_idx_x (blockIdx.x)
+#define ccl_gpu_grid_dim_x (gridDim.x)
+#define ccl_gpu_warp_size (warpSize)
+
+#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
+#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
+
+/* GPU warp synchronization. */
+
+#define ccl_gpu_syncthreads() __syncthreads()
+#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
+#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
+#define ccl_gpu_popc(x) __popc(x)
+
+/* GPU texture objects */
+
+typedef unsigned long long CUtexObject;
+typedef CUtexObject ccl_gpu_tex_object;
+
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_2D(const ccl_gpu_tex_object texobj,
+ const float x,
+ const float y)
+{
+ return tex2D<T>(texobj, x, y);
+}
+
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object texobj,
+ const float x,
+ const float y,
+ const float z)
+{
+ return tex3D<T>(texobj, x, y, z);
+}
+
+/* Half */
+
+typedef unsigned short half;
+
+__device__ half __float2half(const float f)
+{
+ half val;
+ asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
+ return val;
+}
+
+__device__ float __half2float(const half h)
+{
+ float val;
+ asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h));
+ return val;
+}
+
+/* Types */
+
+#include "util/util_half.h"
+#include "util/util_types.h"
diff --git a/intern/cycles/kernel/device/optix/globals.h b/intern/cycles/kernel/device/optix/globals.h
new file mode 100644
index 00000000000..7b8ebfe50e6
--- /dev/null
+++ b/intern/cycles/kernel/device/optix/globals.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#pragma once
+
+#include "kernel/kernel_profiling.h"
+#include "kernel/kernel_types.h"
+
+#include "kernel/integrator/integrator_state.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Not actually used, just a NULL pointer that gets passed everywhere, which we
+ * hope gets optimized out by the compiler. */
+struct KernelGlobalsGPU {
+ int unused[1];
+};
+typedef ccl_global const KernelGlobalsGPU *ccl_restrict KernelGlobals;
+
+/* Launch parameters */
+struct KernelParamsOptiX {
+ /* Kernel arguments */
+ const int *path_index_array;
+ float *render_buffer;
+
+ /* Global scene data and textures */
+ KernelData data;
+#define KERNEL_TEX(type, name) const type *name;
+#include "kernel/kernel_textures.h"
+
+ /* Integrator state */
+ IntegratorStateGPU __integrator_state;
+};
+
+#ifdef __NVCC__
+extern "C" static __constant__ KernelParamsOptiX __params;
+#endif
+
+/* Abstraction macros */
+#define kernel_data __params.data
+#define kernel_tex_array(t) __params.t
+#define kernel_tex_fetch(t, index) __params.t[(index)]
+#define kernel_integrator_state __params.__integrator_state
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/optix/kernel.cu b/intern/cycles/kernel/device/optix/kernel.cu
new file mode 100644
index 00000000000..a3bafb9846c
--- /dev/null
+++ b/intern/cycles/kernel/device/optix/kernel.cu
@@ -0,0 +1,406 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// clang-format off
+#include "kernel/device/optix/compat.h"
+#include "kernel/device/optix/globals.h"
+
+#include "kernel/device/gpu/image.h" /* Texture lookup uses normal CUDA intrinsics. */
+
+#include "kernel/integrator/integrator_state.h"
+#include "kernel/integrator/integrator_state_flow.h"
+#include "kernel/integrator/integrator_state_util.h"
+
+#include "kernel/integrator/integrator_intersect_closest.h"
+#include "kernel/integrator/integrator_intersect_shadow.h"
+#include "kernel/integrator/integrator_intersect_subsurface.h"
+#include "kernel/integrator/integrator_intersect_volume_stack.h"
+
+// clang-format on
+
+template<typename T> ccl_device_forceinline T *get_payload_ptr_0()
+{
+ return pointer_unpack_from_uint<T>(optixGetPayload_0(), optixGetPayload_1());
+}
+template<typename T> ccl_device_forceinline T *get_payload_ptr_2()
+{
+ return pointer_unpack_from_uint<T>(optixGetPayload_2(), optixGetPayload_3());
+}
+
+ccl_device_forceinline int get_object_id()
+{
+#ifdef __OBJECT_MOTION__
+ /* Always get the the instance ID from the TLAS
+ * There might be a motion transform node between TLAS and BLAS which does not have one. */
+ return optixGetInstanceIdFromHandle(optixGetTransformListHandle(0));
+#else
+ return optixGetInstanceId();
+#endif
+}
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_closest()
+{
+ const int global_index = optixGetLaunchIndex().x;
+ const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+ global_index;
+ integrator_intersect_closest(nullptr, path_index);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_shadow()
+{
+ const int global_index = optixGetLaunchIndex().x;
+ const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+ global_index;
+ integrator_intersect_shadow(nullptr, path_index);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_subsurface()
+{
+ const int global_index = optixGetLaunchIndex().x;
+ const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+ global_index;
+ integrator_intersect_subsurface(nullptr, path_index);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_volume_stack()
+{
+ const int global_index = optixGetLaunchIndex().x;
+ const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+ global_index;
+ integrator_intersect_volume_stack(nullptr, path_index);
+}
+
+extern "C" __global__ void __miss__kernel_optix_miss()
+{
+ /* 'kernel_path_lamp_emission' checks intersection distance, so need to set it even on a miss. */
+ optixSetPayload_0(__float_as_uint(optixGetRayTmax()));
+ optixSetPayload_5(PRIMITIVE_NONE);
+}
+
+extern "C" __global__ void __anyhit__kernel_optix_local_hit()
+{
+#ifdef __HAIR__
+ if (!optixIsTriangleHit()) {
+ /* Ignore curves. */
+ return optixIgnoreIntersection();
+ }
+#endif
+
+#ifdef __BVH_LOCAL__
+ const int object = get_object_id();
+ if (object != optixGetPayload_4() /* local_object */) {
+ /* Only intersect with matching object. */
+ return optixIgnoreIntersection();
+ }
+
+ const uint max_hits = optixGetPayload_5();
+ if (max_hits == 0) {
+ /* Special case for when no hit information is requested, just report that something was hit */
+ optixSetPayload_5(true);
+ return optixTerminateRay();
+ }
+
+ int hit = 0;
+ uint *const lcg_state = get_payload_ptr_0<uint>();
+ LocalIntersection *const local_isect = get_payload_ptr_2<LocalIntersection>();
+
+ if (lcg_state) {
+ for (int i = min(max_hits, local_isect->num_hits) - 1; i >= 0; --i) {
+ if (optixGetRayTmax() == local_isect->hits[i].t) {
+ return optixIgnoreIntersection();
+ }
+ }
+
+ hit = local_isect->num_hits++;
+
+ if (local_isect->num_hits > max_hits) {
+ hit = lcg_step_uint(lcg_state) % local_isect->num_hits;
+ if (hit >= max_hits) {
+ return optixIgnoreIntersection();
+ }
+ }
+ }
+ else {
+ if (local_isect->num_hits && optixGetRayTmax() > local_isect->hits[0].t) {
+ /* Record closest intersection only.
+ * Do not terminate ray here, since there is no guarantee about distance ordering in any-hit.
+ */
+ return optixIgnoreIntersection();
+ }
+
+ local_isect->num_hits = 1;
+ }
+
+ const int prim = optixGetPrimitiveIndex();
+
+ Intersection *isect = &local_isect->hits[hit];
+ isect->t = optixGetRayTmax();
+ isect->prim = prim;
+ isect->object = get_object_id();
+ isect->type = kernel_tex_fetch(__objects, isect->object).primitive_type;
+
+ const float2 barycentrics = optixGetTriangleBarycentrics();
+ isect->u = 1.0f - barycentrics.y - barycentrics.x;
+ isect->v = barycentrics.x;
+
+ /* Record geometric normal. */
+ const uint tri_vindex = kernel_tex_fetch(__tri_vindex, prim).w;
+ const float3 tri_a = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 0));
+ const float3 tri_b = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 1));
+ const float3 tri_c = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 2));
+ local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));
+
+ /* Continue tracing (without this the trace call would return after the first hit). */
+ optixIgnoreIntersection();
+#endif
+}
+
+extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
+{
+#ifdef __SHADOW_RECORD_ALL__
+ int prim = optixGetPrimitiveIndex();
+ const uint object = get_object_id();
+# ifdef __VISIBILITY_FLAG__
+ const uint visibility = optixGetPayload_4();
+ if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
+ return optixIgnoreIntersection();
+ }
+# endif
+
+ float u = 0.0f, v = 0.0f;
+ int type = 0;
+ if (optixIsTriangleHit()) {
+ const float2 barycentrics = optixGetTriangleBarycentrics();
+ u = 1.0f - barycentrics.y - barycentrics.x;
+ v = barycentrics.x;
+ type = kernel_tex_fetch(__objects, object).primitive_type;
+ }
+# ifdef __HAIR__
+ else {
+ u = __uint_as_float(optixGetAttribute_0());
+ v = __uint_as_float(optixGetAttribute_1());
+
+ const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, prim);
+ type = segment.type;
+ prim = segment.prim;
+
+ /* Filter out curve endcaps. */
+ if (u == 0.0f || u == 1.0f) {
+ return optixIgnoreIntersection();
+ }
+ }
+# endif
+
+# ifndef __TRANSPARENT_SHADOWS__
+ /* No transparent shadows support compiled in, make opaque. */
+ optixSetPayload_5(true);
+ return optixTerminateRay();
+# else
+ const uint max_hits = optixGetPayload_3();
+ const uint num_hits_packed = optixGetPayload_2();
+ const uint num_recorded_hits = uint16_unpack_from_uint_0(num_hits_packed);
+ const uint num_hits = uint16_unpack_from_uint_1(num_hits_packed);
+
+ /* If no transparent shadows, all light is blocked and we can stop immediately. */
+ if (num_hits >= max_hits ||
+ !(intersection_get_shader_flags(NULL, prim, type) & SD_HAS_TRANSPARENT_SHADOW)) {
+ optixSetPayload_5(true);
+ return optixTerminateRay();
+ }
+
+ /* Always use baked shadow transparency for curves. */
+ if (type & PRIMITIVE_ALL_CURVE) {
+ float throughput = __uint_as_float(optixGetPayload_1());
+ throughput *= intersection_curve_shadow_transparency(nullptr, object, prim, u);
+ optixSetPayload_1(__float_as_uint(throughput));
+ optixSetPayload_2(uint16_pack_to_uint(num_recorded_hits, num_hits + 1));
+
+ if (throughput < CURVE_SHADOW_TRANSPARENCY_CUTOFF) {
+ optixSetPayload_4(true);
+ return optixTerminateRay();
+ }
+ else {
+ /* Continue tracing. */
+ optixIgnoreIntersection();
+ return;
+ }
+ }
+
+ /* Record transparent intersection. */
+ optixSetPayload_2(uint16_pack_to_uint(num_recorded_hits + 1, num_hits + 1));
+
+ uint record_index = num_recorded_hits;
+
+ const IntegratorShadowState state = optixGetPayload_0();
+
+ const uint max_record_hits = min(max_hits, INTEGRATOR_SHADOW_ISECT_SIZE);
+ if (record_index >= max_record_hits) {
+ /* If maximum number of hits reached, find a hit to replace. */
+ float max_recorded_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, 0, t);
+ uint max_recorded_hit = 0;
+
+ for (int i = 1; i < max_record_hits; i++) {
+ const float isect_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, i, t);
+ if (isect_t > max_recorded_t) {
+ max_recorded_t = isect_t;
+ max_recorded_hit = i;
+ }
+ }
+
+ if (optixGetRayTmax() >= max_recorded_t) {
+ /* Accept hit, so that OptiX won't consider any more hits beyond the distance of the
+ * current hit anymore. */
+ return;
+ }
+
+ record_index = max_recorded_hit;
+ }
+
+ INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, u) = u;
+ INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, v) = v;
+ INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, t) = optixGetRayTmax();
+ INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, prim) = prim;
+ INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, object) = object;
+ INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, type) = type;
+
+ /* Continue tracing. */
+ optixIgnoreIntersection();
+# endif /* __TRANSPARENT_SHADOWS__ */
+#endif /* __SHADOW_RECORD_ALL__ */
+}
+
+extern "C" __global__ void __anyhit__kernel_optix_volume_test()
+{
+#ifdef __HAIR__
+ if (!optixIsTriangleHit()) {
+ /* Ignore curves. */
+ return optixIgnoreIntersection();
+ }
+#endif
+
+ const uint object = get_object_id();
+#ifdef __VISIBILITY_FLAG__
+ const uint visibility = optixGetPayload_4();
+ if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
+ return optixIgnoreIntersection();
+ }
+#endif
+
+ if ((kernel_tex_fetch(__object_flag, object) & SD_OBJECT_HAS_VOLUME) == 0) {
+ return optixIgnoreIntersection();
+ }
+}
+
+extern "C" __global__ void __anyhit__kernel_optix_visibility_test()
+{
+#ifdef __HAIR__
+ if (!optixIsTriangleHit()) {
+ /* Filter out curve endcaps. */
+ const float u = __uint_as_float(optixGetAttribute_0());
+ if (u == 0.0f || u == 1.0f) {
+ return optixIgnoreIntersection();
+ }
+ }
+#endif
+
+#ifdef __VISIBILITY_FLAG__
+ const uint object = get_object_id();
+ const uint visibility = optixGetPayload_4();
+ if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
+ return optixIgnoreIntersection();
+ }
+
+ /* Shadow ray early termination. */
+ if (visibility & PATH_RAY_SHADOW_OPAQUE) {
+ return optixTerminateRay();
+ }
+#endif
+}
+
+extern "C" __global__ void __closesthit__kernel_optix_hit()
+{
+ const int object = get_object_id();
+ const int prim = optixGetPrimitiveIndex();
+
+ optixSetPayload_0(__float_as_uint(optixGetRayTmax())); /* Intersection distance */
+ optixSetPayload_4(object);
+
+ if (optixIsTriangleHit()) {
+ const float2 barycentrics = optixGetTriangleBarycentrics();
+ optixSetPayload_1(__float_as_uint(1.0f - barycentrics.y - barycentrics.x));
+ optixSetPayload_2(__float_as_uint(barycentrics.x));
+ optixSetPayload_3(prim);
+ optixSetPayload_5(kernel_tex_fetch(__objects, object).primitive_type);
+ }
+ else {
+ const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, prim);
+ optixSetPayload_1(optixGetAttribute_0()); /* Same as 'optixGetCurveParameter()' */
+ optixSetPayload_2(optixGetAttribute_1());
+ optixSetPayload_3(segment.prim);
+ optixSetPayload_5(segment.type);
+ }
+}
+
+#ifdef __HAIR__
+ccl_device_inline void optix_intersection_curve(const int prim, const int type)
+{
+ const int object = get_object_id();
+
+# ifdef __VISIBILITY_FLAG__
+ const uint visibility = optixGetPayload_4();
+ if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
+ return;
+ }
+# endif
+
+ float3 P = optixGetObjectRayOrigin();
+ float3 dir = optixGetObjectRayDirection();
+
+ /* The direction is not normalized by default, but the curve intersection routine expects that */
+ float len;
+ dir = normalize_len(dir, &len);
+
+# ifdef __OBJECT_MOTION__
+ const float time = optixGetRayTime();
+# else
+ const float time = 0.0f;
+# endif
+
+ Intersection isect;
+ isect.t = optixGetRayTmax();
+ /* Transform maximum distance into object space. */
+ if (isect.t != FLT_MAX)
+ isect.t *= len;
+
+ if (curve_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) {
+ optixReportIntersection(isect.t / len,
+ type & PRIMITIVE_ALL,
+ __float_as_int(isect.u), /* Attribute_0 */
+ __float_as_int(isect.v)); /* Attribute_1 */
+ }
+}
+
+extern "C" __global__ void __intersection__curve_ribbon()
+{
+ const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, optixGetPrimitiveIndex());
+ const int prim = segment.prim;
+ const int type = segment.type;
+ if (type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) {
+ optix_intersection_curve(prim, type);
+ }
+}
+#endif
diff --git a/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu b/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu
new file mode 100644
index 00000000000..bf787e29eaa
--- /dev/null
+++ b/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2021, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Copy of the regular kernels with additional shader ray-tracing kernel that takes
+ * much longer to compiler. This is only loaded when needed by the scene. */
+
+#include "kernel/device/optix/kernel.cu"
+#include "kernel/integrator/integrator_shade_surface.h"
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_surface_raytrace()
+{
+ const int global_index = optixGetLaunchIndex().x;
+ const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+ global_index;
+ integrator_shade_surface_raytrace(nullptr, path_index, __params.render_buffer);
+}