From 08031197250aeecbaca3803254e6f25b8c7b7b37 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@blender.org>
Date: Mon, 20 Sep 2021 17:59:20 +0200
Subject: Cycles: merge of cycles-x branch, a major update to the renderer

This includes much improved GPU rendering performance, viewport interactivity,
new shadow catcher, revamped sampling settings, subsurface scattering anisotropy,
new GPU volume sampling, improved PMJ sampling pattern, and more.

Some features have also been removed or changed, breaking backwards compatibility.
Including the removal of the OpenCL backend, for which alternatives are under
development.

Release notes and code docs:
https://wiki.blender.org/wiki/Reference/Release_Notes/3.0/Cycles
https://wiki.blender.org/wiki/Source/Render/Cycles

Credits:
* Sergey Sharybin
* Brecht Van Lommel
* Patrick Mours (OptiX backend)
* Christophe Hery (subsurface scattering anisotropy)
* William Leeson (PMJ sampling pattern)
* Alaska (various fixes and tweaks)
* Thomas Dinges (various fixes)

For the full commit history, see the cycles-x branch. This squashes together
all the changes since intermediate changes would often fail building or tests.

Ref T87839, T87837, T87836
Fixes T90734, T89353, T80267, T80267, T77185, T69800
---
 intern/cycles/kernel/device/cpu/compat.h           | 101 +++
 intern/cycles/kernel/device/cpu/globals.h          |  61 ++
 intern/cycles/kernel/device/cpu/image.h            | 657 ++++++++++++++++
 intern/cycles/kernel/device/cpu/kernel.cpp         |  94 +++
 intern/cycles/kernel/device/cpu/kernel.h           |  62 ++
 intern/cycles/kernel/device/cpu/kernel_arch.h      | 113 +++
 intern/cycles/kernel/device/cpu/kernel_arch_impl.h | 235 ++++++
 intern/cycles/kernel/device/cpu/kernel_avx.cpp     |  39 +
 intern/cycles/kernel/device/cpu/kernel_avx2.cpp    |  40 +
 intern/cycles/kernel/device/cpu/kernel_sse2.cpp    |  34 +
 intern/cycles/kernel/device/cpu/kernel_sse3.cpp    |  36 +
 intern/cycles/kernel/device/cpu/kernel_sse41.cpp   |  37 +
 intern/cycles/kernel/device/cuda/compat.h          | 135 ++++
 intern/cycles/kernel/device/cuda/config.h          | 114 +++
 intern/cycles/kernel/device/cuda/globals.h         |  48 ++
 intern/cycles/kernel/device/cuda/kernel.cu         |  28 +
 intern/cycles/kernel/device/gpu/image.h            | 278 +++++++
 intern/cycles/kernel/device/gpu/kernel.h           | 843 +++++++++++++++++++++
 .../kernel/device/gpu/parallel_active_index.h      |  83 ++
 .../cycles/kernel/device/gpu/parallel_prefix_sum.h |  46 ++
 intern/cycles/kernel/device/gpu/parallel_reduce.h  |  83 ++
 .../kernel/device/gpu/parallel_sorted_index.h      |  49 ++
 intern/cycles/kernel/device/optix/compat.h         | 127 ++++
 intern/cycles/kernel/device/optix/globals.h        |  59 ++
 intern/cycles/kernel/device/optix/kernel.cu        | 347 +++++++++
 .../kernel/device/optix/kernel_shader_raytrace.cu  |  29 +
 26 files changed, 3778 insertions(+)
 create mode 100644 intern/cycles/kernel/device/cpu/compat.h
 create mode 100644 intern/cycles/kernel/device/cpu/globals.h
 create mode 100644 intern/cycles/kernel/device/cpu/image.h
 create mode 100644 intern/cycles/kernel/device/cpu/kernel.cpp
 create mode 100644 intern/cycles/kernel/device/cpu/kernel.h
 create mode 100644 intern/cycles/kernel/device/cpu/kernel_arch.h
 create mode 100644 intern/cycles/kernel/device/cpu/kernel_arch_impl.h
 create mode 100644 intern/cycles/kernel/device/cpu/kernel_avx.cpp
 create mode 100644 intern/cycles/kernel/device/cpu/kernel_avx2.cpp
 create mode 100644 intern/cycles/kernel/device/cpu/kernel_sse2.cpp
 create mode 100644 intern/cycles/kernel/device/cpu/kernel_sse3.cpp
 create mode 100644 intern/cycles/kernel/device/cpu/kernel_sse41.cpp
 create mode 100644 intern/cycles/kernel/device/cuda/compat.h
 create mode 100644 intern/cycles/kernel/device/cuda/config.h
 create mode 100644 intern/cycles/kernel/device/cuda/globals.h
 create mode 100644 intern/cycles/kernel/device/cuda/kernel.cu
 create mode 100644 intern/cycles/kernel/device/gpu/image.h
 create mode 100644 intern/cycles/kernel/device/gpu/kernel.h
 create mode 100644 intern/cycles/kernel/device/gpu/parallel_active_index.h
 create mode 100644 intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
 create mode 100644 intern/cycles/kernel/device/gpu/parallel_reduce.h
 create mode 100644 intern/cycles/kernel/device/gpu/parallel_sorted_index.h
 create mode 100644 intern/cycles/kernel/device/optix/compat.h
 create mode 100644 intern/cycles/kernel/device/optix/globals.h
 create mode 100644 intern/cycles/kernel/device/optix/kernel.cu
 create mode 100644 intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu

(limited to 'intern/cycles/kernel/device')

diff --git a/intern/cycles/kernel/device/cpu/compat.h b/intern/cycles/kernel/device/cpu/compat.h
new file mode 100644
index 00000000000..bfd936c7bbd
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/compat.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#define __KERNEL_CPU__
+
+/* Release kernel has too much false-positive maybe-uninitialized warnings,
+ * which makes it possible to miss actual warnings.
+ */
+#if (defined(__GNUC__) && !defined(__clang__)) && defined(NDEBUG)
+#  pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#  pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+
+#include "util/util_half.h"
+#include "util/util_math.h"
+#include "util/util_simd.h"
+#include "util/util_texture.h"
+#include "util/util_types.h"
+
+#define ccl_addr_space
+
+/* On x86_64, versions of glibc < 2.16 have an issue where expf is
+ * much slower than the double version.  This was fixed in glibc 2.16.
+ */
+#if !defined(__KERNEL_GPU__) && defined(__x86_64__) && defined(__x86_64__) && \
+    defined(__GNU_LIBRARY__) && defined(__GLIBC__) && defined(__GLIBC_MINOR__) && \
+    (__GLIBC__ <= 2 && __GLIBC_MINOR__ < 16)
+#  define expf(x) ((float)exp((double)(x)))
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/* Assertions inside the kernel only work for the CPU device, so we wrap it in
+ * a macro which is empty for other devices */
+
+#define kernel_assert(cond) assert(cond)
+
+/* Texture types to be compatible with CUDA textures. These are really just
+ * simple arrays and after inlining fetch hopefully revert to being a simple
+ * pointer lookup. */
+template<typename T> struct texture {
+  ccl_always_inline const T &fetch(int index) const
+  {
+    kernel_assert(index >= 0 && index < width);
+    return data[index];
+  }
+
+  T *data;
+  int width;
+};
+
+/* Macros to handle different memory storage on different devices */
+
+#ifdef __KERNEL_SSE2__
+typedef vector3<sseb> sse3b;
+typedef vector3<ssef> sse3f;
+typedef vector3<ssei> sse3i;
+
+ccl_device_inline void print_sse3b(const char *label, sse3b &a)
+{
+  print_sseb(label, a.x);
+  print_sseb(label, a.y);
+  print_sseb(label, a.z);
+}
+
+ccl_device_inline void print_sse3f(const char *label, sse3f &a)
+{
+  print_ssef(label, a.x);
+  print_ssef(label, a.y);
+  print_ssef(label, a.z);
+}
+
+ccl_device_inline void print_sse3i(const char *label, sse3i &a)
+{
+  print_ssei(label, a.x);
+  print_ssei(label, a.y);
+  print_ssei(label, a.z);
+}
+
+#  if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
+typedef vector3<avxf> avx3f;
+#  endif
+
+#endif
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cpu/globals.h b/intern/cycles/kernel/device/cpu/globals.h
new file mode 100644
index 00000000000..98b036e269d
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/globals.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#pragma once
+
+#include "kernel/kernel_profiling.h"
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in
+ * the kernel, to access constant data. These are all stored as "textures", but
+ * these are really just standard arrays. We can't use actually globals because
+ * multiple renders may be running inside the same process. */
+
+#ifdef __OSL__
+struct OSLGlobals;
+struct OSLThreadData;
+struct OSLShadingSystem;
+#endif
+
+typedef struct KernelGlobals {
+#define KERNEL_TEX(type, name) texture<type> name;
+#include "kernel/kernel_textures.h"
+
+  KernelData __data;
+
+#ifdef __OSL__
+  /* On the CPU, we also have the OSL globals here. Most data structures are shared
+   * with SVM, the difference is in the shaders and object/mesh attributes. */
+  OSLGlobals *osl;
+  OSLShadingSystem *osl_ss;
+  OSLThreadData *osl_tdata;
+#endif
+
+  /* **** Run-time data ****  */
+
+  ProfilingState profiler;
+} KernelGlobals;
+
+/* Abstraction macros */
+#define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
+#define kernel_tex_array(tex) (kg->tex.data)
+#define kernel_data (kg->__data)
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cpu/image.h b/intern/cycles/kernel/device/cpu/image.h
new file mode 100644
index 00000000000..57e81ab186d
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/image.h
@@ -0,0 +1,657 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_NANOVDB
+#  define NANOVDB_USE_INTRINSICS
+#  include <nanovdb/NanoVDB.h>
+#  include <nanovdb/util/SampleFromVoxels.h>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/* Make template functions private so symbols don't conflict between kernels with different
+ * instruction sets. */
+namespace {
+
+#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
+  { \
+    u[0] = (((-1.0f / 6.0f) * t + 0.5f) * t - 0.5f) * t + (1.0f / 6.0f); \
+    u[1] = ((0.5f * t - 1.0f) * t) * t + (2.0f / 3.0f); \
+    u[2] = ((-0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f / 6.0f); \
+    u[3] = (1.0f / 6.0f) * t * t * t; \
+  } \
+  (void)0
+
+ccl_device_inline float frac(float x, int *ix)
+{
+  int i = float_to_int(x) - ((x < 0.0f) ? 1 : 0);
+  *ix = i;
+  return x - (float)i;
+}
+
+template<typename T> struct TextureInterpolator {
+
+  static ccl_always_inline float4 read(float4 r)
+  {
+    return r;
+  }
+
+  static ccl_always_inline float4 read(uchar4 r)
+  {
+    float f = 1.0f / 255.0f;
+    return make_float4(r.x * f, r.y * f, r.z * f, r.w * f);
+  }
+
+  static ccl_always_inline float4 read(uchar r)
+  {
+    float f = r * (1.0f / 255.0f);
+    return make_float4(f, f, f, 1.0f);
+  }
+
+  static ccl_always_inline float4 read(float r)
+  {
+    /* TODO(dingto): Optimize this, so interpolation
+     * happens on float instead of float4 */
+    return make_float4(r, r, r, 1.0f);
+  }
+
+  static ccl_always_inline float4 read(half4 r)
+  {
+    return half4_to_float4(r);
+  }
+
+  static ccl_always_inline float4 read(half r)
+  {
+    float f = half_to_float(r);
+    return make_float4(f, f, f, 1.0f);
+  }
+
+  static ccl_always_inline float4 read(uint16_t r)
+  {
+    float f = r * (1.0f / 65535.0f);
+    return make_float4(f, f, f, 1.0f);
+  }
+
+  static ccl_always_inline float4 read(ushort4 r)
+  {
+    float f = 1.0f / 65535.0f;
+    return make_float4(r.x * f, r.y * f, r.z * f, r.w * f);
+  }
+
+  static ccl_always_inline float4 read(const T *data, int x, int y, int width, int height)
+  {
+    if (x < 0 || y < 0 || x >= width || y >= height) {
+      return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    }
+    return read(data[y * width + x]);
+  }
+
+  static ccl_always_inline int wrap_periodic(int x, int width)
+  {
+    x %= width;
+    if (x < 0)
+      x += width;
+    return x;
+  }
+
+  static ccl_always_inline int wrap_clamp(int x, int width)
+  {
+    return clamp(x, 0, width - 1);
+  }
+
+  /* ********  2D interpolation ******** */
+
+  static ccl_always_inline float4 interp_closest(const TextureInfo &info, float x, float y)
+  {
+    const T *data = (const T *)info.data;
+    const int width = info.width;
+    const int height = info.height;
+    int ix, iy;
+    frac(x * (float)width, &ix);
+    frac(y * (float)height, &iy);
+    switch (info.extension) {
+      case EXTENSION_REPEAT:
+        ix = wrap_periodic(ix, width);
+        iy = wrap_periodic(iy, height);
+        break;
+      case EXTENSION_CLIP:
+        if (x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+          return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+        }
+        ATTR_FALLTHROUGH;
+      case EXTENSION_EXTEND:
+        ix = wrap_clamp(ix, width);
+        iy = wrap_clamp(iy, height);
+        break;
+      default:
+        kernel_assert(0);
+        return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    }
+    return read(data[ix + iy * width]);
+  }
+
+  static ccl_always_inline float4 interp_linear(const TextureInfo &info, float x, float y)
+  {
+    const T *data = (const T *)info.data;
+    const int width = info.width;
+    const int height = info.height;
+    int ix, iy, nix, niy;
+    const float tx = frac(x * (float)width - 0.5f, &ix);
+    const float ty = frac(y * (float)height - 0.5f, &iy);
+    switch (info.extension) {
+      case EXTENSION_REPEAT:
+        ix = wrap_periodic(ix, width);
+        iy = wrap_periodic(iy, height);
+        nix = wrap_periodic(ix + 1, width);
+        niy = wrap_periodic(iy + 1, height);
+        break;
+      case EXTENSION_CLIP:
+        nix = ix + 1;
+        niy = iy + 1;
+        break;
+      case EXTENSION_EXTEND:
+        nix = wrap_clamp(ix + 1, width);
+        niy = wrap_clamp(iy + 1, height);
+        ix = wrap_clamp(ix, width);
+        iy = wrap_clamp(iy, height);
+        break;
+      default:
+        kernel_assert(0);
+        return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    }
+    return (1.0f - ty) * (1.0f - tx) * read(data, ix, iy, width, height) +
+           (1.0f - ty) * tx * read(data, nix, iy, width, height) +
+           ty * (1.0f - tx) * read(data, ix, niy, width, height) +
+           ty * tx * read(data, nix, niy, width, height);
+  }
+
+  static ccl_always_inline float4 interp_cubic(const TextureInfo &info, float x, float y)
+  {
+    const T *data = (const T *)info.data;
+    const int width = info.width;
+    const int height = info.height;
+    int ix, iy, nix, niy;
+    const float tx = frac(x * (float)width - 0.5f, &ix);
+    const float ty = frac(y * (float)height - 0.5f, &iy);
+    int pix, piy, nnix, nniy;
+    switch (info.extension) {
+      case EXTENSION_REPEAT:
+        ix = wrap_periodic(ix, width);
+        iy = wrap_periodic(iy, height);
+        pix = wrap_periodic(ix - 1, width);
+        piy = wrap_periodic(iy - 1, height);
+        nix = wrap_periodic(ix + 1, width);
+        niy = wrap_periodic(iy + 1, height);
+        nnix = wrap_periodic(ix + 2, width);
+        nniy = wrap_periodic(iy + 2, height);
+        break;
+      case EXTENSION_CLIP:
+        pix = ix - 1;
+        piy = iy - 1;
+        nix = ix + 1;
+        niy = iy + 1;
+        nnix = ix + 2;
+        nniy = iy + 2;
+        break;
+      case EXTENSION_EXTEND:
+        pix = wrap_clamp(ix - 1, width);
+        piy = wrap_clamp(iy - 1, height);
+        nix = wrap_clamp(ix + 1, width);
+        niy = wrap_clamp(iy + 1, height);
+        nnix = wrap_clamp(ix + 2, width);
+        nniy = wrap_clamp(iy + 2, height);
+        ix = wrap_clamp(ix, width);
+        iy = wrap_clamp(iy, height);
+        break;
+      default:
+        kernel_assert(0);
+        return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    }
+    const int xc[4] = {pix, ix, nix, nnix};
+    const int yc[4] = {piy, iy, niy, nniy};
+    float u[4], v[4];
+    /* Some helper macro to keep code reasonable size,
+     * let compiler to inline all the matrix multiplications.
+     */
+#define DATA(x, y) (read(data, xc[x], yc[y], width, height))
+#define TERM(col) \
+  (v[col] * \
+   (u[0] * DATA(0, col) + u[1] * DATA(1, col) + u[2] * DATA(2, col) + u[3] * DATA(3, col)))
+
+    SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+    SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+
+    /* Actual interpolation. */
+    return TERM(0) + TERM(1) + TERM(2) + TERM(3);
+#undef TERM
+#undef DATA
+  }
+
+  static ccl_always_inline float4 interp(const TextureInfo &info, float x, float y)
+  {
+    if (UNLIKELY(!info.data)) {
+      return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    }
+    switch (info.interpolation) {
+      case INTERPOLATION_CLOSEST:
+        return interp_closest(info, x, y);
+      case INTERPOLATION_LINEAR:
+        return interp_linear(info, x, y);
+      default:
+        return interp_cubic(info, x, y);
+    }
+  }
+
+  /* ********  3D interpolation ******** */
+
+  static ccl_always_inline float4 interp_3d_closest(const TextureInfo &info,
+                                                    float x,
+                                                    float y,
+                                                    float z)
+  {
+    int width = info.width;
+    int height = info.height;
+    int depth = info.depth;
+    int ix, iy, iz;
+
+    frac(x * (float)width, &ix);
+    frac(y * (float)height, &iy);
+    frac(z * (float)depth, &iz);
+
+    switch (info.extension) {
+      case EXTENSION_REPEAT:
+        ix = wrap_periodic(ix, width);
+        iy = wrap_periodic(iy, height);
+        iz = wrap_periodic(iz, depth);
+        break;
+      case EXTENSION_CLIP:
+        if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) {
+          return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+        }
+        ATTR_FALLTHROUGH;
+      case EXTENSION_EXTEND:
+        ix = wrap_clamp(ix, width);
+        iy = wrap_clamp(iy, height);
+        iz = wrap_clamp(iz, depth);
+        break;
+      default:
+        kernel_assert(0);
+        return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    }
+
+    const T *data = (const T *)info.data;
+    return read(data[ix + iy * width + iz * width * height]);
+  }
+
+  static ccl_always_inline float4 interp_3d_linear(const TextureInfo &info,
+                                                   float x,
+                                                   float y,
+                                                   float z)
+  {
+    int width = info.width;
+    int height = info.height;
+    int depth = info.depth;
+    int ix, iy, iz;
+    int nix, niy, niz;
+
+    float tx = frac(x * (float)width - 0.5f, &ix);
+    float ty = frac(y * (float)height - 0.5f, &iy);
+    float tz = frac(z * (float)depth - 0.5f, &iz);
+
+    switch (info.extension) {
+      case EXTENSION_REPEAT:
+        ix = wrap_periodic(ix, width);
+        iy = wrap_periodic(iy, height);
+        iz = wrap_periodic(iz, depth);
+
+        nix = wrap_periodic(ix + 1, width);
+        niy = wrap_periodic(iy + 1, height);
+        niz = wrap_periodic(iz + 1, depth);
+        break;
+      case EXTENSION_CLIP:
+        if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) {
+          return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+        }
+        ATTR_FALLTHROUGH;
+      case EXTENSION_EXTEND:
+        nix = wrap_clamp(ix + 1, width);
+        niy = wrap_clamp(iy + 1, height);
+        niz = wrap_clamp(iz + 1, depth);
+
+        ix = wrap_clamp(ix, width);
+        iy = wrap_clamp(iy, height);
+        iz = wrap_clamp(iz, depth);
+        break;
+      default:
+        kernel_assert(0);
+        return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    }
+
+    const T *data = (const T *)info.data;
+    float4 r;
+
+    r = (1.0f - tz) * (1.0f - ty) * (1.0f - tx) *
+        read(data[ix + iy * width + iz * width * height]);
+    r += (1.0f - tz) * (1.0f - ty) * tx * read(data[nix + iy * width + iz * width * height]);
+    r += (1.0f - tz) * ty * (1.0f - tx) * read(data[ix + niy * width + iz * width * height]);
+    r += (1.0f - tz) * ty * tx * read(data[nix + niy * width + iz * width * height]);
+
+    r += tz * (1.0f - ty) * (1.0f - tx) * read(data[ix + iy * width + niz * width * height]);
+    r += tz * (1.0f - ty) * tx * read(data[nix + iy * width + niz * width * height]);
+    r += tz * ty * (1.0f - tx) * read(data[ix + niy * width + niz * width * height]);
+    r += tz * ty * tx * read(data[nix + niy * width + niz * width * height]);
+
+    return r;
+  }
+
+  /* TODO(sergey): For some unspeakable reason both GCC-6 and Clang-3.9 are
+   * causing stack overflow issue in this function unless it is inlined.
+   *
+   * Only happens for AVX2 kernel and global __KERNEL_SSE__ vectorization
+   * enabled.
+   */
+#if defined(__GNUC__) || defined(__clang__)
+  static ccl_always_inline
+#else
+  static ccl_never_inline
+#endif
+      float4
+      interp_3d_cubic(const TextureInfo &info, float x, float y, float z)
+  {
+    int width = info.width;
+    int height = info.height;
+    int depth = info.depth;
+    int ix, iy, iz;
+    int nix, niy, niz;
+    /* Tricubic b-spline interpolation. */
+    const float tx = frac(x * (float)width - 0.5f, &ix);
+    const float ty = frac(y * (float)height - 0.5f, &iy);
+    const float tz = frac(z * (float)depth - 0.5f, &iz);
+    int pix, piy, piz, nnix, nniy, nniz;
+
+    switch (info.extension) {
+      case EXTENSION_REPEAT:
+        ix = wrap_periodic(ix, width);
+        iy = wrap_periodic(iy, height);
+        iz = wrap_periodic(iz, depth);
+
+        pix = wrap_periodic(ix - 1, width);
+        piy = wrap_periodic(iy - 1, height);
+        piz = wrap_periodic(iz - 1, depth);
+
+        nix = wrap_periodic(ix + 1, width);
+        niy = wrap_periodic(iy + 1, height);
+        niz = wrap_periodic(iz + 1, depth);
+
+        nnix = wrap_periodic(ix + 2, width);
+        nniy = wrap_periodic(iy + 2, height);
+        nniz = wrap_periodic(iz + 2, depth);
+        break;
+      case EXTENSION_CLIP:
+        if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) {
+          return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+        }
+        ATTR_FALLTHROUGH;
+      case EXTENSION_EXTEND:
+        pix = wrap_clamp(ix - 1, width);
+        piy = wrap_clamp(iy - 1, height);
+        piz = wrap_clamp(iz - 1, depth);
+
+        nix = wrap_clamp(ix + 1, width);
+        niy = wrap_clamp(iy + 1, height);
+        niz = wrap_clamp(iz + 1, depth);
+
+        nnix = wrap_clamp(ix + 2, width);
+        nniy = wrap_clamp(iy + 2, height);
+        nniz = wrap_clamp(iz + 2, depth);
+
+        ix = wrap_clamp(ix, width);
+        iy = wrap_clamp(iy, height);
+        iz = wrap_clamp(iz, depth);
+        break;
+      default:
+        kernel_assert(0);
+        return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    }
+
+    const int xc[4] = {pix, ix, nix, nnix};
+    const int yc[4] = {width * piy, width * iy, width * niy, width * nniy};
+    const int zc[4] = {
+        width * height * piz, width * height * iz, width * height * niz, width * height * nniz};
+    float u[4], v[4], w[4];
+
+    /* Some helper macro to keep code reasonable size,
+     * let compiler to inline all the matrix multiplications.
+     */
+#define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]]))
+#define COL_TERM(col, row) \
+  (v[col] * (u[0] * DATA(0, col, row) + u[1] * DATA(1, col, row) + u[2] * DATA(2, col, row) + \
+             u[3] * DATA(3, col, row)))
+#define ROW_TERM(row) \
+  (w[row] * (COL_TERM(0, row) + COL_TERM(1, row) + COL_TERM(2, row) + COL_TERM(3, row)))
+
+    SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+    SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+    SET_CUBIC_SPLINE_WEIGHTS(w, tz);
+
+    /* Actual interpolation. */
+    const T *data = (const T *)info.data;
+    return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
+
+#undef COL_TERM
+#undef ROW_TERM
+#undef DATA
+  }
+
+  static ccl_always_inline float4
+  interp_3d(const TextureInfo &info, float x, float y, float z, InterpolationType interp)
+  {
+    if (UNLIKELY(!info.data))
+      return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+    switch ((interp == INTERPOLATION_NONE) ? info.interpolation : interp) {
+      case INTERPOLATION_CLOSEST:
+        return interp_3d_closest(info, x, y, z);
+      case INTERPOLATION_LINEAR:
+        return interp_3d_linear(info, x, y, z);
+      default:
+        return interp_3d_cubic(info, x, y, z);
+    }
+  }
+};
+
+#ifdef WITH_NANOVDB
+template<typename T> struct NanoVDBInterpolator {
+
+  typedef typename nanovdb::NanoGrid<T>::AccessorType AccessorType;
+
+  static ccl_always_inline float4 read(float r)
+  {
+    return make_float4(r, r, r, 1.0f);
+  }
+
+  static ccl_always_inline float4 read(nanovdb::Vec3f r)
+  {
+    return make_float4(r[0], r[1], r[2], 1.0f);
+  }
+
+  static ccl_always_inline float4 interp_3d_closest(const AccessorType &acc,
+                                                    float x,
+                                                    float y,
+                                                    float z)
+  {
+    const nanovdb::Vec3f xyz(x, y, z);
+    return read(nanovdb::SampleFromVoxels<AccessorType, 0, false>(acc)(xyz));
+  }
+
+  static ccl_always_inline float4 interp_3d_linear(const AccessorType &acc,
+                                                   float x,
+                                                   float y,
+                                                   float z)
+  {
+    const nanovdb::Vec3f xyz(x - 0.5f, y - 0.5f, z - 0.5f);
+    return read(nanovdb::SampleFromVoxels<AccessorType, 1, false>(acc)(xyz));
+  }
+
+#  if defined(__GNUC__) || defined(__clang__)
+  static ccl_always_inline
+#  else
+  static ccl_never_inline
+#  endif
+      float4
+      interp_3d_cubic(const AccessorType &acc, float x, float y, float z)
+  {
+    int ix, iy, iz;
+    int nix, niy, niz;
+    int pix, piy, piz;
+    int nnix, nniy, nniz;
+    /* Tricubic b-spline interpolation. */
+    const float tx = frac(x - 0.5f, &ix);
+    const float ty = frac(y - 0.5f, &iy);
+    const float tz = frac(z - 0.5f, &iz);
+    pix = ix - 1;
+    piy = iy - 1;
+    piz = iz - 1;
+    nix = ix + 1;
+    niy = iy + 1;
+    niz = iz + 1;
+    nnix = ix + 2;
+    nniy = iy + 2;
+    nniz = iz + 2;
+
+    const int xc[4] = {pix, ix, nix, nnix};
+    const int yc[4] = {piy, iy, niy, nniy};
+    const int zc[4] = {piz, iz, niz, nniz};
+    float u[4], v[4], w[4];
+
+    /* Some helper macro to keep code reasonable size,
+     * let compiler to inline all the matrix multiplications.
+     */
+#  define DATA(x, y, z) (read(acc.getValue(nanovdb::Coord(xc[x], yc[y], zc[z]))))
+#  define COL_TERM(col, row) \
+    (v[col] * (u[0] * DATA(0, col, row) + u[1] * DATA(1, col, row) + u[2] * DATA(2, col, row) + \
+               u[3] * DATA(3, col, row)))
+#  define ROW_TERM(row) \
+    (w[row] * (COL_TERM(0, row) + COL_TERM(1, row) + COL_TERM(2, row) + COL_TERM(3, row)))
+
+    SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+    SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+    SET_CUBIC_SPLINE_WEIGHTS(w, tz);
+
+    /* Actual interpolation. */
+    return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
+
+#  undef COL_TERM
+#  undef ROW_TERM
+#  undef DATA
+  }
+
+  static ccl_always_inline float4
+  interp_3d(const TextureInfo &info, float x, float y, float z, InterpolationType interp)
+  {
+    using namespace nanovdb;
+
+    NanoGrid<T> *const grid = (NanoGrid<T> *)info.data;
+    AccessorType acc = grid->getAccessor();
+
+    switch ((interp == INTERPOLATION_NONE) ? info.interpolation : interp) {
+      case INTERPOLATION_CLOSEST:
+        return interp_3d_closest(acc, x, y, z);
+      case INTERPOLATION_LINEAR:
+        return interp_3d_linear(acc, x, y, z);
+      default:
+        return interp_3d_cubic(acc, x, y, z);
+    }
+  }
+};
+#endif
+
+#undef SET_CUBIC_SPLINE_WEIGHTS
+
+ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float x, float y)
+{
+  const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+
+  switch (info.data_type) {
+    case IMAGE_DATA_TYPE_HALF:
+      return TextureInterpolator<half>::interp(info, x, y);
+    case IMAGE_DATA_TYPE_BYTE:
+      return TextureInterpolator<uchar>::interp(info, x, y);
+    case IMAGE_DATA_TYPE_USHORT:
+      return TextureInterpolator<uint16_t>::interp(info, x, y);
+    case IMAGE_DATA_TYPE_FLOAT:
+      return TextureInterpolator<float>::interp(info, x, y);
+    case IMAGE_DATA_TYPE_HALF4:
+      return TextureInterpolator<half4>::interp(info, x, y);
+    case IMAGE_DATA_TYPE_BYTE4:
+      return TextureInterpolator<uchar4>::interp(info, x, y);
+    case IMAGE_DATA_TYPE_USHORT4:
+      return TextureInterpolator<ushort4>::interp(info, x, y);
+    case IMAGE_DATA_TYPE_FLOAT4:
+      return TextureInterpolator<float4>::interp(info, x, y);
+    default:
+      assert(0);
+      return make_float4(
+          TEX_IMAGE_MISSING_R, TEX_IMAGE_MISSING_G, TEX_IMAGE_MISSING_B, TEX_IMAGE_MISSING_A);
+  }
+}
+
+ccl_device float4 kernel_tex_image_interp_3d(const KernelGlobals *kg,
+                                             int id,
+                                             float3 P,
+                                             InterpolationType interp)
+{
+  const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+
+  if (info.use_transform_3d) {
+    P = transform_point(&info.transform_3d, P);
+  }
+
+  switch (info.data_type) {
+    case IMAGE_DATA_TYPE_HALF:
+      return TextureInterpolator<half>::interp_3d(info, P.x, P.y, P.z, interp);
+    case IMAGE_DATA_TYPE_BYTE:
+      return TextureInterpolator<uchar>::interp_3d(info, P.x, P.y, P.z, interp);
+    case IMAGE_DATA_TYPE_USHORT:
+      return TextureInterpolator<uint16_t>::interp_3d(info, P.x, P.y, P.z, interp);
+    case IMAGE_DATA_TYPE_FLOAT:
+      return TextureInterpolator<float>::interp_3d(info, P.x, P.y, P.z, interp);
+    case IMAGE_DATA_TYPE_HALF4:
+      return TextureInterpolator<half4>::interp_3d(info, P.x, P.y, P.z, interp);
+    case IMAGE_DATA_TYPE_BYTE4:
+      return TextureInterpolator<uchar4>::interp_3d(info, P.x, P.y, P.z, interp);
+    case IMAGE_DATA_TYPE_USHORT4:
+      return TextureInterpolator<ushort4>::interp_3d(info, P.x, P.y, P.z, interp);
+    case IMAGE_DATA_TYPE_FLOAT4:
+      return TextureInterpolator<float4>::interp_3d(info, P.x, P.y, P.z, interp);
+#ifdef WITH_NANOVDB
+    case IMAGE_DATA_TYPE_NANOVDB_FLOAT:
+      return NanoVDBInterpolator<float>::interp_3d(info, P.x, P.y, P.z, interp);
+    case IMAGE_DATA_TYPE_NANOVDB_FLOAT3:
+      return NanoVDBInterpolator<nanovdb::Vec3f>::interp_3d(info, P.x, P.y, P.z, interp);
+#endif
+    default:
+      assert(0);
+      return make_float4(
+          TEX_IMAGE_MISSING_R, TEX_IMAGE_MISSING_G, TEX_IMAGE_MISSING_B, TEX_IMAGE_MISSING_A);
+  }
+}
+
+} /* Namespace. */
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cpu/kernel.cpp b/intern/cycles/kernel/device/cpu/kernel.cpp
new file mode 100644
index 00000000000..ac1cdf5fffe
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel.cpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CPU kernel entry points */
+
+/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
+ * one with SSE2 intrinsics.
+ */
+#if defined(__x86_64__) || defined(_M_X64)
+#  define __KERNEL_SSE2__
+#endif
+
+/* When building kernel for native machine detect kernel features from the flags
+ * set by compiler.
+ */
+#ifdef WITH_KERNEL_NATIVE
+#  ifdef __SSE2__
+#    ifndef __KERNEL_SSE2__
+#      define __KERNEL_SSE2__
+#    endif
+#  endif
+#  ifdef __SSE3__
+#    define __KERNEL_SSE3__
+#  endif
+#  ifdef __SSSE3__
+#    define __KERNEL_SSSE3__
+#  endif
+#  ifdef __SSE4_1__
+#    define __KERNEL_SSE41__
+#  endif
+#  ifdef __AVX__
+#    define __KERNEL_SSE__
+#    define __KERNEL_AVX__
+#  endif
+#  ifdef __AVX2__
+#    define __KERNEL_SSE__
+#    define __KERNEL_AVX2__
+#  endif
+#endif
+
+/* quiet unused define warnings */
+#if defined(__KERNEL_SSE2__)
+/* do nothing */
+#endif
+
+#include "kernel/device/cpu/kernel.h"
+#define KERNEL_ARCH cpu
+#include "kernel/device/cpu/kernel_arch_impl.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Memory Copy */
+
+void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t)
+{
+  if (strcmp(name, "__data") == 0) {
+    kg->__data = *(KernelData *)host;
+  }
+  else {
+    assert(0);
+  }
+}
+
+void kernel_global_memory_copy(KernelGlobals *kg, const char *name, void *mem, size_t size)
+{
+  if (0) {
+  }
+
+#define KERNEL_TEX(type, tname) \
+  else if (strcmp(name, #tname) == 0) \
+  { \
+    kg->tname.data = (type *)mem; \
+    kg->tname.width = size; \
+  }
+#include "kernel/kernel_textures.h"
+  else {
+    assert(0);
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cpu/kernel.h b/intern/cycles/kernel/device/cpu/kernel.h
new file mode 100644
index 00000000000..ae2a841835a
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+/* CPU Kernel Interface */
+
+#include "util/util_types.h"
+
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+#define KERNEL_NAME_JOIN(x, y, z) x##_##y##_##z
+#define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name)
+#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
+
+struct IntegratorStateCPU;
+struct KernelGlobals;
+struct KernelData;
+
+KernelGlobals *kernel_globals_create();
+void kernel_globals_free(KernelGlobals *kg);
+
+void *kernel_osl_memory(const KernelGlobals *kg);
+bool kernel_osl_use(const KernelGlobals *kg);
+
+void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t size);
+void kernel_global_memory_copy(KernelGlobals *kg, const char *name, void *mem, size_t size);
+
+#define KERNEL_ARCH cpu
+#include "kernel/device/cpu/kernel_arch.h"
+
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/device/cpu/kernel_arch.h"
+
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/device/cpu/kernel_arch.h"
+
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/device/cpu/kernel_arch.h"
+
+#define KERNEL_ARCH cpu_avx
+#include "kernel/device/cpu/kernel_arch.h"
+
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/device/cpu/kernel_arch.h"
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch.h b/intern/cycles/kernel/device/cpu/kernel_arch.h
new file mode 100644
index 00000000000..81f328c710b
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_arch.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common declaration part of all CPU kernels. */
+
+/* --------------------------------------------------------------------
+ * Integrator.
+ */
+
+#define KERNEL_INTEGRATOR_FUNCTION(name) \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \
+                                                    IntegratorStateCPU *state)
+
+#define KERNEL_INTEGRATOR_SHADE_FUNCTION(name) \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \
+                                                    IntegratorStateCPU *state, \
+                                                    ccl_global float *render_buffer)
+
+#define KERNEL_INTEGRATOR_INIT_FUNCTION(name) \
+  bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \
+                                                    IntegratorStateCPU *state, \
+                                                    KernelWorkTile *tile, \
+                                                    ccl_global float *render_buffer)
+
+KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_camera);
+KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_bake);
+KERNEL_INTEGRATOR_FUNCTION(intersect_closest);
+KERNEL_INTEGRATOR_FUNCTION(intersect_shadow);
+KERNEL_INTEGRATOR_FUNCTION(intersect_subsurface);
+KERNEL_INTEGRATOR_FUNCTION(intersect_volume_stack);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_background);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_light);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_shadow);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_surface);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_volume);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(megakernel);
+
+#undef KERNEL_INTEGRATOR_FUNCTION
+#undef KERNEL_INTEGRATOR_INIT_FUNCTION
+#undef KERNEL_INTEGRATOR_SHADE_FUNCTION
+
+/* --------------------------------------------------------------------
+ * Shader evaluation.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg,
+                                                       const KernelShaderEvalInput *input,
+                                                       float4 *output,
+                                                       const int offset);
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,
+                                                     const KernelShaderEvalInput *input,
+                                                     float4 *output,
+                                                     const int offset);
+
+/* --------------------------------------------------------------------
+ * Adaptive sampling.
+ */
+
+bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
+    const KernelGlobals *kg,
+    ccl_global float *render_buffer,
+    int x,
+    int y,
+    float threshold,
+    bool reset,
+    int offset,
+    int stride);
+
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobals *kg,
+                                                           ccl_global float *render_buffer,
+                                                           int y,
+                                                           int start_x,
+                                                           int width,
+                                                           int offset,
+                                                           int stride);
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobals *kg,
+                                                           ccl_global float *render_buffer,
+                                                           int x,
+                                                           int start_y,
+                                                           int height,
+                                                           int offset,
+                                                           int stride);
+
+/* --------------------------------------------------------------------
+ * Cryptomatte.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobals *kg,
+                                                        ccl_global float *render_buffer,
+                                                        int pixel_index);
+
+/* --------------------------------------------------------------------
+ * Bake.
+ */
+/* TODO(sergey): Needs to be re-implemented. Or not? Brecht did it already :) */
+
+void KERNEL_FUNCTION_FULL_NAME(bake)(
+    const KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride);
+
+#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
new file mode 100644
index 00000000000..1432abfd330
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
@@ -0,0 +1,235 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common implementation part of all CPU kernels.
+ *
+ * The idea is that particular .cpp files sets needed optimization flags and
+ * simply includes this file without worry of copying actual implementation over.
+ */
+
+#pragma once
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+
+#ifndef KERNEL_STUB
+#    include "kernel/device/cpu/globals.h"
+#    include "kernel/device/cpu/image.h"
+
+#    include "kernel/integrator/integrator_state.h"
+#    include "kernel/integrator/integrator_state_flow.h"
+#    include "kernel/integrator/integrator_state_util.h"
+
+#    include "kernel/integrator/integrator_init_from_camera.h"
+#    include "kernel/integrator/integrator_init_from_bake.h"
+#    include "kernel/integrator/integrator_intersect_closest.h"
+#    include "kernel/integrator/integrator_intersect_shadow.h"
+#    include "kernel/integrator/integrator_intersect_subsurface.h"
+#    include "kernel/integrator/integrator_intersect_volume_stack.h"
+#    include "kernel/integrator/integrator_shade_background.h"
+#    include "kernel/integrator/integrator_shade_light.h"
+#    include "kernel/integrator/integrator_shade_shadow.h"
+#    include "kernel/integrator/integrator_shade_surface.h"
+#    include "kernel/integrator/integrator_shade_volume.h"
+#    include "kernel/integrator/integrator_megakernel.h"
+
+#    include "kernel/kernel_film.h"
+#    include "kernel/kernel_adaptive_sampling.h"
+#    include "kernel/kernel_bake.h"
+# include "kernel/kernel_id_passes.h"
+
+#else
+#  define STUB_ASSERT(arch, name) \
+    assert(!(#name " kernel stub for architecture " #arch " was called!"))
+#endif   /* KERNEL_STUB */
+// clang-format on
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * Integrator.
+ */
+
+#ifdef KERNEL_STUB
+#  define KERNEL_INVOKE(name, ...) (STUB_ASSERT(KERNEL_ARCH, name), 0)
+#else
+#  define KERNEL_INVOKE(name, ...) integrator_##name(__VA_ARGS__)
+#endif
+
+#define DEFINE_INTEGRATOR_KERNEL(name) \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *kg, \
+                                                    IntegratorStateCPU *state) \
+  { \
+    KERNEL_INVOKE(name, kg, state); \
+  }
+
+#define DEFINE_INTEGRATOR_SHADE_KERNEL(name) \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)( \
+      const KernelGlobals *kg, IntegratorStateCPU *state, ccl_global float *render_buffer) \
+  { \
+    KERNEL_INVOKE(name, kg, state, render_buffer); \
+  }
+
+/* TODO: Either use something like get_work_pixel(), or simplify tile which is passed here, so
+ * that it does not contain unused fields. */
+#define DEFINE_INTEGRATOR_INIT_KERNEL(name) \
+  bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *kg, \
+                                                    IntegratorStateCPU *state, \
+                                                    KernelWorkTile *tile, \
+                                                    ccl_global float *render_buffer) \
+  { \
+    return KERNEL_INVOKE( \
+        name, kg, state, tile, render_buffer, tile->x, tile->y, tile->start_sample); \
+  }
+
+DEFINE_INTEGRATOR_INIT_KERNEL(init_from_camera)
+DEFINE_INTEGRATOR_INIT_KERNEL(init_from_bake)
+DEFINE_INTEGRATOR_KERNEL(intersect_closest)
+DEFINE_INTEGRATOR_KERNEL(intersect_shadow)
+DEFINE_INTEGRATOR_KERNEL(intersect_subsurface)
+DEFINE_INTEGRATOR_KERNEL(intersect_volume_stack)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_background)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_light)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_shadow)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_surface)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_volume)
+DEFINE_INTEGRATOR_SHADE_KERNEL(megakernel)
+
+/* --------------------------------------------------------------------
+ * Shader evaluation.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,
+                                                     const KernelShaderEvalInput *input,
+                                                     float4 *output,
+                                                     const int offset)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, shader_eval_displace);
+#else
+  kernel_displace_evaluate(kg, input, output, offset);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg,
+                                                       const KernelShaderEvalInput *input,
+                                                       float4 *output,
+                                                       const int offset)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, shader_eval_background);
+#else
+  kernel_background_evaluate(kg, input, output, offset);
+#endif
+}
+
+/* --------------------------------------------------------------------
+ * Adaptive sampling.
+ */
+
+bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
+    const KernelGlobals *kg,
+    ccl_global float *render_buffer,
+    int x,
+    int y,
+    float threshold,
+    bool reset,
+    int offset,
+    int stride)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_convergence_check);
+  return false;
+#else
+  return kernel_adaptive_sampling_convergence_check(
+      kg, render_buffer, x, y, threshold, reset, offset, stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobals *kg,
+                                                           ccl_global float *render_buffer,
+                                                           int y,
+                                                           int start_x,
+                                                           int width,
+                                                           int offset,
+                                                           int stride)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_filter_x);
+#else
+  kernel_adaptive_sampling_filter_x(kg, render_buffer, y, start_x, width, offset, stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobals *kg,
+                                                           ccl_global float *render_buffer,
+                                                           int x,
+                                                           int start_y,
+                                                           int height,
+                                                           int offset,
+                                                           int stride)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_filter_y);
+#else
+  kernel_adaptive_sampling_filter_y(kg, render_buffer, x, start_y, height, offset, stride);
+#endif
+}
+
+/* --------------------------------------------------------------------
+ * Cryptomatte.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobals *kg,
+                                                        ccl_global float *render_buffer,
+                                                        int pixel_index)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, cryptomatte_postprocess);
+#else
+  kernel_cryptomatte_post(kg, render_buffer, pixel_index);
+#endif
+}
+
+/* --------------------------------------------------------------------
+ * Bake.
+ */
+/* TODO(sergey): Needs to be re-implemented. Or not? Brecht did it already :) */
+
+void KERNEL_FUNCTION_FULL_NAME(bake)(
+    const KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride)
+{
+#if 0
+#  ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, bake);
+#  else
+#    ifdef __BAKING__
+  kernel_bake_evaluate(kg, buffer, sample, x, y, offset, stride);
+#    endif
+#  endif /* KERNEL_STUB */
+#endif
+}
+
+#undef KERNEL_INVOKE
+#undef DEFINE_INTEGRATOR_KERNEL
+#undef DEFINE_INTEGRATOR_SHADE_KERNEL
+#undef DEFINE_INTEGRATOR_INIT_KERNEL
+
+#undef KERNEL_STUB
+#undef STUB_ASSERT
+#undef KERNEL_ARCH
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cpu/kernel_avx.cpp b/intern/cycles/kernel/device/cpu/kernel_avx.cpp
new file mode 100644
index 00000000000..220768036ab
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_avx.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug T36316. */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#  endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/device/cpu/kernel.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/device/cpu/kernel_avx2.cpp b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
new file mode 100644
index 00000000000..90c05113cbe
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011-2014 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug T36316. */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#    define __KERNEL_AVX2__
+#  endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/device/cpu/kernel.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/device/cpu/kernel_sse2.cpp b/intern/cycles/kernel/device/cpu/kernel_sse2.cpp
new file mode 100644
index 00000000000..fb85ef5b0d0
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_sse2.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug T36316. */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#  endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/device/cpu/kernel.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/device/cpu/kernel_sse3.cpp b/intern/cycles/kernel/device/cpu/kernel_sse3.cpp
new file mode 100644
index 00000000000..87baf04258a
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_sse3.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug T36316. */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#  endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/device/cpu/kernel.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/device/cpu/kernel_sse41.cpp b/intern/cycles/kernel/device/cpu/kernel_sse41.cpp
new file mode 100644
index 00000000000..bb421d58815
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_sse41.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug T36316. */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#  endif
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/device/cpu/kernel.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/device/cuda/compat.h b/intern/cycles/kernel/device/cuda/compat.h
new file mode 100644
index 00000000000..665da43e1a1
--- /dev/null
+++ b/intern/cycles/kernel/device/cuda/compat.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#define __KERNEL_GPU__
+#define __KERNEL_CUDA__
+#define CCL_NAMESPACE_BEGIN
+#define CCL_NAMESPACE_END
+
+#ifndef ATTR_FALLTHROUGH
+#  define ATTR_FALLTHROUGH
+#endif
+
+/* Manual definitions so we can compile without CUDA toolkit. */
+
+#ifdef __CUDACC_RTC__
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+#else
+#  include <stdint.h>
+#endif
+
+#ifdef CYCLES_CUBIN_CC
+#  define FLT_MIN 1.175494350822287507969e-38f
+#  define FLT_MAX 340282346638528859811704183484516925440.0f
+#  define FLT_EPSILON 1.192092896e-07F
+#endif
+
+/* Qualifiers */
+
+#define ccl_device __device__ __inline__
+#if __CUDA_ARCH__ < 500
+#  define ccl_device_inline __device__ __forceinline__
+#  define ccl_device_forceinline __device__ __forceinline__
+#else
+#  define ccl_device_inline __device__ __inline__
+#  define ccl_device_forceinline __device__ __forceinline__
+#endif
+#define ccl_device_noinline __device__ __noinline__
+#define ccl_device_noinline_cpu ccl_device
+#define ccl_global
+#define ccl_static_constant __constant__
+#define ccl_device_constant __constant__ __device__
+#define ccl_constant const
+#define ccl_gpu_shared __shared__
+#define ccl_private
+#define ccl_may_alias
+#define ccl_addr_space
+#define ccl_restrict __restrict__
+#define ccl_loop_no_unroll
+#define ccl_align(n) __align__(n)
+#define ccl_optional_struct_init
+
+/* No assert supported for CUDA */
+
+#define kernel_assert(cond)
+
+/* GPU thread, block, grid size and index */
+
+#define ccl_gpu_thread_idx_x (threadIdx.x)
+#define ccl_gpu_block_dim_x (blockDim.x)
+#define ccl_gpu_block_idx_x (blockIdx.x)
+#define ccl_gpu_grid_dim_x (gridDim.x)
+#define ccl_gpu_warp_size (warpSize)
+
+#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
+#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
+
+/* GPU warp synchronizaton */
+
+#define ccl_gpu_syncthreads() __syncthreads()
+#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
+#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
+#define ccl_gpu_popc(x) __popc(x)
+
+/* GPU texture objects */
+
+typedef unsigned long long CUtexObject;
+typedef CUtexObject ccl_gpu_tex_object;
+
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_2D(const ccl_gpu_tex_object texobj,
+                                                    const float x,
+                                                    const float y)
+{
+  return tex2D<T>(texobj, x, y);
+}
+
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object texobj,
+                                                    const float x,
+                                                    const float y,
+                                                    const float z)
+{
+  return tex3D<T>(texobj, x, y, z);
+}
+
+/* Use fast math functions */
+
+#define cosf(x) __cosf(((float)(x)))
+#define sinf(x) __sinf(((float)(x)))
+#define powf(x, y) __powf(((float)(x)), ((float)(y)))
+#define tanf(x) __tanf(((float)(x)))
+#define logf(x) __logf(((float)(x)))
+#define expf(x) __expf(((float)(x)))
+
+/* Half */
+
+typedef unsigned short half;
+
+__device__ half __float2half(const float f)
+{
+  half val;
+  asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
+  return val;
+}
+
+/* Types */
+
+#include "util/util_half.h"
+#include "util/util_types.h"
diff --git a/intern/cycles/kernel/device/cuda/config.h b/intern/cycles/kernel/device/cuda/config.h
new file mode 100644
index 00000000000..46196dcdb51
--- /dev/null
+++ b/intern/cycles/kernel/device/cuda/config.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Device data taken from CUDA occupancy calculator.
+ *
+ * Terminology
+ * - CUDA GPUs have multiple streaming multiprocessors
+ * - Each multiprocessor executes multiple thread blocks
+ * - Each thread block contains a number of threads, also known as the block size
+ * - Multiprocessors have a fixed number of registers, and the amount of registers
+ *   used by each threads limits the number of threads per block.
+ */
+
+/* 3.0 and 3.5 */
+#if __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
+#  define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define GPU_BLOCK_MAX_THREADS 1024
+#  define GPU_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#  define GPU_KERNEL_BLOCK_NUM_THREADS 256
+#  define GPU_KERNEL_MAX_REGISTERS 63
+
+/* 3.2 */
+#elif __CUDA_ARCH__ == 320
+#  define GPU_MULTIPRESSOR_MAX_REGISTERS 32768
+#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define GPU_BLOCK_MAX_THREADS 1024
+#  define GPU_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#  define GPU_KERNEL_BLOCK_NUM_THREADS 256
+#  define GPU_KERNEL_MAX_REGISTERS 63
+
+/* 3.7 */
+#elif __CUDA_ARCH__ == 370
+#  define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define GPU_BLOCK_MAX_THREADS 1024
+#  define GPU_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+#  define GPU_KERNEL_BLOCK_NUM_THREADS 256
+#  define GPU_KERNEL_MAX_REGISTERS 63
+
+/* 5.x, 6.x */
+#elif __CUDA_ARCH__ <= 699
+#  define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 32
+#  define GPU_BLOCK_MAX_THREADS 1024
+#  define GPU_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+#  define GPU_KERNEL_BLOCK_NUM_THREADS 256
+/* CUDA 9.0 seems to cause slowdowns on high-end Pascal cards unless we increase the number of
+ * registers */
+#  if __CUDACC_VER_MAJOR__ >= 9 && __CUDA_ARCH__ >= 600
+#    define GPU_KERNEL_MAX_REGISTERS 64
+#  else
+#    define GPU_KERNEL_MAX_REGISTERS 48
+#  endif
+
+/* 7.x, 8.x */
+#elif __CUDA_ARCH__ <= 899
+#  define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 32
+#  define GPU_BLOCK_MAX_THREADS 1024
+#  define GPU_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+#  define GPU_KERNEL_BLOCK_NUM_THREADS 512
+#  define GPU_KERNEL_MAX_REGISTERS 96
+
+/* unknown architecture */
+#else
+#  error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
+#endif
+
+/* Compute number of threads per block and minimum blocks per multiprocessor
+ * given the maximum number of registers per thread. */
+
+#define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
+  extern "C" __global__ void __launch_bounds__(block_num_threads, \
+                                               GPU_MULTIPRESSOR_MAX_REGISTERS / \
+                                                   (block_num_threads * thread_num_registers))
+
+/* sanity checks */
+
+#if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
+#  error "Maximum number of threads per block exceeded"
+#endif
+
+#if GPU_MULTIPRESSOR_MAX_REGISTERS / (GPU_KERNEL_BLOCK_NUM_THREADS * GPU_KERNEL_MAX_REGISTERS) > \
+    GPU_MULTIPROCESSOR_MAX_BLOCKS
+#  error "Maximum number of blocks per multiprocessor exceeded"
+#endif
+
+#if GPU_KERNEL_MAX_REGISTERS > GPU_THREAD_MAX_REGISTERS
+#  error "Maximum number of registers per thread exceeded"
+#endif
diff --git a/intern/cycles/kernel/device/cuda/globals.h b/intern/cycles/kernel/device/cuda/globals.h
new file mode 100644
index 00000000000..169047175f5
--- /dev/null
+++ b/intern/cycles/kernel/device/cuda/globals.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#pragma once
+
+#include "kernel/kernel_profiling.h"
+#include "kernel/kernel_types.h"
+
+#include "kernel/integrator/integrator_state.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Not actually used, just a NULL pointer that gets passed everywhere, which we
+ * hope gets optimized out by the compiler. */
+struct KernelGlobals {
+  int unused[1];
+};
+
+/* Global scene data and textures */
+__constant__ KernelData __data;
+#define KERNEL_TEX(type, name) const __constant__ __device__ type *name;
+#include "kernel/kernel_textures.h"
+
+/* Integrator state */
+__constant__ IntegratorStateGPU __integrator_state;
+
+/* Abstraction macros */
+#define kernel_data __data
+#define kernel_tex_fetch(t, index) t[(index)]
+#define kernel_tex_array(t) (t)
+#define kernel_integrator_state __integrator_state
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cuda/kernel.cu b/intern/cycles/kernel/device/cuda/kernel.cu
new file mode 100644
index 00000000000..e26fe243642
--- /dev/null
+++ b/intern/cycles/kernel/device/cuda/kernel.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CUDA kernel entry points */
+
+#ifdef __CUDA_ARCH__
+
+#  include "kernel/device/cuda/compat.h"
+#  include "kernel/device/cuda/config.h"
+#  include "kernel/device/cuda/globals.h"
+
+#  include "kernel/device/gpu/image.h"
+#  include "kernel/device/gpu/kernel.h"
+
+#endif
diff --git a/intern/cycles/kernel/device/gpu/image.h b/intern/cycles/kernel/device/gpu/image.h
new file mode 100644
index 00000000000..b015c78a8f5
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/image.h
@@ -0,0 +1,278 @@
+/*
+ * Copyright 2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef WITH_NANOVDB
+#  define NDEBUG /* Disable "assert" in device code */
+#  define NANOVDB_USE_INTRINSICS
+#  include "nanovdb/NanoVDB.h"
+#  include "nanovdb/util/SampleFromVoxels.h"
+#endif
+
+/* w0, w1, w2, and w3 are the four cubic B-spline basis functions. */
+ccl_device float cubic_w0(float a)
+{
+  return (1.0f / 6.0f) * (a * (a * (-a + 3.0f) - 3.0f) + 1.0f);
+}
+ccl_device float cubic_w1(float a)
+{
+  return (1.0f / 6.0f) * (a * a * (3.0f * a - 6.0f) + 4.0f);
+}
+ccl_device float cubic_w2(float a)
+{
+  return (1.0f / 6.0f) * (a * (a * (-3.0f * a + 3.0f) + 3.0f) + 1.0f);
+}
+ccl_device float cubic_w3(float a)
+{
+  return (1.0f / 6.0f) * (a * a * a);
+}
+
+/* g0 and g1 are the two amplitude functions. */
+ccl_device float cubic_g0(float a)
+{
+  return cubic_w0(a) + cubic_w1(a);
+}
+ccl_device float cubic_g1(float a)
+{
+  return cubic_w2(a) + cubic_w3(a);
+}
+
+/* h0 and h1 are the two offset functions */
+ccl_device float cubic_h0(float a)
+{
+  return (cubic_w1(a) / cubic_g0(a)) - 1.0f;
+}
+ccl_device float cubic_h1(float a)
+{
+  return (cubic_w3(a) / cubic_g1(a)) + 1.0f;
+}
+
+/* Fast bicubic texture lookup using 4 bilinear lookups, adapted from CUDA samples. */
+template<typename T>
+ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, float y)
+{
+  ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+
+  x = (x * info.width) - 0.5f;
+  y = (y * info.height) - 0.5f;
+
+  float px = floorf(x);
+  float py = floorf(y);
+  float fx = x - px;
+  float fy = y - py;
+
+  float g0x = cubic_g0(fx);
+  float g1x = cubic_g1(fx);
+  /* Note +0.5 offset to compensate for CUDA linear filtering convention. */
+  float x0 = (px + cubic_h0(fx) + 0.5f) / info.width;
+  float x1 = (px + cubic_h1(fx) + 0.5f) / info.width;
+  float y0 = (py + cubic_h0(fy) + 0.5f) / info.height;
+  float y1 = (py + cubic_h1(fy) + 0.5f) / info.height;
+
+  return cubic_g0(fy) * (g0x * ccl_gpu_tex_object_read_2D<T>(tex, x0, y0) +
+                         g1x * ccl_gpu_tex_object_read_2D<T>(tex, x1, y0)) +
+         cubic_g1(fy) * (g0x * ccl_gpu_tex_object_read_2D<T>(tex, x0, y1) +
+                         g1x * ccl_gpu_tex_object_read_2D<T>(tex, x1, y1));
+}
+
+/* Fast tricubic texture lookup using 8 trilinear lookups. */
+template<typename T>
+ccl_device_noinline T
+kernel_tex_image_interp_tricubic(const TextureInfo &info, float x, float y, float z)
+{
+  ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+
+  x = (x * info.width) - 0.5f;
+  y = (y * info.height) - 0.5f;
+  z = (z * info.depth) - 0.5f;
+
+  float px = floorf(x);
+  float py = floorf(y);
+  float pz = floorf(z);
+  float fx = x - px;
+  float fy = y - py;
+  float fz = z - pz;
+
+  float g0x = cubic_g0(fx);
+  float g1x = cubic_g1(fx);
+  float g0y = cubic_g0(fy);
+  float g1y = cubic_g1(fy);
+  float g0z = cubic_g0(fz);
+  float g1z = cubic_g1(fz);
+
+  /* Note +0.5 offset to compensate for CUDA linear filtering convention. */
+  float x0 = (px + cubic_h0(fx) + 0.5f) / info.width;
+  float x1 = (px + cubic_h1(fx) + 0.5f) / info.width;
+  float y0 = (py + cubic_h0(fy) + 0.5f) / info.height;
+  float y1 = (py + cubic_h1(fy) + 0.5f) / info.height;
+  float z0 = (pz + cubic_h0(fz) + 0.5f) / info.depth;
+  float z1 = (pz + cubic_h1(fz) + 0.5f) / info.depth;
+
+  return g0z * (g0y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y0, z0) +
+                       g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y0, z0)) +
+                g1y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y1, z0) +
+                       g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y1, z0))) +
+         g1z * (g0y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y0, z1) +
+                       g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y0, z1)) +
+                g1y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y1, z1) +
+                       g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y1, z1)));
+}
+
+#ifdef WITH_NANOVDB
+template<typename T, typename S>
+ccl_device T kernel_tex_image_interp_tricubic_nanovdb(S &s, float x, float y, float z)
+{
+  float px = floorf(x);
+  float py = floorf(y);
+  float pz = floorf(z);
+  float fx = x - px;
+  float fy = y - py;
+  float fz = z - pz;
+
+  float g0x = cubic_g0(fx);
+  float g1x = cubic_g1(fx);
+  float g0y = cubic_g0(fy);
+  float g1y = cubic_g1(fy);
+  float g0z = cubic_g0(fz);
+  float g1z = cubic_g1(fz);
+
+  float x0 = px + cubic_h0(fx);
+  float x1 = px + cubic_h1(fx);
+  float y0 = py + cubic_h0(fy);
+  float y1 = py + cubic_h1(fy);
+  float z0 = pz + cubic_h0(fz);
+  float z1 = pz + cubic_h1(fz);
+
+  using namespace nanovdb;
+
+  return g0z * (g0y * (g0x * s(Vec3f(x0, y0, z0)) + g1x * s(Vec3f(x1, y0, z0))) +
+                g1y * (g0x * s(Vec3f(x0, y1, z0)) + g1x * s(Vec3f(x1, y1, z0)))) +
+         g1z * (g0y * (g0x * s(Vec3f(x0, y0, z1)) + g1x * s(Vec3f(x1, y0, z1))) +
+                g1y * (g0x * s(Vec3f(x0, y1, z1)) + g1x * s(Vec3f(x1, y1, z1))));
+}
+
+template<typename T>
+ccl_device_noinline T kernel_tex_image_interp_nanovdb(
+    const TextureInfo &info, float x, float y, float z, uint interpolation)
+{
+  using namespace nanovdb;
+
+  NanoGrid<T> *const grid = (NanoGrid<T> *)info.data;
+  typedef typename nanovdb::NanoGrid<T>::AccessorType AccessorType;
+  AccessorType acc = grid->getAccessor();
+
+  switch (interpolation) {
+    case INTERPOLATION_CLOSEST:
+      return SampleFromVoxels<AccessorType, 0, false>(acc)(Vec3f(x, y, z));
+    case INTERPOLATION_LINEAR:
+      return SampleFromVoxels<AccessorType, 1, false>(acc)(Vec3f(x - 0.5f, y - 0.5f, z - 0.5f));
+    default:
+      SampleFromVoxels<AccessorType, 1, false> s(acc);
+      return kernel_tex_image_interp_tricubic_nanovdb<T>(s, x - 0.5f, y - 0.5f, z - 0.5f);
+  }
+}
+#endif
+
+ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float x, float y)
+{
+  const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+
+  /* float4, byte4, ushort4 and half4 */
+  const int texture_type = info.data_type;
+  if (texture_type == IMAGE_DATA_TYPE_FLOAT4 || texture_type == IMAGE_DATA_TYPE_BYTE4 ||
+      texture_type == IMAGE_DATA_TYPE_HALF4 || texture_type == IMAGE_DATA_TYPE_USHORT4) {
+    if (info.interpolation == INTERPOLATION_CUBIC) {
+      return kernel_tex_image_interp_bicubic<float4>(info, x, y);
+    }
+    else {
+      ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+      return ccl_gpu_tex_object_read_2D<float4>(tex, x, y);
+    }
+  }
+  /* float, byte and half */
+  else {
+    float f;
+
+    if (info.interpolation == INTERPOLATION_CUBIC) {
+      f = kernel_tex_image_interp_bicubic<float>(info, x, y);
+    }
+    else {
+      ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+      f = ccl_gpu_tex_object_read_2D<float>(tex, x, y);
+    }
+
+    return make_float4(f, f, f, 1.0f);
+  }
+}
+
+ccl_device float4 kernel_tex_image_interp_3d(const KernelGlobals *kg,
+                                             int id,
+                                             float3 P,
+                                             InterpolationType interp)
+{
+  const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+
+  if (info.use_transform_3d) {
+    P = transform_point(&info.transform_3d, P);
+  }
+
+  const float x = P.x;
+  const float y = P.y;
+  const float z = P.z;
+
+  uint interpolation = (interp == INTERPOLATION_NONE) ? info.interpolation : interp;
+  const int texture_type = info.data_type;
+
+#ifdef WITH_NANOVDB
+  if (texture_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT) {
+    float f = kernel_tex_image_interp_nanovdb<float>(info, x, y, z, interpolation);
+    return make_float4(f, f, f, 1.0f);
+  }
+  if (texture_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
+    nanovdb::Vec3f f = kernel_tex_image_interp_nanovdb<nanovdb::Vec3f>(
+        info, x, y, z, interpolation);
+    return make_float4(f[0], f[1], f[2], 1.0f);
+  }
+#endif
+  if (texture_type == IMAGE_DATA_TYPE_FLOAT4 || texture_type == IMAGE_DATA_TYPE_BYTE4 ||
+      texture_type == IMAGE_DATA_TYPE_HALF4 || texture_type == IMAGE_DATA_TYPE_USHORT4) {
+    if (interpolation == INTERPOLATION_CUBIC) {
+      return kernel_tex_image_interp_tricubic<float4>(info, x, y, z);
+    }
+    else {
+      ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+      return ccl_gpu_tex_object_read_3D<float4>(tex, x, y, z);
+    }
+  }
+  else {
+    float f;
+
+    if (interpolation == INTERPOLATION_CUBIC) {
+      f = kernel_tex_image_interp_tricubic<float>(info, x, y, z);
+    }
+    else {
+      ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+      f = ccl_gpu_tex_object_read_3D<float>(tex, x, y, z);
+    }
+
+    return make_float4(f, f, f, 1.0f);
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
new file mode 100644
index 00000000000..7b79c0aedfa
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -0,0 +1,843 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Common GPU kernels. */
+
+#include "kernel/device/gpu/parallel_active_index.h"
+#include "kernel/device/gpu/parallel_prefix_sum.h"
+#include "kernel/device/gpu/parallel_sorted_index.h"
+
+#include "kernel/integrator/integrator_state.h"
+#include "kernel/integrator/integrator_state_flow.h"
+#include "kernel/integrator/integrator_state_util.h"
+
+#include "kernel/integrator/integrator_init_from_bake.h"
+#include "kernel/integrator/integrator_init_from_camera.h"
+#include "kernel/integrator/integrator_intersect_closest.h"
+#include "kernel/integrator/integrator_intersect_shadow.h"
+#include "kernel/integrator/integrator_intersect_subsurface.h"
+#include "kernel/integrator/integrator_intersect_volume_stack.h"
+#include "kernel/integrator/integrator_shade_background.h"
+#include "kernel/integrator/integrator_shade_light.h"
+#include "kernel/integrator/integrator_shade_shadow.h"
+#include "kernel/integrator/integrator_shade_surface.h"
+#include "kernel/integrator/integrator_shade_volume.h"
+
+#include "kernel/kernel_adaptive_sampling.h"
+#include "kernel/kernel_bake.h"
+#include "kernel/kernel_film.h"
+#include "kernel/kernel_work_stealing.h"
+
+/* --------------------------------------------------------------------
+ * Integrator.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_reset(int num_states)
+{
+  const int state = ccl_gpu_global_id_x();
+
+  if (state < num_states) {
+    INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0;
+    INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0;
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_init_from_camera(KernelWorkTile *tiles,
+                                           const int num_tiles,
+                                           float *render_buffer,
+                                           const int max_tile_work_size)
+{
+  const int work_index = ccl_gpu_global_id_x();
+
+  if (work_index >= max_tile_work_size * num_tiles) {
+    return;
+  }
+
+  const int tile_index = work_index / max_tile_work_size;
+  const int tile_work_index = work_index - tile_index * max_tile_work_size;
+
+  const KernelWorkTile *tile = &tiles[tile_index];
+
+  if (tile_work_index >= tile->work_size) {
+    return;
+  }
+
+  const int state = tile->path_index_offset + tile_work_index;
+
+  uint x, y, sample;
+  get_work_pixel(tile, tile_work_index, &x, &y, &sample);
+
+  integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample);
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_init_from_bake(KernelWorkTile *tiles,
+                                         const int num_tiles,
+                                         float *render_buffer,
+                                         const int max_tile_work_size)
+{
+  const int work_index = ccl_gpu_global_id_x();
+
+  if (work_index >= max_tile_work_size * num_tiles) {
+    return;
+  }
+
+  const int tile_index = work_index / max_tile_work_size;
+  const int tile_work_index = work_index - tile_index * max_tile_work_size;
+
+  const KernelWorkTile *tile = &tiles[tile_index];
+
+  if (tile_work_index >= tile->work_size) {
+    return;
+  }
+
+  const int state = tile->path_index_offset + tile_work_index;
+
+  uint x, y, sample;
+  get_work_pixel(tile, tile_work_index, &x, &y, &sample);
+
+  integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample);
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_intersect_closest(const int *path_index_array, const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_intersect_closest(NULL, state);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_intersect_shadow(const int *path_index_array, const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_intersect_shadow(NULL, state);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_intersect_subsurface(const int *path_index_array, const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_intersect_subsurface(NULL, state);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_intersect_volume_stack(const int *path_index_array, const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_intersect_volume_stack(NULL, state);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shade_background(const int *path_index_array,
+                                           float *render_buffer,
+                                           const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_shade_background(NULL, state, render_buffer);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shade_light(const int *path_index_array,
+                                      float *render_buffer,
+                                      const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_shade_light(NULL, state, render_buffer);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shade_shadow(const int *path_index_array,
+                                       float *render_buffer,
+                                       const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_shade_shadow(NULL, state, render_buffer);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shade_surface(const int *path_index_array,
+                                        float *render_buffer,
+                                        const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_shade_surface(NULL, state, render_buffer);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shade_surface_raytrace(const int *path_index_array,
+                                                 float *render_buffer,
+                                                 const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_shade_surface_raytrace(NULL, state, render_buffer);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shade_volume(const int *path_index_array,
+                                       float *render_buffer,
+                                       const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_shade_volume(NULL, state, render_buffer);
+  }
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_queued_paths_array(int num_states,
+                                             int *indices,
+                                             int *num_indices,
+                                             int kernel)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices, num_indices, [kernel](const int state) {
+        return (INTEGRATOR_STATE(path, queued_kernel) == kernel);
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_queued_shadow_paths_array(int num_states,
+                                                    int *indices,
+                                                    int *num_indices,
+                                                    int kernel)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices, num_indices, [kernel](const int state) {
+        return (INTEGRATOR_STATE(shadow_path, queued_kernel) == kernel);
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_active_paths_array(int num_states, int *indices, int *num_indices)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices, num_indices, [](const int state) {
+        return (INTEGRATOR_STATE(path, queued_kernel) != 0) ||
+               (INTEGRATOR_STATE(shadow_path, queued_kernel) != 0);
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_terminated_paths_array(int num_states,
+                                                 int *indices,
+                                                 int *num_indices,
+                                                 int indices_offset)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices + indices_offset, num_indices, [](const int state) {
+        return (INTEGRATOR_STATE(path, queued_kernel) == 0) &&
+               (INTEGRATOR_STATE(shadow_path, queued_kernel) == 0);
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_sorted_paths_array(
+        int num_states, int *indices, int *num_indices, int *key_prefix_sum, int kernel)
+{
+  gpu_parallel_sorted_index_array<GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices, num_indices, key_prefix_sum, [kernel](const int state) {
+        return (INTEGRATOR_STATE(path, queued_kernel) == kernel) ?
+                   INTEGRATOR_STATE(path, shader_sort_key) :
+                   GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_compact_paths_array(int num_states,
+                                              int *indices,
+                                              int *num_indices,
+                                              int num_active_paths)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices, num_indices, [num_active_paths](const int state) {
+        return (state >= num_active_paths) &&
+               ((INTEGRATOR_STATE(path, queued_kernel) != 0) ||
+                (INTEGRATOR_STATE(shadow_path, queued_kernel) != 0));
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_compact_states(const int *active_terminated_states,
+                                         const int active_states_offset,
+                                         const int terminated_states_offset,
+                                         const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int from_state = active_terminated_states[active_states_offset + global_index];
+    const int to_state = active_terminated_states[terminated_states_offset + global_index];
+
+    integrator_state_move(to_state, from_state);
+  }
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_prefix_sum(int *values, int num_values)
+{
+  gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>(values, num_values);
+}
+
+/* --------------------------------------------------------------------
+ * Adaptive sampling.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_adaptive_sampling_convergence_check(float *render_buffer,
+                                                   int sx,
+                                                   int sy,
+                                                   int sw,
+                                                   int sh,
+                                                   float threshold,
+                                                   bool reset,
+                                                   int offset,
+                                                   int stride,
+                                                   uint *num_active_pixels)
+{
+  const int work_index = ccl_gpu_global_id_x();
+  const int y = work_index / sw;
+  const int x = work_index - y * sw;
+
+  bool converged = true;
+
+  if (x < sw && y < sh) {
+    converged = kernel_adaptive_sampling_convergence_check(
+        nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride);
+  }
+
+  /* NOTE: All threads specified in the mask must execute the intrinsic. */
+  const uint num_active_pixels_mask = ccl_gpu_ballot(!converged);
+  const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
+  if (lane_id == 0) {
+    atomic_fetch_and_add_uint32(num_active_pixels, __popc(num_active_pixels_mask));
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_adaptive_sampling_filter_x(
+        float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+  const int y = ccl_gpu_global_id_x();
+
+  if (y < sh) {
+    kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_adaptive_sampling_filter_y(
+        float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+  const int x = ccl_gpu_global_id_x();
+
+  if (x < sw) {
+    kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride);
+  }
+}
+
+/* --------------------------------------------------------------------
+ * Cryptomatte.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_cryptomatte_postprocess(float *render_buffer, int num_pixels)
+{
+  const int pixel_index = ccl_gpu_global_id_x();
+
+  if (pixel_index < num_pixels) {
+    kernel_cryptomatte_post(nullptr, render_buffer, pixel_index);
+  }
+}
+
+/* --------------------------------------------------------------------
+ * Film.
+ */
+
+/* Common implementation for float destination. */
+template<typename Processor>
+ccl_device_inline void kernel_gpu_film_convert_common(const KernelFilmConvert *kfilm_convert,
+                                                      float *pixels,
+                                                      float *render_buffer,
+                                                      int num_pixels,
+                                                      int width,
+                                                      int offset,
+                                                      int stride,
+                                                      int dst_offset,
+                                                      int dst_stride,
+                                                      const Processor &processor)
+{
+  const int render_pixel_index = ccl_gpu_global_id_x();
+  if (render_pixel_index >= num_pixels) {
+    return;
+  }
+
+  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index * kfilm_convert->pass_stride;
+  ccl_global const float *buffer = render_buffer + render_buffer_offset;
+  ccl_global float *pixel = pixels +
+                            (render_pixel_index + dst_offset) * kfilm_convert->pixel_stride;
+
+  processor(kfilm_convert, buffer, pixel);
+}
+
+/* Common implementation for half4 destination and 4-channel input pass. */
+template<typename Processor>
+ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgba(
+    const KernelFilmConvert *kfilm_convert,
+    uchar4 *rgba,
+    float *render_buffer,
+    int num_pixels,
+    int width,
+    int offset,
+    int stride,
+    int rgba_offset,
+    int rgba_stride,
+    const Processor &processor)
+{
+  const int render_pixel_index = ccl_gpu_global_id_x();
+  if (render_pixel_index >= num_pixels) {
+    return;
+  }
+
+  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index * kfilm_convert->pass_stride;
+  ccl_global const float *buffer = render_buffer + render_buffer_offset;
+
+  float pixel[4];
+  processor(kfilm_convert, buffer, pixel);
+
+  film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel);
+
+  const int x = render_pixel_index % width;
+  const int y = render_pixel_index / width;
+
+  ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x;
+  float4_store_half((ccl_global half *)out, make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
+}
+
+/* Common implementation for half4 destination and 3-channel input pass. */
+template<typename Processor>
+ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgb(
+    const KernelFilmConvert *kfilm_convert,
+    uchar4 *rgba,
+    float *render_buffer,
+    int num_pixels,
+    int width,
+    int offset,
+    int stride,
+    int rgba_offset,
+    int rgba_stride,
+    const Processor &processor)
+{
+  kernel_gpu_film_convert_half_rgba_common_rgba(
+      kfilm_convert,
+      rgba,
+      render_buffer,
+      num_pixels,
+      width,
+      offset,
+      stride,
+      rgba_offset,
+      rgba_stride,
+      [&processor](const KernelFilmConvert *kfilm_convert,
+                   ccl_global const float *buffer,
+                   float *pixel_rgba) {
+        processor(kfilm_convert, buffer, pixel_rgba);
+        pixel_rgba[3] = 1.0f;
+      });
+}
+
+/* Common implementation for half4 destination and single channel input pass. */
+template<typename Processor>
+ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_value(
+    const KernelFilmConvert *kfilm_convert,
+    uchar4 *rgba,
+    float *render_buffer,
+    int num_pixels,
+    int width,
+    int offset,
+    int stride,
+    int rgba_offset,
+    int rgba_stride,
+    const Processor &processor)
+{
+  kernel_gpu_film_convert_half_rgba_common_rgba(
+      kfilm_convert,
+      rgba,
+      render_buffer,
+      num_pixels,
+      width,
+      offset,
+      stride,
+      rgba_offset,
+      rgba_stride,
+      [&processor](const KernelFilmConvert *kfilm_convert,
+                   ccl_global const float *buffer,
+                   float *pixel_rgba) {
+        float value;
+        processor(kfilm_convert, buffer, &value);
+
+        pixel_rgba[0] = value;
+        pixel_rgba[1] = value;
+        pixel_rgba[2] = value;
+        pixel_rgba[3] = 1.0f;
+      });
+}
+
+#define KERNEL_FILM_CONVERT_PROC(name) \
+  ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) name
+
+#define KERNEL_FILM_CONVERT_DEFINE(variant, channels) \
+  KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant) \
+  (const KernelFilmConvert kfilm_convert, \
+   float *pixels, \
+   float *render_buffer, \
+   int num_pixels, \
+   int width, \
+   int offset, \
+   int stride, \
+   int rgba_offset, \
+   int rgba_stride) \
+  { \
+    kernel_gpu_film_convert_common(&kfilm_convert, \
+                                   pixels, \
+                                   render_buffer, \
+                                   num_pixels, \
+                                   width, \
+                                   offset, \
+                                   stride, \
+                                   rgba_offset, \
+                                   rgba_stride, \
+                                   film_get_pass_pixel_##variant); \
+  } \
+  KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant##_half_rgba) \
+  (const KernelFilmConvert kfilm_convert, \
+   uchar4 *rgba, \
+   float *render_buffer, \
+   int num_pixels, \
+   int width, \
+   int offset, \
+   int stride, \
+   int rgba_offset, \
+   int rgba_stride) \
+  { \
+    kernel_gpu_film_convert_half_rgba_common_##channels(&kfilm_convert, \
+                                                        rgba, \
+                                                        render_buffer, \
+                                                        num_pixels, \
+                                                        width, \
+                                                        offset, \
+                                                        stride, \
+                                                        rgba_offset, \
+                                                        rgba_stride, \
+                                                        film_get_pass_pixel_##variant); \
+  }
+
+KERNEL_FILM_CONVERT_DEFINE(depth, value)
+KERNEL_FILM_CONVERT_DEFINE(mist, value)
+KERNEL_FILM_CONVERT_DEFINE(sample_count, value)
+KERNEL_FILM_CONVERT_DEFINE(float, value)
+
+KERNEL_FILM_CONVERT_DEFINE(light_path, rgb)
+KERNEL_FILM_CONVERT_DEFINE(float3, rgb)
+
+KERNEL_FILM_CONVERT_DEFINE(motion, rgba)
+KERNEL_FILM_CONVERT_DEFINE(cryptomatte, rgba)
+KERNEL_FILM_CONVERT_DEFINE(shadow_catcher, rgba)
+KERNEL_FILM_CONVERT_DEFINE(shadow_catcher_matte_with_shadow, rgba)
+KERNEL_FILM_CONVERT_DEFINE(combined, rgba)
+KERNEL_FILM_CONVERT_DEFINE(float4, rgba)
+
+#undef KERNEL_FILM_CONVERT_DEFINE
+#undef KERNEL_FILM_CONVERT_HALF_RGBA_DEFINE
+#undef KERNEL_FILM_CONVERT_PROC
+
+/* --------------------------------------------------------------------
+ * Shader evaluation.
+ */
+
+/* Displacement */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_shader_eval_displace(KernelShaderEvalInput *input,
+                                    float4 *output,
+                                    const int offset,
+                                    const int work_size)
+{
+  int i = ccl_gpu_global_id_x();
+  if (i < work_size) {
+    kernel_displace_evaluate(NULL, input, output, offset + i);
+  }
+}
+
+/* Background Shader Evaluation */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_shader_eval_background(KernelShaderEvalInput *input,
+                                      float4 *output,
+                                      const int offset,
+                                      const int work_size)
+{
+  int i = ccl_gpu_global_id_x();
+  if (i < work_size) {
+    kernel_background_evaluate(NULL, input, output, offset + i);
+  }
+}
+
+/* --------------------------------------------------------------------
+ * Denoising.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_filter_color_preprocess(float *render_buffer,
+                                       int full_x,
+                                       int full_y,
+                                       int width,
+                                       int height,
+                                       int offset,
+                                       int stride,
+                                       int pass_stride,
+                                       int pass_denoised)
+{
+  const int work_index = ccl_gpu_global_id_x();
+  const int y = work_index / width;
+  const int x = work_index - y * width;
+
+  if (x >= width || y >= height) {
+    return;
+  }
+
+  const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
+  float *buffer = render_buffer + render_pixel_index * pass_stride;
+
+  float *color_out = buffer + pass_denoised;
+  color_out[0] = clamp(color_out[0], 0.0f, 10000.0f);
+  color_out[1] = clamp(color_out[1], 0.0f, 10000.0f);
+  color_out[2] = clamp(color_out[2], 0.0f, 10000.0f);
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_filter_guiding_preprocess(float *guiding_buffer,
+                                         int guiding_pass_stride,
+                                         int guiding_pass_albedo,
+                                         int guiding_pass_normal,
+                                         const float *render_buffer,
+                                         int render_offset,
+                                         int render_stride,
+                                         int render_pass_stride,
+                                         int render_pass_sample_count,
+                                         int render_pass_denoising_albedo,
+                                         int render_pass_denoising_normal,
+                                         int full_x,
+                                         int full_y,
+                                         int width,
+                                         int height,
+                                         int num_samples)
+{
+  const int work_index = ccl_gpu_global_id_x();
+  const int y = work_index / width;
+  const int x = work_index - y * width;
+
+  if (x >= width || y >= height) {
+    return;
+  }
+
+  const uint64_t guiding_pixel_index = x + y * width;
+  float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
+
+  const uint64_t render_pixel_index = render_offset + (x + full_x) + (y + full_y) * render_stride;
+  const float *buffer = render_buffer + render_pixel_index * render_pass_stride;
+
+  float pixel_scale;
+  if (render_pass_sample_count == PASS_UNUSED) {
+    pixel_scale = 1.0f / num_samples;
+  }
+  else {
+    pixel_scale = 1.0f / __float_as_uint(buffer[render_pass_sample_count]);
+  }
+
+  /* Albedo pass. */
+  if (guiding_pass_albedo != PASS_UNUSED) {
+    kernel_assert(render_pass_denoising_albedo != PASS_UNUSED);
+
+    const float *aledo_in = buffer + render_pass_denoising_albedo;
+    float *albedo_out = guiding_pixel + guiding_pass_albedo;
+
+    albedo_out[0] = aledo_in[0] * pixel_scale;
+    albedo_out[1] = aledo_in[1] * pixel_scale;
+    albedo_out[2] = aledo_in[2] * pixel_scale;
+  }
+
+  /* Normal pass. */
+  if (render_pass_denoising_normal != PASS_UNUSED) {
+    kernel_assert(render_pass_denoising_normal != PASS_UNUSED);
+
+    const float *normal_in = buffer + render_pass_denoising_normal;
+    float *normal_out = guiding_pixel + guiding_pass_normal;
+
+    normal_out[0] = normal_in[0] * pixel_scale;
+    normal_out[1] = normal_in[1] * pixel_scale;
+    normal_out[2] = normal_in[2] * pixel_scale;
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_filter_guiding_set_fake_albedo(float *guiding_buffer,
+                                              int guiding_pass_stride,
+                                              int guiding_pass_albedo,
+                                              int width,
+                                              int height)
+{
+  kernel_assert(guiding_pass_albedo != PASS_UNUSED);
+
+  const int work_index = ccl_gpu_global_id_x();
+  const int y = work_index / width;
+  const int x = work_index - y * width;
+
+  if (x >= width || y >= height) {
+    return;
+  }
+
+  const uint64_t guiding_pixel_index = x + y * width;
+  float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
+
+  float *albedo_out = guiding_pixel + guiding_pass_albedo;
+
+  albedo_out[0] = 0.5f;
+  albedo_out[1] = 0.5f;
+  albedo_out[2] = 0.5f;
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_filter_color_postprocess(float *render_buffer,
+                                        int full_x,
+                                        int full_y,
+                                        int width,
+                                        int height,
+                                        int offset,
+                                        int stride,
+                                        int pass_stride,
+                                        int num_samples,
+                                        int pass_noisy,
+                                        int pass_denoised,
+                                        int pass_sample_count,
+                                        int num_components,
+                                        bool use_compositing)
+{
+  const int work_index = ccl_gpu_global_id_x();
+  const int y = work_index / width;
+  const int x = work_index - y * width;
+
+  if (x >= width || y >= height) {
+    return;
+  }
+
+  const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
+  float *buffer = render_buffer + render_pixel_index * pass_stride;
+
+  float pixel_scale;
+  if (pass_sample_count == PASS_UNUSED) {
+    pixel_scale = num_samples;
+  }
+  else {
+    pixel_scale = __float_as_uint(buffer[pass_sample_count]);
+  }
+
+  float *denoised_pixel = buffer + pass_denoised;
+
+  denoised_pixel[0] *= pixel_scale;
+  denoised_pixel[1] *= pixel_scale;
+  denoised_pixel[2] *= pixel_scale;
+
+  if (num_components == 3) {
+    /* Pass without alpha channel. */
+  }
+  else if (!use_compositing) {
+    /* Currently compositing passes are either 3-component (derived by dividing light passes)
+     * or do not have transparency (shadow catcher). Implicitly rely on this logic, as it
+     * simplifies logic and avoids extra memory allocation. */
+    const float *noisy_pixel = buffer + pass_noisy;
+    denoised_pixel[3] = noisy_pixel[3];
+  }
+  else {
+    /* Assigning to zero since this is a default alpha value for 3-component passes, and it
+     * is an opaque pixel for 4 component passes. */
+
+    denoised_pixel[3] = 0;
+  }
+}
+
+/* --------------------------------------------------------------------
+ * Shadow catcher.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shadow_catcher_count_possible_splits(int num_states,
+                                                               uint *num_possible_splits)
+{
+  const int state = ccl_gpu_global_id_x();
+
+  bool can_split = false;
+
+  if (state < num_states) {
+    can_split = kernel_shadow_catcher_path_can_split(nullptr, state);
+  }
+
+  /* NOTE: All threads specified in the mask must execute the intrinsic. */
+  const uint can_split_mask = ccl_gpu_ballot(can_split);
+  const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
+  if (lane_id == 0) {
+    atomic_fetch_and_add_uint32(num_possible_splits, __popc(can_split_mask));
+  }
+}
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
new file mode 100644
index 00000000000..85500bf4d07
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Given an array of states, build an array of indices for which the states
+ * are active.
+ *
+ * Shared memory requirement is sizeof(int) * (number_of_warps + 1) */
+
+#include "util/util_atomic.h"
+
+#define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
+
+template<uint blocksize, typename IsActiveOp>
+__device__ void gpu_parallel_active_index_array(const uint num_states,
+                                                int *indices,
+                                                int *num_indices,
+                                                IsActiveOp is_active_op)
+{
+  extern ccl_gpu_shared int warp_offset[];
+
+  const uint thread_index = ccl_gpu_thread_idx_x;
+  const uint thread_warp = thread_index % ccl_gpu_warp_size;
+
+  const uint warp_index = thread_index / ccl_gpu_warp_size;
+  const uint num_warps = blocksize / ccl_gpu_warp_size;
+
+  /* Test if state corresponding to this thread is active. */
+  const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index;
+  const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
+
+  /* For each thread within a warp compute how many other active states precede it. */
+  const uint thread_mask = 0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp);
+  const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) & thread_mask);
+
+  /* Last thread in warp stores number of active states for each warp. */
+  if (thread_warp == ccl_gpu_warp_size - 1) {
+    warp_offset[warp_index] = thread_offset + is_active;
+  }
+
+  ccl_gpu_syncthreads();
+
+  /* Last thread in block converts per-warp sizes to offsets, increments global size of
+   * index array and gets offset to write to. */
+  if (thread_index == blocksize - 1) {
+    /* TODO: parallelize this. */
+    int offset = 0;
+    for (int i = 0; i < num_warps; i++) {
+      int num_active = warp_offset[i];
+      warp_offset[i] = offset;
+      offset += num_active;
+    }
+
+    const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
+    warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
+  }
+
+  ccl_gpu_syncthreads();
+
+  /* Write to index array. */
+  if (is_active) {
+    const uint block_offset = warp_offset[num_warps];
+    indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
new file mode 100644
index 00000000000..f609520b8b4
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Parallel prefix sum.
+ *
+ * TODO: actually make this work in parallel.
+ *
+ * This is used for an array the size of the number of shaders in the scene
+ * which is not usually huge, so might not be a significant bottleneck. */
+
+#include "util/util_atomic.h"
+
+#define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
+
+template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, const int num_values)
+{
+  if (!(ccl_gpu_block_idx_x == 0 && ccl_gpu_thread_idx_x == 0)) {
+    return;
+  }
+
+  int offset = 0;
+  for (int i = 0; i < num_values; i++) {
+    const int new_offset = offset + values[i];
+    values[i] = offset;
+    offset = new_offset;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/parallel_reduce.h b/intern/cycles/kernel/device/gpu/parallel_reduce.h
new file mode 100644
index 00000000000..65b1990dbb8
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/parallel_reduce.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Parallel sum of array input_data with size n into output_sum.
+ *
+ * Adapted from "Optimizing Parallel Reduction in GPU", Mark Harris.
+ *
+ * This version adds multiple elements per thread sequentially.  This reduces
+ * the overall cost of the algorithm while keeping the work complexity O(n) and
+ * the step complexity O(log n). (Brent's Theorem optimization) */
+
+#define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 512
+
+template<uint blocksize, typename InputT, typename OutputT, typename ConvertOp>
+__device__ void gpu_parallel_sum(
+    const InputT *input_data, const uint n, OutputT *output_sum, OutputT zero, ConvertOp convert)
+{
+  extern ccl_gpu_shared OutputT shared_data[];
+
+  const uint tid = ccl_gpu_thread_idx_x;
+  const uint gridsize = blocksize * ccl_gpu_grid_dim_x();
+
+  OutputT sum = zero;
+  for (uint i = ccl_gpu_block_idx_x * blocksize + tid; i < n; i += gridsize) {
+    sum += convert(input_data[i]);
+  }
+  shared_data[tid] = sum;
+
+  ccl_gpu_syncthreads();
+
+  if (blocksize >= 512 && tid < 256) {
+    shared_data[tid] = sum = sum + shared_data[tid + 256];
+  }
+
+  ccl_gpu_syncthreads();
+
+  if (blocksize >= 256 && tid < 128) {
+    shared_data[tid] = sum = sum + shared_data[tid + 128];
+  }
+
+  ccl_gpu_syncthreads();
+
+  if (blocksize >= 128 && tid < 64) {
+    shared_data[tid] = sum = sum + shared_data[tid + 64];
+  }
+
+  ccl_gpu_syncthreads();
+
+  if (blocksize >= 64 && tid < 32) {
+    shared_data[tid] = sum = sum + shared_data[tid + 32];
+  }
+
+  ccl_gpu_syncthreads();
+
+  if (tid < 32) {
+    for (int offset = ccl_gpu_warp_size / 2; offset > 0; offset /= 2) {
+      sum += ccl_shfl_down_sync(0xFFFFFFFF, sum, offset);
+    }
+  }
+
+  if (tid == 0) {
+    output_sum[ccl_gpu_block_idx_x] = sum;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
new file mode 100644
index 00000000000..99b35468517
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Given an array of states, build an array of indices for which the states
+ * are active and sorted by a given key. The prefix sum of the number of active
+ * states per key must have already been computed.
+ *
+ * TODO: there may be ways to optimize this to avoid this many atomic ops? */
+
+#include "util/util_atomic.h"
+
+#define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
+#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
+
+template<uint blocksize, typename GetKeyOp>
+__device__ void gpu_parallel_sorted_index_array(const uint num_states,
+                                                int *indices,
+                                                int *num_indices,
+                                                int *key_prefix_sum,
+                                                GetKeyOp get_key_op)
+{
+  const uint state_index = ccl_gpu_block_idx_x * blocksize + ccl_gpu_thread_idx_x;
+  const int key = (state_index < num_states) ? get_key_op(state_index) :
+                                               GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
+
+  if (key != GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY) {
+    const uint index = atomic_fetch_and_add_uint32(&key_prefix_sum[key], 1);
+    indices[index] = state_index;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/optix/compat.h b/intern/cycles/kernel/device/optix/compat.h
new file mode 100644
index 00000000000..4e255a135c6
--- /dev/null
+++ b/intern/cycles/kernel/device/optix/compat.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#define OPTIX_DONT_INCLUDE_CUDA
+#include <optix.h>
+
+#define __KERNEL_GPU__
+#define __KERNEL_CUDA__ /* OptiX kernels are implicitly CUDA kernels too */
+#define __KERNEL_OPTIX__
+#define CCL_NAMESPACE_BEGIN
+#define CCL_NAMESPACE_END
+
+#ifndef ATTR_FALLTHROUGH
+#  define ATTR_FALLTHROUGH
+#endif
+
+/* Manual definitions so we can compile without CUDA toolkit. */
+
+#ifdef __CUDACC_RTC__
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+#else
+#  include <stdint.h>
+#endif
+
+#ifdef CYCLES_CUBIN_CC
+#  define FLT_MIN 1.175494350822287507969e-38f
+#  define FLT_MAX 340282346638528859811704183484516925440.0f
+#  define FLT_EPSILON 1.192092896e-07F
+#endif
+
+#define ccl_device \
+  __device__ __forceinline__  // Function calls are bad for OptiX performance, so inline everything
+#define ccl_device_inline ccl_device
+#define ccl_device_forceinline ccl_device
+#define ccl_device_noinline __device__ __noinline__
+#define ccl_device_noinline_cpu ccl_device
+#define ccl_global
+#define ccl_static_constant __constant__
+#define ccl_device_constant __constant__ __device__
+#define ccl_constant const
+#define ccl_gpu_shared __shared__
+#define ccl_private
+#define ccl_may_alias
+#define ccl_addr_space
+#define ccl_restrict __restrict__
+#define ccl_loop_no_unroll
+#define ccl_align(n) __align__(n)
+
+/* Zero initialize structs to help the compiler figure out scoping */
+#define ccl_optional_struct_init = {}
+
+/* No assert supported for CUDA */
+
+#define kernel_assert(cond)
+
+/* GPU thread, block, grid size and index */
+
+#define ccl_gpu_thread_idx_x (threadIdx.x)
+#define ccl_gpu_block_dim_x (blockDim.x)
+#define ccl_gpu_block_idx_x (blockIdx.x)
+#define ccl_gpu_grid_dim_x (gridDim.x)
+#define ccl_gpu_warp_size (warpSize)
+
+#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
+#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
+
+/* GPU warp synchronizaton */
+
+#define ccl_gpu_syncthreads() __syncthreads()
+#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
+#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
+#define ccl_gpu_popc(x) __popc(x)
+
+/* GPU texture objects */
+
+typedef unsigned long long CUtexObject;
+typedef CUtexObject ccl_gpu_tex_object;
+
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_2D(const ccl_gpu_tex_object texobj,
+                                                    const float x,
+                                                    const float y)
+{
+  return tex2D<T>(texobj, x, y);
+}
+
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object texobj,
+                                                    const float x,
+                                                    const float y,
+                                                    const float z)
+{
+  return tex3D<T>(texobj, x, y, z);
+}
+
+/* Half */
+
+typedef unsigned short half;
+
+__device__ half __float2half(const float f)
+{
+  half val;
+  asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
+  return val;
+}
+
+/* Types */
+
+#include "util/util_half.h"
+#include "util/util_types.h"
diff --git a/intern/cycles/kernel/device/optix/globals.h b/intern/cycles/kernel/device/optix/globals.h
new file mode 100644
index 00000000000..7d898ed5d91
--- /dev/null
+++ b/intern/cycles/kernel/device/optix/globals.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#pragma once
+
+#include "kernel/kernel_profiling.h"
+#include "kernel/kernel_types.h"
+
+#include "kernel/integrator/integrator_state.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Not actually used, just a NULL pointer that gets passed everywhere, which we
+ * hope gets optimized out by the compiler. */
+struct KernelGlobals {
+  int unused[1];
+};
+
+/* Launch parameters */
+struct KernelParamsOptiX {
+  /* Kernel arguments */
+  const int *path_index_array;
+  float *render_buffer;
+
+  /* Global scene data and textures */
+  KernelData data;
+#define KERNEL_TEX(type, name) const type *name;
+#include "kernel/kernel_textures.h"
+
+  /* Integrator state */
+  IntegratorStateGPU __integrator_state;
+};
+
+#ifdef __NVCC__
+extern "C" static __constant__ KernelParamsOptiX __params;
+#endif
+
+/* Abstraction macros */
+#define kernel_data __params.data
+#define kernel_tex_array(t) __params.t
+#define kernel_tex_fetch(t, index) __params.t[(index)]
+#define kernel_integrator_state __params.__integrator_state
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/optix/kernel.cu b/intern/cycles/kernel/device/optix/kernel.cu
new file mode 100644
index 00000000000..c1e36febfc0
--- /dev/null
+++ b/intern/cycles/kernel/device/optix/kernel.cu
@@ -0,0 +1,347 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// clang-format off
+#include "kernel/device/optix/compat.h"
+#include "kernel/device/optix/globals.h"
+
+#include "kernel/device/gpu/image.h"  // Texture lookup uses normal CUDA intrinsics
+
+#include "kernel/integrator/integrator_state.h"
+#include "kernel/integrator/integrator_state_flow.h"
+#include "kernel/integrator/integrator_state_util.h"
+
+#include "kernel/integrator/integrator_intersect_closest.h"
+#include "kernel/integrator/integrator_intersect_shadow.h"
+#include "kernel/integrator/integrator_intersect_subsurface.h"
+#include "kernel/integrator/integrator_intersect_volume_stack.h"
+
+// clang-format on
+
+template<typename T> ccl_device_forceinline T *get_payload_ptr_0()
+{
+  return (T *)(((uint64_t)optixGetPayload_1() << 32) | optixGetPayload_0());
+}
+template<typename T> ccl_device_forceinline T *get_payload_ptr_2()
+{
+  return (T *)(((uint64_t)optixGetPayload_3() << 32) | optixGetPayload_2());
+}
+
+template<bool always = false> ccl_device_forceinline uint get_object_id()
+{
+#ifdef __OBJECT_MOTION__
+  // Always get the the instance ID from the TLAS
+  // There might be a motion transform node between TLAS and BLAS which does not have one
+  uint object = optixGetInstanceIdFromHandle(optixGetTransformListHandle(0));
+#else
+  uint object = optixGetInstanceId();
+#endif
+  // Choose between always returning object ID or only for instances
+  if (always || (object & 1) == 0)
+    // Can just remove the low bit since instance always contains object ID
+    return object >> 1;
+  else
+    // Set to OBJECT_NONE if this is not an instanced object
+    return OBJECT_NONE;
+}
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_closest()
+{
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+                                                       global_index;
+  integrator_intersect_closest(nullptr, path_index);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_shadow()
+{
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+                                                       global_index;
+  integrator_intersect_shadow(nullptr, path_index);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_subsurface()
+{
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+                                                       global_index;
+  integrator_intersect_subsurface(nullptr, path_index);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_volume_stack()
+{
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+                                                       global_index;
+  integrator_intersect_volume_stack(nullptr, path_index);
+}
+
+extern "C" __global__ void __miss__kernel_optix_miss()
+{
+  // 'kernel_path_lamp_emission' checks intersection distance, so need to set it even on a miss
+  optixSetPayload_0(__float_as_uint(optixGetRayTmax()));
+  optixSetPayload_5(PRIMITIVE_NONE);
+}
+
+extern "C" __global__ void __anyhit__kernel_optix_local_hit()
+{
+#ifdef __BVH_LOCAL__
+  const uint object = get_object_id<true>();
+  if (object != optixGetPayload_4() /* local_object */) {
+    // Only intersect with matching object
+    return optixIgnoreIntersection();
+  }
+
+  const uint max_hits = optixGetPayload_5();
+  if (max_hits == 0) {
+    // Special case for when no hit information is requested, just report that something was hit
+    optixSetPayload_5(true);
+    return optixTerminateRay();
+  }
+
+  int hit = 0;
+  uint *const lcg_state = get_payload_ptr_0<uint>();
+  LocalIntersection *const local_isect = get_payload_ptr_2<LocalIntersection>();
+
+  if (lcg_state) {
+    for (int i = min(max_hits, local_isect->num_hits) - 1; i >= 0; --i) {
+      if (optixGetRayTmax() == local_isect->hits[i].t) {
+        return optixIgnoreIntersection();
+      }
+    }
+
+    hit = local_isect->num_hits++;
+
+    if (local_isect->num_hits > max_hits) {
+      hit = lcg_step_uint(lcg_state) % local_isect->num_hits;
+      if (hit >= max_hits) {
+        return optixIgnoreIntersection();
+      }
+    }
+  }
+  else {
+    if (local_isect->num_hits && optixGetRayTmax() > local_isect->hits[0].t) {
+      // Record closest intersection only
+      // Do not terminate ray here, since there is no guarantee about distance ordering in any-hit
+      return optixIgnoreIntersection();
+    }
+
+    local_isect->num_hits = 1;
+  }
+
+  Intersection *isect = &local_isect->hits[hit];
+  isect->t = optixGetRayTmax();
+  isect->prim = optixGetPrimitiveIndex();
+  isect->object = get_object_id();
+  isect->type = kernel_tex_fetch(__prim_type, isect->prim);
+
+  const float2 barycentrics = optixGetTriangleBarycentrics();
+  isect->u = 1.0f - barycentrics.y - barycentrics.x;
+  isect->v = barycentrics.x;
+
+  // Record geometric normal
+  const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect->prim);
+  const float3 tri_a = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex + 0));
+  const float3 tri_b = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex + 1));
+  const float3 tri_c = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex + 2));
+  local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));
+
+  // Continue tracing (without this the trace call would return after the first hit)
+  optixIgnoreIntersection();
+#endif
+}
+
+extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
+{
+#ifdef __SHADOW_RECORD_ALL__
+  bool ignore_intersection = false;
+
+  const uint prim = optixGetPrimitiveIndex();
+#  ifdef __VISIBILITY_FLAG__
+  const uint visibility = optixGetPayload_4();
+  if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0) {
+    ignore_intersection = true;
+  }
+#  endif
+
+  float u = 0.0f, v = 0.0f;
+  if (optixIsTriangleHit()) {
+    const float2 barycentrics = optixGetTriangleBarycentrics();
+    u = 1.0f - barycentrics.y - barycentrics.x;
+    v = barycentrics.x;
+  }
+#  ifdef __HAIR__
+  else {
+    u = __uint_as_float(optixGetAttribute_0());
+    v = __uint_as_float(optixGetAttribute_1());
+
+    // Filter out curve endcaps
+    if (u == 0.0f || u == 1.0f) {
+      ignore_intersection = true;
+    }
+  }
+#  endif
+
+  int num_hits = optixGetPayload_2();
+  int record_index = num_hits;
+  const int max_hits = optixGetPayload_3();
+
+  if (!ignore_intersection) {
+    optixSetPayload_2(num_hits + 1);
+  }
+
+  Intersection *const isect_array = get_payload_ptr_0<Intersection>();
+
+#  ifdef __TRANSPARENT_SHADOWS__
+  if (num_hits >= max_hits) {
+    /* If maximum number of hits reached, find a hit to replace. */
+    const int num_recorded_hits = min(max_hits, num_hits);
+    float max_recorded_t = isect_array[0].t;
+    int max_recorded_hit = 0;
+
+    for (int i = 1; i < num_recorded_hits; i++) {
+      if (isect_array[i].t > max_recorded_t) {
+        max_recorded_t = isect_array[i].t;
+        max_recorded_hit = i;
+      }
+    }
+
+    if (optixGetRayTmax() >= max_recorded_t) {
+      /* Accept hit, so that OptiX won't consider any more hits beyond the distance of the current
+       * hit anymore. */
+      return;
+    }
+
+    record_index = max_recorded_hit;
+  }
+#  endif
+
+  if (!ignore_intersection) {
+    Intersection *const isect = isect_array + record_index;
+    isect->u = u;
+    isect->v = v;
+    isect->t = optixGetRayTmax();
+    isect->prim = prim;
+    isect->object = get_object_id();
+    isect->type = kernel_tex_fetch(__prim_type, prim);
+
+#  ifdef __TRANSPARENT_SHADOWS__
+    // Detect if this surface has a shader with transparent shadows
+    if (!shader_transparent_shadow(NULL, isect) || max_hits == 0) {
+#  endif
+      // If no transparent shadows, all light is blocked and we can stop immediately
+      optixSetPayload_5(true);
+      return optixTerminateRay();
+#  ifdef __TRANSPARENT_SHADOWS__
+    }
+#  endif
+  }
+
+  // Continue tracing
+  optixIgnoreIntersection();
+#endif
+}
+
+extern "C" __global__ void __anyhit__kernel_optix_visibility_test()
+{
+  uint visibility = optixGetPayload_4();
+#ifdef __VISIBILITY_FLAG__
+  const uint prim = optixGetPrimitiveIndex();
+  if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0) {
+    return optixIgnoreIntersection();
+  }
+#endif
+
+#ifdef __HAIR__
+  if (!optixIsTriangleHit()) {
+    // Filter out curve endcaps
+    const float u = __uint_as_float(optixGetAttribute_0());
+    if (u == 0.0f || u == 1.0f) {
+      return optixIgnoreIntersection();
+    }
+  }
+#endif
+
+  // Shadow ray early termination
+  if (visibility & PATH_RAY_SHADOW_OPAQUE) {
+    return optixTerminateRay();
+  }
+}
+
+extern "C" __global__ void __closesthit__kernel_optix_hit()
+{
+  optixSetPayload_0(__float_as_uint(optixGetRayTmax()));  // Intersection distance
+  optixSetPayload_3(optixGetPrimitiveIndex());
+  optixSetPayload_4(get_object_id());
+  // Can be PRIMITIVE_TRIANGLE and PRIMITIVE_MOTION_TRIANGLE or curve type and segment index
+  optixSetPayload_5(kernel_tex_fetch(__prim_type, optixGetPrimitiveIndex()));
+
+  if (optixIsTriangleHit()) {
+    const float2 barycentrics = optixGetTriangleBarycentrics();
+    optixSetPayload_1(__float_as_uint(1.0f - barycentrics.y - barycentrics.x));
+    optixSetPayload_2(__float_as_uint(barycentrics.x));
+  }
+  else {
+    optixSetPayload_1(optixGetAttribute_0());  // Same as 'optixGetCurveParameter()'
+    optixSetPayload_2(optixGetAttribute_1());
+  }
+}
+
+#ifdef __HAIR__
+ccl_device_inline void optix_intersection_curve(const uint prim, const uint type)
+{
+  const uint object = get_object_id<true>();
+  const uint visibility = optixGetPayload_4();
+
+  float3 P = optixGetObjectRayOrigin();
+  float3 dir = optixGetObjectRayDirection();
+
+  // The direction is not normalized by default, but the curve intersection routine expects that
+  float len;
+  dir = normalize_len(dir, &len);
+
+#  ifdef __OBJECT_MOTION__
+  const float time = optixGetRayTime();
+#  else
+  const float time = 0.0f;
+#  endif
+
+  Intersection isect;
+  isect.t = optixGetRayTmax();
+  // Transform maximum distance into object space
+  if (isect.t != FLT_MAX)
+    isect.t *= len;
+
+  if (curve_intersect(NULL, &isect, P, dir, isect.t, visibility, object, prim, time, type)) {
+    optixReportIntersection(isect.t / len,
+                            type & PRIMITIVE_ALL,
+                            __float_as_int(isect.u),   // Attribute_0
+                            __float_as_int(isect.v));  // Attribute_1
+  }
+}
+
+extern "C" __global__ void __intersection__curve_ribbon()
+{
+  const uint prim = optixGetPrimitiveIndex();
+  const uint type = kernel_tex_fetch(__prim_type, prim);
+
+  if (type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) {
+    optix_intersection_curve(prim, type);
+  }
+}
+#endif
diff --git a/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu b/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu
new file mode 100644
index 00000000000..bf787e29eaa
--- /dev/null
+++ b/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2021, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Copy of the regular kernels with additional shader ray-tracing kernel that takes
+ * much longer to compiler. This is only loaded when needed by the scene. */
+
+#include "kernel/device/optix/kernel.cu"
+#include "kernel/integrator/integrator_shade_surface.h"
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_surface_raytrace()
+{
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+                                                       global_index;
+  integrator_shade_surface_raytrace(nullptr, path_index, __params.render_buffer);
+}
-- 
cgit v1.2.3


From 4d66cbd140b1648b79df0df695046cb718797b70 Mon Sep 17 00:00:00 2001
From: Campbell Barton <ideasman42@gmail.com>
Date: Wed, 22 Sep 2021 14:48:01 +1000
Subject: Cleanup: spelling in comments

---
 intern/cycles/kernel/device/cuda/compat.h  | 2 +-
 intern/cycles/kernel/device/optix/compat.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'intern/cycles/kernel/device')

diff --git a/intern/cycles/kernel/device/cuda/compat.h b/intern/cycles/kernel/device/cuda/compat.h
index 665da43e1a1..3c85a8e7bd2 100644
--- a/intern/cycles/kernel/device/cuda/compat.h
+++ b/intern/cycles/kernel/device/cuda/compat.h
@@ -80,7 +80,7 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
 #define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
 
-/* GPU warp synchronizaton */
+/* GPU warp synchronization. */
 
 #define ccl_gpu_syncthreads() __syncthreads()
 #define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
diff --git a/intern/cycles/kernel/device/optix/compat.h b/intern/cycles/kernel/device/optix/compat.h
index 4e255a135c6..fb9e094b535 100644
--- a/intern/cycles/kernel/device/optix/compat.h
+++ b/intern/cycles/kernel/device/optix/compat.h
@@ -81,7 +81,7 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
 #define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
 
-/* GPU warp synchronizaton */
+/* GPU warp synchronization. */
 
 #define ccl_gpu_syncthreads() __syncthreads()
 #define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
-- 
cgit v1.2.3


From b659d1a560410425b3454016eeead8dbae7a0898 Mon Sep 17 00:00:00 2001
From: Campbell Barton <ideasman42@gmail.com>
Date: Thu, 23 Sep 2021 22:06:49 +1000
Subject: Cleanup: spelling in comments

---
 intern/cycles/kernel/device/gpu/parallel_active_index.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'intern/cycles/kernel/device')

diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
index 85500bf4d07..a68d1d80c7d 100644
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN
 /* Given an array of states, build an array of indices for which the states
  * are active.
  *
- * Shared memory requirement is sizeof(int) * (number_of_warps + 1) */
+ * Shared memory requirement is `sizeof(int) * (number_of_warps + 1)`. */
 
 #include "util/util_atomic.h"
 
-- 
cgit v1.2.3


From 2189dfd6e25a7bb6b734116619d87bc2d2a535ff Mon Sep 17 00:00:00 2001
From: Patrick Mours <pmours@nvidia.com>
Date: Wed, 22 Sep 2021 16:23:08 +0200
Subject: Cycles: Rework OptiX visibility flags handling

Before the visibility test against the visibility flags was performed in an any-hit program in OptiX
(called `__anyhit__kernel_optix_visibility_test`), which was using the `__prim_visibility` array.
This is not entirely correct however, since `__prim_visibility` is filled with the merged visibility
flags of all objects that reference that primitive, so if one object uses different visibility flags
than another object, but they both are instances of the same geometry, they would appear the same
way. The reason that the any-hit program was used rather than the OptiX instance visibility mask is
that the latter is currently limited to 8 bits only, which is not sufficient to contain all Cycles
visibility flags (12 bits).

To mostly fix the problem with multiple instances and different visibility flags, I changed things to
use the OptiX instance visibility mask for a subset of the Cycles visibility flags (`PATH_RAY_CAMERA`
to `PATH_RAY_VOLUME_SCATTER`, which fit into 8 bits) and only fall back to the visibility test any-hit
program if that isn't enough (e.g. the ray visibility mask exceeds 8 bits or when using the built-in
curves from OptiX, since the any-hit program is then also used to skip the curve endcaps).

This may also improve performance in some cases, since by default OptiX can now perform the normal
scene intersection trace calls entirely on RT cores without having to jump back to the SM on every
hit to execute the any-hit program.

Fixes T89801

Differential Revision: https://developer.blender.org/D12604
---
 intern/cycles/kernel/device/optix/kernel.cu | 87 +++++++++++++++++++----------
 1 file changed, 59 insertions(+), 28 deletions(-)

(limited to 'intern/cycles/kernel/device')

diff --git a/intern/cycles/kernel/device/optix/kernel.cu b/intern/cycles/kernel/device/optix/kernel.cu
index c1e36febfc0..7a79e0c4823 100644
--- a/intern/cycles/kernel/device/optix/kernel.cu
+++ b/intern/cycles/kernel/device/optix/kernel.cu
@@ -19,7 +19,7 @@
 #include "kernel/device/optix/compat.h"
 #include "kernel/device/optix/globals.h"
 
-#include "kernel/device/gpu/image.h"  // Texture lookup uses normal CUDA intrinsics
+#include "kernel/device/gpu/image.h"  /* Texture lookup uses normal CUDA intrinsics. */
 
 #include "kernel/integrator/integrator_state.h"
 #include "kernel/integrator/integrator_state_flow.h"
@@ -44,18 +44,18 @@ template<typename T> ccl_device_forceinline T *get_payload_ptr_2()
 template<bool always = false> ccl_device_forceinline uint get_object_id()
 {
 #ifdef __OBJECT_MOTION__
-  // Always get the the instance ID from the TLAS
-  // There might be a motion transform node between TLAS and BLAS which does not have one
+  /* Always get the the instance ID from the TLAS.
+   * There might be a motion transform node between TLAS and BLAS which does not have one. */
   uint object = optixGetInstanceIdFromHandle(optixGetTransformListHandle(0));
 #else
   uint object = optixGetInstanceId();
 #endif
-  // Choose between always returning object ID or only for instances
+  /* Choose between always returning object ID or only for instances. */
   if (always || (object & 1) == 0)
-    // Can just remove the low bit since instance always contains object ID
+    /* Can just remove the low bit since instance always contains object ID. */
     return object >> 1;
   else
-    // Set to OBJECT_NONE if this is not an instanced object
+    /* Set to OBJECT_NONE if this is not an instanced object. */
     return OBJECT_NONE;
 }
 
@@ -93,23 +93,30 @@ extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_volume_st
 
 extern "C" __global__ void __miss__kernel_optix_miss()
 {
-  // 'kernel_path_lamp_emission' checks intersection distance, so need to set it even on a miss
+  /* 'kernel_path_lamp_emission' checks intersection distance, so need to set it even on a miss. */
   optixSetPayload_0(__float_as_uint(optixGetRayTmax()));
   optixSetPayload_5(PRIMITIVE_NONE);
 }
 
 extern "C" __global__ void __anyhit__kernel_optix_local_hit()
 {
+#ifdef __HAIR__
+  if (!optixIsTriangleHit()) {
+    /* Ignore curves. */
+    return optixIgnoreIntersection();
+  }
+#endif
+
 #ifdef __BVH_LOCAL__
   const uint object = get_object_id<true>();
   if (object != optixGetPayload_4() /* local_object */) {
-    // Only intersect with matching object
+    /* Only intersect with matching object. */
     return optixIgnoreIntersection();
   }
 
   const uint max_hits = optixGetPayload_5();
   if (max_hits == 0) {
-    // Special case for when no hit information is requested, just report that something was hit
+    /* Special case for when no hit information is requested, just report that something was hit */
     optixSetPayload_5(true);
     return optixTerminateRay();
   }
@@ -136,8 +143,9 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit()
   }
   else {
     if (local_isect->num_hits && optixGetRayTmax() > local_isect->hits[0].t) {
-      // Record closest intersection only
-      // Do not terminate ray here, since there is no guarantee about distance ordering in any-hit
+      /* Record closest intersection only.
+       * Do not terminate ray here, since there is no guarantee about distance ordering in any-hit.
+       */
       return optixIgnoreIntersection();
     }
 
@@ -154,14 +162,14 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit()
   isect->u = 1.0f - barycentrics.y - barycentrics.x;
   isect->v = barycentrics.x;
 
-  // Record geometric normal
+  /* Record geometric normal. */
   const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect->prim);
   const float3 tri_a = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex + 0));
   const float3 tri_b = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex + 1));
   const float3 tri_c = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex + 2));
   local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));
 
-  // Continue tracing (without this the trace call would return after the first hit)
+  /* Continue tracing (without this the trace call would return after the first hit). */
   optixIgnoreIntersection();
 #endif
 }
@@ -190,7 +198,7 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
     u = __uint_as_float(optixGetAttribute_0());
     v = __uint_as_float(optixGetAttribute_1());
 
-    // Filter out curve endcaps
+    /* Filter out curve endcaps. */
     if (u == 0.0f || u == 1.0f) {
       ignore_intersection = true;
     }
@@ -241,10 +249,10 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
     isect->type = kernel_tex_fetch(__prim_type, prim);
 
 #  ifdef __TRANSPARENT_SHADOWS__
-    // Detect if this surface has a shader with transparent shadows
+    /* Detect if this surface has a shader with transparent shadows. */
     if (!shader_transparent_shadow(NULL, isect) || max_hits == 0) {
 #  endif
-      // If no transparent shadows, all light is blocked and we can stop immediately
+      /* If no transparent shadows, all light is blocked and we can stop immediately. */
       optixSetPayload_5(true);
       return optixTerminateRay();
 #  ifdef __TRANSPARENT_SHADOWS__
@@ -252,24 +260,39 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
 #  endif
   }
 
-  // Continue tracing
+  /* Continue tracing. */
   optixIgnoreIntersection();
 #endif
 }
 
-extern "C" __global__ void __anyhit__kernel_optix_visibility_test()
+extern "C" __global__ void __anyhit__kernel_optix_volume_test()
 {
-  uint visibility = optixGetPayload_4();
+#ifdef __HAIR__
+  if (!optixIsTriangleHit()) {
+    /* Ignore curves. */
+    return optixIgnoreIntersection();
+  }
+#endif
+
 #ifdef __VISIBILITY_FLAG__
   const uint prim = optixGetPrimitiveIndex();
+  const uint visibility = optixGetPayload_4();
   if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0) {
     return optixIgnoreIntersection();
   }
 #endif
 
+  const uint object = get_object_id<true>();
+  if ((kernel_tex_fetch(__object_flag, object) & SD_OBJECT_HAS_VOLUME) == 0) {
+    return optixIgnoreIntersection();
+  }
+}
+
+extern "C" __global__ void __anyhit__kernel_optix_visibility_test()
+{
 #ifdef __HAIR__
   if (!optixIsTriangleHit()) {
-    // Filter out curve endcaps
+    /* Filter out curve endcaps. */
     const float u = __uint_as_float(optixGetAttribute_0());
     if (u == 0.0f || u == 1.0f) {
       return optixIgnoreIntersection();
@@ -277,18 +300,26 @@ extern "C" __global__ void __anyhit__kernel_optix_visibility_test()
   }
 #endif
 
-  // Shadow ray early termination
+#ifdef __VISIBILITY_FLAG__
+  const uint prim = optixGetPrimitiveIndex();
+  const uint visibility = optixGetPayload_4();
+  if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0) {
+    return optixIgnoreIntersection();
+  }
+
+  /* Shadow ray early termination. */
   if (visibility & PATH_RAY_SHADOW_OPAQUE) {
     return optixTerminateRay();
   }
+#endif
 }
 
 extern "C" __global__ void __closesthit__kernel_optix_hit()
 {
-  optixSetPayload_0(__float_as_uint(optixGetRayTmax()));  // Intersection distance
+  optixSetPayload_0(__float_as_uint(optixGetRayTmax())); /* Intersection distance */
   optixSetPayload_3(optixGetPrimitiveIndex());
   optixSetPayload_4(get_object_id());
-  // Can be PRIMITIVE_TRIANGLE and PRIMITIVE_MOTION_TRIANGLE or curve type and segment index
+  /* Can be PRIMITIVE_TRIANGLE and PRIMITIVE_MOTION_TRIANGLE or curve type and segment index. */
   optixSetPayload_5(kernel_tex_fetch(__prim_type, optixGetPrimitiveIndex()));
 
   if (optixIsTriangleHit()) {
@@ -297,7 +328,7 @@ extern "C" __global__ void __closesthit__kernel_optix_hit()
     optixSetPayload_2(__float_as_uint(barycentrics.x));
   }
   else {
-    optixSetPayload_1(optixGetAttribute_0());  // Same as 'optixGetCurveParameter()'
+    optixSetPayload_1(optixGetAttribute_0()); /* Same as 'optixGetCurveParameter()' */
     optixSetPayload_2(optixGetAttribute_1());
   }
 }
@@ -311,7 +342,7 @@ ccl_device_inline void optix_intersection_curve(const uint prim, const uint type
   float3 P = optixGetObjectRayOrigin();
   float3 dir = optixGetObjectRayDirection();
 
-  // The direction is not normalized by default, but the curve intersection routine expects that
+  /* The direction is not normalized by default, but the curve intersection routine expects that */
   float len;
   dir = normalize_len(dir, &len);
 
@@ -323,15 +354,15 @@ ccl_device_inline void optix_intersection_curve(const uint prim, const uint type
 
   Intersection isect;
   isect.t = optixGetRayTmax();
-  // Transform maximum distance into object space
+  /* Transform maximum distance into object space. */
   if (isect.t != FLT_MAX)
     isect.t *= len;
 
   if (curve_intersect(NULL, &isect, P, dir, isect.t, visibility, object, prim, time, type)) {
     optixReportIntersection(isect.t / len,
                             type & PRIMITIVE_ALL,
-                            __float_as_int(isect.u),   // Attribute_0
-                            __float_as_int(isect.v));  // Attribute_1
+                            __float_as_int(isect.u),  /* Attribute_0 */
+                            __float_as_int(isect.v)); /* Attribute_1 */
   }
 }
 
-- 
cgit v1.2.3


From 044a77352f8a8a0e1f60190369d69ef26587b65f Mon Sep 17 00:00:00 2001
From: Brian Savery <bsavery>
Date: Tue, 28 Sep 2021 16:51:14 +0200
Subject: Cycles: add HIP device support for AMD GPUs

NOTE: this feature is not ready for user testing, and not yet enabled in daily
builds. It is being merged now for easier collaboration on development.

HIP is a heterogenous compute interface allowing C++ code to be executed on
GPUs similar to CUDA. It is intended to bring back AMD GPU rendering support
on Windows and Linux.

https://github.com/ROCm-Developer-Tools/HIP.

As of the time of writing, it should compile and run on Linux with existing
HIP compilers and driver runtimes. Publicly available compilers and drivers
for Windows will come later.

See task T91571 for more details on the current status and work remaining
to be done.

Credits:

Sayak Biswas (AMD)
Arya Rafii (AMD)
Brian Savery (AMD)

Differential Revision: https://developer.blender.org/D12578
---
 .../kernel/device/gpu/parallel_active_index.h      |   6 +-
 .../cycles/kernel/device/gpu/parallel_prefix_sum.h |   6 +-
 intern/cycles/kernel/device/gpu/parallel_reduce.h  |   6 +-
 .../kernel/device/gpu/parallel_sorted_index.h      |   6 +-
 intern/cycles/kernel/device/hip/compat.h           | 121 +++++++++++++++++++++
 intern/cycles/kernel/device/hip/config.h           |  57 ++++++++++
 intern/cycles/kernel/device/hip/globals.h          |  49 +++++++++
 intern/cycles/kernel/device/hip/kernel.cpp         |  28 +++++
 8 files changed, 275 insertions(+), 4 deletions(-)
 create mode 100644 intern/cycles/kernel/device/hip/compat.h
 create mode 100644 intern/cycles/kernel/device/hip/config.h
 create mode 100644 intern/cycles/kernel/device/hip/globals.h
 create mode 100644 intern/cycles/kernel/device/hip/kernel.cpp

(limited to 'intern/cycles/kernel/device')

diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
index a68d1d80c7d..db4a4bf71e0 100644
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -25,7 +25,11 @@ CCL_NAMESPACE_BEGIN
 
 #include "util/util_atomic.h"
 
-#define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
+#ifdef __HIP__
+#  define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 1024
+#else
+#  define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
+#endif
 
 template<uint blocksize, typename IsActiveOp>
 __device__ void gpu_parallel_active_index_array(const uint num_states,
diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
index f609520b8b4..a1349e82efb 100644
--- a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
+++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
@@ -27,7 +27,11 @@ CCL_NAMESPACE_BEGIN
 
 #include "util/util_atomic.h"
 
-#define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
+#ifdef __HIP__
+#  define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 1024
+#else
+#  define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
+#endif
 
 template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, const int num_values)
 {
diff --git a/intern/cycles/kernel/device/gpu/parallel_reduce.h b/intern/cycles/kernel/device/gpu/parallel_reduce.h
index 65b1990dbb8..b60dceb2ed0 100644
--- a/intern/cycles/kernel/device/gpu/parallel_reduce.h
+++ b/intern/cycles/kernel/device/gpu/parallel_reduce.h
@@ -26,7 +26,11 @@ CCL_NAMESPACE_BEGIN
  * the overall cost of the algorithm while keeping the work complexity O(n) and
  * the step complexity O(log n). (Brent's Theorem optimization) */
 
-#define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 512
+#ifdef __HIP__
+#  define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 1024
+#else
+#  define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 512
+#endif
 
 template<uint blocksize, typename InputT, typename OutputT, typename ConvertOp>
 __device__ void gpu_parallel_sum(
diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
index 99b35468517..9bca1fad22f 100644
--- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -26,7 +26,11 @@ CCL_NAMESPACE_BEGIN
 
 #include "util/util_atomic.h"
 
-#define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
+#ifdef __HIP__
+#  define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 1024
+#else
+#  define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
+#endif
 #define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
 
 template<uint blocksize, typename GetKeyOp>
diff --git a/intern/cycles/kernel/device/hip/compat.h b/intern/cycles/kernel/device/hip/compat.h
new file mode 100644
index 00000000000..3644925d5be
--- /dev/null
+++ b/intern/cycles/kernel/device/hip/compat.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#define __KERNEL_GPU__
+#define __KERNEL_HIP__
+#define CCL_NAMESPACE_BEGIN
+#define CCL_NAMESPACE_END
+
+#ifndef ATTR_FALLTHROUGH
+#  define ATTR_FALLTHROUGH
+#endif
+
+#ifdef __HIPCC_RTC__
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+#else
+#  include <stdint.h>
+#endif
+
+#ifdef CYCLES_HIPBIN_CC
+#  define FLT_MIN 1.175494350822287507969e-38f
+#  define FLT_MAX 340282346638528859811704183484516925440.0f
+#  define FLT_EPSILON 1.192092896e-07F
+#endif
+
+/* Qualifiers */
+
+#define ccl_device __device__ __inline__
+#define ccl_device_inline __device__ __inline__
+#define ccl_device_forceinline __device__ __forceinline__
+#define ccl_device_noinline __device__ __noinline__
+#define ccl_device_noinline_cpu ccl_device
+#define ccl_global
+#define ccl_static_constant __constant__
+#define ccl_device_constant __constant__ __device__
+#define ccl_constant const
+#define ccl_gpu_shared __shared__
+#define ccl_private
+#define ccl_may_alias
+#define ccl_addr_space
+#define ccl_restrict __restrict__
+#define ccl_loop_no_unroll
+#define ccl_align(n) __align__(n)
+#define ccl_optional_struct_init
+
+#define kernel_assert(cond)
+
+/* Types */
+#ifdef __HIP__
+#  include "hip/hip_fp16.h"
+#  include "hip/hip_runtime.h"
+#endif
+
+#ifdef _MSC_VER
+#  include <immintrin.h>
+#endif
+
+#define ccl_gpu_thread_idx_x (threadIdx.x)
+#define ccl_gpu_block_dim_x (blockDim.x)
+#define ccl_gpu_block_idx_x (blockIdx.x)
+#define ccl_gpu_grid_dim_x (gridDim.x)
+#define ccl_gpu_warp_size (warpSize)
+
+#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
+#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
+
+/* GPU warp synchronizaton */
+
+#define ccl_gpu_syncthreads() __syncthreads()
+#define ccl_gpu_ballot(predicate) __ballot(predicate)
+#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down(var, detla)
+#define ccl_gpu_popc(x) __popc(x)
+
+/* GPU texture objects */
+typedef hipTextureObject_t ccl_gpu_tex_object;
+
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_2D(const ccl_gpu_tex_object texobj,
+                                                    const float x,
+                                                    const float y)
+{
+  return tex2D<T>(texobj, x, y);
+}
+
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object texobj,
+                                                    const float x,
+                                                    const float y,
+                                                    const float z)
+{
+  return tex3D<T>(texobj, x, y, z);
+}
+
+/* Use fast math functions */
+
+#define cosf(x) __cosf(((float)(x)))
+#define sinf(x) __sinf(((float)(x)))
+#define powf(x, y) __powf(((float)(x)), ((float)(y)))
+#define tanf(x) __tanf(((float)(x)))
+#define logf(x) __logf(((float)(x)))
+#define expf(x) __expf(((float)(x)))
+
+/* Types */
+
+#include "util/util_half.h"
+#include "util/util_types.h"
diff --git a/intern/cycles/kernel/device/hip/config.h b/intern/cycles/kernel/device/hip/config.h
new file mode 100644
index 00000000000..2fde0d46015
--- /dev/null
+++ b/intern/cycles/kernel/device/hip/config.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Device data taken from HIP occupancy calculator.
+ *
+ * Terminology
+ * - HIP GPUs have multiple streaming multiprocessors
+ * - Each multiprocessor executes multiple thread blocks
+ * - Each thread block contains a number of threads, also known as the block size
+ * - Multiprocessors have a fixed number of registers, and the amount of registers
+ *   used by each threads limits the number of threads per block.
+ */
+
+/* Launch Bound Definitions */
+#define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+#define GPU_MULTIPROCESSOR_MAX_BLOCKS 64
+#define GPU_BLOCK_MAX_THREADS 1024
+#define GPU_THREAD_MAX_REGISTERS 255
+
+#define GPU_KERNEL_BLOCK_NUM_THREADS 1024
+#define GPU_KERNEL_MAX_REGISTERS 64
+
+/* Compute number of threads per block and minimum blocks per multiprocessor
+ * given the maximum number of registers per thread. */
+
+#define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
+  extern "C" __global__ void __launch_bounds__(block_num_threads, \
+                                               GPU_MULTIPRESSOR_MAX_REGISTERS / \
+                                                   (block_num_threads * thread_num_registers))
+
+/* sanity checks */
+
+#if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
+#  error "Maximum number of threads per block exceeded"
+#endif
+
+#if GPU_MULTIPRESSOR_MAX_REGISTERS / (GPU_KERNEL_BLOCK_NUM_THREADS * GPU_KERNEL_MAX_REGISTERS) > \
+    GPU_MULTIPROCESSOR_MAX_BLOCKS
+#  error "Maximum number of blocks per multiprocessor exceeded"
+#endif
+
+#if GPU_KERNEL_MAX_REGISTERS > GPU_THREAD_MAX_REGISTERS
+#  error "Maximum number of registers per thread exceeded"
+#endif
diff --git a/intern/cycles/kernel/device/hip/globals.h b/intern/cycles/kernel/device/hip/globals.h
new file mode 100644
index 00000000000..39978ae7899
--- /dev/null
+++ b/intern/cycles/kernel/device/hip/globals.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#pragma once
+
+#include "kernel/kernel_profiling.h"
+#include "kernel/kernel_types.h"
+
+#include "kernel/integrator/integrator_state.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Not actually used, just a NULL pointer that gets passed everywhere, which we
+ * hope gets optimized out by the compiler. */
+struct KernelGlobals {
+  /* NOTE: Keep the size in sync with SHADOW_STACK_MAX_HITS. */
+  int unused[1];
+};
+
+/* Global scene data and textures */
+__constant__ KernelData __data;
+#define KERNEL_TEX(type, name) __attribute__((used)) const __constant__ __device__ type *name;
+#include "kernel/kernel_textures.h"
+
+/* Integrator state */
+__constant__ IntegratorStateGPU __integrator_state;
+
+/* Abstraction macros */
+#define kernel_data __data
+#define kernel_tex_fetch(t, index) t[(index)]
+#define kernel_tex_array(t) (t)
+#define kernel_integrator_state __integrator_state
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/hip/kernel.cpp b/intern/cycles/kernel/device/hip/kernel.cpp
new file mode 100644
index 00000000000..c801320a2e1
--- /dev/null
+++ b/intern/cycles/kernel/device/hip/kernel.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* HIP kernel entry points */
+
+#ifdef __HIP_DEVICE_COMPILE__
+
+#  include "kernel/device/hip/compat.h"
+#  include "kernel/device/hip/config.h"
+#  include "kernel/device/hip/globals.h"
+
+#  include "kernel/device/gpu/image.h"
+#  include "kernel/device/gpu/kernel.h"
+
+#endif
-- 
cgit v1.2.3


From 79290f51605e31cff09e4984d4f493d05bfe17e2 Mon Sep 17 00:00:00 2001
From: Campbell Barton <ideasman42@gmail.com>
Date: Wed, 29 Sep 2021 07:29:15 +1000
Subject: Cleanup: spelling in comments

---
 intern/cycles/kernel/device/hip/compat.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'intern/cycles/kernel/device')

diff --git a/intern/cycles/kernel/device/hip/compat.h b/intern/cycles/kernel/device/hip/compat.h
index 3644925d5be..95338fe7d6e 100644
--- a/intern/cycles/kernel/device/hip/compat.h
+++ b/intern/cycles/kernel/device/hip/compat.h
@@ -79,7 +79,7 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
 #define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
 
-/* GPU warp synchronizaton */
+/* GPU warp synchronization */
 
 #define ccl_gpu_syncthreads() __syncthreads()
 #define ccl_gpu_ballot(predicate) __ballot(predicate)
-- 
cgit v1.2.3


From 6e268a749fee16b442bcb3fba6cb6e08850d8389 Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey@blender.org>
Date: Tue, 21 Sep 2021 17:03:22 +0200
Subject: Fix adaptive sampling artifacts on tile boundaries

Implement an overscan support for tiles, so that adaptive sampling can
rely on the pixels neighbourhood.

Differential Revision: https://developer.blender.org/D12599
---
 intern/cycles/kernel/device/gpu/kernel.h | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'intern/cycles/kernel/device')

diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index 7b79c0aedfa..3379114fc62 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -424,8 +424,12 @@ ccl_device_inline void kernel_gpu_film_convert_common(const KernelFilmConvert *k
     return;
   }
 
-  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index * kfilm_convert->pass_stride;
-  ccl_global const float *buffer = render_buffer + render_buffer_offset;
+  const int x = render_pixel_index % width;
+  const int y = render_pixel_index / width;
+
+  ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert->pass_stride +
+                                   y * stride * kfilm_convert->pass_stride;
+
   ccl_global float *pixel = pixels +
                             (render_pixel_index + dst_offset) * kfilm_convert->pixel_stride;
 
@@ -451,17 +455,17 @@ ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgba(
     return;
   }
 
-  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index * kfilm_convert->pass_stride;
-  ccl_global const float *buffer = render_buffer + render_buffer_offset;
+  const int x = render_pixel_index % width;
+  const int y = render_pixel_index / width;
+
+  ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert->pass_stride +
+                                   y * stride * kfilm_convert->pass_stride;
 
   float pixel[4];
   processor(kfilm_convert, buffer, pixel);
 
   film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel);
 
-  const int x = render_pixel_index % width;
-  const int y = render_pixel_index / width;
-
   ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x;
   float4_store_half((ccl_global half *)out, make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
 }
-- 
cgit v1.2.3


From 04857cc8efb385af5d8f40b655eeca41e2b73494 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@blender.org>
Date: Sun, 28 Feb 2021 23:23:24 +0100
Subject: Cycles: fully decouple triangle and curve primitive storage from BVH2

Previously the storage here was optimized to avoid indirections in BVH2
traversal. This helps improve performance a bit, but makes performance
and memory usage of Embree and OptiX BVHs a bit worse also. It also adds
code complexity in other parts of the code.

Now decouple triangle and curve primitive storage from BVH2.
* Reduced peak memory usage on all devices
* Bit better performance for OptiX and Embree
* Bit worse performance for CUDA
* Simplified code:
** Intersection.prim/object now matches ShaderData.prim/object
** No more offset manipulation for mesh displacement before a BVH is built
** Remove primitive packing code and flags for Embree and OptiX
** Curve segments are now stored in a KernelCurve struct
* Also happens to fix a bug in baking with incorrect prim/object

Fixes T91968, T91770, T91902

Differential Revision: https://developer.blender.org/D12766
---
 intern/cycles/kernel/device/optix/kernel.cu | 86 ++++++++++++++++-------------
 1 file changed, 49 insertions(+), 37 deletions(-)

(limited to 'intern/cycles/kernel/device')

diff --git a/intern/cycles/kernel/device/optix/kernel.cu b/intern/cycles/kernel/device/optix/kernel.cu
index 7a79e0c4823..736f30d93ef 100644
--- a/intern/cycles/kernel/device/optix/kernel.cu
+++ b/intern/cycles/kernel/device/optix/kernel.cu
@@ -41,22 +41,15 @@ template<typename T> ccl_device_forceinline T *get_payload_ptr_2()
   return (T *)(((uint64_t)optixGetPayload_3() << 32) | optixGetPayload_2());
 }
 
-template<bool always = false> ccl_device_forceinline uint get_object_id()
+ccl_device_forceinline int get_object_id()
 {
 #ifdef __OBJECT_MOTION__
-  /* Always get the the instance ID from the TLAS.
+  /* Always get the the instance ID from the TLAS
    * There might be a motion transform node between TLAS and BLAS which does not have one. */
-  uint object = optixGetInstanceIdFromHandle(optixGetTransformListHandle(0));
+  return optixGetInstanceIdFromHandle(optixGetTransformListHandle(0));
 #else
-  uint object = optixGetInstanceId();
+  return optixGetInstanceId();
 #endif
-  /* Choose between always returning object ID or only for instances. */
-  if (always || (object & 1) == 0)
-    /* Can just remove the low bit since instance always contains object ID. */
-    return object >> 1;
-  else
-    /* Set to OBJECT_NONE if this is not an instanced object. */
-    return OBJECT_NONE;
 }
 
 extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_closest()
@@ -108,7 +101,7 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit()
 #endif
 
 #ifdef __BVH_LOCAL__
-  const uint object = get_object_id<true>();
+  const int object = get_object_id();
   if (object != optixGetPayload_4() /* local_object */) {
     /* Only intersect with matching object. */
     return optixIgnoreIntersection();
@@ -152,21 +145,23 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit()
     local_isect->num_hits = 1;
   }
 
+  const int prim = optixGetPrimitiveIndex();
+
   Intersection *isect = &local_isect->hits[hit];
   isect->t = optixGetRayTmax();
-  isect->prim = optixGetPrimitiveIndex();
+  isect->prim = prim;
   isect->object = get_object_id();
-  isect->type = kernel_tex_fetch(__prim_type, isect->prim);
+  isect->type = kernel_tex_fetch(__objects, isect->object).primitive_type;
 
   const float2 barycentrics = optixGetTriangleBarycentrics();
   isect->u = 1.0f - barycentrics.y - barycentrics.x;
   isect->v = barycentrics.x;
 
   /* Record geometric normal. */
-  const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect->prim);
-  const float3 tri_a = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex + 0));
-  const float3 tri_b = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex + 1));
-  const float3 tri_c = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex + 2));
+  const uint tri_vindex = kernel_tex_fetch(__tri_vindex, prim).w;
+  const float3 tri_a = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 0));
+  const float3 tri_b = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 1));
+  const float3 tri_c = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 2));
   local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));
 
   /* Continue tracing (without this the trace call would return after the first hit). */
@@ -179,25 +174,32 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
 #ifdef __SHADOW_RECORD_ALL__
   bool ignore_intersection = false;
 
-  const uint prim = optixGetPrimitiveIndex();
+  int prim = optixGetPrimitiveIndex();
+  const uint object = get_object_id();
 #  ifdef __VISIBILITY_FLAG__
   const uint visibility = optixGetPayload_4();
-  if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0) {
+  if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
     ignore_intersection = true;
   }
 #  endif
 
   float u = 0.0f, v = 0.0f;
+  int type = 0;
   if (optixIsTriangleHit()) {
     const float2 barycentrics = optixGetTriangleBarycentrics();
     u = 1.0f - barycentrics.y - barycentrics.x;
     v = barycentrics.x;
+    type = kernel_tex_fetch(__objects, object).primitive_type;
   }
 #  ifdef __HAIR__
   else {
     u = __uint_as_float(optixGetAttribute_0());
     v = __uint_as_float(optixGetAttribute_1());
 
+    const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, prim);
+    type = segment.type;
+    prim = segment.prim;
+
     /* Filter out curve endcaps. */
     if (u == 0.0f || u == 1.0f) {
       ignore_intersection = true;
@@ -245,8 +247,8 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
     isect->v = v;
     isect->t = optixGetRayTmax();
     isect->prim = prim;
-    isect->object = get_object_id();
-    isect->type = kernel_tex_fetch(__prim_type, prim);
+    isect->object = object;
+    isect->type = type;
 
 #  ifdef __TRANSPARENT_SHADOWS__
     /* Detect if this surface has a shader with transparent shadows. */
@@ -274,15 +276,14 @@ extern "C" __global__ void __anyhit__kernel_optix_volume_test()
   }
 #endif
 
+  const uint object = get_object_id();
 #ifdef __VISIBILITY_FLAG__
-  const uint prim = optixGetPrimitiveIndex();
   const uint visibility = optixGetPayload_4();
-  if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0) {
+  if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
     return optixIgnoreIntersection();
   }
 #endif
 
-  const uint object = get_object_id<true>();
   if ((kernel_tex_fetch(__object_flag, object) & SD_OBJECT_HAS_VOLUME) == 0) {
     return optixIgnoreIntersection();
   }
@@ -301,9 +302,9 @@ extern "C" __global__ void __anyhit__kernel_optix_visibility_test()
 #endif
 
 #ifdef __VISIBILITY_FLAG__
-  const uint prim = optixGetPrimitiveIndex();
+  const uint object = get_object_id();
   const uint visibility = optixGetPayload_4();
-  if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0) {
+  if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
     return optixIgnoreIntersection();
   }
 
@@ -316,28 +317,39 @@ extern "C" __global__ void __anyhit__kernel_optix_visibility_test()
 
 extern "C" __global__ void __closesthit__kernel_optix_hit()
 {
+  const int object = get_object_id();
+  const int prim = optixGetPrimitiveIndex();
+
   optixSetPayload_0(__float_as_uint(optixGetRayTmax())); /* Intersection distance */
-  optixSetPayload_3(optixGetPrimitiveIndex());
-  optixSetPayload_4(get_object_id());
-  /* Can be PRIMITIVE_TRIANGLE and PRIMITIVE_MOTION_TRIANGLE or curve type and segment index. */
-  optixSetPayload_5(kernel_tex_fetch(__prim_type, optixGetPrimitiveIndex()));
+  optixSetPayload_4(object);
 
   if (optixIsTriangleHit()) {
     const float2 barycentrics = optixGetTriangleBarycentrics();
     optixSetPayload_1(__float_as_uint(1.0f - barycentrics.y - barycentrics.x));
     optixSetPayload_2(__float_as_uint(barycentrics.x));
+    optixSetPayload_3(prim);
+    optixSetPayload_5(kernel_tex_fetch(__objects, object).primitive_type);
   }
   else {
+    const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, prim);
     optixSetPayload_1(optixGetAttribute_0()); /* Same as 'optixGetCurveParameter()' */
     optixSetPayload_2(optixGetAttribute_1());
+    optixSetPayload_3(segment.prim);
+    optixSetPayload_5(segment.type);
   }
 }
 
 #ifdef __HAIR__
-ccl_device_inline void optix_intersection_curve(const uint prim, const uint type)
+ccl_device_inline void optix_intersection_curve(const int prim, const int type)
 {
-  const uint object = get_object_id<true>();
+  const int object = get_object_id();
+
+#  ifdef __VISIBILITY_FLAG__
   const uint visibility = optixGetPayload_4();
+  if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
+    return;
+  }
+#  endif
 
   float3 P = optixGetObjectRayOrigin();
   float3 dir = optixGetObjectRayDirection();
@@ -358,7 +370,7 @@ ccl_device_inline void optix_intersection_curve(const uint prim, const uint type
   if (isect.t != FLT_MAX)
     isect.t *= len;
 
-  if (curve_intersect(NULL, &isect, P, dir, isect.t, visibility, object, prim, time, type)) {
+  if (curve_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) {
     optixReportIntersection(isect.t / len,
                             type & PRIMITIVE_ALL,
                             __float_as_int(isect.u),  /* Attribute_0 */
@@ -368,9 +380,9 @@ ccl_device_inline void optix_intersection_curve(const uint prim, const uint type
 
 extern "C" __global__ void __intersection__curve_ribbon()
 {
-  const uint prim = optixGetPrimitiveIndex();
-  const uint type = kernel_tex_fetch(__prim_type, prim);
-
+  const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, optixGetPrimitiveIndex());
+  const int prim = segment.prim;
+  const int type = segment.type;
   if (type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) {
     optix_intersection_curve(prim, type);
   }
-- 
cgit v1.2.3


From a0f269f682dab848afc80cd322d04a0c4a815cae Mon Sep 17 00:00:00 2001
From: Michael Jones <michael_p_jones@apple.com>
Date: Thu, 14 Oct 2021 13:53:40 +0100
Subject: Cycles: Kernel address space changes for MSL

This is the first of a sequence of changes to support compiling Cycles kernels as MSL (Metal Shading Language) in preparation for a Metal GPU device implementation.

MSL requires that all pointer types be declared with explicit address space attributes (device, thread, etc...). There is already precedent for this with Cycles' address space macros (ccl_global, ccl_private, etc...), therefore the first step of MSL-enablement is to apply these consistently. Line-for-line this represents the largest change required to enable MSL. Applying this change first will simplify future patches as well as offering the emergent benefit of enhanced descriptiveness.

The vast majority of deltas in this patch fall into one of two cases:

- Ensuring ccl_private is specified for thread-local pointer types
- Ensuring ccl_global is specified for device-wide pointer types

Additionally, the ccl_addr_space qualifier can be removed. Prior to Cycles X, ccl_addr_space was used as a context-dependent address space qualifier, but now it is either redundant (e.g. in struct typedefs), or can be replaced by ccl_global in the case of pointer types. Associated function variants (e.g. lcg_step_float_addrspace) are also redundant.

In cases where address space qualifiers are chained with "const", this patch places the address space qualifier first. The rationale for this is that the choice of address space is likely to have the greater impact on runtime performance and overall architecture.

The final part of this patch is the addition of a metal/compat.h header. This is partially complete and will be extended in future patches, paving the way for the full Metal implementation.

Ref T92212

Reviewed By: brecht

Maniphest Tasks: T92212

Differential Revision: https://developer.blender.org/D12864
---
 intern/cycles/kernel/device/cpu/compat.h   |   2 -
 intern/cycles/kernel/device/cuda/compat.h  |   1 -
 intern/cycles/kernel/device/hip/compat.h   |   1 -
 intern/cycles/kernel/device/metal/compat.h | 126 +++++++++++++++++++++++++++++
 intern/cycles/kernel/device/optix/compat.h |   1 -
 5 files changed, 126 insertions(+), 5 deletions(-)
 create mode 100644 intern/cycles/kernel/device/metal/compat.h

(limited to 'intern/cycles/kernel/device')

diff --git a/intern/cycles/kernel/device/cpu/compat.h b/intern/cycles/kernel/device/cpu/compat.h
index bfd936c7bbd..888c0d5d872 100644
--- a/intern/cycles/kernel/device/cpu/compat.h
+++ b/intern/cycles/kernel/device/cpu/compat.h
@@ -32,8 +32,6 @@
 #include "util/util_texture.h"
 #include "util/util_types.h"
 
-#define ccl_addr_space
-
 /* On x86_64, versions of glibc < 2.16 have an issue where expf is
  * much slower than the double version.  This was fixed in glibc 2.16.
  */
diff --git a/intern/cycles/kernel/device/cuda/compat.h b/intern/cycles/kernel/device/cuda/compat.h
index 3c85a8e7bd2..685c7a5b753 100644
--- a/intern/cycles/kernel/device/cuda/compat.h
+++ b/intern/cycles/kernel/device/cuda/compat.h
@@ -59,7 +59,6 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_shared __shared__
 #define ccl_private
 #define ccl_may_alias
-#define ccl_addr_space
 #define ccl_restrict __restrict__
 #define ccl_loop_no_unroll
 #define ccl_align(n) __align__(n)
diff --git a/intern/cycles/kernel/device/hip/compat.h b/intern/cycles/kernel/device/hip/compat.h
index 95338fe7d6e..089976d84e4 100644
--- a/intern/cycles/kernel/device/hip/compat.h
+++ b/intern/cycles/kernel/device/hip/compat.h
@@ -52,7 +52,6 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_shared __shared__
 #define ccl_private
 #define ccl_may_alias
-#define ccl_addr_space
 #define ccl_restrict __restrict__
 #define ccl_loop_no_unroll
 #define ccl_align(n) __align__(n)
diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h
new file mode 100644
index 00000000000..77cea30914c
--- /dev/null
+++ b/intern/cycles/kernel/device/metal/compat.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#define __KERNEL_GPU__
+#define __KERNEL_METAL__
+#define CCL_NAMESPACE_BEGIN
+#define CCL_NAMESPACE_END
+
+#ifndef ATTR_FALLTHROUGH
+#  define ATTR_FALLTHROUGH
+#endif
+
+#include <metal_atomic>
+#include <metal_pack>
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+#pragma clang diagnostic ignored "-Wunused-variable"
+#pragma clang diagnostic ignored "-Wsign-compare"
+
+/* Qualifiers */
+
+#define ccl_device
+#define ccl_device_inline ccl_device
+#define ccl_device_forceinline ccl_device
+#define ccl_device_noinline ccl_device __attribute__((noinline))
+#define ccl_device_noinline_cpu ccl_device
+#define ccl_global device
+#define ccl_static_constant static constant constexpr
+#define ccl_device_constant constant
+#define ccl_constant const device
+#define ccl_gpu_shared threadgroup
+#define ccl_private thread
+#define ccl_may_alias
+#define ccl_restrict __restrict
+#define ccl_loop_no_unroll
+#define ccl_align(n) alignas(n)
+#define ccl_optional_struct_init
+
+/* No assert supported for Metal */
+
+#define kernel_assert(cond)
+
+/* make_type definitions with Metal style element initializers */
+#ifdef make_float2
+#  undef make_float2
+#endif
+#ifdef make_float3
+#  undef make_float3
+#endif
+#ifdef make_float4
+#  undef make_float4
+#endif
+#ifdef make_int2
+#  undef make_int2
+#endif
+#ifdef make_int3
+#  undef make_int3
+#endif
+#ifdef make_int4
+#  undef make_int4
+#endif
+#ifdef make_uchar4
+#  undef make_uchar4
+#endif
+
+#define make_float2(x, y) float2(x, y)
+#define make_float3(x, y, z) float3(x, y, z)
+#define make_float4(x, y, z, w) float4(x, y, z, w)
+#define make_int2(x, y) int2(x, y)
+#define make_int3(x, y, z) int3(x, y, z)
+#define make_int4(x, y, z, w) int4(x, y, z, w)
+#define make_uchar4(x, y, z, w) uchar4(x, y, z, w)
+
+/* Math functions */
+
+#define __uint_as_float(x) as_type<float>(x)
+#define __float_as_uint(x) as_type<uint>(x)
+#define __int_as_float(x) as_type<float>(x)
+#define __float_as_int(x) as_type<int>(x)
+#define __float2half(x) half(x)
+#define powf(x, y) pow(float(x), float(y))
+#define fabsf(x) fabs(float(x))
+#define copysignf(x, y) copysign(float(x), float(y))
+#define asinf(x) asin(float(x))
+#define acosf(x) acos(float(x))
+#define atanf(x) atan(float(x))
+#define floorf(x) floor(float(x))
+#define ceilf(x) ceil(float(x))
+#define hypotf(x, y) hypot(float(x), float(y))
+#define atan2f(x, y) atan2(float(x), float(y))
+#define fmaxf(x, y) fmax(float(x), float(y))
+#define fminf(x, y) fmin(float(x), float(y))
+#define fmodf(x, y) fmod(float(x), float(y))
+#define sinhf(x) sinh(float(x))
+#define coshf(x) cosh(float(x))
+#define tanhf(x) tanh(float(x))
+
+/* Use native functions with possibly lower precision for performance,
+ * no issues found so far. */
+#define trigmode fast
+#define sinf(x) trigmode::sin(float(x))
+#define cosf(x) trigmode::cos(float(x))
+#define tanf(x) trigmode::tan(float(x))
+#define expf(x) trigmode::exp(float(x))
+#define sqrtf(x) trigmode::sqrt(float(x))
+#define logf(x) trigmode::log(float(x))
+
+#define NULL 0
diff --git a/intern/cycles/kernel/device/optix/compat.h b/intern/cycles/kernel/device/optix/compat.h
index fb9e094b535..c9ec9be05df 100644
--- a/intern/cycles/kernel/device/optix/compat.h
+++ b/intern/cycles/kernel/device/optix/compat.h
@@ -58,7 +58,6 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_shared __shared__
 #define ccl_private
 #define ccl_may_alias
-#define ccl_addr_space
 #define ccl_restrict __restrict__
 #define ccl_loop_no_unroll
 #define ccl_align(n) __align__(n)
-- 
cgit v1.2.3


From 2ba7c3aa650c3c795d903a24998204f67c75b017 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@blender.org>
Date: Wed, 13 Oct 2021 19:13:35 +0200
Subject: Cleanup: refactor to make number of channels for shader evaluation
 variable

---
 intern/cycles/kernel/device/cpu/kernel_arch.h      | 4 ++--
 intern/cycles/kernel/device/cpu/kernel_arch_impl.h | 4 ++--
 intern/cycles/kernel/device/gpu/kernel.h           | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'intern/cycles/kernel/device')

diff --git a/intern/cycles/kernel/device/cpu/kernel_arch.h b/intern/cycles/kernel/device/cpu/kernel_arch.h
index 81f328c710b..8b7b0ec0548 100644
--- a/intern/cycles/kernel/device/cpu/kernel_arch.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch.h
@@ -58,11 +58,11 @@ KERNEL_INTEGRATOR_SHADE_FUNCTION(megakernel);
 
 void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg,
                                                        const KernelShaderEvalInput *input,
-                                                       float4 *output,
+                                                       float *output,
                                                        const int offset);
 void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,
                                                      const KernelShaderEvalInput *input,
-                                                     float4 *output,
+                                                     float *output,
                                                      const int offset);
 
 /* --------------------------------------------------------------------
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
index 1432abfd330..23e371f165f 100644
--- a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
@@ -114,7 +114,7 @@ DEFINE_INTEGRATOR_SHADE_KERNEL(megakernel)
 
 void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,
                                                      const KernelShaderEvalInput *input,
-                                                     float4 *output,
+                                                     float *output,
                                                      const int offset)
 {
 #ifdef KERNEL_STUB
@@ -126,7 +126,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,
 
 void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg,
                                                        const KernelShaderEvalInput *input,
-                                                       float4 *output,
+                                                       float *output,
                                                        const int offset)
 {
 #ifdef KERNEL_STUB
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index 3379114fc62..21901215757 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -615,7 +615,7 @@ KERNEL_FILM_CONVERT_DEFINE(float4, rgba)
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
     kernel_gpu_shader_eval_displace(KernelShaderEvalInput *input,
-                                    float4 *output,
+                                    float *output,
                                     const int offset,
                                     const int work_size)
 {
@@ -629,7 +629,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
     kernel_gpu_shader_eval_background(KernelShaderEvalInput *input,
-                                      float4 *output,
+                                      float *output,
                                       const int offset,
                                       const int work_size)
 {
-- 
cgit v1.2.3


From eb71157e2a9c7abdeb7045bdf9b79d8ca27ba263 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@blender.org>
Date: Thu, 14 Oct 2021 17:51:27 +0200
Subject: Cleanup: add utility functions for packing integers

---
 intern/cycles/kernel/device/optix/kernel.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'intern/cycles/kernel/device')

diff --git a/intern/cycles/kernel/device/optix/kernel.cu b/intern/cycles/kernel/device/optix/kernel.cu
index 736f30d93ef..c9577bb2aa2 100644
--- a/intern/cycles/kernel/device/optix/kernel.cu
+++ b/intern/cycles/kernel/device/optix/kernel.cu
@@ -34,11 +34,11 @@
 
 template<typename T> ccl_device_forceinline T *get_payload_ptr_0()
 {
-  return (T *)(((uint64_t)optixGetPayload_1() << 32) | optixGetPayload_0());
+  return pointer_unpack_from_uint<T>(optixGetPayload_0(), optixGetPayload_1());
 }
 template<typename T> ccl_device_forceinline T *get_payload_ptr_2()
 {
-  return (T *)(((uint64_t)optixGetPayload_3() << 32) | optixGetPayload_2());
+  return pointer_unpack_from_uint<T>(optixGetPayload_2(), optixGetPayload_3());
 }
 
 ccl_device_forceinline int get_object_id()
-- 
cgit v1.2.3


From 5d565062edc25575bbabf173a4e26f184103944b Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@blender.org>
Date: Wed, 13 Oct 2021 18:19:51 +0200
Subject: Cleanup: refactor OptiX shadow intersection for upcoming changes

---
 intern/cycles/kernel/device/optix/kernel.cu | 68 +++++++++++++----------------
 1 file changed, 31 insertions(+), 37 deletions(-)

(limited to 'intern/cycles/kernel/device')

diff --git a/intern/cycles/kernel/device/optix/kernel.cu b/intern/cycles/kernel/device/optix/kernel.cu
index c9577bb2aa2..e97b25d31a2 100644
--- a/intern/cycles/kernel/device/optix/kernel.cu
+++ b/intern/cycles/kernel/device/optix/kernel.cu
@@ -172,14 +172,12 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit()
 extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
 {
 #ifdef __SHADOW_RECORD_ALL__
-  bool ignore_intersection = false;
-
   int prim = optixGetPrimitiveIndex();
   const uint object = get_object_id();
 #  ifdef __VISIBILITY_FLAG__
   const uint visibility = optixGetPayload_4();
   if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
-    ignore_intersection = true;
+    return optixIgnoreIntersection();
   }
 #  endif
 
@@ -202,29 +200,39 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
 
     /* Filter out curve endcaps. */
     if (u == 0.0f || u == 1.0f) {
-      ignore_intersection = true;
+      return optixIgnoreIntersection();
     }
   }
 #  endif
 
-  int num_hits = optixGetPayload_2();
-  int record_index = num_hits;
+#  ifndef __TRANSPARENT_SHADOWS__
+  /* No transparent shadows support compiled in, make opaque. */
+  optixSetPayload_5(true);
+  return optixTerminateRay();
+#  else
   const int max_hits = optixGetPayload_3();
 
-  if (!ignore_intersection) {
-    optixSetPayload_2(num_hits + 1);
+  /* If no transparent shadows, all light is blocked and we can stop immediately. */
+  if (max_hits == 0 ||
+      !(intersection_get_shader_flags(NULL, prim, type) & SD_HAS_TRANSPARENT_SHADOW)) {
+    optixSetPayload_5(true);
+    return optixTerminateRay();
   }
 
+  /* Record transparent intersection. */
+  const int num_hits = optixGetPayload_2();
+  int record_index = num_hits;
+
+  optixSetPayload_2(num_hits + 1);
+
   Intersection *const isect_array = get_payload_ptr_0<Intersection>();
 
-#  ifdef __TRANSPARENT_SHADOWS__
-  if (num_hits >= max_hits) {
+  if (record_index >= max_hits) {
     /* If maximum number of hits reached, find a hit to replace. */
-    const int num_recorded_hits = min(max_hits, num_hits);
     float max_recorded_t = isect_array[0].t;
     int max_recorded_hit = 0;
 
-    for (int i = 1; i < num_recorded_hits; i++) {
+    for (int i = 1; i < max_hits; i++) {
       if (isect_array[i].t > max_recorded_t) {
         max_recorded_t = isect_array[i].t;
         max_recorded_hit = i;
@@ -232,39 +240,25 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
     }
 
     if (optixGetRayTmax() >= max_recorded_t) {
-      /* Accept hit, so that OptiX won't consider any more hits beyond the distance of the current
-       * hit anymore. */
+      /* Accept hit, so that OptiX won't consider any more hits beyond the distance of the
+       * current hit anymore. */
       return;
     }
 
     record_index = max_recorded_hit;
   }
-#  endif
 
-  if (!ignore_intersection) {
-    Intersection *const isect = isect_array + record_index;
-    isect->u = u;
-    isect->v = v;
-    isect->t = optixGetRayTmax();
-    isect->prim = prim;
-    isect->object = object;
-    isect->type = type;
-
-#  ifdef __TRANSPARENT_SHADOWS__
-    /* Detect if this surface has a shader with transparent shadows. */
-    if (!shader_transparent_shadow(NULL, isect) || max_hits == 0) {
-#  endif
-      /* If no transparent shadows, all light is blocked and we can stop immediately. */
-      optixSetPayload_5(true);
-      return optixTerminateRay();
-#  ifdef __TRANSPARENT_SHADOWS__
-    }
-#  endif
-  }
+  Intersection *const isect = isect_array + record_index;
+  isect->u = u;
+  isect->v = v;
+  isect->t = optixGetRayTmax();
+  isect->prim = prim;
+  isect->object = object;
+  isect->type = type;
 
-  /* Continue tracing. */
   optixIgnoreIntersection();
-#endif
+#  endif /* __TRANSPARENT_SHADOWS__ */
+#endif   /* __SHADOW_RECORD_ALL__ */
 }
 
 extern "C" __global__ void __anyhit__kernel_optix_volume_test()
-- 
cgit v1.2.3


From 1df3b51988852fa8ee6b530a64aa23346db9acd4 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht>
Date: Sun, 17 Oct 2021 16:10:10 +0200
Subject: Cycles: replace integrator state argument macros

* Rename struct KernelGlobals to struct KernelGlobalsCPU
* Add KernelGlobals, IntegratorState and ConstIntegratorState typedefs
  that every device can define in its own way.
* Remove INTEGRATOR_STATE_ARGS and INTEGRATOR_STATE_PASS macros and
  replace with these new typedefs.
* Add explicit state argument to INTEGRATOR_STATE and similar macros

In preparation for decoupling main and shadow paths.

Differential Revision: https://developer.blender.org/D12888
---
 intern/cycles/kernel/device/cpu/globals.h          |  6 ++++--
 intern/cycles/kernel/device/cpu/image.h            |  4 ++--
 intern/cycles/kernel/device/cpu/kernel.cpp         |  4 ++--
 intern/cycles/kernel/device/cpu/kernel.h           | 14 ++++++-------
 intern/cycles/kernel/device/cpu/kernel_arch.h      | 20 +++++++++---------
 intern/cycles/kernel/device/cpu/kernel_arch_impl.h | 20 +++++++++---------
 intern/cycles/kernel/device/cuda/globals.h         |  3 ++-
 intern/cycles/kernel/device/gpu/image.h            |  4 ++--
 intern/cycles/kernel/device/gpu/kernel.h           | 24 +++++++++++-----------
 intern/cycles/kernel/device/optix/globals.h        |  3 ++-
 10 files changed, 53 insertions(+), 49 deletions(-)

(limited to 'intern/cycles/kernel/device')

diff --git a/intern/cycles/kernel/device/cpu/globals.h b/intern/cycles/kernel/device/cpu/globals.h
index 98b036e269d..fb9aae38cfc 100644
--- a/intern/cycles/kernel/device/cpu/globals.h
+++ b/intern/cycles/kernel/device/cpu/globals.h
@@ -34,7 +34,7 @@ struct OSLThreadData;
 struct OSLShadingSystem;
 #endif
 
-typedef struct KernelGlobals {
+typedef struct KernelGlobalsCPU {
 #define KERNEL_TEX(type, name) texture<type> name;
 #include "kernel/kernel_textures.h"
 
@@ -51,7 +51,9 @@ typedef struct KernelGlobals {
   /* **** Run-time data ****  */
 
   ProfilingState profiler;
-} KernelGlobals;
+} KernelGlobalsCPU;
+
+typedef const KernelGlobalsCPU *ccl_restrict KernelGlobals;
 
 /* Abstraction macros */
 #define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
diff --git a/intern/cycles/kernel/device/cpu/image.h b/intern/cycles/kernel/device/cpu/image.h
index 57e81ab186d..44c5d7ef065 100644
--- a/intern/cycles/kernel/device/cpu/image.h
+++ b/intern/cycles/kernel/device/cpu/image.h
@@ -583,7 +583,7 @@ template<typename T> struct NanoVDBInterpolator {
 
 #undef SET_CUBIC_SPLINE_WEIGHTS
 
-ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float x, float y)
+ccl_device float4 kernel_tex_image_interp(KernelGlobals kg, int id, float x, float y)
 {
   const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
 
@@ -611,7 +611,7 @@ ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float
   }
 }
 
-ccl_device float4 kernel_tex_image_interp_3d(const KernelGlobals *kg,
+ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg,
                                              int id,
                                              float3 P,
                                              InterpolationType interp)
diff --git a/intern/cycles/kernel/device/cpu/kernel.cpp b/intern/cycles/kernel/device/cpu/kernel.cpp
index ac1cdf5fffe..8519b77aa08 100644
--- a/intern/cycles/kernel/device/cpu/kernel.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel.cpp
@@ -64,7 +64,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Memory Copy */
 
-void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t)
+void kernel_const_copy(KernelGlobalsCPU *kg, const char *name, void *host, size_t)
 {
   if (strcmp(name, "__data") == 0) {
     kg->__data = *(KernelData *)host;
@@ -74,7 +74,7 @@ void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t)
   }
 }
 
-void kernel_global_memory_copy(KernelGlobals *kg, const char *name, void *mem, size_t size)
+void kernel_global_memory_copy(KernelGlobalsCPU *kg, const char *name, void *mem, size_t size)
 {
   if (0) {
   }
diff --git a/intern/cycles/kernel/device/cpu/kernel.h b/intern/cycles/kernel/device/cpu/kernel.h
index ae2a841835a..28337a58898 100644
--- a/intern/cycles/kernel/device/cpu/kernel.h
+++ b/intern/cycles/kernel/device/cpu/kernel.h
@@ -29,17 +29,17 @@ CCL_NAMESPACE_BEGIN
 #define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
 
 struct IntegratorStateCPU;
-struct KernelGlobals;
+struct KernelGlobalsCPU;
 struct KernelData;
 
-KernelGlobals *kernel_globals_create();
-void kernel_globals_free(KernelGlobals *kg);
+KernelGlobalsCPU *kernel_globals_create();
+void kernel_globals_free(KernelGlobalsCPU *kg);
 
-void *kernel_osl_memory(const KernelGlobals *kg);
-bool kernel_osl_use(const KernelGlobals *kg);
+void *kernel_osl_memory(const KernelGlobalsCPU *kg);
+bool kernel_osl_use(const KernelGlobalsCPU *kg);
 
-void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t size);
-void kernel_global_memory_copy(KernelGlobals *kg, const char *name, void *mem, size_t size);
+void kernel_const_copy(KernelGlobalsCPU *kg, const char *name, void *host, size_t size);
+void kernel_global_memory_copy(KernelGlobalsCPU *kg, const char *name, void *mem, size_t size);
 
 #define KERNEL_ARCH cpu
 #include "kernel/device/cpu/kernel_arch.h"
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch.h b/intern/cycles/kernel/device/cpu/kernel_arch.h
index 8b7b0ec0548..ae7fab65100 100644
--- a/intern/cycles/kernel/device/cpu/kernel_arch.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch.h
@@ -21,16 +21,16 @@
  */
 
 #define KERNEL_INTEGRATOR_FUNCTION(name) \
-  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *ccl_restrict kg, \
                                                     IntegratorStateCPU *state)
 
 #define KERNEL_INTEGRATOR_SHADE_FUNCTION(name) \
-  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *ccl_restrict kg, \
                                                     IntegratorStateCPU *state, \
                                                     ccl_global float *render_buffer)
 
 #define KERNEL_INTEGRATOR_INIT_FUNCTION(name) \
-  bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \
+  bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *ccl_restrict kg, \
                                                     IntegratorStateCPU *state, \
                                                     KernelWorkTile *tile, \
                                                     ccl_global float *render_buffer)
@@ -56,11 +56,11 @@ KERNEL_INTEGRATOR_SHADE_FUNCTION(megakernel);
  * Shader evaluation.
  */
 
-void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg,
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobalsCPU *kg,
                                                        const KernelShaderEvalInput *input,
                                                        float *output,
                                                        const int offset);
-void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobalsCPU *kg,
                                                      const KernelShaderEvalInput *input,
                                                      float *output,
                                                      const int offset);
@@ -70,7 +70,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,
  */
 
 bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
-    const KernelGlobals *kg,
+    const KernelGlobalsCPU *kg,
     ccl_global float *render_buffer,
     int x,
     int y,
@@ -79,14 +79,14 @@ bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
     int offset,
     int stride);
 
-void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobals *kg,
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobalsCPU *kg,
                                                            ccl_global float *render_buffer,
                                                            int y,
                                                            int start_x,
                                                            int width,
                                                            int offset,
                                                            int stride);
-void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobals *kg,
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobalsCPU *kg,
                                                            ccl_global float *render_buffer,
                                                            int x,
                                                            int start_y,
@@ -98,7 +98,7 @@ void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobals *
  * Cryptomatte.
  */
 
-void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobals *kg,
+void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobalsCPU *kg,
                                                         ccl_global float *render_buffer,
                                                         int pixel_index);
 
@@ -108,6 +108,6 @@ void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobals *kg,
 /* TODO(sergey): Needs to be re-implemented. Or not? Brecht did it already :) */
 
 void KERNEL_FUNCTION_FULL_NAME(bake)(
-    const KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride);
+    const KernelGlobalsCPU *kg, float *buffer, int sample, int x, int y, int offset, int stride);
 
 #undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
index 23e371f165f..bf8667ac045 100644
--- a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
@@ -70,7 +70,7 @@ CCL_NAMESPACE_BEGIN
 #endif
 
 #define DEFINE_INTEGRATOR_KERNEL(name) \
-  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *kg, \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *kg, \
                                                     IntegratorStateCPU *state) \
   { \
     KERNEL_INVOKE(name, kg, state); \
@@ -78,7 +78,7 @@ CCL_NAMESPACE_BEGIN
 
 #define DEFINE_INTEGRATOR_SHADE_KERNEL(name) \
   void KERNEL_FUNCTION_FULL_NAME(integrator_##name)( \
-      const KernelGlobals *kg, IntegratorStateCPU *state, ccl_global float *render_buffer) \
+      const KernelGlobalsCPU *kg, IntegratorStateCPU *state, ccl_global float *render_buffer) \
   { \
     KERNEL_INVOKE(name, kg, state, render_buffer); \
   }
@@ -86,7 +86,7 @@ CCL_NAMESPACE_BEGIN
 /* TODO: Either use something like get_work_pixel(), or simplify tile which is passed here, so
  * that it does not contain unused fields. */
 #define DEFINE_INTEGRATOR_INIT_KERNEL(name) \
-  bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *kg, \
+  bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *kg, \
                                                     IntegratorStateCPU *state, \
                                                     KernelWorkTile *tile, \
                                                     ccl_global float *render_buffer) \
@@ -112,7 +112,7 @@ DEFINE_INTEGRATOR_SHADE_KERNEL(megakernel)
  * Shader evaluation.
  */
 
-void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobalsCPU *kg,
                                                      const KernelShaderEvalInput *input,
                                                      float *output,
                                                      const int offset)
@@ -124,7 +124,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,
 #endif
 }
 
-void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg,
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobalsCPU *kg,
                                                        const KernelShaderEvalInput *input,
                                                        float *output,
                                                        const int offset)
@@ -141,7 +141,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg,
  */
 
 bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
-    const KernelGlobals *kg,
+    const KernelGlobalsCPU *kg,
     ccl_global float *render_buffer,
     int x,
     int y,
@@ -159,7 +159,7 @@ bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
 #endif
 }
 
-void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobals *kg,
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobalsCPU *kg,
                                                            ccl_global float *render_buffer,
                                                            int y,
                                                            int start_x,
@@ -174,7 +174,7 @@ void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobals *
 #endif
 }
 
-void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobals *kg,
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobalsCPU *kg,
                                                            ccl_global float *render_buffer,
                                                            int x,
                                                            int start_y,
@@ -193,7 +193,7 @@ void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobals *
  * Cryptomatte.
  */
 
-void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobals *kg,
+void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobalsCPU *kg,
                                                         ccl_global float *render_buffer,
                                                         int pixel_index)
 {
@@ -210,7 +210,7 @@ void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobals *kg,
 /* TODO(sergey): Needs to be re-implemented. Or not? Brecht did it already :) */
 
 void KERNEL_FUNCTION_FULL_NAME(bake)(
-    const KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride)
+    const KernelGlobalsCPU *kg, float *buffer, int sample, int x, int y, int offset, int stride)
 {
 #if 0
 #  ifdef KERNEL_STUB
diff --git a/intern/cycles/kernel/device/cuda/globals.h b/intern/cycles/kernel/device/cuda/globals.h
index 169047175f5..2c187cf8a23 100644
--- a/intern/cycles/kernel/device/cuda/globals.h
+++ b/intern/cycles/kernel/device/cuda/globals.h
@@ -27,9 +27,10 @@ CCL_NAMESPACE_BEGIN
 
 /* Not actually used, just a NULL pointer that gets passed everywhere, which we
  * hope gets optimized out by the compiler. */
-struct KernelGlobals {
+struct KernelGlobalsGPU {
   int unused[1];
 };
+typedef ccl_global const KernelGlobalsGPU *ccl_restrict KernelGlobals;
 
 /* Global scene data and textures */
 __constant__ KernelData __data;
diff --git a/intern/cycles/kernel/device/gpu/image.h b/intern/cycles/kernel/device/gpu/image.h
index b015c78a8f5..95a37c693ae 100644
--- a/intern/cycles/kernel/device/gpu/image.h
+++ b/intern/cycles/kernel/device/gpu/image.h
@@ -189,7 +189,7 @@ ccl_device_noinline T kernel_tex_image_interp_nanovdb(
 }
 #endif
 
-ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float x, float y)
+ccl_device float4 kernel_tex_image_interp(KernelGlobals kg, int id, float x, float y)
 {
   const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
 
@@ -221,7 +221,7 @@ ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float
   }
 }
 
-ccl_device float4 kernel_tex_image_interp_3d(const KernelGlobals *kg,
+ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg,
                                              int id,
                                              float3 P,
                                              InterpolationType interp)
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index 21901215757..56beaf1fd91 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -51,8 +51,8 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   const int state = ccl_gpu_global_id_x();
 
   if (state < num_states) {
-    INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0;
-    INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0;
+    INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
+    INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0;
   }
 }
 
@@ -244,7 +244,7 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B
 {
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
       num_states, indices, num_indices, [kernel](const int state) {
-        return (INTEGRATOR_STATE(path, queued_kernel) == kernel);
+        return (INTEGRATOR_STATE(state, path, queued_kernel) == kernel);
       });
 }
 
@@ -256,7 +256,7 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B
 {
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
       num_states, indices, num_indices, [kernel](const int state) {
-        return (INTEGRATOR_STATE(shadow_path, queued_kernel) == kernel);
+        return (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == kernel);
       });
 }
 
@@ -265,8 +265,8 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B
 {
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
       num_states, indices, num_indices, [](const int state) {
-        return (INTEGRATOR_STATE(path, queued_kernel) != 0) ||
-               (INTEGRATOR_STATE(shadow_path, queued_kernel) != 0);
+        return (INTEGRATOR_STATE(state, path, queued_kernel) != 0) ||
+               (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0);
       });
 }
 
@@ -278,8 +278,8 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B
 {
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
       num_states, indices + indices_offset, num_indices, [](const int state) {
-        return (INTEGRATOR_STATE(path, queued_kernel) == 0) &&
-               (INTEGRATOR_STATE(shadow_path, queued_kernel) == 0);
+        return (INTEGRATOR_STATE(state, path, queued_kernel) == 0) &&
+               (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
       });
 }
 
@@ -289,8 +289,8 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B
 {
   gpu_parallel_sorted_index_array<GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE>(
       num_states, indices, num_indices, key_prefix_sum, [kernel](const int state) {
-        return (INTEGRATOR_STATE(path, queued_kernel) == kernel) ?
-                   INTEGRATOR_STATE(path, shader_sort_key) :
+        return (INTEGRATOR_STATE(state, path, queued_kernel) == kernel) ?
+                   INTEGRATOR_STATE(state, path, shader_sort_key) :
                    GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
       });
 }
@@ -304,8 +304,8 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
       num_states, indices, num_indices, [num_active_paths](const int state) {
         return (state >= num_active_paths) &&
-               ((INTEGRATOR_STATE(path, queued_kernel) != 0) ||
-                (INTEGRATOR_STATE(shadow_path, queued_kernel) != 0));
+               ((INTEGRATOR_STATE(state, path, queued_kernel) != 0) ||
+                (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0));
       });
 }
 
diff --git a/intern/cycles/kernel/device/optix/globals.h b/intern/cycles/kernel/device/optix/globals.h
index 7d898ed5d91..7b8ebfe50e6 100644
--- a/intern/cycles/kernel/device/optix/globals.h
+++ b/intern/cycles/kernel/device/optix/globals.h
@@ -27,9 +27,10 @@ CCL_NAMESPACE_BEGIN
 
 /* Not actually used, just a NULL pointer that gets passed everywhere, which we
  * hope gets optimized out by the compiler. */
-struct KernelGlobals {
+struct KernelGlobalsGPU {
   int unused[1];
 };
+typedef ccl_global const KernelGlobalsGPU *ccl_restrict KernelGlobals;
 
 /* Launch parameters */
 struct KernelParamsOptiX {
-- 
cgit v1.2.3


From 3065d2609700d14100490a16c91152a6e71790e8 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@blender.org>
Date: Sun, 17 Oct 2021 20:43:06 +0200
Subject: Cycles: optimize volume stack copying for shadow catcher/compaction

Only copy the number of items used instead of the max items.

Ref D12889
---
 intern/cycles/kernel/device/gpu/kernel.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'intern/cycles/kernel/device')

diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index 56beaf1fd91..b5ecab2a4db 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -321,7 +321,7 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B
     const int from_state = active_terminated_states[active_states_offset + global_index];
     const int to_state = active_terminated_states[terminated_states_offset + global_index];
 
-    integrator_state_move(to_state, from_state);
+    integrator_state_move(NULL, to_state, from_state);
   }
 }
 
-- 
cgit v1.2.3


From 943e73b07e26d64c04ccb7d8f656e3818a57cca0 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht>
Date: Sun, 17 Oct 2021 16:22:20 +0200
Subject: Cycles: decouple shadow paths from main path on GPU

The motivation for this is twofold. It improves performance (5-10% on most
benchmark scenes), and will help  to bring back transparency support for the
ambient occlusion pass.

* Duplicate some members from the main path state in the shadow path state.
* Add shadow paths incrementally to the array similar to what we do for
  the shadow catchers.
* For the scheduling, allow running shade surface and shade volume kernels
  as long as there is enough space in the shadow paths array. If not, execute
  shadow kernels until it is empty.

* Add IntegratorShadowState and ConstIntegratorShadowState typedefs that
  can be different between CPU and GPU. For GPU both main and shadow paths
  juse have an integer for SoA access. Bt with CPU it's a different pointer
  type so we get type safety checks in code shared between CPU and GPU.
* For CPU, add a separate IntegratorShadowStateCPU struct embedded in
  IntegratorShadowState.
* Update various functions to take the shadow state, and make SVM take either
  type of state using templates.

Differential Revision: https://developer.blender.org/D12889
---
 intern/cycles/kernel/device/cpu/kernel_arch_impl.h | 36 +++++++++++++++-------
 intern/cycles/kernel/device/gpu/kernel.h           | 10 ++----
 2 files changed, 28 insertions(+), 18 deletions(-)

(limited to 'intern/cycles/kernel/device')

diff --git a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
index bf8667ac045..2b0eea4fb61 100644
--- a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
@@ -69,6 +69,18 @@ CCL_NAMESPACE_BEGIN
 #  define KERNEL_INVOKE(name, ...) integrator_##name(__VA_ARGS__)
 #endif
 
+/* TODO: Either use something like get_work_pixel(), or simplify tile which is passed here, so
+ * that it does not contain unused fields. */
+#define DEFINE_INTEGRATOR_INIT_KERNEL(name) \
+  bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *kg, \
+                                                    IntegratorStateCPU *state, \
+                                                    KernelWorkTile *tile, \
+                                                    ccl_global float *render_buffer) \
+  { \
+    return KERNEL_INVOKE( \
+        name, kg, state, tile, render_buffer, tile->x, tile->y, tile->start_sample); \
+  }
+
 #define DEFINE_INTEGRATOR_KERNEL(name) \
   void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *kg, \
                                                     IntegratorStateCPU *state) \
@@ -83,30 +95,32 @@ CCL_NAMESPACE_BEGIN
     KERNEL_INVOKE(name, kg, state, render_buffer); \
   }
 
-/* TODO: Either use something like get_work_pixel(), or simplify tile which is passed here, so
- * that it does not contain unused fields. */
-#define DEFINE_INTEGRATOR_INIT_KERNEL(name) \
-  bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *kg, \
-                                                    IntegratorStateCPU *state, \
-                                                    KernelWorkTile *tile, \
-                                                    ccl_global float *render_buffer) \
+#define DEFINE_INTEGRATOR_SHADOW_KERNEL(name) \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *kg, \
+                                                    IntegratorStateCPU *state) \
   { \
-    return KERNEL_INVOKE( \
-        name, kg, state, tile, render_buffer, tile->x, tile->y, tile->start_sample); \
+    KERNEL_INVOKE(name, kg, &state->shadow); \
+  }
+
+#define DEFINE_INTEGRATOR_SHADOW_SHADE_KERNEL(name) \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)( \
+      const KernelGlobalsCPU *kg, IntegratorStateCPU *state, ccl_global float *render_buffer) \
+  { \
+    KERNEL_INVOKE(name, kg, &state->shadow, render_buffer); \
   }
 
 DEFINE_INTEGRATOR_INIT_KERNEL(init_from_camera)
 DEFINE_INTEGRATOR_INIT_KERNEL(init_from_bake)
 DEFINE_INTEGRATOR_KERNEL(intersect_closest)
-DEFINE_INTEGRATOR_KERNEL(intersect_shadow)
 DEFINE_INTEGRATOR_KERNEL(intersect_subsurface)
 DEFINE_INTEGRATOR_KERNEL(intersect_volume_stack)
 DEFINE_INTEGRATOR_SHADE_KERNEL(shade_background)
 DEFINE_INTEGRATOR_SHADE_KERNEL(shade_light)
-DEFINE_INTEGRATOR_SHADE_KERNEL(shade_shadow)
 DEFINE_INTEGRATOR_SHADE_KERNEL(shade_surface)
 DEFINE_INTEGRATOR_SHADE_KERNEL(shade_volume)
 DEFINE_INTEGRATOR_SHADE_KERNEL(megakernel)
+DEFINE_INTEGRATOR_SHADOW_KERNEL(intersect_shadow)
+DEFINE_INTEGRATOR_SHADOW_SHADE_KERNEL(shade_shadow)
 
 /* --------------------------------------------------------------------
  * Shader evaluation.
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index b5ecab2a4db..6b4d79ed5b7 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -265,8 +265,7 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B
 {
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
       num_states, indices, num_indices, [](const int state) {
-        return (INTEGRATOR_STATE(state, path, queued_kernel) != 0) ||
-               (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0);
+        return (INTEGRATOR_STATE(state, path, queued_kernel) != 0);
       });
 }
 
@@ -278,8 +277,7 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B
 {
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
       num_states, indices + indices_offset, num_indices, [](const int state) {
-        return (INTEGRATOR_STATE(state, path, queued_kernel) == 0) &&
-               (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
+        return (INTEGRATOR_STATE(state, path, queued_kernel) == 0);
       });
 }
 
@@ -303,9 +301,7 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B
 {
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
       num_states, indices, num_indices, [num_active_paths](const int state) {
-        return (state >= num_active_paths) &&
-               ((INTEGRATOR_STATE(state, path, queued_kernel) != 0) ||
-                (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0));
+        return (state >= num_active_paths) && (INTEGRATOR_STATE(state, path, queued_kernel) != 0);
       });
 }
 
-- 
cgit v1.2.3


From d06828f0b8ebb083de59fd2cb8c5f8fe6af1da22 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@blender.org>
Date: Mon, 18 Oct 2021 19:20:09 +0200
Subject: Cycles: avoid intermediate stack array for writing shadow
 intersections

Helps save one OptiX payload and is a bit more efficient.

Differential Revision: https://developer.blender.org/D12909
---
 intern/cycles/kernel/device/optix/kernel.cu | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'intern/cycles/kernel/device')

diff --git a/intern/cycles/kernel/device/optix/kernel.cu b/intern/cycles/kernel/device/optix/kernel.cu
index e97b25d31a2..574f66ab708 100644
--- a/intern/cycles/kernel/device/optix/kernel.cu
+++ b/intern/cycles/kernel/device/optix/kernel.cu
@@ -225,16 +225,17 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
 
   optixSetPayload_2(num_hits + 1);
 
-  Intersection *const isect_array = get_payload_ptr_0<Intersection>();
+  const IntegratorShadowState state = optixGetPayload_0();
 
   if (record_index >= max_hits) {
     /* If maximum number of hits reached, find a hit to replace. */
-    float max_recorded_t = isect_array[0].t;
+    float max_recorded_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, 0, t);
     int max_recorded_hit = 0;
 
     for (int i = 1; i < max_hits; i++) {
-      if (isect_array[i].t > max_recorded_t) {
-        max_recorded_t = isect_array[i].t;
+      const float isect_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, i, t);
+      if (isect_t > max_recorded_t) {
+        max_recorded_t = isect_t;
         max_recorded_hit = i;
       }
     }
@@ -248,13 +249,12 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
     record_index = max_recorded_hit;
   }
 
-  Intersection *const isect = isect_array + record_index;
-  isect->u = u;
-  isect->v = v;
-  isect->t = optixGetRayTmax();
-  isect->prim = prim;
-  isect->object = object;
-  isect->type = type;
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, u) = u;
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, v) = v;
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, t) = optixGetRayTmax();
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, prim) = prim;
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, object) = object;
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, type) = type;
 
   optixIgnoreIntersection();
 #  endif /* __TRANSPARENT_SHADOWS__ */
-- 
cgit v1.2.3


From fd77a28031daff3122ded3a1cb37a7fb44feedf6 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@blender.org>
Date: Mon, 20 Sep 2021 16:16:11 +0200
Subject: Cycles: bake transparent shadows for hair

These transparent shadows can be expansive to evaluate. Especially on the
GPU they can lead to poor occupancy when only some pixels require many kernel
launches to trace and evaluate many layers of transparency.

Baked transparency allows tracing a single ray in many cases by accumulating
the throughput directly in the intersection program without recording hits
or evaluating shaders. Transparency is baked at curve vertices and
interpolated, for most shaders this will look practically the same as actual
shader evaluation.

Fixes T91428, performance regression with spring demo file due to transparent
hair, and makes it render significantly faster than Blender 2.93.

Differential Revision: https://developer.blender.org/D12880
---
 intern/cycles/kernel/device/cpu/kernel_arch.h      |  5 +++
 intern/cycles/kernel/device/cpu/kernel_arch_impl.h | 13 ++++++++
 intern/cycles/kernel/device/gpu/kernel.h           | 16 ++++++++-
 intern/cycles/kernel/device/optix/kernel.cu        | 38 +++++++++++++++++-----
 4 files changed, 63 insertions(+), 9 deletions(-)

(limited to 'intern/cycles/kernel/device')

diff --git a/intern/cycles/kernel/device/cpu/kernel_arch.h b/intern/cycles/kernel/device/cpu/kernel_arch.h
index ae7fab65100..7a438b58e73 100644
--- a/intern/cycles/kernel/device/cpu/kernel_arch.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch.h
@@ -64,6 +64,11 @@ void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobalsCPU *kg,
                                                      const KernelShaderEvalInput *input,
                                                      float *output,
                                                      const int offset);
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_curve_shadow_transparency)(
+    const KernelGlobalsCPU *kg,
+    const KernelShaderEvalInput *input,
+    float *output,
+    const int offset);
 
 /* --------------------------------------------------------------------
  * Adaptive sampling.
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
index 2b0eea4fb61..ac606c768db 100644
--- a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
@@ -150,6 +150,19 @@ void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobalsCPU *k
 #endif
 }
 
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_curve_shadow_transparency)(
+    const KernelGlobalsCPU *kg,
+    const KernelShaderEvalInput *input,
+    float *output,
+    const int offset)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, shader_eval_curve_shadow_transparency);
+#else
+  kernel_curve_shadow_transparency_evaluate(kg, input, output, offset);
+#endif
+}
+
 /* --------------------------------------------------------------------
  * Adaptive sampling.
  */
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index 6b4d79ed5b7..b6df74e835a 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -621,7 +621,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   }
 }
 
-/* Background Shader Evaluation */
+/* Background */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
     kernel_gpu_shader_eval_background(KernelShaderEvalInput *input,
@@ -635,6 +635,20 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   }
 }
 
+/* Curve Shadow Transparency */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_shader_eval_curve_shadow_transparency(KernelShaderEvalInput *input,
+                                                     float *output,
+                                                     const int offset,
+                                                     const int work_size)
+{
+  int i = ccl_gpu_global_id_x();
+  if (i < work_size) {
+    kernel_curve_shadow_transparency_evaluate(NULL, input, output, offset + i);
+  }
+}
+
 /* --------------------------------------------------------------------
  * Denoising.
  */
diff --git a/intern/cycles/kernel/device/optix/kernel.cu b/intern/cycles/kernel/device/optix/kernel.cu
index 574f66ab708..a3bafb9846c 100644
--- a/intern/cycles/kernel/device/optix/kernel.cu
+++ b/intern/cycles/kernel/device/optix/kernel.cu
@@ -210,29 +210,50 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
   optixSetPayload_5(true);
   return optixTerminateRay();
 #  else
-  const int max_hits = optixGetPayload_3();
+  const uint max_hits = optixGetPayload_3();
+  const uint num_hits_packed = optixGetPayload_2();
+  const uint num_recorded_hits = uint16_unpack_from_uint_0(num_hits_packed);
+  const uint num_hits = uint16_unpack_from_uint_1(num_hits_packed);
 
   /* If no transparent shadows, all light is blocked and we can stop immediately. */
-  if (max_hits == 0 ||
+  if (num_hits >= max_hits ||
       !(intersection_get_shader_flags(NULL, prim, type) & SD_HAS_TRANSPARENT_SHADOW)) {
     optixSetPayload_5(true);
     return optixTerminateRay();
   }
 
+  /* Always use baked shadow transparency for curves. */
+  if (type & PRIMITIVE_ALL_CURVE) {
+    float throughput = __uint_as_float(optixGetPayload_1());
+    throughput *= intersection_curve_shadow_transparency(nullptr, object, prim, u);
+    optixSetPayload_1(__float_as_uint(throughput));
+    optixSetPayload_2(uint16_pack_to_uint(num_recorded_hits, num_hits + 1));
+
+    if (throughput < CURVE_SHADOW_TRANSPARENCY_CUTOFF) {
+      optixSetPayload_4(true);
+      return optixTerminateRay();
+    }
+    else {
+      /* Continue tracing. */
+      optixIgnoreIntersection();
+      return;
+    }
+  }
+
   /* Record transparent intersection. */
-  const int num_hits = optixGetPayload_2();
-  int record_index = num_hits;
+  optixSetPayload_2(uint16_pack_to_uint(num_recorded_hits + 1, num_hits + 1));
 
-  optixSetPayload_2(num_hits + 1);
+  uint record_index = num_recorded_hits;
 
   const IntegratorShadowState state = optixGetPayload_0();
 
-  if (record_index >= max_hits) {
+  const uint max_record_hits = min(max_hits, INTEGRATOR_SHADOW_ISECT_SIZE);
+  if (record_index >= max_record_hits) {
     /* If maximum number of hits reached, find a hit to replace. */
     float max_recorded_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, 0, t);
-    int max_recorded_hit = 0;
+    uint max_recorded_hit = 0;
 
-    for (int i = 1; i < max_hits; i++) {
+    for (int i = 1; i < max_record_hits; i++) {
       const float isect_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, i, t);
       if (isect_t > max_recorded_t) {
         max_recorded_t = isect_t;
@@ -256,6 +277,7 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
   INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, object) = object;
   INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, type) = type;
 
+  /* Continue tracing. */
   optixIgnoreIntersection();
 #  endif /* __TRANSPARENT_SHADOWS__ */
 #endif   /* __SHADOW_RECORD_ALL__ */
-- 
cgit v1.2.3


From cccfa597ba69944817e0913944cf3c3d0a6e1165 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@blender.org>
Date: Sun, 17 Oct 2021 18:08:00 +0200
Subject: Cycles: make ambient occlusion pass take into account transparency
 again

Taking advantage of the new decoupled main and shadow paths. For CPU we
just store two nested structs in the integrator state, one for direct light
shadows and one for AO. For the GPU we restrict the number of shade surface
states to be executed based on available space in the shadow paths queue.

This also helps improve performance in benchmark scenes with an AO pass,
since it is no longer needed to use the shader raytracing kernel there,
which has worse performance.

Differential Revision: https://developer.blender.org/D12900
---
 intern/cycles/kernel/device/gpu/kernel.h           | 22 +++++++++++++++++-----
 .../cycles/kernel/device/gpu/parallel_prefix_sum.h |  8 +++++---
 .../kernel/device/gpu/parallel_sorted_index.h      | 12 +++++++++++-
 3 files changed, 33 insertions(+), 9 deletions(-)

(limited to 'intern/cycles/kernel/device')

diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index b6df74e835a..fcb398f7e6d 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -282,11 +282,22 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B
 }
 
 extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_sorted_paths_array(
-        int num_states, int *indices, int *num_indices, int *key_prefix_sum, int kernel)
+    kernel_gpu_integrator_sorted_paths_array(int num_states,
+                                             int num_states_limit,
+                                             int *indices,
+                                             int *num_indices,
+                                             int *key_counter,
+                                             int *key_prefix_sum,
+                                             int kernel)
 {
   gpu_parallel_sorted_index_array<GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices, num_indices, key_prefix_sum, [kernel](const int state) {
+      num_states,
+      num_states_limit,
+      indices,
+      num_indices,
+      key_counter,
+      key_prefix_sum,
+      [kernel](const int state) {
         return (INTEGRATOR_STATE(state, path, queued_kernel) == kernel) ?
                    INTEGRATOR_STATE(state, path, shader_sort_key) :
                    GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
@@ -322,9 +333,10 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B
 }
 
 extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_prefix_sum(int *values, int num_values)
+    kernel_gpu_prefix_sum(int *counter, int *prefix_sum, int num_values)
 {
-  gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>(values, num_values);
+  gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>(
+      counter, prefix_sum, num_values);
 }
 
 /* --------------------------------------------------------------------
diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
index a1349e82efb..aabe6e2e27a 100644
--- a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
+++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
@@ -33,7 +33,8 @@ CCL_NAMESPACE_BEGIN
 #  define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
 #endif
 
-template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, const int num_values)
+template<uint blocksize>
+__device__ void gpu_parallel_prefix_sum(int *counter, int *prefix_sum, const int num_values)
 {
   if (!(ccl_gpu_block_idx_x == 0 && ccl_gpu_thread_idx_x == 0)) {
     return;
@@ -41,8 +42,9 @@ template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, co
 
   int offset = 0;
   for (int i = 0; i < num_values; i++) {
-    const int new_offset = offset + values[i];
-    values[i] = offset;
+    const int new_offset = offset + counter[i];
+    prefix_sum[i] = offset;
+    counter[i] = 0;
     offset = new_offset;
   }
 }
diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
index 9bca1fad22f..7570c5a6bbd 100644
--- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -35,8 +35,10 @@ CCL_NAMESPACE_BEGIN
 
 template<uint blocksize, typename GetKeyOp>
 __device__ void gpu_parallel_sorted_index_array(const uint num_states,
+                                                const int num_states_limit,
                                                 int *indices,
                                                 int *num_indices,
+                                                int *key_counter,
                                                 int *key_prefix_sum,
                                                 GetKeyOp get_key_op)
 {
@@ -46,7 +48,15 @@ __device__ void gpu_parallel_sorted_index_array(const uint num_states,
 
   if (key != GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY) {
     const uint index = atomic_fetch_and_add_uint32(&key_prefix_sum[key], 1);
-    indices[index] = state_index;
+    if (index < num_states_limit) {
+      /* Assign state index. */
+      indices[index] = state_index;
+    }
+    else {
+      /* Can't process this state now, increase the counter again so that
+       * it will be handled in another iteration. */
+      atomic_fetch_and_add_uint32(&key_counter[key], 1);
+    }
   }
 }
 
-- 
cgit v1.2.3


From 7d111f4ac2b1321fc9387922c606381cadd3f3ad Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@blender.org>
Date: Wed, 20 Oct 2021 17:58:16 +0200
Subject: Cleanup: remove unused code

---
 intern/cycles/kernel/device/cpu/kernel_arch.h      |  8 --------
 intern/cycles/kernel/device/cpu/kernel_arch_impl.h | 19 -------------------
 2 files changed, 27 deletions(-)

(limited to 'intern/cycles/kernel/device')

diff --git a/intern/cycles/kernel/device/cpu/kernel_arch.h b/intern/cycles/kernel/device/cpu/kernel_arch.h
index 7a438b58e73..432ac5e15a9 100644
--- a/intern/cycles/kernel/device/cpu/kernel_arch.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch.h
@@ -107,12 +107,4 @@ void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobalsCPU *
                                                         ccl_global float *render_buffer,
                                                         int pixel_index);
 
-/* --------------------------------------------------------------------
- * Bake.
- */
-/* TODO(sergey): Needs to be re-implemented. Or not? Brecht did it already :) */
-
-void KERNEL_FUNCTION_FULL_NAME(bake)(
-    const KernelGlobalsCPU *kg, float *buffer, int sample, int x, int y, int offset, int stride);
-
 #undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
index ac606c768db..ba777062113 100644
--- a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
@@ -231,25 +231,6 @@ void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobalsCPU *
 #endif
 }
 
-/* --------------------------------------------------------------------
- * Bake.
- */
-/* TODO(sergey): Needs to be re-implemented. Or not? Brecht did it already :) */
-
-void KERNEL_FUNCTION_FULL_NAME(bake)(
-    const KernelGlobalsCPU *kg, float *buffer, int sample, int x, int y, int offset, int stride)
-{
-#if 0
-#  ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, bake);
-#  else
-#    ifdef __BAKING__
-  kernel_bake_evaluate(kg, buffer, sample, x, y, offset, stride);
-#    endif
-#  endif /* KERNEL_STUB */
-#endif
-}
-
 #undef KERNEL_INVOKE
 #undef DEFINE_INTEGRATOR_KERNEL
 #undef DEFINE_INTEGRATOR_SHADE_KERNEL
-- 
cgit v1.2.3


From df004637643241136a3294a63c7d4ca865cdea98 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht>
Date: Thu, 21 Oct 2021 15:14:30 +0200
Subject: Cycles: add shadow path compaction for GPU rendering

Similar to main path compaction that happens before adding work tiles, this
compacts shadow paths before launching kernels that may add shadow paths.

Only do it when more than 50% of space is wasted.

It's not a clear win in all scenes, some are up to 1.5% slower. Likely caused
by different order of scheduling kernels having an unpredictable performance
impact. Still feels like compaction is just the right thing to avoid cases
where a few shadow paths can hold up a lot of main paths.

Differential Revision: https://developer.blender.org/D12944
---
 intern/cycles/kernel/device/gpu/kernel.h | 41 ++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

(limited to 'intern/cycles/kernel/device')

diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index fcb398f7e6d..eeac09d4b29 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -281,6 +281,18 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B
       });
 }
 
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_terminated_shadow_paths_array(int num_states,
+                                                        int *indices,
+                                                        int *num_indices,
+                                                        int indices_offset)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices + indices_offset, num_indices, [](const int state) {
+        return (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
+      });
+}
+
 extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
     kernel_gpu_integrator_sorted_paths_array(int num_states,
                                              int num_states_limit,
@@ -332,6 +344,35 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B
   }
 }
 
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_compact_shadow_paths_array(int num_states,
+                                                     int *indices,
+                                                     int *num_indices,
+                                                     int num_active_paths)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices, num_indices, [num_active_paths](const int state) {
+        return (state >= num_active_paths) &&
+               (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0);
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_compact_shadow_states(const int *active_terminated_states,
+                                                const int active_states_offset,
+                                                const int terminated_states_offset,
+                                                const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int from_state = active_terminated_states[active_states_offset + global_index];
+    const int to_state = active_terminated_states[terminated_states_offset + global_index];
+
+    integrator_shadow_state_move(NULL, to_state, from_state);
+  }
+}
+
 extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE)
     kernel_gpu_prefix_sum(int *counter, int *prefix_sum, int num_values)
 {
-- 
cgit v1.2.3


From d092933abbadb3a6d5ab53a0b2b3b865cd5c9079 Mon Sep 17 00:00:00 2001
From: Sayak Biswas <sayakAMD>
Date: Thu, 21 Oct 2021 20:57:17 +0200
Subject: Cycles: various fixes for HIP and compilation of HIP binaries

* Additional structs added to the hipew loader for device props
* Adds hipRTC functions to the loader for future usage
* Enables CPU+GPU usage for HIP
* Cleanup to the adaptive kernel compilation process
* Fix for kernel compilation failures with HIP with latest master

Ref T92393, D12958
---
 intern/cycles/kernel/device/hip/globals.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'intern/cycles/kernel/device')

diff --git a/intern/cycles/kernel/device/hip/globals.h b/intern/cycles/kernel/device/hip/globals.h
index 39978ae7899..28e1cc4282f 100644
--- a/intern/cycles/kernel/device/hip/globals.h
+++ b/intern/cycles/kernel/device/hip/globals.h
@@ -27,10 +27,10 @@ CCL_NAMESPACE_BEGIN
 
 /* Not actually used, just a NULL pointer that gets passed everywhere, which we
  * hope gets optimized out by the compiler. */
-struct KernelGlobals {
-  /* NOTE: Keep the size in sync with SHADOW_STACK_MAX_HITS. */
+struct KernelGlobalsGPU {
   int unused[1];
 };
+typedef ccl_global const KernelGlobalsGPU *ccl_restrict KernelGlobals;
 
 /* Global scene data and textures */
 __constant__ KernelData __data;
-- 
cgit v1.2.3


From 282516e53eba9bb3aaddd67b2b099fea98bd4c1f Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@blender.org>
Date: Thu, 21 Oct 2021 19:25:38 +0200
Subject: Cleanup: refactor float/half conversions for clarity

---
 intern/cycles/kernel/device/cpu/image.h    | 4 ++--
 intern/cycles/kernel/device/cuda/compat.h  | 7 +++++++
 intern/cycles/kernel/device/gpu/kernel.h   | 2 +-
 intern/cycles/kernel/device/optix/compat.h | 7 +++++++
 4 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'intern/cycles/kernel/device')

diff --git a/intern/cycles/kernel/device/cpu/image.h b/intern/cycles/kernel/device/cpu/image.h
index 44c5d7ef065..93f956e354d 100644
--- a/intern/cycles/kernel/device/cpu/image.h
+++ b/intern/cycles/kernel/device/cpu/image.h
@@ -72,12 +72,12 @@ template<typename T> struct TextureInterpolator {
 
   static ccl_always_inline float4 read(half4 r)
   {
-    return half4_to_float4(r);
+    return half4_to_float4_image(r);
   }
 
   static ccl_always_inline float4 read(half r)
   {
-    float f = half_to_float(r);
+    float f = half_to_float_image(r);
     return make_float4(f, f, f, 1.0f);
   }
 
diff --git a/intern/cycles/kernel/device/cuda/compat.h b/intern/cycles/kernel/device/cuda/compat.h
index 685c7a5b753..8a50eb1a3d5 100644
--- a/intern/cycles/kernel/device/cuda/compat.h
+++ b/intern/cycles/kernel/device/cuda/compat.h
@@ -128,6 +128,13 @@ __device__ half __float2half(const float f)
   return val;
 }
 
+__device__ float __half2float(const half h)
+{
+  float val;
+  asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h));
+  return val;
+}
+
 /* Types */
 
 #include "util/util_half.h"
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index eeac09d4b29..335cb1ec0c0 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -516,7 +516,7 @@ ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgba(
   film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel);
 
   ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x;
-  float4_store_half((ccl_global half *)out, make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
+  *out = float4_to_half4_display(make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
 }
 
 /* Common implementation for half4 destination and 3-channel input pass. */
diff --git a/intern/cycles/kernel/device/optix/compat.h b/intern/cycles/kernel/device/optix/compat.h
index c9ec9be05df..d27b7d55475 100644
--- a/intern/cycles/kernel/device/optix/compat.h
+++ b/intern/cycles/kernel/device/optix/compat.h
@@ -120,6 +120,13 @@ __device__ half __float2half(const float f)
   return val;
 }
 
+__device__ float __half2float(const half h)
+{
+  float val;
+  asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h));
+  return val;
+}
+
 /* Types */
 
 #include "util/util_half.h"
-- 
cgit v1.2.3