Cycles: merge of cycles-x branch, a major update to the renderer

This includes much improved GPU rendering performance, viewport interactivity, new shadow catcher, revamped sampling settings, subsurface scattering anisotropy, new GPU volume sampling, improved PMJ sampling pattern, and more. Some features have also been removed or changed, breaking backwards compatibility. Including the removal of the OpenCL backend, for which alternatives are under development. Release notes and code docs: https://wiki.blender.org/wiki/Reference/Release_Notes/3.0/Cycles https://wiki.blender.org/wiki/Source/Render/Cycles Credits: * Sergey Sharybin * Brecht Van Lommel * Patrick Mours (OptiX backend) * Christophe Hery (subsurface scattering anisotropy) * William Leeson (PMJ sampling pattern) * Alaska (various fixes and tweaks) * Thomas Dinges (various fixes) For the full commit history, see the cycles-x branch. This squashes together all the changes since intermediate changes would often fail building or tests. Ref T87839, T87837, T87836 Fixes T90734, T89353, T80267, T80267, T77185, T69800
author: Brecht Van Lommel <brecht@blender.org> 2021-09-20 18:59:20 +0300
committer: Brecht Van Lommel <brecht@blender.org> 2021-09-21 15:55:54 +0300
commit: 08031197250aeecbaca3803254e6f25b8c7b7b37 (patch)
tree: 6fe7ab045f0dc0a423d6557c4073f34309ef4740 /intern/cycles/kernel/device/gpu
parent: fa6b1007bad065440950cd67deb16a04f368856f (diff)
6 files changed, 1382 insertions, 0 deletions
diff --git a/intern/cycles/kernel/device/gpu/image.h b/intern/cycles/kernel/device/gpu/image.h
new file mode 100644
index 00000000000..b015c78a8f5
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/image.h
@@ -0,0 +1,278 @@
+/*
+ * Copyright 2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef WITH_NANOVDB
+#  define NDEBUG /* Disable "assert" in device code */
+#  define NANOVDB_USE_INTRINSICS
+#  include "nanovdb/NanoVDB.h"
+#  include "nanovdb/util/SampleFromVoxels.h"
+#endif
+
+/* w0, w1, w2, and w3 are the four cubic B-spline basis functions. */
+ccl_device float cubic_w0(float a)
+{
+  return (1.0f / 6.0f) * (a * (a * (-a + 3.0f) - 3.0f) + 1.0f);
+}
+ccl_device float cubic_w1(float a)
+{
+  return (1.0f / 6.0f) * (a * a * (3.0f * a - 6.0f) + 4.0f);
+}
+ccl_device float cubic_w2(float a)
+{
+  return (1.0f / 6.0f) * (a * (a * (-3.0f * a + 3.0f) + 3.0f) + 1.0f);
+}
+ccl_device float cubic_w3(float a)
+{
+  return (1.0f / 6.0f) * (a * a * a);
+}
+
+/* g0 and g1 are the two amplitude functions. */
+ccl_device float cubic_g0(float a)
+{
+  return cubic_w0(a) + cubic_w1(a);
+}
+ccl_device float cubic_g1(float a)
+{
+  return cubic_w2(a) + cubic_w3(a);
+}
+
+/* h0 and h1 are the two offset functions */
+ccl_device float cubic_h0(float a)
+{
+  return (cubic_w1(a) / cubic_g0(a)) - 1.0f;
+}
+ccl_device float cubic_h1(float a)
+{
+  return (cubic_w3(a) / cubic_g1(a)) + 1.0f;
+}
+
+/* Fast bicubic texture lookup using 4 bilinear lookups, adapted from CUDA samples. */
+template<typename T>
+ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, float y)
+{
+  ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+
+  x = (x * info.width) - 0.5f;
+  y = (y * info.height) - 0.5f;
+
+  float px = floorf(x);
+  float py = floorf(y);
+  float fx = x - px;
+  float fy = y - py;
+
+  float g0x = cubic_g0(fx);
+  float g1x = cubic_g1(fx);
+  /* Note +0.5 offset to compensate for CUDA linear filtering convention. */
+  float x0 = (px + cubic_h0(fx) + 0.5f) / info.width;
+  float x1 = (px + cubic_h1(fx) + 0.5f) / info.width;
+  float y0 = (py + cubic_h0(fy) + 0.5f) / info.height;
+  float y1 = (py + cubic_h1(fy) + 0.5f) / info.height;
+
+  return cubic_g0(fy) * (g0x * ccl_gpu_tex_object_read_2D<T>(tex, x0, y0) +
+                         g1x * ccl_gpu_tex_object_read_2D<T>(tex, x1, y0)) +
+         cubic_g1(fy) * (g0x * ccl_gpu_tex_object_read_2D<T>(tex, x0, y1) +
+                         g1x * ccl_gpu_tex_object_read_2D<T>(tex, x1, y1));
+}
+
+/* Fast tricubic texture lookup using 8 trilinear lookups. */
+template<typename T>
+ccl_device_noinline T
+kernel_tex_image_interp_tricubic(const TextureInfo &info, float x, float y, float z)
+{
+  ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+
+  x = (x * info.width) - 0.5f;
+  y = (y * info.height) - 0.5f;
+  z = (z * info.depth) - 0.5f;
+
+  float px = floorf(x);
+  float py = floorf(y);
+  float pz = floorf(z);
+  float fx = x - px;
+  float fy = y - py;
+  float fz = z - pz;
+
+  float g0x = cubic_g0(fx);
+  float g1x = cubic_g1(fx);
+  float g0y = cubic_g0(fy);
+  float g1y = cubic_g1(fy);
+  float g0z = cubic_g0(fz);
+  float g1z = cubic_g1(fz);
+
+  /* Note +0.5 offset to compensate for CUDA linear filtering convention. */
+  float x0 = (px + cubic_h0(fx) + 0.5f) / info.width;
+  float x1 = (px + cubic_h1(fx) + 0.5f) / info.width;
+  float y0 = (py + cubic_h0(fy) + 0.5f) / info.height;
+  float y1 = (py + cubic_h1(fy) + 0.5f) / info.height;
+  float z0 = (pz + cubic_h0(fz) + 0.5f) / info.depth;
+  float z1 = (pz + cubic_h1(fz) + 0.5f) / info.depth;
+
+  return g0z * (g0y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y0, z0) +
+                       g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y0, z0)) +
+                g1y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y1, z0) +
+                       g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y1, z0))) +
+         g1z * (g0y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y0, z1) +
+                       g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y0, z1)) +
+                g1y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y1, z1) +
+                       g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y1, z1)));
+}
+
+#ifdef WITH_NANOVDB
+template<typename T, typename S>
+ccl_device T kernel_tex_image_interp_tricubic_nanovdb(S &s, float x, float y, float z)
+{
+  float px = floorf(x);
+  float py = floorf(y);
+  float pz = floorf(z);
+  float fx = x - px;
+  float fy = y - py;
+  float fz = z - pz;
+
+  float g0x = cubic_g0(fx);
+  float g1x = cubic_g1(fx);
+  float g0y = cubic_g0(fy);
+  float g1y = cubic_g1(fy);
+  float g0z = cubic_g0(fz);
+  float g1z = cubic_g1(fz);
+
+  float x0 = px + cubic_h0(fx);
+  float x1 = px + cubic_h1(fx);
+  float y0 = py + cubic_h0(fy);
+  float y1 = py + cubic_h1(fy);
+  float z0 = pz + cubic_h0(fz);
+  float z1 = pz + cubic_h1(fz);
+
+  using namespace nanovdb;
+
+  return g0z * (g0y * (g0x * s(Vec3f(x0, y0, z0)) + g1x * s(Vec3f(x1, y0, z0))) +
+                g1y * (g0x * s(Vec3f(x0, y1, z0)) + g1x * s(Vec3f(x1, y1, z0)))) +
+         g1z * (g0y * (g0x * s(Vec3f(x0, y0, z1)) + g1x * s(Vec3f(x1, y0, z1))) +
+                g1y * (g0x * s(Vec3f(x0, y1, z1)) + g1x * s(Vec3f(x1, y1, z1))));
+}
+
+template<typename T>
+ccl_device_noinline T kernel_tex_image_interp_nanovdb(
+    const TextureInfo &info, float x, float y, float z, uint interpolation)
+{
+  using namespace nanovdb;
+
+  NanoGrid<T> *const grid = (NanoGrid<T> *)info.data;
+  typedef typename nanovdb::NanoGrid<T>::AccessorType AccessorType;
+  AccessorType acc = grid->getAccessor();
+
+  switch (interpolation) {
+    case INTERPOLATION_CLOSEST:
+      return SampleFromVoxels<AccessorType, 0, false>(acc)(Vec3f(x, y, z));
+    case INTERPOLATION_LINEAR:
+      return SampleFromVoxels<AccessorType, 1, false>(acc)(Vec3f(x - 0.5f, y - 0.5f, z - 0.5f));
+    default:
+      SampleFromVoxels<AccessorType, 1, false> s(acc);
+      return kernel_tex_image_interp_tricubic_nanovdb<T>(s, x - 0.5f, y - 0.5f, z - 0.5f);
+  }
+}
+#endif
+
+ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float x, float y)
+{
+  const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+
+  /* float4, byte4, ushort4 and half4 */
+  const int texture_type = info.data_type;
+  if (texture_type == IMAGE_DATA_TYPE_FLOAT4 || texture_type == IMAGE_DATA_TYPE_BYTE4 ||
+      texture_type == IMAGE_DATA_TYPE_HALF4 || texture_type == IMAGE_DATA_TYPE_USHORT4) {
+    if (info.interpolation == INTERPOLATION_CUBIC) {
+      return kernel_tex_image_interp_bicubic<float4>(info, x, y);
+    }
+    else {
+      ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+      return ccl_gpu_tex_object_read_2D<float4>(tex, x, y);
+    }
+  }
+  /* float, byte and half */
+  else {
+    float f;
+
+    if (info.interpolation == INTERPOLATION_CUBIC) {
+      f = kernel_tex_image_interp_bicubic<float>(info, x, y);
+    }
+    else {
+      ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+      f = ccl_gpu_tex_object_read_2D<float>(tex, x, y);
+    }
+
+    return make_float4(f, f, f, 1.0f);
+  }
+}
+
+ccl_device float4 kernel_tex_image_interp_3d(const KernelGlobals *kg,
+                                             int id,
+                                             float3 P,
+                                             InterpolationType interp)
+{
+  const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+
+  if (info.use_transform_3d) {
+    P = transform_point(&info.transform_3d, P);
+  }
+
+  const float x = P.x;
+  const float y = P.y;
+  const float z = P.z;
+
+  uint interpolation = (interp == INTERPOLATION_NONE) ? info.interpolation : interp;
+  const int texture_type = info.data_type;
+
+#ifdef WITH_NANOVDB
+  if (texture_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT) {
+    float f = kernel_tex_image_interp_nanovdb<float>(info, x, y, z, interpolation);
+    return make_float4(f, f, f, 1.0f);
+  }
+  if (texture_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
+    nanovdb::Vec3f f = kernel_tex_image_interp_nanovdb<nanovdb::Vec3f>(
+        info, x, y, z, interpolation);
+    return make_float4(f[0], f[1], f[2], 1.0f);
+  }
+#endif
+  if (texture_type == IMAGE_DATA_TYPE_FLOAT4 || texture_type == IMAGE_DATA_TYPE_BYTE4 ||
+      texture_type == IMAGE_DATA_TYPE_HALF4 || texture_type == IMAGE_DATA_TYPE_USHORT4) {
+    if (interpolation == INTERPOLATION_CUBIC) {
+      return kernel_tex_image_interp_tricubic<float4>(info, x, y, z);
+    }
+    else {
+      ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+      return ccl_gpu_tex_object_read_3D<float4>(tex, x, y, z);
+    }
+  }
+  else {
+    float f;
+
+    if (interpolation == INTERPOLATION_CUBIC) {
+      f = kernel_tex_image_interp_tricubic<float>(info, x, y, z);
+    }
+    else {
+      ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+      f = ccl_gpu_tex_object_read_3D<float>(tex, x, y, z);
+    }
+
+    return make_float4(f, f, f, 1.0f);
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
new file mode 100644
index 00000000000..7b79c0aedfa
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -0,0 +1,843 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Common GPU kernels. */
+
+#include "kernel/device/gpu/parallel_active_index.h"
+#include "kernel/device/gpu/parallel_prefix_sum.h"
+#include "kernel/device/gpu/parallel_sorted_index.h"
+
+#include "kernel/integrator/integrator_state.h"
+#include "kernel/integrator/integrator_state_flow.h"
+#include "kernel/integrator/integrator_state_util.h"
+
+#include "kernel/integrator/integrator_init_from_bake.h"
+#include "kernel/integrator/integrator_init_from_camera.h"
+#include "kernel/integrator/integrator_intersect_closest.h"
+#include "kernel/integrator/integrator_intersect_shadow.h"
+#include "kernel/integrator/integrator_intersect_subsurface.h"
+#include "kernel/integrator/integrator_intersect_volume_stack.h"
+#include "kernel/integrator/integrator_shade_background.h"
+#include "kernel/integrator/integrator_shade_light.h"
+#include "kernel/integrator/integrator_shade_shadow.h"
+#include "kernel/integrator/integrator_shade_surface.h"
+#include "kernel/integrator/integrator_shade_volume.h"
+
+#include "kernel/kernel_adaptive_sampling.h"
+#include "kernel/kernel_bake.h"
+#include "kernel/kernel_film.h"
+#include "kernel/kernel_work_stealing.h"
+
+/* --------------------------------------------------------------------
+ * Integrator.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_reset(int num_states)
+{
+  const int state = ccl_gpu_global_id_x();
+
+  if (state < num_states) {
+    INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0;
+    INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0;
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_init_from_camera(KernelWorkTile *tiles,
+                                           const int num_tiles,
+                                           float *render_buffer,
+                                           const int max_tile_work_size)
+{
+  const int work_index = ccl_gpu_global_id_x();
+
+  if (work_index >= max_tile_work_size * num_tiles) {
+    return;
+  }
+
+  const int tile_index = work_index / max_tile_work_size;
+  const int tile_work_index = work_index - tile_index * max_tile_work_size;
+
+  const KernelWorkTile *tile = &tiles[tile_index];
+
+  if (tile_work_index >= tile->work_size) {
+    return;
+  }
+
+  const int state = tile->path_index_offset + tile_work_index;
+
+  uint x, y, sample;
+  get_work_pixel(tile, tile_work_index, &x, &y, &sample);
+
+  integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample);
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_init_from_bake(KernelWorkTile *tiles,
+                                         const int num_tiles,
+                                         float *render_buffer,
+                                         const int max_tile_work_size)
+{
+  const int work_index = ccl_gpu_global_id_x();
+
+  if (work_index >= max_tile_work_size * num_tiles) {
+    return;
+  }
+
+  const int tile_index = work_index / max_tile_work_size;
+  const int tile_work_index = work_index - tile_index * max_tile_work_size;
+
+  const KernelWorkTile *tile = &tiles[tile_index];
+
+  if (tile_work_index >= tile->work_size) {
+    return;
+  }
+
+  const int state = tile->path_index_offset + tile_work_index;
+
+  uint x, y, sample;
+  get_work_pixel(tile, tile_work_index, &x, &y, &sample);
+
+  integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample);
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_intersect_closest(const int *path_index_array, const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_intersect_closest(NULL, state);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_intersect_shadow(const int *path_index_array, const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_intersect_shadow(NULL, state);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_intersect_subsurface(const int *path_index_array, const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_intersect_subsurface(NULL, state);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_intersect_volume_stack(const int *path_index_array, const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_intersect_volume_stack(NULL, state);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shade_background(const int *path_index_array,
+                                           float *render_buffer,
+                                           const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_shade_background(NULL, state, render_buffer);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shade_light(const int *path_index_array,
+                                      float *render_buffer,
+                                      const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_shade_light(NULL, state, render_buffer);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shade_shadow(const int *path_index_array,
+                                       float *render_buffer,
+                                       const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_shade_shadow(NULL, state, render_buffer);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shade_surface(const int *path_index_array,
+                                        float *render_buffer,
+                                        const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_shade_surface(NULL, state, render_buffer);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shade_surface_raytrace(const int *path_index_array,
+                                                 float *render_buffer,
+                                                 const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_shade_surface_raytrace(NULL, state, render_buffer);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shade_volume(const int *path_index_array,
+                                       float *render_buffer,
+                                       const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_shade_volume(NULL, state, render_buffer);
+  }
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_queued_paths_array(int num_states,
+                                             int *indices,
+                                             int *num_indices,
+                                             int kernel)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices, num_indices, [kernel](const int state) {
+        return (INTEGRATOR_STATE(path, queued_kernel) == kernel);
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_queued_shadow_paths_array(int num_states,
+                                                    int *indices,
+                                                    int *num_indices,
+                                                    int kernel)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices, num_indices, [kernel](const int state) {
+        return (INTEGRATOR_STATE(shadow_path, queued_kernel) == kernel);
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_active_paths_array(int num_states, int *indices, int *num_indices)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices, num_indices, [](const int state) {
+        return (INTEGRATOR_STATE(path, queued_kernel) != 0) ||
+               (INTEGRATOR_STATE(shadow_path, queued_kernel) != 0);
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_terminated_paths_array(int num_states,
+                                                 int *indices,
+                                                 int *num_indices,
+                                                 int indices_offset)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices + indices_offset, num_indices, [](const int state) {
+        return (INTEGRATOR_STATE(path, queued_kernel) == 0) &&
+               (INTEGRATOR_STATE(shadow_path, queued_kernel) == 0);
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_sorted_paths_array(
+        int num_states, int *indices, int *num_indices, int *key_prefix_sum, int kernel)
+{
+  gpu_parallel_sorted_index_array<GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices, num_indices, key_prefix_sum, [kernel](const int state) {
+        return (INTEGRATOR_STATE(path, queued_kernel) == kernel) ?
+                   INTEGRATOR_STATE(path, shader_sort_key) :
+                   GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_compact_paths_array(int num_states,
+                                              int *indices,
+                                              int *num_indices,
+                                              int num_active_paths)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices, num_indices, [num_active_paths](const int state) {
+        return (state >= num_active_paths) &&
+               ((INTEGRATOR_STATE(path, queued_kernel) != 0) ||
+                (INTEGRATOR_STATE(shadow_path, queued_kernel) != 0));
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_compact_states(const int *active_terminated_states,
+                                         const int active_states_offset,
+                                         const int terminated_states_offset,
+                                         const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int from_state = active_terminated_states[active_states_offset + global_index];
+    const int to_state = active_terminated_states[terminated_states_offset + global_index];
+
+    integrator_state_move(to_state, from_state);
+  }
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_prefix_sum(int *values, int num_values)
+{
+  gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>(values, num_values);
+}
+
+/* --------------------------------------------------------------------
+ * Adaptive sampling.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_adaptive_sampling_convergence_check(float *render_buffer,
+                                                   int sx,
+                                                   int sy,
+                                                   int sw,
+                                                   int sh,
+                                                   float threshold,
+                                                   bool reset,
+                                                   int offset,
+                                                   int stride,
+                                                   uint *num_active_pixels)
+{
+  const int work_index = ccl_gpu_global_id_x();
+  const int y = work_index / sw;
+  const int x = work_index - y * sw;
+
+  bool converged = true;
+
+  if (x < sw && y < sh) {
+    converged = kernel_adaptive_sampling_convergence_check(
+        nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride);
+  }
+
+  /* NOTE: All threads specified in the mask must execute the intrinsic. */
+  const uint num_active_pixels_mask = ccl_gpu_ballot(!converged);
+  const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
+  if (lane_id == 0) {
+    atomic_fetch_and_add_uint32(num_active_pixels, __popc(num_active_pixels_mask));
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_adaptive_sampling_filter_x(
+        float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+  const int y = ccl_gpu_global_id_x();
+
+  if (y < sh) {
+    kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_adaptive_sampling_filter_y(
+        float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+  const int x = ccl_gpu_global_id_x();
+
+  if (x < sw) {
+    kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride);
+  }
+}
+
+/* --------------------------------------------------------------------
+ * Cryptomatte.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_cryptomatte_postprocess(float *render_buffer, int num_pixels)
+{
+  const int pixel_index = ccl_gpu_global_id_x();
+
+  if (pixel_index < num_pixels) {
+    kernel_cryptomatte_post(nullptr, render_buffer, pixel_index);
+  }
+}
+
+/* --------------------------------------------------------------------
+ * Film.
+ */
+
+/* Common implementation for float destination. */
+template<typename Processor>
+ccl_device_inline void kernel_gpu_film_convert_common(const KernelFilmConvert *kfilm_convert,
+                                                      float *pixels,
+                                                      float *render_buffer,
+                                                      int num_pixels,
+                                                      int width,
+                                                      int offset,
+                                                      int stride,
+                                                      int dst_offset,
+                                                      int dst_stride,
+                                                      const Processor &processor)
+{
+  const int render_pixel_index = ccl_gpu_global_id_x();
+  if (render_pixel_index >= num_pixels) {
+    return;
+  }
+
+  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index * kfilm_convert->pass_stride;
+  ccl_global const float *buffer = render_buffer + render_buffer_offset;
+  ccl_global float *pixel = pixels +
+                            (render_pixel_index + dst_offset) * kfilm_convert->pixel_stride;
+
+  processor(kfilm_convert, buffer, pixel);
+}
+
+/* Common implementation for half4 destination and 4-channel input pass. */
+template<typename Processor>
+ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgba(
+    const KernelFilmConvert *kfilm_convert,
+    uchar4 *rgba,
+    float *render_buffer,
+    int num_pixels,
+    int width,
+    int offset,
+    int stride,
+    int rgba_offset,
+    int rgba_stride,
+    const Processor &processor)
+{
+  const int render_pixel_index = ccl_gpu_global_id_x();
+  if (render_pixel_index >= num_pixels) {
+    return;
+  }
+
+  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index * kfilm_convert->pass_stride;
+  ccl_global const float *buffer = render_buffer + render_buffer_offset;
+
+  float pixel[4];
+  processor(kfilm_convert, buffer, pixel);
+
+  film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel);
+
+  const int x = render_pixel_index % width;
+  const int y = render_pixel_index / width;
+
+  ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x;
+  float4_store_half((ccl_global half *)out, make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
+}
+
+/* Common implementation for half4 destination and 3-channel input pass. */
+template<typename Processor>
+ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgb(
+    const KernelFilmConvert *kfilm_convert,
+    uchar4 *rgba,
+    float *render_buffer,
+    int num_pixels,
+    int width,
+    int offset,
+    int stride,
+    int rgba_offset,
+    int rgba_stride,
+    const Processor &processor)
+{
+  kernel_gpu_film_convert_half_rgba_common_rgba(
+      kfilm_convert,
+      rgba,
+      render_buffer,
+      num_pixels,
+      width,
+      offset,
+      stride,
+      rgba_offset,
+      rgba_stride,
+      [&processor](const KernelFilmConvert *kfilm_convert,
+                   ccl_global const float *buffer,
+                   float *pixel_rgba) {
+        processor(kfilm_convert, buffer, pixel_rgba);
+        pixel_rgba[3] = 1.0f;
+      });
+}
+
+/* Common implementation for half4 destination and single channel input pass. */
+template<typename Processor>
+ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_value(
+    const KernelFilmConvert *kfilm_convert,
+    uchar4 *rgba,
+    float *render_buffer,
+    int num_pixels,
+    int width,
+    int offset,
+    int stride,
+    int rgba_offset,
+    int rgba_stride,
+    const Processor &processor)
+{
+  kernel_gpu_film_convert_half_rgba_common_rgba(
+      kfilm_convert,
+      rgba,
+      render_buffer,
+      num_pixels,
+      width,
+      offset,
+      stride,
+      rgba_offset,
+      rgba_stride,
+      [&processor](const KernelFilmConvert *kfilm_convert,
+                   ccl_global const float *buffer,
+                   float *pixel_rgba) {
+        float value;
+        processor(kfilm_convert, buffer, &value);
+
+        pixel_rgba[0] = value;
+        pixel_rgba[1] = value;
+        pixel_rgba[2] = value;
+        pixel_rgba[3] = 1.0f;
+      });
+}
+
+#define KERNEL_FILM_CONVERT_PROC(name) \
+  ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) name
+
+#define KERNEL_FILM_CONVERT_DEFINE(variant, channels) \
+  KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant) \
+  (const KernelFilmConvert kfilm_convert, \
+   float *pixels, \
+   float *render_buffer, \
+   int num_pixels, \
+   int width, \
+   int offset, \
+   int stride, \
+   int rgba_offset, \
+   int rgba_stride) \
+  { \
+    kernel_gpu_film_convert_common(&kfilm_convert, \
+                                   pixels, \
+                                   render_buffer, \
+                                   num_pixels, \
+                                   width, \
+                                   offset, \
+                                   stride, \
+                                   rgba_offset, \
+                                   rgba_stride, \
+                                   film_get_pass_pixel_##variant); \
+  } \
+  KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant##_half_rgba) \
+  (const KernelFilmConvert kfilm_convert, \
+   uchar4 *rgba, \
+   float *render_buffer, \
+   int num_pixels, \
+   int width, \
+   int offset, \
+   int stride, \
+   int rgba_offset, \
+   int rgba_stride) \
+  { \
+    kernel_gpu_film_convert_half_rgba_common_##channels(&kfilm_convert, \
+                                                        rgba, \
+                                                        render_buffer, \
+                                                        num_pixels, \
+                                                        width, \
+                                                        offset, \
+                                                        stride, \
+                                                        rgba_offset, \
+                                                        rgba_stride, \
+                                                        film_get_pass_pixel_##variant); \
+  }
+
+KERNEL_FILM_CONVERT_DEFINE(depth, value)
+KERNEL_FILM_CONVERT_DEFINE(mist, value)
+KERNEL_FILM_CONVERT_DEFINE(sample_count, value)
+KERNEL_FILM_CONVERT_DEFINE(float, value)
+
+KERNEL_FILM_CONVERT_DEFINE(light_path, rgb)
+KERNEL_FILM_CONVERT_DEFINE(float3, rgb)
+
+KERNEL_FILM_CONVERT_DEFINE(motion, rgba)
+KERNEL_FILM_CONVERT_DEFINE(cryptomatte, rgba)
+KERNEL_FILM_CONVERT_DEFINE(shadow_catcher, rgba)
+KERNEL_FILM_CONVERT_DEFINE(shadow_catcher_matte_with_shadow, rgba)
+KERNEL_FILM_CONVERT_DEFINE(combined, rgba)
+KERNEL_FILM_CONVERT_DEFINE(float4, rgba)
+
+#undef KERNEL_FILM_CONVERT_DEFINE
+#undef KERNEL_FILM_CONVERT_HALF_RGBA_DEFINE
+#undef KERNEL_FILM_CONVERT_PROC
+
+/* --------------------------------------------------------------------
+ * Shader evaluation.
+ */
+
+/* Displacement */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_shader_eval_displace(KernelShaderEvalInput *input,
+                                    float4 *output,
+                                    const int offset,
+                                    const int work_size)
+{
+  int i = ccl_gpu_global_id_x();
+  if (i < work_size) {
+    kernel_displace_evaluate(NULL, input, output, offset + i);
+  }
+}
+
+/* Background Shader Evaluation */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_shader_eval_background(KernelShaderEvalInput *input,
+                                      float4 *output,
+                                      const int offset,
+                                      const int work_size)
+{
+  int i = ccl_gpu_global_id_x();
+  if (i < work_size) {
+    kernel_background_evaluate(NULL, input, output, offset + i);
+  }
+}
+
+/* --------------------------------------------------------------------
+ * Denoising.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_filter_color_preprocess(float *render_buffer,
+                                       int full_x,
+                                       int full_y,
+                                       int width,
+                                       int height,
+                                       int offset,
+                                       int stride,
+                                       int pass_stride,
+                                       int pass_denoised)
+{
+  const int work_index = ccl_gpu_global_id_x();
+  const int y = work_index / width;
+  const int x = work_index - y * width;
+
+  if (x >= width || y >= height) {
+    return;
+  }
+
+  const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
+  float *buffer = render_buffer + render_pixel_index * pass_stride;
+
+  float *color_out = buffer + pass_denoised;
+  color_out[0] = clamp(color_out[0], 0.0f, 10000.0f);
+  color_out[1] = clamp(color_out[1], 0.0f, 10000.0f);
+  color_out[2] = clamp(color_out[2], 0.0f, 10000.0f);
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_filter_guiding_preprocess(float *guiding_buffer,
+                                         int guiding_pass_stride,
+                                         int guiding_pass_albedo,
+                                         int guiding_pass_normal,
+                                         const float *render_buffer,
+                                         int render_offset,
+                                         int render_stride,
+                                         int render_pass_stride,
+                                         int render_pass_sample_count,
+                                         int render_pass_denoising_albedo,
+                                         int render_pass_denoising_normal,
+                                         int full_x,
+                                         int full_y,
+                                         int width,
+                                         int height,
+                                         int num_samples)
+{
+  const int work_index = ccl_gpu_global_id_x();
+  const int y = work_index / width;
+  const int x = work_index - y * width;
+
+  if (x >= width || y >= height) {
+    return;
+  }
+
+  const uint64_t guiding_pixel_index = x + y * width;
+  float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
+
+  const uint64_t render_pixel_index = render_offset + (x + full_x) + (y + full_y) * render_stride;
+  const float *buffer = render_buffer + render_pixel_index * render_pass_stride;
+
+  float pixel_scale;
+  if (render_pass_sample_count == PASS_UNUSED) {
+    pixel_scale = 1.0f / num_samples;
+  }
+  else {
+    pixel_scale = 1.0f / __float_as_uint(buffer[render_pass_sample_count]);
+  }
+
+  /* Albedo pass. */
+  if (guiding_pass_albedo != PASS_UNUSED) {
+    kernel_assert(render_pass_denoising_albedo != PASS_UNUSED);
+
+    const float *aledo_in = buffer + render_pass_denoising_albedo;
+    float *albedo_out = guiding_pixel + guiding_pass_albedo;
+
+    albedo_out[0] = aledo_in[0] * pixel_scale;
+    albedo_out[1] = aledo_in[1] * pixel_scale;
+    albedo_out[2] = aledo_in[2] * pixel_scale;
+  }
+
+  /* Normal pass. */
+  if (render_pass_denoising_normal != PASS_UNUSED) {
+    kernel_assert(render_pass_denoising_normal != PASS_UNUSED);
+
+    const float *normal_in = buffer + render_pass_denoising_normal;
+    float *normal_out = guiding_pixel + guiding_pass_normal;
+
+    normal_out[0] = normal_in[0] * pixel_scale;
+    normal_out[1] = normal_in[1] * pixel_scale;
+    normal_out[2] = normal_in[2] * pixel_scale;
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_filter_guiding_set_fake_albedo(float *guiding_buffer,
+                                              int guiding_pass_stride,
+                                              int guiding_pass_albedo,
+                                              int width,
+                                              int height)
+{
+  kernel_assert(guiding_pass_albedo != PASS_UNUSED);
+
+  const int work_index = ccl_gpu_global_id_x();
+  const int y = work_index / width;
+  const int x = work_index - y * width;
+
+  if (x >= width || y >= height) {
+    return;
+  }
+
+  const uint64_t guiding_pixel_index = x + y * width;
+  float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
+
+  float *albedo_out = guiding_pixel + guiding_pass_albedo;
+
+  albedo_out[0] = 0.5f;
+  albedo_out[1] = 0.5f;
+  albedo_out[2] = 0.5f;
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_filter_color_postprocess(float *render_buffer,
+                                        int full_x,
+                                        int full_y,
+                                        int width,
+                                        int height,
+                                        int offset,
+                                        int stride,
+                                        int pass_stride,
+                                        int num_samples,
+                                        int pass_noisy,
+                                        int pass_denoised,
+                                        int pass_sample_count,
+                                        int num_components,
+                                        bool use_compositing)
+{
+  const int work_index = ccl_gpu_global_id_x();
+  const int y = work_index / width;
+  const int x = work_index - y * width;
+
+  if (x >= width || y >= height) {
+    return;
+  }
+
+  const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
+  float *buffer = render_buffer + render_pixel_index * pass_stride;
+
+  float pixel_scale;
+  if (pass_sample_count == PASS_UNUSED) {
+    pixel_scale = num_samples;
+  }
+  else {
+    pixel_scale = __float_as_uint(buffer[pass_sample_count]);
+  }
+
+  float *denoised_pixel = buffer + pass_denoised;
+
+  denoised_pixel[0] *= pixel_scale;
+  denoised_pixel[1] *= pixel_scale;
+  denoised_pixel[2] *= pixel_scale;
+
+  if (num_components == 3) {
+    /* Pass without alpha channel. */
+  }
+  else if (!use_compositing) {
+    /* Currently compositing passes are either 3-component (derived by dividing light passes)
+     * or do not have transparency (shadow catcher). Implicitly rely on this logic, as it
+     * simplifies logic and avoids extra memory allocation. */
+    const float *noisy_pixel = buffer + pass_noisy;
+    denoised_pixel[3] = noisy_pixel[3];
+  }
+  else {
+    /* Assigning to zero since this is a default alpha value for 3-component passes, and it
+     * is an opaque pixel for 4 component passes. */
+
+    denoised_pixel[3] = 0;
+  }
+}
+
+/* --------------------------------------------------------------------
+ * Shadow catcher.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shadow_catcher_count_possible_splits(int num_states,
+                                                               uint *num_possible_splits)
+{
+  const int state = ccl_gpu_global_id_x();
+
+  bool can_split = false;
+
+  if (state < num_states) {
+    can_split = kernel_shadow_catcher_path_can_split(nullptr, state);
+  }
+
+  /* NOTE: All threads specified in the mask must execute the intrinsic. */
+  const uint can_split_mask = ccl_gpu_ballot(can_split);
+  const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
+  if (lane_id == 0) {
+    atomic_fetch_and_add_uint32(num_possible_splits, __popc(can_split_mask));
+  }
+}
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
new file mode 100644
index 00000000000..85500bf4d07
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Given an array of states, build an array of indices for which the states
+ * are active.
+ *
+ * Shared memory requirement is sizeof(int) * (number_of_warps + 1) */
+
+#include "util/util_atomic.h"
+
+#define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
+
+template<uint blocksize, typename IsActiveOp>
+__device__ void gpu_parallel_active_index_array(const uint num_states,
+                                                int *indices,
+                                                int *num_indices,
+                                                IsActiveOp is_active_op)
+{
+  extern ccl_gpu_shared int warp_offset[];
+
+  const uint thread_index = ccl_gpu_thread_idx_x;
+  const uint thread_warp = thread_index % ccl_gpu_warp_size;
+
+  const uint warp_index = thread_index / ccl_gpu_warp_size;
+  const uint num_warps = blocksize / ccl_gpu_warp_size;
+
+  /* Test if state corresponding to this thread is active. */
+  const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index;
+  const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
+
+  /* For each thread within a warp compute how many other active states precede it. */
+  const uint thread_mask = 0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp);
+  const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) & thread_mask);
+
+  /* Last thread in warp stores number of active states for each warp. */
+  if (thread_warp == ccl_gpu_warp_size - 1) {
+    warp_offset[warp_index] = thread_offset + is_active;
+  }
+
+  ccl_gpu_syncthreads();
+
+  /* Last thread in block converts per-warp sizes to offsets, increments global size of
+   * index array and gets offset to write to. */
+  if (thread_index == blocksize - 1) {
+    /* TODO: parallelize this. */
+    int offset = 0;
+    for (int i = 0; i < num_warps; i++) {
+      int num_active = warp_offset[i];
+      warp_offset[i] = offset;
+      offset += num_active;
+    }
+
+    const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
+    warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
+  }
+
+  ccl_gpu_syncthreads();
+
+  /* Write to index array. */
+  if (is_active) {
+    const uint block_offset = warp_offset[num_warps];
+    indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
new file mode 100644
index 00000000000..f609520b8b4
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Parallel prefix sum.
+ *
+ * TODO: actually make this work in parallel.
+ *
+ * This is used for an array the size of the number of shaders in the scene
+ * which is not usually huge, so might not be a significant bottleneck. */
+
+#include "util/util_atomic.h"
+
+#define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
+
+template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, const int num_values)
+{
+  if (!(ccl_gpu_block_idx_x == 0 && ccl_gpu_thread_idx_x == 0)) {
+    return;
+  }
+
+  int offset = 0;
+  for (int i = 0; i < num_values; i++) {
+    const int new_offset = offset + values[i];
+    values[i] = offset;
+    offset = new_offset;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/parallel_reduce.h b/intern/cycles/kernel/device/gpu/parallel_reduce.h
new file mode 100644
index 00000000000..65b1990dbb8
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/parallel_reduce.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Parallel sum of array input_data with size n into output_sum.
+ *
+ * Adapted from "Optimizing Parallel Reduction in GPU", Mark Harris.
+ *
+ * This version adds multiple elements per thread sequentially.  This reduces
+ * the overall cost of the algorithm while keeping the work complexity O(n) and
+ * the step complexity O(log n). (Brent's Theorem optimization) */
+
+#define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 512
+
+template<uint blocksize, typename InputT, typename OutputT, typename ConvertOp>
+__device__ void gpu_parallel_sum(
+    const InputT *input_data, const uint n, OutputT *output_sum, OutputT zero, ConvertOp convert)
+{
+  extern ccl_gpu_shared OutputT shared_data[];
+
+  const uint tid = ccl_gpu_thread_idx_x;
+  const uint gridsize = blocksize * ccl_gpu_grid_dim_x();
+
+  OutputT sum = zero;
+  for (uint i = ccl_gpu_block_idx_x * blocksize + tid; i < n; i += gridsize) {
+    sum += convert(input_data[i]);
+  }
+  shared_data[tid] = sum;
+
+  ccl_gpu_syncthreads();
+
+  if (blocksize >= 512 && tid < 256) {
+    shared_data[tid] = sum = sum + shared_data[tid + 256];
+  }
+
+  ccl_gpu_syncthreads();
+
+  if (blocksize >= 256 && tid < 128) {
+    shared_data[tid] = sum = sum + shared_data[tid + 128];
+  }
+
+  ccl_gpu_syncthreads();
+
+  if (blocksize >= 128 && tid < 64) {
+    shared_data[tid] = sum = sum + shared_data[tid + 64];
+  }
+
+  ccl_gpu_syncthreads();
+
+  if (blocksize >= 64 && tid < 32) {
+    shared_data[tid] = sum = sum + shared_data[tid + 32];
+  }
+
+  ccl_gpu_syncthreads();
+
+  if (tid < 32) {
+    for (int offset = ccl_gpu_warp_size / 2; offset > 0; offset /= 2) {
+      sum += ccl_shfl_down_sync(0xFFFFFFFF, sum, offset);
+    }
+  }
+
+  if (tid == 0) {
+    output_sum[ccl_gpu_block_idx_x] = sum;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
new file mode 100644
index 00000000000..99b35468517
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Given an array of states, build an array of indices for which the states
+ * are active and sorted by a given key. The prefix sum of the number of active
+ * states per key must have already been computed.
+ *
+ * TODO: there may be ways to optimize this to avoid this many atomic ops? */
+
+#include "util/util_atomic.h"
+
+#define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
+#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
+
+template<uint blocksize, typename GetKeyOp>
+__device__ void gpu_parallel_sorted_index_array(const uint num_states,
+                                                int *indices,
+                                                int *num_indices,
+                                                int *key_prefix_sum,
+                                                GetKeyOp get_key_op)
+{
+  const uint state_index = ccl_gpu_block_idx_x * blocksize + ccl_gpu_thread_idx_x;
+  const int key = (state_index < num_states) ? get_key_op(state_index) :
+                                               GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
+
+  if (key != GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY) {
+    const uint index = atomic_fetch_and_add_uint32(&key_prefix_sum[key], 1);
+    indices[index] = state_index;
+  }
+}
+
+CCL_NAMESPACE_END
author	Brecht Van Lommel <brecht@blender.org>	2021-09-20 18:59:20 +0300
committer	Brecht Van Lommel <brecht@blender.org>	2021-09-21 15:55:54 +0300
commit	08031197250aeecbaca3803254e6f25b8c7b7b37 (patch)
tree	6fe7ab045f0dc0a423d6557c4073f34309ef4740 /intern/cycles/kernel/device/gpu
parent	fa6b1007bad065440950cd67deb16a04f368856f (diff)