6 files changed, 1773 insertions, 0 deletions
diff --git a/intern/cycles/kernel/film/film_accumulate.h b/intern/cycles/kernel/film/film_accumulate.h
new file mode 100644
index 00000000000..914e165a9cd
--- /dev/null
+++ b/intern/cycles/kernel/film/film_accumulate.h
@@ -0,0 +1,553 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/film/film_adaptive_sampling.h"
+#include "kernel/film/film_write_passes.h"
+
+#include "kernel/integrator/integrator_shadow_catcher.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * BSDF Evaluation
+ *
+ * BSDF evaluation result, split between diffuse and glossy. This is used to
+ * accumulate render passes separately. Note that reflection, transmission
+ * and volume scattering are written to different render passes, but we assume
+ * that only one of those can happen at a bounce, and so do not need to accumulate
+ * them separately. */
+
+ccl_device_inline void bsdf_eval_init(ccl_private BsdfEval *eval,
+                                      const bool is_diffuse,
+                                      float3 value)
+{
+  eval->diffuse = zero_float3();
+  eval->glossy = zero_float3();
+
+  if (is_diffuse) {
+    eval->diffuse = value;
+  }
+  else {
+    eval->glossy = value;
+  }
+}
+
+ccl_device_inline void bsdf_eval_accum(ccl_private BsdfEval *eval,
+                                       const bool is_diffuse,
+                                       float3 value,
+                                       float mis_weight)
+{
+  value *= mis_weight;
+
+  if (is_diffuse) {
+    eval->diffuse += value;
+  }
+  else {
+    eval->glossy += value;
+  }
+}
+
+ccl_device_inline bool bsdf_eval_is_zero(ccl_private BsdfEval *eval)
+{
+  return is_zero(eval->diffuse) && is_zero(eval->glossy);
+}
+
+ccl_device_inline void bsdf_eval_mul(ccl_private BsdfEval *eval, float value)
+{
+  eval->diffuse *= value;
+  eval->glossy *= value;
+}
+
+ccl_device_inline void bsdf_eval_mul3(ccl_private BsdfEval *eval, float3 value)
+{
+  eval->diffuse *= value;
+  eval->glossy *= value;
+}
+
+ccl_device_inline float3 bsdf_eval_sum(ccl_private const BsdfEval *eval)
+{
+  return eval->diffuse + eval->glossy;
+}
+
+ccl_device_inline float3 bsdf_eval_diffuse_glossy_ratio(ccl_private const BsdfEval *eval)
+{
+  /* Ratio of diffuse and glossy to recover proportions for writing to render pass.
+   * We assume reflection, transmission and volume scatter to be exclusive. */
+  return safe_divide_float3_float3(eval->diffuse, eval->diffuse + eval->glossy);
+}
+
+/* --------------------------------------------------------------------
+ * Clamping
+ *
+ * Clamping is done on a per-contribution basis so that we can write directly
+ * to render buffers instead of using per-thread memory, and to avoid the
+ * impact of clamping on other contributions. */
+
+ccl_device_forceinline void kernel_accum_clamp(KernelGlobals kg, ccl_private float3 *L, int bounce)
+{
+#ifdef __KERNEL_DEBUG_NAN__
+  if (!isfinite3_safe(*L)) {
+    kernel_assert(!"Cycles sample with non-finite value detected");
+  }
+#endif
+  /* Make sure all components are finite, allowing the contribution to be usable by adaptive
+   * sampling convergence check, but also to make it so render result never causes issues with
+   * post-processing. */
+  *L = ensure_finite3(*L);
+
+#ifdef __CLAMP_SAMPLE__
+  float limit = (bounce > 0) ? kernel_data.integrator.sample_clamp_indirect :
+                               kernel_data.integrator.sample_clamp_direct;
+  float sum = reduce_add(fabs(*L));
+  if (sum > limit) {
+    *L *= limit / sum;
+  }
+#endif
+}
+
+/* --------------------------------------------------------------------
+ * Pass accumulation utilities.
+ */
+
+/* Get pointer to pixel in render buffer. */
+ccl_device_forceinline ccl_global float *kernel_accum_pixel_render_buffer(
+    KernelGlobals kg, ConstIntegratorState state, ccl_global float *ccl_restrict render_buffer)
+{
+  const uint32_t render_pixel_index = INTEGRATOR_STATE(state, path, render_pixel_index);
+  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
+                                        kernel_data.film.pass_stride;
+  return render_buffer + render_buffer_offset;
+}
+
+/* --------------------------------------------------------------------
+ * Adaptive sampling.
+ */
+
+ccl_device_inline int kernel_accum_sample(KernelGlobals kg,
+                                          ConstIntegratorState state,
+                                          ccl_global float *ccl_restrict render_buffer,
+                                          int sample)
+{
+  if (kernel_data.film.pass_sample_count == PASS_UNUSED) {
+    return sample;
+  }
+
+  ccl_global float *buffer = kernel_accum_pixel_render_buffer(kg, state, render_buffer);
+
+  return atomic_fetch_and_add_uint32((uint *)(buffer) + kernel_data.film.pass_sample_count, 1);
+}
+
+ccl_device void kernel_accum_adaptive_buffer(KernelGlobals kg,
+                                             const int sample,
+                                             const float3 contribution,
+                                             ccl_global float *ccl_restrict buffer)
+{
+  /* Adaptive Sampling. Fill the additional buffer with the odd samples and calculate our stopping
+   * criteria. This is the heuristic from "A hierarchical automatic stopping condition for Monte
+   * Carlo global illumination" except that here it is applied per pixel and not in hierarchical
+   * tiles. */
+
+  if (kernel_data.film.pass_adaptive_aux_buffer == PASS_UNUSED) {
+    return;
+  }
+
+  if (sample_is_even(kernel_data.integrator.sampling_pattern, sample)) {
+    kernel_write_pass_float4(
+        buffer + kernel_data.film.pass_adaptive_aux_buffer,
+        make_float4(contribution.x * 2.0f, contribution.y * 2.0f, contribution.z * 2.0f, 0.0f));
+  }
+}
+
+/* --------------------------------------------------------------------
+ * Shadow catcher.
+ */
+
+#ifdef __SHADOW_CATCHER__
+
+/* Accumulate contribution to the Shadow Catcher pass.
+ *
+ * Returns truth if the contribution is fully handled here and is not to be added to the other
+ * passes (like combined, adaptive sampling). */
+
+ccl_device bool kernel_accum_shadow_catcher(KernelGlobals kg,
+                                            const uint32_t path_flag,
+                                            const float3 contribution,
+                                            ccl_global float *ccl_restrict buffer)
+{
+  if (!kernel_data.integrator.has_shadow_catcher) {
+    return false;
+  }
+
+  kernel_assert(kernel_data.film.pass_shadow_catcher != PASS_UNUSED);
+  kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED);
+
+  /* Matte pass. */
+  if (kernel_shadow_catcher_is_matte_path(path_flag)) {
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_shadow_catcher_matte, contribution);
+    /* NOTE: Accumulate the combined pass and to the samples count pass, so that the adaptive
+     * sampling is based on how noisy the combined pass is as if there were no catchers in the
+     * scene. */
+  }
+
+  /* Shadow catcher pass. */
+  if (kernel_shadow_catcher_is_object_pass(path_flag)) {
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_shadow_catcher, contribution);
+    return true;
+  }
+
+  return false;
+}
+
+ccl_device bool kernel_accum_shadow_catcher_transparent(KernelGlobals kg,
+                                                        const uint32_t path_flag,
+                                                        const float3 contribution,
+                                                        const float transparent,
+                                                        ccl_global float *ccl_restrict buffer)
+{
+  if (!kernel_data.integrator.has_shadow_catcher) {
+    return false;
+  }
+
+  kernel_assert(kernel_data.film.pass_shadow_catcher != PASS_UNUSED);
+  kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED);
+
+  if (path_flag & PATH_RAY_SHADOW_CATCHER_BACKGROUND) {
+    return true;
+  }
+
+  /* Matte pass. */
+  if (kernel_shadow_catcher_is_matte_path(path_flag)) {
+    kernel_write_pass_float4(
+        buffer + kernel_data.film.pass_shadow_catcher_matte,
+        make_float4(contribution.x, contribution.y, contribution.z, transparent));
+    /* NOTE: Accumulate the combined pass and to the samples count pass, so that the adaptive
+     * sampling is based on how noisy the combined pass is as if there were no catchers in the
+     * scene. */
+  }
+
+  /* Shadow catcher pass. */
+  if (kernel_shadow_catcher_is_object_pass(path_flag)) {
+    /* NOTE: The transparency of the shadow catcher pass is ignored. It is not needed for the
+     * calculation and the alpha channel of the pass contains numbers of samples contributed to a
+     * pixel of the pass. */
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_shadow_catcher, contribution);
+    return true;
+  }
+
+  return false;
+}
+
+ccl_device void kernel_accum_shadow_catcher_transparent_only(KernelGlobals kg,
+                                                             const uint32_t path_flag,
+                                                             const float transparent,
+                                                             ccl_global float *ccl_restrict buffer)
+{
+  if (!kernel_data.integrator.has_shadow_catcher) {
+    return;
+  }
+
+  kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED);
+
+  /* Matte pass. */
+  if (kernel_shadow_catcher_is_matte_path(path_flag)) {
+    kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_matte + 3, transparent);
+  }
+}
+
+#endif /* __SHADOW_CATCHER__ */
+
+/* --------------------------------------------------------------------
+ * Render passes.
+ */
+
+/* Write combined pass. */
+ccl_device_inline void kernel_accum_combined_pass(KernelGlobals kg,
+                                                  const uint32_t path_flag,
+                                                  const int sample,
+                                                  const float3 contribution,
+                                                  ccl_global float *ccl_restrict buffer)
+{
+#ifdef __SHADOW_CATCHER__
+  if (kernel_accum_shadow_catcher(kg, path_flag, contribution, buffer)) {
+    return;
+  }
+#endif
+
+  if (kernel_data.film.light_pass_flag & PASSMASK(COMBINED)) {
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_combined, contribution);
+  }
+
+  kernel_accum_adaptive_buffer(kg, sample, contribution, buffer);
+}
+
+/* Write combined pass with transparency. */
+ccl_device_inline void kernel_accum_combined_transparent_pass(KernelGlobals kg,
+                                                              const uint32_t path_flag,
+                                                              const int sample,
+                                                              const float3 contribution,
+                                                              const float transparent,
+                                                              ccl_global float *ccl_restrict
+                                                                  buffer)
+{
+#ifdef __SHADOW_CATCHER__
+  if (kernel_accum_shadow_catcher_transparent(kg, path_flag, contribution, transparent, buffer)) {
+    return;
+  }
+#endif
+
+  if (kernel_data.film.light_pass_flag & PASSMASK(COMBINED)) {
+    kernel_write_pass_float4(
+        buffer + kernel_data.film.pass_combined,
+        make_float4(contribution.x, contribution.y, contribution.z, transparent));
+  }
+
+  kernel_accum_adaptive_buffer(kg, sample, contribution, buffer);
+}
+
+/* Write background or emission to appropriate pass. */
+ccl_device_inline void kernel_accum_emission_or_background_pass(KernelGlobals kg,
+                                                                ConstIntegratorState state,
+                                                                float3 contribution,
+                                                                ccl_global float *ccl_restrict
+                                                                    buffer,
+                                                                const int pass)
+{
+  if (!(kernel_data.film.light_pass_flag & PASS_ANY)) {
+    return;
+  }
+
+#ifdef __PASSES__
+  const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
+  int pass_offset = PASS_UNUSED;
+
+  /* Denoising albedo. */
+#  ifdef __DENOISING_FEATURES__
+  if (path_flag & PATH_RAY_DENOISING_FEATURES) {
+    if (kernel_data.film.pass_denoising_albedo != PASS_UNUSED) {
+      const float3 denoising_feature_throughput = INTEGRATOR_STATE(
+          state, path, denoising_feature_throughput);
+      const float3 denoising_albedo = denoising_feature_throughput * contribution;
+      kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_albedo, denoising_albedo);
+    }
+  }
+#  endif /* __DENOISING_FEATURES__ */
+
+  if (!(path_flag & PATH_RAY_ANY_PASS)) {
+    /* Directly visible, write to emission or background pass. */
+    pass_offset = pass;
+  }
+  else if (path_flag & (PATH_RAY_REFLECT_PASS | PATH_RAY_TRANSMISSION_PASS)) {
+    /* Indirectly visible through reflection. */
+    const int glossy_pass_offset = (path_flag & PATH_RAY_REFLECT_PASS) ?
+                                       ((INTEGRATOR_STATE(state, path, bounce) == 1) ?
+                                            kernel_data.film.pass_glossy_direct :
+                                            kernel_data.film.pass_glossy_indirect) :
+                                       ((INTEGRATOR_STATE(state, path, bounce) == 1) ?
+                                            kernel_data.film.pass_transmission_direct :
+                                            kernel_data.film.pass_transmission_indirect);
+
+    if (glossy_pass_offset != PASS_UNUSED) {
+      /* Glossy is a subset of the throughput, reconstruct it here using the
+       * diffuse-glossy ratio. */
+      const float3 ratio = INTEGRATOR_STATE(state, path, diffuse_glossy_ratio);
+      const float3 glossy_contribution = (one_float3() - ratio) * contribution;
+      kernel_write_pass_float3(buffer + glossy_pass_offset, glossy_contribution);
+    }
+
+    /* Reconstruct diffuse subset of throughput. */
+    pass_offset = (INTEGRATOR_STATE(state, path, bounce) == 1) ?
+                      kernel_data.film.pass_diffuse_direct :
+                      kernel_data.film.pass_diffuse_indirect;
+    if (pass_offset != PASS_UNUSED) {
+      contribution *= INTEGRATOR_STATE(state, path, diffuse_glossy_ratio);
+    }
+  }
+  else if (path_flag & PATH_RAY_VOLUME_PASS) {
+    /* Indirectly visible through volume. */
+    pass_offset = (INTEGRATOR_STATE(state, path, bounce) == 1) ?
+                      kernel_data.film.pass_volume_direct :
+                      kernel_data.film.pass_volume_indirect;
+  }
+
+  /* Single write call for GPU coherence. */
+  if (pass_offset != PASS_UNUSED) {
+    kernel_write_pass_float3(buffer + pass_offset, contribution);
+  }
+#endif /* __PASSES__ */
+}
+
+/* Write light contribution to render buffer. */
+ccl_device_inline void kernel_accum_light(KernelGlobals kg,
+                                          ConstIntegratorShadowState state,
+                                          ccl_global float *ccl_restrict render_buffer)
+{
+  /* The throughput for shadow paths already contains the light shader evaluation. */
+  float3 contribution = INTEGRATOR_STATE(state, shadow_path, throughput);
+  kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(state, shadow_path, bounce));
+
+  const uint32_t render_pixel_index = INTEGRATOR_STATE(state, shadow_path, render_pixel_index);
+  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
+                                        kernel_data.film.pass_stride;
+  ccl_global float *buffer = render_buffer + render_buffer_offset;
+
+  const uint32_t path_flag = INTEGRATOR_STATE(state, shadow_path, flag);
+  const int sample = INTEGRATOR_STATE(state, shadow_path, sample);
+
+  /* Ambient occlusion. */
+  if (path_flag & PATH_RAY_SHADOW_FOR_AO) {
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_ao, contribution);
+    return;
+  }
+
+  /* Direct light shadow. */
+  kernel_accum_combined_pass(kg, path_flag, sample, contribution, buffer);
+
+#ifdef __PASSES__
+  if (kernel_data.film.light_pass_flag & PASS_ANY) {
+    const uint32_t path_flag = INTEGRATOR_STATE(state, shadow_path, flag);
+    int pass_offset = PASS_UNUSED;
+
+    if (path_flag & (PATH_RAY_REFLECT_PASS | PATH_RAY_TRANSMISSION_PASS)) {
+      /* Indirectly visible through reflection. */
+      const int glossy_pass_offset = (path_flag & PATH_RAY_REFLECT_PASS) ?
+                                         ((INTEGRATOR_STATE(state, shadow_path, bounce) == 0) ?
+                                              kernel_data.film.pass_glossy_direct :
+                                              kernel_data.film.pass_glossy_indirect) :
+                                         ((INTEGRATOR_STATE(state, shadow_path, bounce) == 0) ?
+                                              kernel_data.film.pass_transmission_direct :
+                                              kernel_data.film.pass_transmission_indirect);
+
+      if (glossy_pass_offset != PASS_UNUSED) {
+        /* Glossy is a subset of the throughput, reconstruct it here using the
+         * diffuse-glossy ratio. */
+        const float3 ratio = INTEGRATOR_STATE(state, shadow_path, diffuse_glossy_ratio);
+        const float3 glossy_contribution = (one_float3() - ratio) * contribution;
+        kernel_write_pass_float3(buffer + glossy_pass_offset, glossy_contribution);
+      }
+
+      /* Reconstruct diffuse subset of throughput. */
+      pass_offset = (INTEGRATOR_STATE(state, shadow_path, bounce) == 0) ?
+                        kernel_data.film.pass_diffuse_direct :
+                        kernel_data.film.pass_diffuse_indirect;
+      if (pass_offset != PASS_UNUSED) {
+        contribution *= INTEGRATOR_STATE(state, shadow_path, diffuse_glossy_ratio);
+      }
+    }
+    else if (path_flag & PATH_RAY_VOLUME_PASS) {
+      /* Indirectly visible through volume. */
+      pass_offset = (INTEGRATOR_STATE(state, shadow_path, bounce) == 0) ?
+                        kernel_data.film.pass_volume_direct :
+                        kernel_data.film.pass_volume_indirect;
+    }
+
+    /* Single write call for GPU coherence. */
+    if (pass_offset != PASS_UNUSED) {
+      kernel_write_pass_float3(buffer + pass_offset, contribution);
+    }
+
+    /* Write shadow pass. */
+    if (kernel_data.film.pass_shadow != PASS_UNUSED && (path_flag & PATH_RAY_SHADOW_FOR_LIGHT) &&
+        (path_flag & PATH_RAY_CAMERA)) {
+      const float3 unshadowed_throughput = INTEGRATOR_STATE(
+          state, shadow_path, unshadowed_throughput);
+      const float3 shadowed_throughput = INTEGRATOR_STATE(state, shadow_path, throughput);
+      const float3 shadow = safe_divide_float3_float3(shadowed_throughput, unshadowed_throughput) *
+                            kernel_data.film.pass_shadow_scale;
+      kernel_write_pass_float3(buffer + kernel_data.film.pass_shadow, shadow);
+    }
+  }
+#endif
+}
+
+/* Write transparency to render buffer.
+ *
+ * Note that we accumulate transparency = 1 - alpha in the render buffer.
+ * Otherwise we'd have to write alpha on path termination, which happens
+ * in many places. */
+ccl_device_inline void kernel_accum_transparent(KernelGlobals kg,
+                                                ConstIntegratorState state,
+                                                const uint32_t path_flag,
+                                                const float transparent,
+                                                ccl_global float *ccl_restrict buffer)
+{
+  if (kernel_data.film.light_pass_flag & PASSMASK(COMBINED)) {
+    kernel_write_pass_float(buffer + kernel_data.film.pass_combined + 3, transparent);
+  }
+
+  kernel_accum_shadow_catcher_transparent_only(kg, path_flag, transparent, buffer);
+}
+
+/* Write holdout to render buffer. */
+ccl_device_inline void kernel_accum_holdout(KernelGlobals kg,
+                                            ConstIntegratorState state,
+                                            const uint32_t path_flag,
+                                            const float transparent,
+                                            ccl_global float *ccl_restrict render_buffer)
+{
+  ccl_global float *buffer = kernel_accum_pixel_render_buffer(kg, state, render_buffer);
+  kernel_accum_transparent(kg, state, path_flag, transparent, buffer);
+}
+
+/* Write background contribution to render buffer.
+ *
+ * Includes transparency, matching kernel_accum_transparent. */
+ccl_device_inline void kernel_accum_background(KernelGlobals kg,
+                                               ConstIntegratorState state,
+                                               const float3 L,
+                                               const float transparent,
+                                               const bool is_transparent_background_ray,
+                                               ccl_global float *ccl_restrict render_buffer)
+{
+  float3 contribution = INTEGRATOR_STATE(state, path, throughput) * L;
+  kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(state, path, bounce) - 1);
+
+  ccl_global float *buffer = kernel_accum_pixel_render_buffer(kg, state, render_buffer);
+  const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
+
+  if (is_transparent_background_ray) {
+    kernel_accum_transparent(kg, state, path_flag, transparent, buffer);
+  }
+  else {
+    const int sample = INTEGRATOR_STATE(state, path, sample);
+    kernel_accum_combined_transparent_pass(
+        kg, path_flag, sample, contribution, transparent, buffer);
+  }
+  kernel_accum_emission_or_background_pass(
+      kg, state, contribution, buffer, kernel_data.film.pass_background);
+}
+
+/* Write emission to render buffer. */
+ccl_device_inline void kernel_accum_emission(KernelGlobals kg,
+                                             ConstIntegratorState state,
+                                             const float3 throughput,
+                                             const float3 L,
+                                             ccl_global float *ccl_restrict render_buffer)
+{
+  float3 contribution = throughput * L;
+  kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(state, path, bounce) - 1);
+
+  ccl_global float *buffer = kernel_accum_pixel_render_buffer(kg, state, render_buffer);
+  const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
+  const int sample = INTEGRATOR_STATE(state, path, sample);
+
+  kernel_accum_combined_pass(kg, path_flag, sample, contribution, buffer);
+  kernel_accum_emission_or_background_pass(
+      kg, state, contribution, buffer, kernel_data.film.pass_emission);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/film/film_adaptive_sampling.h b/intern/cycles/kernel/film/film_adaptive_sampling.h
new file mode 100644
index 00000000000..c78b5f6b707
--- /dev/null
+++ b/intern/cycles/kernel/film/film_adaptive_sampling.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/film/film_write_passes.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Check whether the pixel has converged and should not be sampled anymore. */
+
+ccl_device_forceinline bool kernel_need_sample_pixel(KernelGlobals kg,
+                                                     ConstIntegratorState state,
+                                                     ccl_global float *render_buffer)
+{
+  if (kernel_data.film.pass_adaptive_aux_buffer == PASS_UNUSED) {
+    return true;
+  }
+
+  const uint32_t render_pixel_index = INTEGRATOR_STATE(state, path, render_pixel_index);
+  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
+                                        kernel_data.film.pass_stride;
+  ccl_global float *buffer = render_buffer + render_buffer_offset;
+
+  const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3;
+  return buffer[aux_w_offset] == 0.0f;
+}
+
+/* Determines whether to continue sampling a given pixel or if it has sufficiently converged. */
+
+ccl_device bool kernel_adaptive_sampling_convergence_check(KernelGlobals kg,
+                                                           ccl_global float *render_buffer,
+                                                           int x,
+                                                           int y,
+                                                           float threshold,
+                                                           bool reset,
+                                                           int offset,
+                                                           int stride)
+{
+  kernel_assert(kernel_data.film.pass_adaptive_aux_buffer != PASS_UNUSED);
+  kernel_assert(kernel_data.film.pass_sample_count != PASS_UNUSED);
+
+  const int render_pixel_index = offset + x + y * stride;
+  ccl_global float *buffer = render_buffer +
+                             (uint64_t)render_pixel_index * kernel_data.film.pass_stride;
+
+  /* TODO(Stefan): Is this better in linear, sRGB or something else? */
+
+  const float4 A = kernel_read_pass_float4(buffer + kernel_data.film.pass_adaptive_aux_buffer);
+  if (!reset && A.w != 0.0f) {
+    /* If the pixel was considered converged, its state will not change in this kernel. Early
+     * output before doing any math.
+     *
+     * TODO(sergey): On a GPU it might be better to keep thread alive for better coherency? */
+    return true;
+  }
+
+  const float4 I = kernel_read_pass_float4(buffer + kernel_data.film.pass_combined);
+
+  const float sample = __float_as_uint(buffer[kernel_data.film.pass_sample_count]);
+  const float inv_sample = 1.0f / sample;
+
+  /* The per pixel error as seen in section 2.1 of
+   * "A hierarchical automatic stopping condition for Monte Carlo global illumination" */
+  const float error_difference = (fabsf(I.x - A.x) + fabsf(I.y - A.y) + fabsf(I.z - A.z)) *
+                                 inv_sample;
+  const float error_normalize = sqrtf((I.x + I.y + I.z) * inv_sample);
+  /* A small epsilon is added to the divisor to prevent division by zero. */
+  const float error = error_difference / (0.0001f + error_normalize);
+  const bool did_converge = (error < threshold);
+
+  const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3;
+  buffer[aux_w_offset] = did_converge;
+
+  return did_converge;
+}
+
+/* This is a simple box filter in two passes.
+ * When a pixel demands more adaptive samples, let its neighboring pixels draw more samples too. */
+
+ccl_device void kernel_adaptive_sampling_filter_x(KernelGlobals kg,
+                                                  ccl_global float *render_buffer,
+                                                  int y,
+                                                  int start_x,
+                                                  int width,
+                                                  int offset,
+                                                  int stride)
+{
+  kernel_assert(kernel_data.film.pass_adaptive_aux_buffer != PASS_UNUSED);
+
+  bool prev = false;
+  for (int x = start_x; x < start_x + width; ++x) {
+    int index = offset + x + y * stride;
+    ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride;
+    const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3;
+
+    if (buffer[aux_w_offset] == 0.0f) {
+      if (x > start_x && !prev) {
+        index = index - 1;
+        buffer = render_buffer + index * kernel_data.film.pass_stride;
+        buffer[aux_w_offset] = 0.0f;
+      }
+      prev = true;
+    }
+    else {
+      if (prev) {
+        buffer[aux_w_offset] = 0.0f;
+      }
+      prev = false;
+    }
+  }
+}
+
+ccl_device void kernel_adaptive_sampling_filter_y(KernelGlobals kg,
+                                                  ccl_global float *render_buffer,
+                                                  int x,
+                                                  int start_y,
+                                                  int height,
+                                                  int offset,
+                                                  int stride)
+{
+  kernel_assert(kernel_data.film.pass_adaptive_aux_buffer != PASS_UNUSED);
+
+  bool prev = false;
+  for (int y = start_y; y < start_y + height; ++y) {
+    int index = offset + x + y * stride;
+    ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride;
+    const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3;
+
+    if (buffer[aux_w_offset] == 0.0f) {
+      if (y > start_y && !prev) {
+        index = index - stride;
+        buffer = render_buffer + index * kernel_data.film.pass_stride;
+        buffer[aux_w_offset] = 0.0f;
+      }
+      prev = true;
+    }
+    else {
+      if (prev) {
+        buffer[aux_w_offset] = 0.0f;
+      }
+      prev = false;
+    }
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/film/film_id_passes.h b/intern/cycles/kernel/film/film_id_passes.h
new file mode 100644
index 00000000000..d5b8c90a828
--- /dev/null
+++ b/intern/cycles/kernel/film/film_id_passes.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2018 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Element of ID pass stored in the render buffers.
+ * It is `float2` semantically, but it must be unaligned since the offset of ID passes in the
+ * render buffers might not meet expected by compiler alignment. */
+typedef struct IDPassBufferElement {
+  float x;
+  float y;
+} IDPassBufferElement;
+
+ccl_device_inline void kernel_write_id_slots(ccl_global float *buffer,
+                                             int num_slots,
+                                             float id,
+                                             float weight)
+{
+  kernel_assert(id != ID_NONE);
+  if (weight == 0.0f) {
+    return;
+  }
+
+  for (int slot = 0; slot < num_slots; slot++) {
+    ccl_global IDPassBufferElement *id_buffer = (ccl_global IDPassBufferElement *)buffer;
+#ifdef __ATOMIC_PASS_WRITE__
+    /* If the loop reaches an empty slot, the ID isn't in any slot yet - so add it! */
+    if (id_buffer[slot].x == ID_NONE) {
+      /* Use an atomic to claim this slot.
+       * If a different thread got here first, try again from this slot on. */
+      float old_id = atomic_compare_and_swap_float(buffer + slot * 2, ID_NONE, id);
+      if (old_id != ID_NONE && old_id != id) {
+        continue;
+      }
+      atomic_add_and_fetch_float(buffer + slot * 2 + 1, weight);
+      break;
+    }
+    /* If there already is a slot for that ID, add the weight.
+     * If no slot was found, add it to the last. */
+    else if (id_buffer[slot].x == id || slot == num_slots - 1) {
+      atomic_add_and_fetch_float(buffer + slot * 2 + 1, weight);
+      break;
+    }
+#else  /* __ATOMIC_PASS_WRITE__ */
+    /* If the loop reaches an empty slot, the ID isn't in any slot yet - so add it! */
+    if (id_buffer[slot].x == ID_NONE) {
+      id_buffer[slot].x = id;
+      id_buffer[slot].y = weight;
+      break;
+    }
+    /* If there already is a slot for that ID, add the weight.
+     * If no slot was found, add it to the last. */
+    else if (id_buffer[slot].x == id || slot == num_slots - 1) {
+      id_buffer[slot].y += weight;
+      break;
+    }
+#endif /* __ATOMIC_PASS_WRITE__ */
+  }
+}
+
+ccl_device_inline void kernel_sort_id_slots(ccl_global float *buffer, int num_slots)
+{
+  ccl_global IDPassBufferElement *id_buffer = (ccl_global IDPassBufferElement *)buffer;
+  for (int slot = 1; slot < num_slots; ++slot) {
+    if (id_buffer[slot].x == ID_NONE) {
+      return;
+    }
+    /* Since we're dealing with a tiny number of elements, insertion sort should be fine. */
+    int i = slot;
+    while (i > 0 && id_buffer[i].y > id_buffer[i - 1].y) {
+      const IDPassBufferElement swap = id_buffer[i];
+      id_buffer[i] = id_buffer[i - 1];
+      id_buffer[i - 1] = swap;
+      --i;
+    }
+  }
+}
+
+/* post-sorting for Cryptomatte */
+ccl_device_inline void kernel_cryptomatte_post(KernelGlobals kg,
+                                               ccl_global float *render_buffer,
+                                               int pixel_index)
+{
+  const int pass_stride = kernel_data.film.pass_stride;
+  const uint64_t render_buffer_offset = (uint64_t)pixel_index * pass_stride;
+  ccl_global float *cryptomatte_buffer = render_buffer + render_buffer_offset +
+                                         kernel_data.film.pass_cryptomatte;
+  kernel_sort_id_slots(cryptomatte_buffer, 2 * kernel_data.film.cryptomatte_depth);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/film/film_passes.h b/intern/cycles/kernel/film/film_passes.h
new file mode 100644
index 00000000000..40ddb1b5563
--- /dev/null
+++ b/intern/cycles/kernel/film/film_passes.h
@@ -0,0 +1,334 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/geom/geom.h"
+
+#include "kernel/film/film_id_passes.h"
+#include "kernel/film/film_write_passes.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Get pointer to pixel in render buffer. */
+ccl_device_forceinline ccl_global float *kernel_pass_pixel_render_buffer(
+    KernelGlobals kg, ConstIntegratorState state, ccl_global float *ccl_restrict render_buffer)
+{
+  const uint32_t render_pixel_index = INTEGRATOR_STATE(state, path, render_pixel_index);
+  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
+                                        kernel_data.film.pass_stride;
+  return render_buffer + render_buffer_offset;
+}
+
+#ifdef __DENOISING_FEATURES__
+
+ccl_device_forceinline void kernel_write_denoising_features_surface(
+    KernelGlobals kg,
+    IntegratorState state,
+    ccl_private const ShaderData *sd,
+    ccl_global float *ccl_restrict render_buffer)
+{
+  if (!(INTEGRATOR_STATE(state, path, flag) & PATH_RAY_DENOISING_FEATURES)) {
+    return;
+  }
+
+  /* Skip implicitly transparent surfaces. */
+  if (sd->flag & SD_HAS_ONLY_VOLUME) {
+    return;
+  }
+
+  ccl_global float *buffer = kernel_pass_pixel_render_buffer(kg, state, render_buffer);
+
+  float3 normal = zero_float3();
+  float3 diffuse_albedo = zero_float3();
+  float3 specular_albedo = zero_float3();
+  float sum_weight = 0.0f, sum_nonspecular_weight = 0.0f;
+
+  for (int i = 0; i < sd->num_closure; i++) {
+    ccl_private const ShaderClosure *sc = &sd->closure[i];
+
+    if (!CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+      continue;
+    }
+
+    /* All closures contribute to the normal feature, but only diffuse-like ones to the albedo. */
+    normal += sc->N * sc->sample_weight;
+    sum_weight += sc->sample_weight;
+
+    float3 closure_albedo = sc->weight;
+    /* Closures that include a Fresnel term typically have weights close to 1 even though their
+     * actual contribution is significantly lower.
+     * To account for this, we scale their weight by the average fresnel factor (the same is also
+     * done for the sample weight in the BSDF setup, so we don't need to scale that here). */
+    if (CLOSURE_IS_BSDF_MICROFACET_FRESNEL(sc->type)) {
+      ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)sc;
+      closure_albedo *= bsdf->extra->fresnel_color;
+    }
+    else if (sc->type == CLOSURE_BSDF_PRINCIPLED_SHEEN_ID) {
+      ccl_private PrincipledSheenBsdf *bsdf = (ccl_private PrincipledSheenBsdf *)sc;
+      closure_albedo *= bsdf->avg_value;
+    }
+    else if (sc->type == CLOSURE_BSDF_HAIR_PRINCIPLED_ID) {
+      closure_albedo *= bsdf_principled_hair_albedo(sc);
+    }
+
+    if (bsdf_get_specular_roughness_squared(sc) > sqr(0.075f)) {
+      diffuse_albedo += closure_albedo;
+      sum_nonspecular_weight += sc->sample_weight;
+    }
+    else {
+      specular_albedo += closure_albedo;
+    }
+  }
+
+  /* Wait for next bounce if 75% or more sample weight belongs to specular-like closures. */
+  if ((sum_weight == 0.0f) || (sum_nonspecular_weight * 4.0f > sum_weight)) {
+    if (sum_weight != 0.0f) {
+      normal /= sum_weight;
+    }
+
+    if (kernel_data.film.pass_denoising_normal != PASS_UNUSED) {
+      /* Transform normal into camera space. */
+      const Transform worldtocamera = kernel_data.cam.worldtocamera;
+      normal = transform_direction(&worldtocamera, normal);
+
+      const float3 denoising_normal = ensure_finite3(normal);
+      kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_normal, denoising_normal);
+    }
+
+    if (kernel_data.film.pass_denoising_albedo != PASS_UNUSED) {
+      const float3 denoising_feature_throughput = INTEGRATOR_STATE(
+          state, path, denoising_feature_throughput);
+      const float3 denoising_albedo = ensure_finite3(denoising_feature_throughput *
+                                                     diffuse_albedo);
+      kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_albedo, denoising_albedo);
+    }
+
+    INTEGRATOR_STATE_WRITE(state, path, flag) &= ~PATH_RAY_DENOISING_FEATURES;
+  }
+  else {
+    INTEGRATOR_STATE_WRITE(state, path, denoising_feature_throughput) *= specular_albedo;
+  }
+}
+
+ccl_device_forceinline void kernel_write_denoising_features_volume(KernelGlobals kg,
+                                                                   IntegratorState state,
+                                                                   const float3 albedo,
+                                                                   const bool scatter,
+                                                                   ccl_global float *ccl_restrict
+                                                                       render_buffer)
+{
+  ccl_global float *buffer = kernel_pass_pixel_render_buffer(kg, state, render_buffer);
+  const float3 denoising_feature_throughput = INTEGRATOR_STATE(
+      state, path, denoising_feature_throughput);
+
+  if (scatter && kernel_data.film.pass_denoising_normal != PASS_UNUSED) {
+    /* Assume scatter is sufficiently diffuse to stop writing denoising features. */
+    INTEGRATOR_STATE_WRITE(state, path, flag) &= ~PATH_RAY_DENOISING_FEATURES;
+
+    /* Write view direction as normal. */
+    const float3 denoising_normal = make_float3(0.0f, 0.0f, -1.0f);
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_normal, denoising_normal);
+  }
+
+  if (kernel_data.film.pass_denoising_albedo != PASS_UNUSED) {
+    /* Write albedo. */
+    const float3 denoising_albedo = ensure_finite3(denoising_feature_throughput * albedo);
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_albedo, denoising_albedo);
+  }
+}
+#endif /* __DENOISING_FEATURES__ */
+
+#ifdef __SHADOW_CATCHER__
+
+/* Write shadow catcher passes on a bounce from the shadow catcher object. */
+ccl_device_forceinline void kernel_write_shadow_catcher_bounce_data(
+    KernelGlobals kg,
+    IntegratorState state,
+    ccl_private const ShaderData *sd,
+    ccl_global float *ccl_restrict render_buffer)
+{
+  if (!kernel_data.integrator.has_shadow_catcher) {
+    return;
+  }
+
+  kernel_assert(kernel_data.film.pass_shadow_catcher_sample_count != PASS_UNUSED);
+  kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED);
+
+  if (!kernel_shadow_catcher_is_path_split_bounce(kg, state, sd->object_flag)) {
+    return;
+  }
+
+  ccl_global float *buffer = kernel_pass_pixel_render_buffer(kg, state, render_buffer);
+
+  /* Count sample for the shadow catcher object. */
+  kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_sample_count, 1.0f);
+
+  /* Since the split is done, the sample does not contribute to the matte, so accumulate it as
+   * transparency to the matte. */
+  const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
+  kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_matte + 3,
+                          average(throughput));
+}
+
+#endif /* __SHADOW_CATCHER__ */
+
+ccl_device_inline size_t kernel_write_id_pass(ccl_global float *ccl_restrict buffer,
+                                              size_t depth,
+                                              float id,
+                                              float matte_weight)
+{
+  kernel_write_id_slots(buffer, depth * 2, id, matte_weight);
+  return depth * 4;
+}
+
+ccl_device_inline void kernel_write_data_passes(KernelGlobals kg,
+                                                IntegratorState state,
+                                                ccl_private const ShaderData *sd,
+                                                ccl_global float *ccl_restrict render_buffer)
+{
+#ifdef __PASSES__
+  const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
+
+  if (!(path_flag & PATH_RAY_CAMERA)) {
+    return;
+  }
+
+  const int flag = kernel_data.film.pass_flag;
+
+  if (!(flag & PASS_ANY)) {
+    return;
+  }
+
+  ccl_global float *buffer = kernel_pass_pixel_render_buffer(kg, state, render_buffer);
+
+  if (!(path_flag & PATH_RAY_SINGLE_PASS_DONE)) {
+    if (!(sd->flag & SD_TRANSPARENT) || kernel_data.film.pass_alpha_threshold == 0.0f ||
+        average(shader_bsdf_alpha(kg, sd)) >= kernel_data.film.pass_alpha_threshold) {
+      if (INTEGRATOR_STATE(state, path, sample) == 0) {
+        if (flag & PASSMASK(DEPTH)) {
+          const float depth = camera_z_depth(kg, sd->P);
+          kernel_write_pass_float(buffer + kernel_data.film.pass_depth, depth);
+        }
+        if (flag & PASSMASK(OBJECT_ID)) {
+          const float id = object_pass_id(kg, sd->object);
+          kernel_write_pass_float(buffer + kernel_data.film.pass_object_id, id);
+        }
+        if (flag & PASSMASK(MATERIAL_ID)) {
+          const float id = shader_pass_id(kg, sd);
+          kernel_write_pass_float(buffer + kernel_data.film.pass_material_id, id);
+        }
+      }
+
+      if (flag & PASSMASK(POSITION)) {
+        const float3 position = sd->P;
+        kernel_write_pass_float3(buffer + kernel_data.film.pass_position, position);
+      }
+      if (flag & PASSMASK(NORMAL)) {
+        const float3 normal = shader_bsdf_average_normal(kg, sd);
+        kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, normal);
+      }
+      if (flag & PASSMASK(ROUGHNESS)) {
+        const float roughness = shader_bsdf_average_roughness(sd);
+        kernel_write_pass_float(buffer + kernel_data.film.pass_roughness, roughness);
+      }
+      if (flag & PASSMASK(UV)) {
+        const float3 uv = primitive_uv(kg, sd);
+        kernel_write_pass_float3(buffer + kernel_data.film.pass_uv, uv);
+      }
+      if (flag & PASSMASK(MOTION)) {
+        const float4 speed = primitive_motion_vector(kg, sd);
+        kernel_write_pass_float4(buffer + kernel_data.film.pass_motion, speed);
+        kernel_write_pass_float(buffer + kernel_data.film.pass_motion_weight, 1.0f);
+      }
+
+      INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_SINGLE_PASS_DONE;
+    }
+  }
+
+  if (kernel_data.film.cryptomatte_passes) {
+    const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
+    const float matte_weight = average(throughput) *
+                               (1.0f - average(shader_bsdf_transparency(kg, sd)));
+    if (matte_weight > 0.0f) {
+      ccl_global float *cryptomatte_buffer = buffer + kernel_data.film.pass_cryptomatte;
+      if (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) {
+        const float id = object_cryptomatte_id(kg, sd->object);
+        cryptomatte_buffer += kernel_write_id_pass(
+            cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight);
+      }
+      if (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) {
+        const float id = shader_cryptomatte_id(kg, sd->shader);
+        cryptomatte_buffer += kernel_write_id_pass(
+            cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight);
+      }
+      if (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) {
+        const float id = object_cryptomatte_asset_id(kg, sd->object);
+        cryptomatte_buffer += kernel_write_id_pass(
+            cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight);
+      }
+    }
+  }
+
+  if (flag & PASSMASK(DIFFUSE_COLOR)) {
+    const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_color,
+                             shader_bsdf_diffuse(kg, sd) * throughput);
+  }
+  if (flag & PASSMASK(GLOSSY_COLOR)) {
+    const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_color,
+                             shader_bsdf_glossy(kg, sd) * throughput);
+  }
+  if (flag & PASSMASK(TRANSMISSION_COLOR)) {
+    const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_color,
+                             shader_bsdf_transmission(kg, sd) * throughput);
+  }
+  if (flag & PASSMASK(MIST)) {
+    /* Bring depth into 0..1 range. */
+    const float mist_start = kernel_data.film.mist_start;
+    const float mist_inv_depth = kernel_data.film.mist_inv_depth;
+
+    const float depth = camera_distance(kg, sd->P);
+    float mist = saturate((depth - mist_start) * mist_inv_depth);
+
+    /* Falloff */
+    const float mist_falloff = kernel_data.film.mist_falloff;
+
+    if (mist_falloff == 1.0f)
+      ;
+    else if (mist_falloff == 2.0f)
+      mist = mist * mist;
+    else if (mist_falloff == 0.5f)
+      mist = sqrtf(mist);
+    else
+      mist = powf(mist, mist_falloff);
+
+    /* Modulate by transparency */
+    const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
+    const float3 alpha = shader_bsdf_alpha(kg, sd);
+    const float mist_output = (1.0f - mist) * average(throughput * alpha);
+
+    /* Note that the final value in the render buffer we want is 1 - mist_output,
+     * to avoid having to tracking this in the Integrator state we do the negation
+     * after rendering. */
+    kernel_write_pass_float(buffer + kernel_data.film.pass_mist, mist_output);
+  }
+#endif
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/film/film_read.h b/intern/cycles/kernel/film/film_read.h
new file mode 100644
index 00000000000..a87eff3832e
--- /dev/null
+++ b/intern/cycles/kernel/film/film_read.h
@@ -0,0 +1,532 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * Common utilities.
+ */
+
+/* The input buffer contains transparency = 1 - alpha, this converts it to
+ * alpha. Also clamp since alpha might end up outside of 0..1 due to Russian
+ * roulette. */
+ccl_device_forceinline float film_transparency_to_alpha(float transparency)
+{
+  return saturate(1.0f - transparency);
+}
+
+ccl_device_inline float film_get_scale(ccl_global const KernelFilmConvert *ccl_restrict
+                                           kfilm_convert,
+                                       ccl_global const float *ccl_restrict buffer)
+{
+  if (kfilm_convert->pass_sample_count == PASS_UNUSED) {
+    return kfilm_convert->scale;
+  }
+
+  if (kfilm_convert->pass_use_filter) {
+    const uint sample_count = *(
+        (ccl_global const uint *)(buffer + kfilm_convert->pass_sample_count));
+    return 1.0f / sample_count;
+  }
+
+  return 1.0f;
+}
+
+ccl_device_inline float film_get_scale_exposure(ccl_global const KernelFilmConvert *ccl_restrict
+                                                    kfilm_convert,
+                                                ccl_global const float *ccl_restrict buffer)
+{
+  if (kfilm_convert->pass_sample_count == PASS_UNUSED) {
+    return kfilm_convert->scale_exposure;
+  }
+
+  const float scale = film_get_scale(kfilm_convert, buffer);
+
+  if (kfilm_convert->pass_use_exposure) {
+    return scale * kfilm_convert->exposure;
+  }
+
+  return scale;
+}
+
+ccl_device_inline bool film_get_scale_and_scale_exposure(
+    ccl_global const KernelFilmConvert *ccl_restrict kfilm_convert,
+    ccl_global const float *ccl_restrict buffer,
+    ccl_private float *ccl_restrict scale,
+    ccl_private float *ccl_restrict scale_exposure)
+{
+  if (kfilm_convert->pass_sample_count == PASS_UNUSED) {
+    *scale = kfilm_convert->scale;
+    *scale_exposure = kfilm_convert->scale_exposure;
+    return true;
+  }
+
+  const uint sample_count = *(
+      (ccl_global const uint *)(buffer + kfilm_convert->pass_sample_count));
+  if (!sample_count) {
+    *scale = 0.0f;
+    *scale_exposure = 0.0f;
+    return false;
+  }
+
+  if (kfilm_convert->pass_use_filter) {
+    *scale = 1.0f / sample_count;
+  }
+  else {
+    *scale = 1.0f;
+  }
+
+  if (kfilm_convert->pass_use_exposure) {
+    *scale_exposure = *scale * kfilm_convert->exposure;
+  }
+  else {
+    *scale_exposure = *scale;
+  }
+
+  return true;
+}
+
+/* --------------------------------------------------------------------
+ * Float (scalar) passes.
+ */
+
+ccl_device_inline void film_get_pass_pixel_depth(ccl_global const KernelFilmConvert *ccl_restrict
+                                                     kfilm_convert,
+                                                 ccl_global const float *ccl_restrict buffer,
+                                                 ccl_private float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components >= 1);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  const float scale_exposure = film_get_scale_exposure(kfilm_convert, buffer);
+
+  ccl_global const float *in = buffer + kfilm_convert->pass_offset;
+  const float f = *in;
+
+  pixel[0] = (f == 0.0f) ? 1e10f : f * scale_exposure;
+}
+
+ccl_device_inline void film_get_pass_pixel_mist(ccl_global const KernelFilmConvert *ccl_restrict
+                                                    kfilm_convert,
+                                                ccl_global const float *ccl_restrict buffer,
+                                                ccl_private float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components >= 1);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  const float scale_exposure = film_get_scale_exposure(kfilm_convert, buffer);
+
+  ccl_global const float *in = buffer + kfilm_convert->pass_offset;
+  const float f = *in;
+
+  /* Note that we accumulate 1 - mist in the kernel to avoid having to
+   * track the mist values in the integrator state. */
+  pixel[0] = saturate(1.0f - f * scale_exposure);
+}
+
+ccl_device_inline void film_get_pass_pixel_sample_count(
+    ccl_global const KernelFilmConvert *ccl_restrict kfilm_convert,
+    ccl_global const float *ccl_restrict buffer,
+    ccl_private float *ccl_restrict pixel)
+{
+  /* TODO(sergey): Consider normalizing into the [0..1] range, so that it is possible to see
+   * meaningful value when adaptive sampler stopped rendering image way before the maximum
+   * number of samples was reached (for examples when number of samples is set to 0 in
+   * viewport). */
+
+  kernel_assert(kfilm_convert->num_components >= 1);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  ccl_global const float *in = buffer + kfilm_convert->pass_offset;
+  const float f = *in;
+
+  pixel[0] = __float_as_uint(f) * kfilm_convert->scale;
+}
+
+ccl_device_inline void film_get_pass_pixel_float(ccl_global const KernelFilmConvert *ccl_restrict
+                                                     kfilm_convert,
+                                                 ccl_global const float *ccl_restrict buffer,
+                                                 ccl_private float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components >= 1);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  const float scale_exposure = film_get_scale_exposure(kfilm_convert, buffer);
+
+  ccl_global const float *in = buffer + kfilm_convert->pass_offset;
+  const float f = *in;
+
+  pixel[0] = f * scale_exposure;
+}
+
+/* --------------------------------------------------------------------
+ * Float 3 passes.
+ */
+
+ccl_device_inline void film_get_pass_pixel_light_path(
+    ccl_global const KernelFilmConvert *ccl_restrict kfilm_convert,
+    ccl_global const float *ccl_restrict buffer,
+    ccl_private float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components >= 3);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  /* Read light pass. */
+  ccl_global const float *in = buffer + kfilm_convert->pass_offset;
+  float3 f = make_float3(in[0], in[1], in[2]);
+
+  /* Optionally add indirect light pass. */
+  if (kfilm_convert->pass_indirect != PASS_UNUSED) {
+    ccl_global const float *in_indirect = buffer + kfilm_convert->pass_indirect;
+    const float3 f_indirect = make_float3(in_indirect[0], in_indirect[1], in_indirect[2]);
+    f += f_indirect;
+  }
+
+  /* Optionally divide out color. */
+  if (kfilm_convert->pass_divide != PASS_UNUSED) {
+    ccl_global const float *in_divide = buffer + kfilm_convert->pass_divide;
+    const float3 f_divide = make_float3(in_divide[0], in_divide[1], in_divide[2]);
+    f = safe_divide_even_color(f, f_divide);
+
+    /* Exposure only, sample scale cancels out. */
+    f *= kfilm_convert->exposure;
+  }
+  else {
+    /* Sample scale and exposure. */
+    f *= film_get_scale_exposure(kfilm_convert, buffer);
+  }
+
+  pixel[0] = f.x;
+  pixel[1] = f.y;
+  pixel[2] = f.z;
+}
+
+ccl_device_inline void film_get_pass_pixel_float3(ccl_global const KernelFilmConvert *ccl_restrict
+                                                      kfilm_convert,
+                                                  ccl_global const float *ccl_restrict buffer,
+                                                  ccl_private float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components >= 3);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  const float scale_exposure = film_get_scale_exposure(kfilm_convert, buffer);
+
+  ccl_global const float *in = buffer + kfilm_convert->pass_offset;
+
+  const float3 f = make_float3(in[0], in[1], in[2]) * scale_exposure;
+
+  pixel[0] = f.x;
+  pixel[1] = f.y;
+  pixel[2] = f.z;
+}
+
+/* --------------------------------------------------------------------
+ * Float4 passes.
+ */
+
+ccl_device_inline void film_get_pass_pixel_motion(ccl_global const KernelFilmConvert *ccl_restrict
+                                                      kfilm_convert,
+                                                  ccl_global const float *ccl_restrict buffer,
+                                                  ccl_private float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components == 4);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+  kernel_assert(kfilm_convert->pass_motion_weight != PASS_UNUSED);
+
+  ccl_global const float *in = buffer + kfilm_convert->pass_offset;
+  ccl_global const float *in_weight = buffer + kfilm_convert->pass_motion_weight;
+
+  const float weight = in_weight[0];
+  const float weight_inv = (weight > 0.0f) ? 1.0f / weight : 0.0f;
+
+  const float4 motion = make_float4(in[0], in[1], in[2], in[3]) * weight_inv;
+
+  pixel[0] = motion.x;
+  pixel[1] = motion.y;
+  pixel[2] = motion.z;
+  pixel[3] = motion.w;
+}
+
+ccl_device_inline void film_get_pass_pixel_cryptomatte(
+    ccl_global const KernelFilmConvert *ccl_restrict kfilm_convert,
+    ccl_global const float *ccl_restrict buffer,
+    ccl_private float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components == 4);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  const float scale = film_get_scale(kfilm_convert, buffer);
+
+  ccl_global const float *in = buffer + kfilm_convert->pass_offset;
+
+  const float4 f = make_float4(in[0], in[1], in[2], in[3]);
+
+  /* x and z contain integer IDs, don't rescale them.
+   * y and w contain matte weights, they get scaled. */
+  pixel[0] = f.x;
+  pixel[1] = f.y * scale;
+  pixel[2] = f.z;
+  pixel[3] = f.w * scale;
+}
+
+ccl_device_inline void film_get_pass_pixel_float4(ccl_global const KernelFilmConvert *ccl_restrict
+                                                      kfilm_convert,
+                                                  ccl_global const float *ccl_restrict buffer,
+                                                  ccl_private float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components == 4);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  float scale, scale_exposure;
+  film_get_scale_and_scale_exposure(kfilm_convert, buffer, &scale, &scale_exposure);
+
+  ccl_global const float *in = buffer + kfilm_convert->pass_offset;
+
+  const float3 color = make_float3(in[0], in[1], in[2]) * scale_exposure;
+  const float alpha = in[3] * scale;
+
+  pixel[0] = color.x;
+  pixel[1] = color.y;
+  pixel[2] = color.z;
+  pixel[3] = alpha;
+}
+
+ccl_device_inline void film_get_pass_pixel_combined(
+    ccl_global const KernelFilmConvert *ccl_restrict kfilm_convert,
+    ccl_global const float *ccl_restrict buffer,
+    ccl_private float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components == 4);
+
+  /* 3rd channel contains transparency = 1 - alpha for the combined pass. */
+
+  kernel_assert(kfilm_convert->num_components == 4);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  float scale, scale_exposure;
+  if (!film_get_scale_and_scale_exposure(kfilm_convert, buffer, &scale, &scale_exposure)) {
+    pixel[0] = 0.0f;
+    pixel[1] = 0.0f;
+    pixel[2] = 0.0f;
+    pixel[3] = 0.0f;
+    return;
+  }
+
+  ccl_global const float *in = buffer + kfilm_convert->pass_offset;
+
+  const float3 color = make_float3(in[0], in[1], in[2]) * scale_exposure;
+  const float alpha = in[3] * scale;
+
+  pixel[0] = color.x;
+  pixel[1] = color.y;
+  pixel[2] = color.z;
+  pixel[3] = film_transparency_to_alpha(alpha);
+}
+
+/* --------------------------------------------------------------------
+ * Shadow catcher.
+ */
+
+ccl_device_inline float3 film_calculate_shadow_catcher_denoised(
+    ccl_global const KernelFilmConvert *ccl_restrict kfilm_convert,
+    ccl_global const float *ccl_restrict buffer)
+{
+  kernel_assert(kfilm_convert->pass_shadow_catcher != PASS_UNUSED);
+
+  float scale, scale_exposure;
+  film_get_scale_and_scale_exposure(kfilm_convert, buffer, &scale, &scale_exposure);
+
+  ccl_global const float *in_catcher = buffer + kfilm_convert->pass_shadow_catcher;
+
+  const float3 pixel = make_float3(in_catcher[0], in_catcher[1], in_catcher[2]) * scale_exposure;
+
+  return pixel;
+}
+
+ccl_device_inline float3 safe_divide_shadow_catcher(float3 a, float3 b)
+{
+  float x, y, z;
+
+  x = (b.x != 0.0f) ? a.x / b.x : 1.0f;
+  y = (b.y != 0.0f) ? a.y / b.y : 1.0f;
+  z = (b.z != 0.0f) ? a.z / b.z : 1.0f;
+
+  return make_float3(x, y, z);
+}
+
+ccl_device_inline float3
+film_calculate_shadow_catcher(ccl_global const KernelFilmConvert *ccl_restrict kfilm_convert,
+                              ccl_global const float *ccl_restrict buffer)
+{
+  /* For the shadow catcher pass we divide combined pass by the shadow catcher.
+   * Note that denoised shadow catcher pass contains value which only needs ot be scaled (but not
+   * to be calculated as division). */
+
+  if (kfilm_convert->is_denoised) {
+    return film_calculate_shadow_catcher_denoised(kfilm_convert, buffer);
+  }
+
+  kernel_assert(kfilm_convert->pass_shadow_catcher_sample_count != PASS_UNUSED);
+
+  /* If there is no shadow catcher object in this pixel, there is no modification of the light
+   * needed, so return one. */
+  ccl_global const float *in_catcher_sample_count =
+      buffer + kfilm_convert->pass_shadow_catcher_sample_count;
+  const float num_samples = in_catcher_sample_count[0];
+  if (num_samples == 0.0f) {
+    return one_float3();
+  }
+
+  kernel_assert(kfilm_convert->pass_shadow_catcher != PASS_UNUSED);
+  ccl_global const float *in_catcher = buffer + kfilm_convert->pass_shadow_catcher;
+
+  /* NOTE: It is possible that the Shadow Catcher pass is requested as an output without actual
+   * shadow catcher objects in the scene. In this case there will be no auxiliary passes required
+   * for the decision (to save up memory). So delay the asserts to this point so that the number of
+   * samples check handles such configuration. */
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+  kernel_assert(kfilm_convert->pass_combined != PASS_UNUSED);
+  kernel_assert(kfilm_convert->pass_shadow_catcher_matte != PASS_UNUSED);
+
+  ccl_global const float *in_combined = buffer + kfilm_convert->pass_combined;
+  ccl_global const float *in_matte = buffer + kfilm_convert->pass_shadow_catcher_matte;
+
+  /* No scaling needed. The integration works in way that number of samples in the combined and
+   * shadow catcher passes are the same, and exposure is canceled during the division. */
+  const float3 color_catcher = make_float3(in_catcher[0], in_catcher[1], in_catcher[2]);
+  const float3 color_combined = make_float3(in_combined[0], in_combined[1], in_combined[2]);
+  const float3 color_matte = make_float3(in_matte[0], in_matte[1], in_matte[2]);
+
+  /* Need to ignore contribution of the matte object when doing division (otherwise there will be
+   * artifacts caused by anti-aliasing). Since combined pass is used for adaptive sampling and need
+   * to contain matte objects, we subtract matte objects contribution here. This is the same as if
+   * the matte objects were not accumulated to the combined pass. */
+  const float3 combined_no_matte = color_combined - color_matte;
+
+  const float3 shadow_catcher = safe_divide_shadow_catcher(combined_no_matte, color_catcher);
+
+  const float scale = film_get_scale(kfilm_convert, buffer);
+  const float transparency = in_combined[3] * scale;
+  const float alpha = film_transparency_to_alpha(transparency);
+
+  /* Alpha-over on white using transparency of the combined pass. This allows to eliminate
+   * artifacts which are happening on an edge of a shadow catcher when using transparent film.
+   * Note that we treat shadow catcher as straight alpha here because alpha got canceled out
+   * during the division. */
+  const float3 pixel = (1.0f - alpha) * one_float3() + alpha * shadow_catcher;
+
+  return pixel;
+}
+
+ccl_device_inline float4 film_calculate_shadow_catcher_matte_with_shadow(
+    ccl_global const KernelFilmConvert *ccl_restrict kfilm_convert,
+    ccl_global const float *ccl_restrict buffer)
+{
+  /* The approximation of the shadow is 1 - average(shadow_catcher_pass). A better approximation
+   * is possible.
+   *
+   * The matte is alpha-overed onto the shadow (which is kind of alpha-overing shadow onto footage,
+   * and then alpha-overing synthetic objects on top). */
+
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+  kernel_assert(kfilm_convert->pass_shadow_catcher != PASS_UNUSED);
+  kernel_assert(kfilm_convert->pass_shadow_catcher_matte != PASS_UNUSED);
+
+  float scale, scale_exposure;
+  if (!film_get_scale_and_scale_exposure(kfilm_convert, buffer, &scale, &scale_exposure)) {
+    return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+  }
+
+  ccl_global const float *in_matte = buffer + kfilm_convert->pass_shadow_catcher_matte;
+
+  const float3 shadow_catcher = film_calculate_shadow_catcher(kfilm_convert, buffer);
+  const float3 color_matte = make_float3(in_matte[0], in_matte[1], in_matte[2]) * scale_exposure;
+
+  const float transparency = in_matte[3] * scale;
+  const float alpha = saturate(1.0f - transparency);
+
+  const float alpha_matte = (1.0f - alpha) * (1.0f - average(shadow_catcher)) + alpha;
+
+  if (kfilm_convert->use_approximate_shadow_catcher_background) {
+    kernel_assert(kfilm_convert->pass_background != PASS_UNUSED);
+
+    ccl_global const float *in_background = buffer + kfilm_convert->pass_background;
+    const float3 color_background = make_float3(
+                                        in_background[0], in_background[1], in_background[2]) *
+                                    scale_exposure;
+    const float3 alpha_over = color_matte + color_background * (1.0f - alpha_matte);
+    return make_float4(alpha_over.x, alpha_over.y, alpha_over.z, 1.0f);
+  }
+
+  return make_float4(color_matte.x, color_matte.y, color_matte.z, alpha_matte);
+}
+
+ccl_device_inline void film_get_pass_pixel_shadow_catcher(
+    ccl_global const KernelFilmConvert *ccl_restrict kfilm_convert,
+    ccl_global const float *ccl_restrict buffer,
+    ccl_private float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components >= 3);
+
+  const float3 pixel_value = film_calculate_shadow_catcher(kfilm_convert, buffer);
+
+  pixel[0] = pixel_value.x;
+  pixel[1] = pixel_value.y;
+  pixel[2] = pixel_value.z;
+}
+
+ccl_device_inline void film_get_pass_pixel_shadow_catcher_matte_with_shadow(
+    ccl_global const KernelFilmConvert *ccl_restrict kfilm_convert,
+    ccl_global const float *ccl_restrict buffer,
+    ccl_private float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components == 3 || kfilm_convert->num_components == 4);
+
+  const float4 pixel_value = film_calculate_shadow_catcher_matte_with_shadow(kfilm_convert,
+                                                                             buffer);
+
+  pixel[0] = pixel_value.x;
+  pixel[1] = pixel_value.y;
+  pixel[2] = pixel_value.z;
+  if (kfilm_convert->num_components == 4) {
+    pixel[3] = pixel_value.w;
+  }
+}
+
+/* --------------------------------------------------------------------
+ * Compositing and overlays.
+ */
+
+ccl_device_inline void film_apply_pass_pixel_overlays_rgba(
+    ccl_global const KernelFilmConvert *ccl_restrict kfilm_convert,
+    ccl_global const float *ccl_restrict buffer,
+    ccl_private float *ccl_restrict pixel)
+{
+  if (kfilm_convert->show_active_pixels &&
+      kfilm_convert->pass_adaptive_aux_buffer != PASS_UNUSED) {
+    if (buffer[kfilm_convert->pass_adaptive_aux_buffer + 3] == 0.0f) {
+      const float3 active_rgb = make_float3(1.0f, 0.0f, 0.0f);
+      const float3 mix_rgb = interp(make_float3(pixel[0], pixel[1], pixel[2]), active_rgb, 0.5f);
+      pixel[0] = mix_rgb.x;
+      pixel[1] = mix_rgb.y;
+      pixel[2] = mix_rgb.z;
+    }
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/film/film_write_passes.h b/intern/cycles/kernel/film/film_write_passes.h
new file mode 100644
index 00000000000..9d379495629
--- /dev/null
+++ b/intern/cycles/kernel/film/film_write_passes.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef __KERNEL_GPU__
+#  define __ATOMIC_PASS_WRITE__
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_write_pass_float(ccl_global float *ccl_restrict buffer, float value)
+{
+#ifdef __ATOMIC_PASS_WRITE__
+  atomic_add_and_fetch_float(buffer, value);
+#else
+  *buffer += value;
+#endif
+}
+
+ccl_device_inline void kernel_write_pass_float3(ccl_global float *ccl_restrict buffer,
+                                                float3 value)
+{
+#ifdef __ATOMIC_PASS_WRITE__
+  ccl_global float *buf_x = buffer + 0;
+  ccl_global float *buf_y = buffer + 1;
+  ccl_global float *buf_z = buffer + 2;
+
+  atomic_add_and_fetch_float(buf_x, value.x);
+  atomic_add_and_fetch_float(buf_y, value.y);
+  atomic_add_and_fetch_float(buf_z, value.z);
+#else
+  buffer[0] += value.x;
+  buffer[1] += value.y;
+  buffer[2] += value.z;
+#endif
+}
+
+ccl_device_inline void kernel_write_pass_float4(ccl_global float *ccl_restrict buffer,
+                                                float4 value)
+{
+#ifdef __ATOMIC_PASS_WRITE__
+  ccl_global float *buf_x = buffer + 0;
+  ccl_global float *buf_y = buffer + 1;
+  ccl_global float *buf_z = buffer + 2;
+  ccl_global float *buf_w = buffer + 3;
+
+  atomic_add_and_fetch_float(buf_x, value.x);
+  atomic_add_and_fetch_float(buf_y, value.y);
+  atomic_add_and_fetch_float(buf_z, value.z);
+  atomic_add_and_fetch_float(buf_w, value.w);
+#else
+  buffer[0] += value.x;
+  buffer[1] += value.y;
+  buffer[2] += value.z;
+  buffer[3] += value.w;
+#endif
+}
+
+ccl_device_inline float kernel_read_pass_float(ccl_global float *ccl_restrict buffer)
+{
+  return *buffer;
+}
+
+ccl_device_inline float3 kernel_read_pass_float3(ccl_global float *ccl_restrict buffer)
+{
+  return make_float3(buffer[0], buffer[1], buffer[2]);
+}
+
+ccl_device_inline float4 kernel_read_pass_float4(ccl_global float *ccl_restrict buffer)
+{
+  return make_float4(buffer[0], buffer[1], buffer[2], buffer[3]);
+}
+
+CCL_NAMESPACE_END