Cycles: merge of cycles-x branch, a major update to the renderer

This includes much improved GPU rendering performance, viewport interactivity, new shadow catcher, revamped sampling settings, subsurface scattering anisotropy, new GPU volume sampling, improved PMJ sampling pattern, and more. Some features have also been removed or changed, breaking backwards compatibility. Including the removal of the OpenCL backend, for which alternatives are under development. Release notes and code docs: https://wiki.blender.org/wiki/Reference/Release_Notes/3.0/Cycles https://wiki.blender.org/wiki/Source/Render/Cycles Credits: * Sergey Sharybin * Brecht Van Lommel * Patrick Mours (OptiX backend) * Christophe Hery (subsurface scattering anisotropy) * William Leeson (PMJ sampling pattern) * Alaska (various fixes and tweaks) * Thomas Dinges (various fixes) For the full commit history, see the cycles-x branch. This squashes together all the changes since intermediate changes would often fail building or tests. Ref T87839, T87837, T87836 Fixes T90734, T89353, T80267, T80267, T77185, T69800
author: Brecht Van Lommel <brecht@blender.org> 2021-09-20 18:59:20 +0300
committer: Brecht Van Lommel <brecht@blender.org> 2021-09-21 15:55:54 +0300
commit: 08031197250aeecbaca3803254e6f25b8c7b7b37 (patch)
tree: 6fe7ab045f0dc0a423d6557c4073f34309ef4740 /intern/cycles/kernel/kernel_adaptive_sampling.h
parent: fa6b1007bad065440950cd67deb16a04f368856f (diff)
1 files changed, 97 insertions, 177 deletions
diff --git a/intern/cycles/kernel/kernel_adaptive_sampling.h b/intern/cycles/kernel/kernel_adaptive_sampling.h
index 98b7bf7e7dc..2bee12f0473 100644
--- a/intern/cycles/kernel/kernel_adaptive_sampling.h
+++ b/intern/cycles/kernel/kernel_adaptive_sampling.h
@@ -14,226 +14,146 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_ADAPTIVE_SAMPLING_H__
-#define __KERNEL_ADAPTIVE_SAMPLING_H__
+#pragma once
+
+#include "kernel/kernel_write_passes.h"
 
 CCL_NAMESPACE_BEGIN
 
-/* Determines whether to continue sampling a given pixel or if it has sufficiently converged. */
+/* Check whether the pixel has converged and should not be sampled anymore. */
 
-ccl_device void kernel_do_adaptive_stopping(KernelGlobals *kg,
-                                            ccl_global float *buffer,
-                                            int sample)
+ccl_device_forceinline bool kernel_need_sample_pixel(INTEGRATOR_STATE_CONST_ARGS,
+                                                     ccl_global float *render_buffer)
 {
-  /* TODO Stefan: Is this better in linear, sRGB or something else? */
-  float4 I = *((ccl_global float4 *)buffer);
-  float4 A = *(ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer);
-  /* The per pixel error as seen in section 2.1 of
-   * "A hierarchical automatic stopping condition for Monte Carlo global illumination"
-   * A small epsilon is added to the divisor to prevent division by zero. */
-  float error = (fabsf(I.x - A.x) + fabsf(I.y - A.y) + fabsf(I.z - A.z)) /
-                (sample * 0.0001f + sqrtf(I.x + I.y + I.z));
-  if (error < kernel_data.integrator.adaptive_threshold * (float)sample) {
-    /* Set the fourth component to non-zero value to indicate that this pixel has converged. */
-    buffer[kernel_data.film.pass_adaptive_aux_buffer + 3] += 1.0f;
+  if (kernel_data.film.pass_adaptive_aux_buffer == PASS_UNUSED) {
+    return true;
   }
-}
-
-/* Adjust the values of an adaptively sampled pixel. */
-
-ccl_device void kernel_adaptive_post_adjust(KernelGlobals *kg,
-                                            ccl_global float *buffer,
-                                            float sample_multiplier)
-{
-  *(ccl_global float4 *)(buffer) *= sample_multiplier;
 
-  /* Scale the aux pass too, this is necessary for progressive rendering to work properly. */
-  kernel_assert(kernel_data.film.pass_adaptive_aux_buffer);
-  *(ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer) *= sample_multiplier;
+  const uint32_t render_pixel_index = INTEGRATOR_STATE(path, render_pixel_index);
+  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
+                                        kernel_data.film.pass_stride;
+  ccl_global float *buffer = render_buffer + render_buffer_offset;
 
-#ifdef __PASSES__
-  int flag = kernel_data.film.pass_flag;
-
-  if (flag & PASSMASK(NORMAL))
-    *(ccl_global float3 *)(buffer + kernel_data.film.pass_normal) *= sample_multiplier;
+  const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3;
+  return buffer[aux_w_offset] == 0.0f;
+}
 
-  if (flag & PASSMASK(UV))
-    *(ccl_global float3 *)(buffer + kernel_data.film.pass_uv) *= sample_multiplier;
+/* Determines whether to continue sampling a given pixel or if it has sufficiently converged. */
 
-  if (flag & PASSMASK(MOTION)) {
-    *(ccl_global float4 *)(buffer + kernel_data.film.pass_motion) *= sample_multiplier;
-    *(ccl_global float *)(buffer + kernel_data.film.pass_motion_weight) *= sample_multiplier;
+ccl_device bool kernel_adaptive_sampling_convergence_check(const KernelGlobals *kg,
+                                                           ccl_global float *render_buffer,
+                                                           int x,
+                                                           int y,
+                                                           float threshold,
+                                                           bool reset,
+                                                           int offset,
+                                                           int stride)
+{
+  kernel_assert(kernel_data.film.pass_adaptive_aux_buffer != PASS_UNUSED);
+  kernel_assert(kernel_data.film.pass_sample_count != PASS_UNUSED);
+
+  const int render_pixel_index = offset + x + y * stride;
+  ccl_global float *buffer = render_buffer +
+                             (uint64_t)render_pixel_index * kernel_data.film.pass_stride;
+
+  /* TODO(Stefan): Is this better in linear, sRGB or something else? */
+
+  const float4 A = kernel_read_pass_float4(buffer + kernel_data.film.pass_adaptive_aux_buffer);
+  if (!reset && A.w != 0.0f) {
+    /* If the pixel was considered converged, its state will not change in this kernmel. Early
+     * output before doing any math.
+     *
+     * TODO(sergey): On a GPU it might be better to keep thread alive for better coherency? */
+    return true;
   }
 
-  if (kernel_data.film.use_light_pass) {
-    int light_flag = kernel_data.film.light_pass_flag;
-
-    if (light_flag & PASSMASK(MIST))
-      *(ccl_global float *)(buffer + kernel_data.film.pass_mist) *= sample_multiplier;
-
-    /* Shadow pass omitted on purpose. It has its own scale parameter. */
-
-    if (light_flag & PASSMASK(DIFFUSE_INDIRECT))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_indirect) *= sample_multiplier;
-    if (light_flag & PASSMASK(GLOSSY_INDIRECT))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_indirect) *= sample_multiplier;
-    if (light_flag & PASSMASK(TRANSMISSION_INDIRECT))
-      *(ccl_global float3 *)(buffer +
-                             kernel_data.film.pass_transmission_indirect) *= sample_multiplier;
-    if (light_flag & PASSMASK(VOLUME_INDIRECT))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_volume_indirect) *= sample_multiplier;
-    if (light_flag & PASSMASK(DIFFUSE_DIRECT))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_direct) *= sample_multiplier;
-    if (light_flag & PASSMASK(GLOSSY_DIRECT))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_direct) *= sample_multiplier;
-    if (light_flag & PASSMASK(TRANSMISSION_DIRECT))
-      *(ccl_global float3 *)(buffer +
-                             kernel_data.film.pass_transmission_direct) *= sample_multiplier;
-    if (light_flag & PASSMASK(VOLUME_DIRECT))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_volume_direct) *= sample_multiplier;
-
-    if (light_flag & PASSMASK(EMISSION))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_emission) *= sample_multiplier;
-    if (light_flag & PASSMASK(BACKGROUND))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_background) *= sample_multiplier;
-    if (light_flag & PASSMASK(AO))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_ao) *= sample_multiplier;
-
-    if (light_flag & PASSMASK(DIFFUSE_COLOR))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_color) *= sample_multiplier;
-    if (light_flag & PASSMASK(GLOSSY_COLOR))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_color) *= sample_multiplier;
-    if (light_flag & PASSMASK(TRANSMISSION_COLOR))
-      *(ccl_global float3 *)(buffer +
-                             kernel_data.film.pass_transmission_color) *= sample_multiplier;
-  }
-#endif
-
-#ifdef __DENOISING_FEATURES__
-
-#  define scale_float3_variance(buffer, offset, scale) \
-    *(buffer + offset) *= scale; \
-    *(buffer + offset + 1) *= scale; \
-    *(buffer + offset + 2) *= scale; \
-    *(buffer + offset + 3) *= scale * scale; \
-    *(buffer + offset + 4) *= scale * scale; \
-    *(buffer + offset + 5) *= scale * scale;
-
-#  define scale_shadow_variance(buffer, offset, scale) \
-    *(buffer + offset) *= scale; \
-    *(buffer + offset + 1) *= scale; \
-    *(buffer + offset + 2) *= scale * scale;
-
-  if (kernel_data.film.pass_denoising_data) {
-    scale_shadow_variance(
-        buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_SHADOW_A, sample_multiplier);
-    scale_shadow_variance(
-        buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_SHADOW_B, sample_multiplier);
-    if (kernel_data.film.pass_denoising_clean) {
-      scale_float3_variance(
-          buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, sample_multiplier);
-      *(buffer + kernel_data.film.pass_denoising_clean) *= sample_multiplier;
-      *(buffer + kernel_data.film.pass_denoising_clean + 1) *= sample_multiplier;
-      *(buffer + kernel_data.film.pass_denoising_clean + 2) *= sample_multiplier;
-    }
-    else {
-      scale_float3_variance(
-          buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, sample_multiplier);
-    }
-    scale_float3_variance(
-        buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL, sample_multiplier);
-    scale_float3_variance(
-        buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO, sample_multiplier);
-    *(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH) *= sample_multiplier;
-    *(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH +
-      1) *= sample_multiplier * sample_multiplier;
-  }
-#endif /* __DENOISING_FEATURES__ */
-
-  /* Cryptomatte. */
-  if (kernel_data.film.cryptomatte_passes) {
-    int num_slots = 0;
-    num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) ? 1 : 0;
-    num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) ? 1 : 0;
-    num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) ? 1 : 0;
-    num_slots = num_slots * 2 * kernel_data.film.cryptomatte_depth;
-    ccl_global float2 *id_buffer = (ccl_global float2 *)(buffer +
-                                                         kernel_data.film.pass_cryptomatte);
-    for (int slot = 0; slot < num_slots; slot++) {
-      id_buffer[slot].y *= sample_multiplier;
-    }
-  }
+  const float4 I = kernel_read_pass_float4(buffer + kernel_data.film.pass_combined);
 
-  /* AOVs. */
-  for (int i = 0; i < kernel_data.film.pass_aov_value_num; i++) {
-    *(buffer + kernel_data.film.pass_aov_value + i) *= sample_multiplier;
-  }
-  for (int i = 0; i < kernel_data.film.pass_aov_color_num; i++) {
-    *((ccl_global float4 *)(buffer + kernel_data.film.pass_aov_color) + i) *= sample_multiplier;
-  }
+  const float sample = __float_as_uint(buffer[kernel_data.film.pass_sample_count]);
+  const float inv_sample = 1.0f / sample;
+
+  /* The per pixel error as seen in section 2.1 of
+   * "A hierarchical automatic stopping condition for Monte Carlo global illumination" */
+  const float error_difference = (fabsf(I.x - A.x) + fabsf(I.y - A.y) + fabsf(I.z - A.z)) *
+                                 inv_sample;
+  const float error_normalize = sqrtf((I.x + I.y + I.z) * inv_sample);
+  /* A small epsilon is added to the divisor to prevent division by zero. */
+  const float error = error_difference / (0.0001f + error_normalize);
+  const bool did_converge = (error < threshold);
+
+  const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3;
+  buffer[aux_w_offset] = did_converge;
+
+  return did_converge;
 }
 
 /* This is a simple box filter in two passes.
  * When a pixel demands more adaptive samples, let its neighboring pixels draw more samples too. */
 
-ccl_device bool kernel_do_adaptive_filter_x(KernelGlobals *kg, int y, ccl_global WorkTile *tile)
+ccl_device void kernel_adaptive_sampling_filter_x(const KernelGlobals *kg,
+                                                  ccl_global float *render_buffer,
+                                                  int y,
+                                                  int start_x,
+                                                  int width,
+                                                  int offset,
+                                                  int stride)
 {
-  bool any = false;
+  kernel_assert(kernel_data.film.pass_adaptive_aux_buffer != PASS_UNUSED);
+
   bool prev = false;
-  for (int x = tile->x; x < tile->x + tile->w; ++x) {
-    int index = tile->offset + x + y * tile->stride;
-    ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
-    ccl_global float4 *aux = (ccl_global float4 *)(buffer +
-                                                   kernel_data.film.pass_adaptive_aux_buffer);
-    if ((*aux).w == 0.0f) {
-      any = true;
-      if (x > tile->x && !prev) {
+  for (int x = start_x; x < start_x + width; ++x) {
+    int index = offset + x + y * stride;
+    ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride;
+    const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3;
+
+    if (buffer[aux_w_offset] == 0.0f) {
+      if (x > start_x && !prev) {
         index = index - 1;
-        buffer = tile->buffer + index * kernel_data.film.pass_stride;
-        aux = (ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer);
-        (*aux).w = 0.0f;
+        buffer = render_buffer + index * kernel_data.film.pass_stride;
+        buffer[aux_w_offset] = 0.0f;
       }
       prev = true;
     }
     else {
       if (prev) {
-        (*aux).w = 0.0f;
+        buffer[aux_w_offset] = 0.0f;
       }
       prev = false;
     }
   }
-  return any;
 }
 
-ccl_device bool kernel_do_adaptive_filter_y(KernelGlobals *kg, int x, ccl_global WorkTile *tile)
+ccl_device void kernel_adaptive_sampling_filter_y(const KernelGlobals *kg,
+                                                  ccl_global float *render_buffer,
+                                                  int x,
+                                                  int start_y,
+                                                  int height,
+                                                  int offset,
+                                                  int stride)
 {
+  kernel_assert(kernel_data.film.pass_adaptive_aux_buffer != PASS_UNUSED);
+
   bool prev = false;
-  bool any = false;
-  for (int y = tile->y; y < tile->y + tile->h; ++y) {
-    int index = tile->offset + x + y * tile->stride;
-    ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
-    ccl_global float4 *aux = (ccl_global float4 *)(buffer +
-                                                   kernel_data.film.pass_adaptive_aux_buffer);
-    if ((*aux).w == 0.0f) {
-      any = true;
-      if (y > tile->y && !prev) {
-        index = index - tile->stride;
-        buffer = tile->buffer + index * kernel_data.film.pass_stride;
-        aux = (ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer);
-        (*aux).w = 0.0f;
+  for (int y = start_y; y < start_y + height; ++y) {
+    int index = offset + x + y * stride;
+    ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride;
+    const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3;
+
+    if (buffer[aux_w_offset] == 0.0f) {
+      if (y > start_y && !prev) {
+        index = index - stride;
+        buffer = render_buffer + index * kernel_data.film.pass_stride;
+        buffer[aux_w_offset] = 0.0f;
       }
       prev = true;
     }
     else {
       if (prev) {
-        (*aux).w = 0.0f;
+        buffer[aux_w_offset] = 0.0f;
       }
       prev = false;
     }
   }
-  return any;
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_ADAPTIVE_SAMPLING_H__ */
author	Brecht Van Lommel <brecht@blender.org>	2021-09-20 18:59:20 +0300
committer	Brecht Van Lommel <brecht@blender.org>	2021-09-21 15:55:54 +0300
commit	08031197250aeecbaca3803254e6f25b8c7b7b37 (patch)
tree	6fe7ab045f0dc0a423d6557c4073f34309ef4740 /intern/cycles/kernel/kernel_adaptive_sampling.h
parent	fa6b1007bad065440950cd67deb16a04f368856f (diff)