Fix T91922: Cycles artifacts with high volume nested level

Make volume stack allocated conditionally, potentially based on the actual nested level of objects in the scene. Currently the nested level is estimated by number of volume objects. This is a non-expensive check which is probably enough in practice to get almost perfect memory usage and performance. The conditional allocation is a bit tricky. For the CPU we declare and define maximum possible volume stack, because there are only that many integrator states on the CPU. On the GPU we declare outer SoA to have all volume stack elements, but only allocate actually needed ones. The actually used volume stack size is passed as a pre-processor, which seems to be easiest and fastest for the GPU state copy. There seems to be no speed regression in the demo files on RTX6000. Note that scenes with high nested level of volume will now be slower but correct. Differential Revision: https://developer.blender.org/D12759
author: Sergey Sharybin <sergey@blender.org> 2021-10-05 16:05:12 +0300
committer: Sergey Sharybin <sergey@blender.org> 2021-10-06 16:46:32 +0300
commit: c6275da852eab77e2cea1ae601a43a2dbaad6c27 (patch)
tree: b1b810367a23465e6b9188d9f862599bad07c3b3 /intern
parent: e41dddd29a17a77e60bde6a2336fcd3937819bec (diff)
12 files changed, 112 insertions, 23 deletions
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index c29b0fb039e..8af8f9a02e2 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -23,6 +23,7 @@
 #include "render/buffers.h"
 #include "render/scene.h"
 #include "util/util_logging.h"
+#include "util/util_string.h"
 #include "util/util_tbb.h"
 #include "util/util_time.h"
 
@@ -30,7 +31,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-static size_t estimate_single_state_size()
+static size_t estimate_single_state_size(DeviceScene *device_scene)
 {
   size_t state_size = 0;
 
@@ -45,12 +46,14 @@ static size_t estimate_single_state_size()
     break; \
   } \
   }
+#define KERNEL_STRUCT_VOLUME_STACK_SIZE (device_scene->data.volume_stack_size)
 #include "kernel/integrator/integrator_state_template.h"
 #undef KERNEL_STRUCT_BEGIN
 #undef KERNEL_STRUCT_MEMBER
 #undef KERNEL_STRUCT_ARRAY_MEMBER
 #undef KERNEL_STRUCT_END
 #undef KERNEL_STRUCT_END_ARRAY
+#undef KERNEL_STRUCT_VOLUME_STACK_SIZE
 
   return state_size;
 }
@@ -72,7 +75,7 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
       num_queued_paths_(device, "num_queued_paths", MEM_READ_WRITE),
       work_tiles_(device, "work_tiles", MEM_READ_WRITE),
       display_rgba_half_(device, "display buffer half", MEM_READ_WRITE),
-      max_num_paths_(queue_->num_concurrent_states(estimate_single_state_size())),
+      max_num_paths_(queue_->num_concurrent_states(estimate_single_state_size(device_scene))),
       min_num_active_paths_(queue_->num_concurrent_busy_states()),
       max_active_path_index_(0)
 {
@@ -125,12 +128,23 @@ void PathTraceWorkGPU::alloc_integrator_soa()
     break; \
   } \
   }
+#define KERNEL_STRUCT_VOLUME_STACK_SIZE (device_scene_->data.volume_stack_size)
 #include "kernel/integrator/integrator_state_template.h"
 #undef KERNEL_STRUCT_BEGIN
 #undef KERNEL_STRUCT_MEMBER
 #undef KERNEL_STRUCT_ARRAY_MEMBER
 #undef KERNEL_STRUCT_END
 #undef KERNEL_STRUCT_END_ARRAY
+#undef KERNEL_STRUCT_VOLUME_STACK_SIZE
+
+  if (VLOG_IS_ON(3)) {
+    size_t total_soa_size = 0;
+    for (auto &&soa_memory : integrator_state_soa_) {
+      total_soa_size += soa_memory->memory_size();
+    }
+
+    VLOG(3) << "GPU SoA state size: " << string_human_readable_size(total_soa_size);
+  }
 }
 
 void PathTraceWorkGPU::alloc_integrator_queue()
diff --git a/intern/cycles/kernel/integrator/integrator_intersect_volume_stack.h b/intern/cycles/kernel/integrator/integrator_intersect_volume_stack.h
index 60d8a8e3e54..99f6cf35e9e 100644
--- a/intern/cycles/kernel/integrator/integrator_intersect_volume_stack.h
+++ b/intern/cycles/kernel/integrator/integrator_intersect_volume_stack.h
@@ -38,10 +38,13 @@ ccl_device void integrator_volume_stack_update_for_subsurface(INTEGRATOR_STATE_A
   volume_ray.P = from_P;
   volume_ray.D = normalize_len(to_P - from_P, &volume_ray.t);
 
+  /* Store to avoid global fetches on every intersection step. */
+  const uint volume_stack_size = kernel_data.volume_stack_size;
+
 #ifdef __VOLUME_RECORD_ALL__
-  Intersection hits[2 * VOLUME_STACK_SIZE + 1];
+  Intersection hits[2 * volume_stack_size + 1];
   uint num_hits = scene_intersect_volume_all(
-      kg, &volume_ray, hits, 2 * VOLUME_STACK_SIZE, PATH_RAY_ALL_VISIBILITY);
+      kg, &volume_ray, hits, 2 * volume_stack_size, PATH_RAY_ALL_VISIBILITY);
   if (num_hits > 0) {
     Intersection *isect = hits;
 
@@ -55,7 +58,7 @@ ccl_device void integrator_volume_stack_update_for_subsurface(INTEGRATOR_STATE_A
 #else
   Intersection isect;
   int step = 0;
-  while (step < 2 * VOLUME_STACK_SIZE &&
+  while (step < 2 * volume_stack_size &&
          scene_intersect_volume(kg, &volume_ray, &isect, PATH_RAY_ALL_VISIBILITY)) {
     shader_setup_from_ray(kg, stack_sd, &volume_ray, &isect);
     volume_stack_enter_exit(INTEGRATOR_STATE_PASS, stack_sd);
@@ -91,12 +94,15 @@ ccl_device void integrator_intersect_volume_stack(INTEGRATOR_STATE_ARGS)
     stack_index++;
   }
 
+  /* Store to avoid global fetches on every intersection step. */
+  const uint volume_stack_size = kernel_data.volume_stack_size;
+
 #ifdef __VOLUME_RECORD_ALL__
-  Intersection hits[2 * VOLUME_STACK_SIZE + 1];
+  Intersection hits[2 * volume_stack_size + 1];
   uint num_hits = scene_intersect_volume_all(
-      kg, &volume_ray, hits, 2 * VOLUME_STACK_SIZE, visibility);
+      kg, &volume_ray, hits, 2 * volume_stack_size, visibility);
   if (num_hits > 0) {
-    int enclosed_volumes[VOLUME_STACK_SIZE];
+    int enclosed_volumes[volume_stack_size];
     Intersection *isect = hits;
 
     qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
@@ -121,7 +127,7 @@ ccl_device void integrator_intersect_volume_stack(INTEGRATOR_STATE_ARGS)
             break;
           }
         }
-        if (need_add && stack_index < VOLUME_STACK_SIZE - 1) {
+        if (need_add && stack_index < volume_stack_size - 1) {
           const VolumeStack new_entry = {stack_sd->object, stack_sd->shader};
           integrator_state_write_volume_stack(INTEGRATOR_STATE_PASS, stack_index, new_entry);
           ++stack_index;
@@ -136,11 +142,12 @@ ccl_device void integrator_intersect_volume_stack(INTEGRATOR_STATE_ARGS)
     }
   }
 #else
-  int enclosed_volumes[VOLUME_STACK_SIZE];
+  /* CUDA does not support defintion of a variable size arrays, so use the maximum possible. */
+  int enclosed_volumes[MAX_VOLUME_STACK_SIZE];
   int step = 0;
 
-  while (stack_index < VOLUME_STACK_SIZE - 1 && enclosed_index < VOLUME_STACK_SIZE - 1 &&
-         step < 2 * VOLUME_STACK_SIZE) {
+  while (stack_index < volume_stack_size - 1 && enclosed_index < volume_stack_size - 1 &&
+         step < 2 * volume_stack_size) {
     Intersection isect;
     if (!scene_intersect_volume(kg, &volume_ray, &isect, visibility)) {
       break;
diff --git a/intern/cycles/kernel/integrator/integrator_state.h b/intern/cycles/kernel/integrator/integrator_state.h
index f745ad3f4b9..efc7576d95b 100644
--- a/intern/cycles/kernel/integrator/integrator_state.h
+++ b/intern/cycles/kernel/integrator/integrator_state.h
@@ -59,8 +59,6 @@ CCL_NAMESPACE_BEGIN
  *
  * TODO: these could be made dynamic depending on the features used in the scene. */
 
-#define INTEGRATOR_VOLUME_STACK_SIZE VOLUME_STACK_SIZE
-
 #define INTEGRATOR_SHADOW_ISECT_SIZE_CPU 1024
 #define INTEGRATOR_SHADOW_ISECT_SIZE_GPU 4
 
@@ -85,12 +83,14 @@ typedef struct IntegratorStateCPU {
 #define KERNEL_STRUCT_END_ARRAY(name, cpu_size, gpu_size) \
   } \
   name[cpu_size];
+#define KERNEL_STRUCT_VOLUME_STACK_SIZE MAX_VOLUME_STACK_SIZE
 #include "kernel/integrator/integrator_state_template.h"
 #undef KERNEL_STRUCT_BEGIN
 #undef KERNEL_STRUCT_MEMBER
 #undef KERNEL_STRUCT_ARRAY_MEMBER
 #undef KERNEL_STRUCT_END
 #undef KERNEL_STRUCT_END_ARRAY
+#undef KERNEL_STRUCT_VOLUME_STACK_SIZE
 } IntegratorStateCPU;
 
 /* Path Queue
@@ -114,12 +114,14 @@ typedef struct IntegratorStateGPU {
 #define KERNEL_STRUCT_END_ARRAY(name, cpu_size, gpu_size) \
   } \
   name[gpu_size];
+#define KERNEL_STRUCT_VOLUME_STACK_SIZE MAX_VOLUME_STACK_SIZE
 #include "kernel/integrator/integrator_state_template.h"
 #undef KERNEL_STRUCT_BEGIN
 #undef KERNEL_STRUCT_MEMBER
 #undef KERNEL_STRUCT_ARRAY_MEMBER
 #undef KERNEL_STRUCT_END
 #undef KERNEL_STRUCT_END_ARRAY
+#undef KERNEL_STRUCT_VOLUME_STACK_SIZE
 
   /* Count number of queued kernels. */
   IntegratorQueueCounter *queue_counter;
diff --git a/intern/cycles/kernel/integrator/integrator_state_template.h b/intern/cycles/kernel/integrator/integrator_state_template.h
index 0d8126c64aa..15998ee6edf 100644
--- a/intern/cycles/kernel/integrator/integrator_state_template.h
+++ b/intern/cycles/kernel/integrator/integrator_state_template.h
@@ -107,7 +107,9 @@ KERNEL_STRUCT_END(subsurface)
 KERNEL_STRUCT_BEGIN(volume_stack)
 KERNEL_STRUCT_ARRAY_MEMBER(volume_stack, int, object, KERNEL_FEATURE_VOLUME)
 KERNEL_STRUCT_ARRAY_MEMBER(volume_stack, int, shader, KERNEL_FEATURE_VOLUME)
-KERNEL_STRUCT_END_ARRAY(volume_stack, INTEGRATOR_VOLUME_STACK_SIZE, INTEGRATOR_VOLUME_STACK_SIZE)
+KERNEL_STRUCT_END_ARRAY(volume_stack,
+                        KERNEL_STRUCT_VOLUME_STACK_SIZE,
+                        KERNEL_STRUCT_VOLUME_STACK_SIZE)
 
 /********************************* Shadow Path State **************************/
 
@@ -163,5 +165,5 @@ KERNEL_STRUCT_BEGIN(shadow_volume_stack)
 KERNEL_STRUCT_ARRAY_MEMBER(shadow_volume_stack, int, object, KERNEL_FEATURE_VOLUME)
 KERNEL_STRUCT_ARRAY_MEMBER(shadow_volume_stack, int, shader, KERNEL_FEATURE_VOLUME)
 KERNEL_STRUCT_END_ARRAY(shadow_volume_stack,
-                        INTEGRATOR_VOLUME_STACK_SIZE,
-                        INTEGRATOR_VOLUME_STACK_SIZE)
+                        KERNEL_STRUCT_VOLUME_STACK_SIZE,
+                        KERNEL_STRUCT_VOLUME_STACK_SIZE)
diff --git a/intern/cycles/kernel/integrator/integrator_state_util.h b/intern/cycles/kernel/integrator/integrator_state_util.h
index 08d6cb00114..453ec49c7b0 100644
--- a/intern/cycles/kernel/integrator/integrator_state_util.h
+++ b/intern/cycles/kernel/integrator/integrator_state_util.h
@@ -155,7 +155,7 @@ ccl_device_forceinline void integrator_state_read_shadow_isect(INTEGRATOR_STATE_
 ccl_device_forceinline void integrator_state_copy_volume_stack_to_shadow(INTEGRATOR_STATE_ARGS)
 {
   if (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) {
-    for (int i = 0; i < INTEGRATOR_VOLUME_STACK_SIZE; i++) {
+    for (int i = 0; i < kernel_data.volume_stack_size; i++) {
       INTEGRATOR_STATE_ARRAY_WRITE(shadow_volume_stack, i, object) = INTEGRATOR_STATE_ARRAY(
           volume_stack, i, object);
       INTEGRATOR_STATE_ARRAY_WRITE(shadow_volume_stack, i, shader) = INTEGRATOR_STATE_ARRAY(
@@ -223,6 +223,8 @@ ccl_device_inline void integrator_state_copy_only(const IntegratorState to_state
     while (index < gpu_array_size) \
       ;
 
+#  define KERNEL_STRUCT_VOLUME_STACK_SIZE kernel_data.volume_stack_size
+
 #  include "kernel/integrator/integrator_state_template.h"
 
 #  undef KERNEL_STRUCT_BEGIN
@@ -230,6 +232,7 @@ ccl_device_inline void integrator_state_copy_only(const IntegratorState to_state
 #  undef KERNEL_STRUCT_ARRAY_MEMBER
 #  undef KERNEL_STRUCT_END
 #  undef KERNEL_STRUCT_END_ARRAY
+#  undef KERNEL_STRUCT_VOLUME_STACK_SIZE
 }
 
 ccl_device_inline void integrator_state_move(const IntegratorState to_state,
diff --git a/intern/cycles/kernel/integrator/integrator_volume_stack.h b/intern/cycles/kernel/integrator/integrator_volume_stack.h
index d53070095f0..01ebf8376b1 100644
--- a/intern/cycles/kernel/integrator/integrator_volume_stack.h
+++ b/intern/cycles/kernel/integrator/integrator_volume_stack.h
@@ -72,7 +72,7 @@ ccl_device void volume_stack_enter_exit(INTEGRATOR_STATE_ARGS,
     }
 
     /* If we exceed the stack limit, ignore. */
-    if (i >= VOLUME_STACK_SIZE - 1) {
+    if (i >= kernel_data.volume_stack_size - 1) {
       return;
     }
 
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 6107e1028ba..22dde3537eb 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -61,8 +61,6 @@ CCL_NAMESPACE_BEGIN
 #define ID_NONE (0.0f)
 #define PASS_UNUSED (~0)
 
-#define VOLUME_STACK_SIZE 4
-
 /* Kernel features */
 #define __SOBOL__
 #define __DPDU__
@@ -608,6 +606,12 @@ typedef struct AttributeDescriptor {
 #  define MAX_CLOSURE __MAX_CLOSURE__
 #endif
 
+#ifndef __MAX_VOLUME_STACK_SIZE__
+#  define MAX_VOLUME_STACK_SIZE 32
+#else
+#  define MAX_VOLUME_STACK_SIZE __MAX_VOLUME_STACK_SIZE__
+#endif
+
 #define MAX_VOLUME_CLOSURE 8
 
 /* This struct is the base class for all closures. The common members are
@@ -1223,7 +1227,7 @@ typedef struct KernelData {
   uint kernel_features;
   uint max_closures;
   uint max_shaders;
-  uint pad;
+  uint volume_stack_size;
 
   KernelCamera cam;
   KernelFilm film;
diff --git a/intern/cycles/render/graph.cpp b/intern/cycles/render/graph.cpp
index e9da48b624d..ee1a6e68bcf 100644
--- a/intern/cycles/render/graph.cpp
+++ b/intern/cycles/render/graph.cpp
@@ -1149,7 +1149,9 @@ int ShaderGraph::get_num_closures()
       num_closures += 8;
     }
     else if (CLOSURE_IS_VOLUME(closure_type)) {
-      num_closures += VOLUME_STACK_SIZE;
+      /* TODO(sergey): Verify this is still needed, since we have special minimized volume storage
+       * for the volume steps. */
+      num_closures += MAX_VOLUME_STACK_SIZE;
     }
     else if (closure_type == CLOSURE_BSDF_HAIR_PRINCIPLED_ID) {
       num_closures += 4;
diff --git a/intern/cycles/render/object.cpp b/intern/cycles/render/object.cpp
index 4637f8fe989..1320a5eb7a6 100644
--- a/intern/cycles/render/object.cpp
+++ b/intern/cycles/render/object.cpp
@@ -366,6 +366,22 @@ float Object::compute_volume_step_size() const
   return step_size;
 }
 
+bool Object::check_is_volume() const
+{
+  if (geometry->geometry_type == Geometry::VOLUME) {
+    return true;
+  }
+
+  for (Node *node : get_geometry()->get_used_shaders()) {
+    const Shader *shader = static_cast<const Shader *>(node);
+    if (shader->has_volume_connected) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
 int Object::get_device_index() const
 {
   return index;
diff --git a/intern/cycles/render/object.h b/intern/cycles/render/object.h
index c52ddce48da..84e2dfffebb 100644
--- a/intern/cycles/render/object.h
+++ b/intern/cycles/render/object.h
@@ -109,6 +109,13 @@ class Object : public Node {
   /* Compute step size from attributes, shaders, transforms. */
   float compute_volume_step_size() const;
 
+  /* Check whether this object requires volume sampling (and hence might require space in the
+   * volume stack).
+   *
+   * Note that this is a naive iteration over sharders, which allows to access information prior
+   * to `scene_update()`. */
+  bool check_is_volume() const;
+
  protected:
   /* Specifies the position of the object in scene->objects and
    * in the device vectors. Gets set in device_update. */
diff --git a/intern/cycles/render/scene.cpp b/intern/cycles/render/scene.cpp
index a4b030190dc..ecd6946bbf8 100644
--- a/intern/cycles/render/scene.cpp
+++ b/intern/cycles/render/scene.cpp
@@ -527,6 +527,8 @@ void Scene::update_kernel_features()
   const uint max_closures = (params.background) ? get_max_closure_count() : MAX_CLOSURE;
   dscene.data.max_closures = max_closures;
   dscene.data.max_shaders = shaders.size();
+
+  dscene.data.volume_stack_size = get_volume_stack_size();
 }
 
 bool Scene::update(Progress &progress)
@@ -642,6 +644,33 @@ int Scene::get_max_closure_count()
   return max_closure_global;
 }
 
+int Scene::get_volume_stack_size() const
+{
+  /* Quick non-expensive check. Can over-estimate maximum possible nested level, but does not
+   * require expensive calculation during pre-processing. */
+  int num_volume_objects = 0;
+  for (const Object *object : objects) {
+    if (object->check_is_volume()) {
+      ++num_volume_objects;
+    }
+
+    if (num_volume_objects == MAX_VOLUME_STACK_SIZE) {
+      break;
+    }
+  }
+
+  /* Count background world for the stack. */
+  const Shader *background_shader = background->get_shader(this);
+  if (background_shader && background_shader->has_volume_connected) {
+    ++num_volume_objects;
+  }
+
+  /* Space for terminator. */
+  ++num_volume_objects;
+
+  return min(num_volume_objects, MAX_VOLUME_STACK_SIZE);
+}
+
 bool Scene::has_shadow_catcher()
 {
   if (shadow_catcher_modified_) {
diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h
index cf4a3ba6b12..8076d0dc09c 100644
--- a/intern/cycles/render/scene.h
+++ b/intern/cycles/render/scene.h
@@ -344,6 +344,9 @@ class Scene : public NodeOwner {
   /* Get maximum number of closures to be used in kernel. */
   int get_max_closure_count();
 
+  /* Get size of a volume stack needed to render this scene.  */
+  int get_volume_stack_size() const;
+
   template<typename T> void delete_node_impl(T *node)
   {
     delete node;
author	Sergey Sharybin <sergey@blender.org>	2021-10-05 16:05:12 +0300
committer	Sergey Sharybin <sergey@blender.org>	2021-10-06 16:46:32 +0300
commit	c6275da852eab77e2cea1ae601a43a2dbaad6c27 (patch)
tree	b1b810367a23465e6b9188d9f862599bad07c3b3 /intern
parent	e41dddd29a17a77e60bde6a2336fcd3937819bec (diff)