Fix T91922: Cycles artifacts with high volume nested level

Make volume stack allocated conditionally, potentially based on the actual nested level of objects in the scene. Currently the nested level is estimated by number of volume objects. This is a non-expensive check which is probably enough in practice to get almost perfect memory usage and performance. The conditional allocation is a bit tricky. For the CPU we declare and define maximum possible volume stack, because there are only that many integrator states on the CPU. On the GPU we declare outer SoA to have all volume stack elements, but only allocate actually needed ones. The actually used volume stack size is passed as a pre-processor, which seems to be easiest and fastest for the GPU state copy. There seems to be no speed regression in the demo files on RTX6000. Note that scenes with high nested level of volume will now be slower but correct. Differential Revision: https://developer.blender.org/D12759
author: Sergey Sharybin <sergey@blender.org> 2021-10-05 16:05:12 +0300
committer: Sergey Sharybin <sergey@blender.org> 2021-10-06 16:46:32 +0300
commit: c6275da852eab77e2cea1ae601a43a2dbaad6c27 (patch)
tree: b1b810367a23465e6b9188d9f862599bad07c3b3 /intern/cycles/integrator
parent: e41dddd29a17a77e60bde6a2336fcd3937819bec (diff)
1 files changed, 16 insertions, 2 deletions
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index c29b0fb039e..8af8f9a02e2 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -23,6 +23,7 @@
 #include "render/buffers.h"
 #include "render/scene.h"
 #include "util/util_logging.h"
+#include "util/util_string.h"
 #include "util/util_tbb.h"
 #include "util/util_time.h"
 
@@ -30,7 +31,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-static size_t estimate_single_state_size()
+static size_t estimate_single_state_size(DeviceScene *device_scene)
 {
   size_t state_size = 0;
 
@@ -45,12 +46,14 @@ static size_t estimate_single_state_size()
     break; \
   } \
   }
+#define KERNEL_STRUCT_VOLUME_STACK_SIZE (device_scene->data.volume_stack_size)
 #include "kernel/integrator/integrator_state_template.h"
 #undef KERNEL_STRUCT_BEGIN
 #undef KERNEL_STRUCT_MEMBER
 #undef KERNEL_STRUCT_ARRAY_MEMBER
 #undef KERNEL_STRUCT_END
 #undef KERNEL_STRUCT_END_ARRAY
+#undef KERNEL_STRUCT_VOLUME_STACK_SIZE
 
   return state_size;
 }
@@ -72,7 +75,7 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
       num_queued_paths_(device, "num_queued_paths", MEM_READ_WRITE),
       work_tiles_(device, "work_tiles", MEM_READ_WRITE),
       display_rgba_half_(device, "display buffer half", MEM_READ_WRITE),
-      max_num_paths_(queue_->num_concurrent_states(estimate_single_state_size())),
+      max_num_paths_(queue_->num_concurrent_states(estimate_single_state_size(device_scene))),
       min_num_active_paths_(queue_->num_concurrent_busy_states()),
       max_active_path_index_(0)
 {
@@ -125,12 +128,23 @@ void PathTraceWorkGPU::alloc_integrator_soa()
     break; \
   } \
   }
+#define KERNEL_STRUCT_VOLUME_STACK_SIZE (device_scene_->data.volume_stack_size)
 #include "kernel/integrator/integrator_state_template.h"
 #undef KERNEL_STRUCT_BEGIN
 #undef KERNEL_STRUCT_MEMBER
 #undef KERNEL_STRUCT_ARRAY_MEMBER
 #undef KERNEL_STRUCT_END
 #undef KERNEL_STRUCT_END_ARRAY
+#undef KERNEL_STRUCT_VOLUME_STACK_SIZE
+
+  if (VLOG_IS_ON(3)) {
+    size_t total_soa_size = 0;
+    for (auto &&soa_memory : integrator_state_soa_) {
+      total_soa_size += soa_memory->memory_size();
+    }
+
+    VLOG(3) << "GPU SoA state size: " << string_human_readable_size(total_soa_size);
+  }
 }
 
 void PathTraceWorkGPU::alloc_integrator_queue()
author	Sergey Sharybin <sergey@blender.org>	2021-10-05 16:05:12 +0300
committer	Sergey Sharybin <sergey@blender.org>	2021-10-06 16:46:32 +0300
commit	c6275da852eab77e2cea1ae601a43a2dbaad6c27 (patch)
tree	b1b810367a23465e6b9188d9f862599bad07c3b3 /intern/cycles/integrator
parent	e41dddd29a17a77e60bde6a2336fcd3937819bec (diff)