WIP: Cycles X: Schedule work based on occupancy

Sacrifice refresh interval and increase GPU occupancy, lowering the final render time. Lowers `Blender 2.80 - Spring` demo file form predicted 30min measured 3 min to render 1/10th of samples) to about 7.5min. It is still considerably higher than the master branch, which finishes the file in just below 3 min, but it is already a better results. The timing is from RTX 5000. The viewport and CPU rendering should stay unaffected by this change. Differential Revision: https://developer.blender.org/D12570
author: Sergey Sharybin <sergey@blender.org> 2021-09-20 16:49:10 +0300
committer: Sergey Sharybin <sergey@blender.org> 2021-09-20 18:43:57 +0300
commit: a9ad15a88f129a0593432e1217610ac840687497 (patch)
tree: 9bd413ab0fd6db2572d1e0254f378721af52cbd0
parent: 2413da215160403d026d776bdbe0b883de5516ae (diff)
9 files changed, 86 insertions, 10 deletions
diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp
index 55f050d7833..5ce75464acd 100644
--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -351,14 +351,32 @@ void PathTrace::path_trace(RenderWork &render_work)
   const double start_time = time_dt();
 
   const int num_works = path_trace_works_.size();
+
   tbb::parallel_for(0, num_works, [&](int i) {
     const double work_start_time = time_dt();
+    const int num_samples = render_work.path_trace.num_samples;
+
     PathTraceWork *path_trace_work = path_trace_works_[i].get();
-    path_trace_work->render_samples(render_work.path_trace.start_sample,
-                                    render_work.path_trace.num_samples);
-    work_balance_infos_[i].time_spent += time_dt() - work_start_time;
+
+    PathTraceWork::RenderStatistics statistics;
+    path_trace_work->render_samples(statistics, render_work.path_trace.start_sample, num_samples);
+
+    const double work_time = time_dt() - work_start_time;
+    work_balance_infos_[i].time_spent += work_time;
+    work_balance_infos_[i].occupancy = statistics.occupancy;
+
+    VLOG(3) << "Rendered " << num_samples << " samples in " << work_time << " seconds ("
+            << work_time / num_samples
+            << " seconds per sample), occupancy: " << statistics.occupancy;
   });
 
+  float occupancy_accum = 0.0f;
+  for (const WorkBalanceInfo &balance_info : work_balance_infos_) {
+    occupancy_accum += balance_info.occupancy;
+  }
+  const float occupancy = occupancy_accum / num_works;
+  render_scheduler_.report_path_trace_occupancy(render_work, occupancy);
+
   render_scheduler_.report_path_trace_time(
       render_work, time_dt() - start_time, is_cancel_requested());
 }
diff --git a/intern/cycles/integrator/path_trace_work.h b/intern/cycles/integrator/path_trace_work.h
index ca64c1c2ffd..97b97f3d888 100644
--- a/intern/cycles/integrator/path_trace_work.h
+++ b/intern/cycles/integrator/path_trace_work.h
@@ -33,6 +33,10 @@ class RenderBuffers;
 
 class PathTraceWork {
  public:
+  struct RenderStatistics {
+    float occupancy = 1.0f;
+  };
+
   /* Create path trace work which fits best the device.
    *
    * The cancel request flag is used for a cheap check whether cancel is to berformed as soon as
@@ -71,7 +75,7 @@ class PathTraceWork {
 
   /* Render given number of samples as a synchronous blocking call.
    * The samples are added to the render buffer associated with this work. */
-  virtual void render_samples(int start_sample, int samples_num) = 0;
+  virtual void render_samples(RenderStatistics &statistics, int start_sample, int samples_num) = 0;
 
   /* Copy render result from this work to the corresponding place of the GPU display.
    *
diff --git a/intern/cycles/integrator/path_trace_work_cpu.cpp b/intern/cycles/integrator/path_trace_work_cpu.cpp
index eaed0d0d636..b9a33b64051 100644
--- a/intern/cycles/integrator/path_trace_work_cpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_cpu.cpp
@@ -67,7 +67,9 @@ void PathTraceWorkCPU::init_execution()
   device_->get_cpu_kernel_thread_globals(kernel_thread_globals_);
 }
 
-void PathTraceWorkCPU::render_samples(int start_sample, int samples_num)
+void PathTraceWorkCPU::render_samples(RenderStatistics &statistics,
+                                      int start_sample,
+                                      int samples_num)
 {
   const int64_t image_width = effective_buffer_params_.width;
   const int64_t image_height = effective_buffer_params_.height;
@@ -106,6 +108,8 @@ void PathTraceWorkCPU::render_samples(int start_sample, int samples_num)
   for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
     kernel_globals.stop_profiling();
   }
+
+  statistics.occupancy = 1.0f;
 }
 
 void PathTraceWorkCPU::render_samples_full_pipeline(KernelGlobals *kernel_globals,
diff --git a/intern/cycles/integrator/path_trace_work_cpu.h b/intern/cycles/integrator/path_trace_work_cpu.h
index 0ea901e452d..ab729bbf879 100644
--- a/intern/cycles/integrator/path_trace_work_cpu.h
+++ b/intern/cycles/integrator/path_trace_work_cpu.h
@@ -46,7 +46,9 @@ class PathTraceWorkCPU : public PathTraceWork {
 
   virtual void init_execution() override;
 
-  virtual void render_samples(int start_sample, int samples_num) override;
+  virtual void render_samples(RenderStatistics &statistics,
+                              int start_sample,
+                              int samples_num) override;
 
   virtual void copy_to_gpu_display(GPUDisplay *gpu_display,
                                    PassMode pass_mode,
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index 7f15237ddbf..10baf869aa6 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -180,7 +180,9 @@ void PathTraceWorkGPU::init_execution()
       "__integrator_state", &integrator_state_gpu_, sizeof(integrator_state_gpu_));
 }
 
-void PathTraceWorkGPU::render_samples(int start_sample, int samples_num)
+void PathTraceWorkGPU::render_samples(RenderStatistics &statistics,
+                                      int start_sample,
+                                      int samples_num)
 {
   /* Limit number of states for the tile and rely on a greedy scheduling of tiles. This allows to
    * add more work (because tiles are smaller, so there is higher chance that more paths will
@@ -192,6 +194,9 @@ void PathTraceWorkGPU::render_samples(int start_sample, int samples_num)
 
   enqueue_reset();
 
+  int num_iterations = 0;
+  uint64_t num_busy_accum = 0;
+
   /* TODO: set a hard limit in case of undetected kernel failures? */
   while (true) {
     /* Enqueue work from the scheduler, on start or when there are not enough
@@ -228,7 +233,12 @@ void PathTraceWorkGPU::render_samples(int start_sample, int samples_num)
     if (is_cancel_requested()) {
       break;
     }
+
+    num_busy_accum += get_num_active_paths();
+    ++num_iterations;
   }
+
+  statistics.occupancy = static_cast<float>(num_busy_accum) / num_iterations / max_num_paths_;
 }
 
 DeviceKernel PathTraceWorkGPU::get_most_queued_kernel() const
diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h
index aee54d4a372..38788122b0d 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -44,7 +44,9 @@ class PathTraceWorkGPU : public PathTraceWork {
   virtual void alloc_work_memory() override;
   virtual void init_execution() override;
 
-  virtual void render_samples(int start_sample, int samples_num) override;
+  virtual void render_samples(RenderStatistics &statistics,
+                              int start_sample,
+                              int samples_num) override;
 
   virtual void copy_to_gpu_display(GPUDisplay *gpu_display,
                                    PassMode pass_mode,
diff --git a/intern/cycles/integrator/render_scheduler.cpp b/intern/cycles/integrator/render_scheduler.cpp
index 50017daca38..4eb1dd941f9 100644
--- a/intern/cycles/integrator/render_scheduler.cpp
+++ b/intern/cycles/integrator/render_scheduler.cpp
@@ -155,6 +155,9 @@ void RenderScheduler::reset(const BufferParams &buffer_params, int num_samples)
   state_.end_render_time = 0.0;
   state_.time_limit_reached = false;
 
+  state_.occupancy_num_samples = 0;
+  state_.occupancy = 1.0f;
+
   first_render_time_.path_trace_per_sample = 0.0;
   first_render_time_.denoise_time = 0.0;
   first_render_time_.display_update_time = 0.0;
@@ -475,6 +478,13 @@ void RenderScheduler::report_path_trace_time(const RenderWork &render_work,
   VLOG(4) << "Average path tracing time: " << path_trace_time_.get_average() << " seconds.";
 }
 
+void RenderScheduler::report_path_trace_occupancy(const RenderWork &render_work, float occupancy)
+{
+  state_.occupancy_num_samples = render_work.path_trace.num_samples;
+  state_.occupancy = occupancy;
+  VLOG(4) << "Measured path tracing occupancy: " << occupancy;
+}
+
 void RenderScheduler::report_adaptive_filter_time(const RenderWork &render_work,
                                                   double time,
                                                   bool is_cancelled)
@@ -803,8 +813,23 @@ int RenderScheduler::get_num_samples_to_path_trace() const
    * more than N samples. */
   const int num_samples_pot = round_num_samples_to_power_of_2(num_samples_per_update);
 
-  const int num_samples_to_render = min(num_samples_pot,
-                                        start_sample_ + num_samples_ - path_trace_start_sample);
+  const int max_num_samples_to_render = start_sample_ + num_samples_ - path_trace_start_sample;
+
+  int num_samples_to_render = min(num_samples_pot, max_num_samples_to_render);
+
+  /* When enough statistics is available and doing an offlien rendering prefer to keep device
+   * occupied. */
+  if (state_.occupancy_num_samples && (background_ || headless_)) {
+    /* Keep occupancy at about 0.5 (this is more of an empirical figure which seems to match scenes
+     * with good performance without forcing occupancy to be higher). */
+    int num_samples_to_occupy = state_.occupancy_num_samples;
+    if (state_.occupancy < 0.5f) {
+      num_samples_to_occupy = lround(state_.occupancy_num_samples * 0.7f / state_.occupancy);
+    }
+
+    num_samples_to_render = max(num_samples_to_render,
+                                min(num_samples_to_occupy, max_num_samples_to_render));
+  }
 
   /* If adaptive sampling is not use, render as many samples per update as possible, keeping the
    * device fully occupied, without much overhead of display updates. */
diff --git a/intern/cycles/integrator/render_scheduler.h b/intern/cycles/integrator/render_scheduler.h
index 10fbcc52cd6..9c2d107e46d 100644
--- a/intern/cycles/integrator/render_scheduler.h
+++ b/intern/cycles/integrator/render_scheduler.h
@@ -186,6 +186,7 @@ class RenderScheduler {
 
   /* Report time (in seconds) which corresponding part of work took. */
   void report_path_trace_time(const RenderWork &render_work, double time, bool is_cancelled);
+  void report_path_trace_occupancy(const RenderWork &render_work, float occupancy);
   void report_adaptive_filter_time(const RenderWork &render_work, double time, bool is_cancelled);
   void report_denoise_time(const RenderWork &render_work, double time);
   void report_display_update_time(const RenderWork &render_work, double time);
@@ -380,6 +381,13 @@ class RenderScheduler {
     /* Time at which rendering started and finished. */
     double start_render_time = 0.0;
     double end_render_time = 0.0;
+
+    /* Measured occupancy of the render devices measured normalized to the number of samples.
+     *
+     * In a way it is "trailing": when scheduling new work this occupancy is measured when the
+     * previous work was rendered. */
+    int occupancy_num_samples = 0;
+    float occupancy = 1.0f;
   } state_;
 
   /* Timing of tasks which were performed at the very first render work at 100% of the
diff --git a/intern/cycles/integrator/work_balancer.h b/intern/cycles/integrator/work_balancer.h
index a2f83ef7f24..94e20ecf054 100644
--- a/intern/cycles/integrator/work_balancer.h
+++ b/intern/cycles/integrator/work_balancer.h
@@ -24,6 +24,9 @@ struct WorkBalanceInfo {
   /* Time spent performing corresponding work. */
   double time_spent = 0;
 
+  /* Average occupancy of the device while performing the work. */
+  float occupancy = 1.0f;
+
   /* Normalized weight, which is ready to be used for work balancing (like calculating fraction of
    * the big tile which is to be rendered on the device). */
   double weight = 1.0;
author	Sergey Sharybin <sergey@blender.org>	2021-09-20 16:49:10 +0300
committer	Sergey Sharybin <sergey@blender.org>	2021-09-20 18:43:57 +0300
commit	a9ad15a88f129a0593432e1217610ac840687497 (patch)
tree	9bd413ab0fd6db2572d1e0254f378721af52cbd0
parent	2413da215160403d026d776bdbe0b883de5516ae (diff)