Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSergey Sharybin <sergey@blender.org>2021-09-20 16:49:10 +0300
committerSergey Sharybin <sergey@blender.org>2021-09-20 18:43:57 +0300
commita9ad15a88f129a0593432e1217610ac840687497 (patch)
tree9bd413ab0fd6db2572d1e0254f378721af52cbd0
parent2413da215160403d026d776bdbe0b883de5516ae (diff)
WIP: Cycles X: Schedule work based on occupancy
Sacrifice refresh interval and increase GPU occupancy, lowering the final render time. Lowers `Blender 2.80 - Spring` demo file form predicted 30min measured 3 min to render 1/10th of samples) to about 7.5min. It is still considerably higher than the master branch, which finishes the file in just below 3 min, but it is already a better results. The timing is from RTX 5000. The viewport and CPU rendering should stay unaffected by this change. Differential Revision: https://developer.blender.org/D12570
-rw-r--r--intern/cycles/integrator/path_trace.cpp24
-rw-r--r--intern/cycles/integrator/path_trace_work.h6
-rw-r--r--intern/cycles/integrator/path_trace_work_cpu.cpp6
-rw-r--r--intern/cycles/integrator/path_trace_work_cpu.h4
-rw-r--r--intern/cycles/integrator/path_trace_work_gpu.cpp12
-rw-r--r--intern/cycles/integrator/path_trace_work_gpu.h4
-rw-r--r--intern/cycles/integrator/render_scheduler.cpp29
-rw-r--r--intern/cycles/integrator/render_scheduler.h8
-rw-r--r--intern/cycles/integrator/work_balancer.h3
9 files changed, 86 insertions, 10 deletions
diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp
index 55f050d7833..5ce75464acd 100644
--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -351,14 +351,32 @@ void PathTrace::path_trace(RenderWork &render_work)
const double start_time = time_dt();
const int num_works = path_trace_works_.size();
+
tbb::parallel_for(0, num_works, [&](int i) {
const double work_start_time = time_dt();
+ const int num_samples = render_work.path_trace.num_samples;
+
PathTraceWork *path_trace_work = path_trace_works_[i].get();
- path_trace_work->render_samples(render_work.path_trace.start_sample,
- render_work.path_trace.num_samples);
- work_balance_infos_[i].time_spent += time_dt() - work_start_time;
+
+ PathTraceWork::RenderStatistics statistics;
+ path_trace_work->render_samples(statistics, render_work.path_trace.start_sample, num_samples);
+
+ const double work_time = time_dt() - work_start_time;
+ work_balance_infos_[i].time_spent += work_time;
+ work_balance_infos_[i].occupancy = statistics.occupancy;
+
+ VLOG(3) << "Rendered " << num_samples << " samples in " << work_time << " seconds ("
+ << work_time / num_samples
+ << " seconds per sample), occupancy: " << statistics.occupancy;
});
+ float occupancy_accum = 0.0f;
+ for (const WorkBalanceInfo &balance_info : work_balance_infos_) {
+ occupancy_accum += balance_info.occupancy;
+ }
+ const float occupancy = occupancy_accum / num_works;
+ render_scheduler_.report_path_trace_occupancy(render_work, occupancy);
+
render_scheduler_.report_path_trace_time(
render_work, time_dt() - start_time, is_cancel_requested());
}
diff --git a/intern/cycles/integrator/path_trace_work.h b/intern/cycles/integrator/path_trace_work.h
index ca64c1c2ffd..97b97f3d888 100644
--- a/intern/cycles/integrator/path_trace_work.h
+++ b/intern/cycles/integrator/path_trace_work.h
@@ -33,6 +33,10 @@ class RenderBuffers;
class PathTraceWork {
public:
+ struct RenderStatistics {
+ float occupancy = 1.0f;
+ };
+
/* Create path trace work which fits best the device.
*
* The cancel request flag is used for a cheap check whether cancel is to berformed as soon as
@@ -71,7 +75,7 @@ class PathTraceWork {
/* Render given number of samples as a synchronous blocking call.
* The samples are added to the render buffer associated with this work. */
- virtual void render_samples(int start_sample, int samples_num) = 0;
+ virtual void render_samples(RenderStatistics &statistics, int start_sample, int samples_num) = 0;
/* Copy render result from this work to the corresponding place of the GPU display.
*
diff --git a/intern/cycles/integrator/path_trace_work_cpu.cpp b/intern/cycles/integrator/path_trace_work_cpu.cpp
index eaed0d0d636..b9a33b64051 100644
--- a/intern/cycles/integrator/path_trace_work_cpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_cpu.cpp
@@ -67,7 +67,9 @@ void PathTraceWorkCPU::init_execution()
device_->get_cpu_kernel_thread_globals(kernel_thread_globals_);
}
-void PathTraceWorkCPU::render_samples(int start_sample, int samples_num)
+void PathTraceWorkCPU::render_samples(RenderStatistics &statistics,
+ int start_sample,
+ int samples_num)
{
const int64_t image_width = effective_buffer_params_.width;
const int64_t image_height = effective_buffer_params_.height;
@@ -106,6 +108,8 @@ void PathTraceWorkCPU::render_samples(int start_sample, int samples_num)
for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
kernel_globals.stop_profiling();
}
+
+ statistics.occupancy = 1.0f;
}
void PathTraceWorkCPU::render_samples_full_pipeline(KernelGlobals *kernel_globals,
diff --git a/intern/cycles/integrator/path_trace_work_cpu.h b/intern/cycles/integrator/path_trace_work_cpu.h
index 0ea901e452d..ab729bbf879 100644
--- a/intern/cycles/integrator/path_trace_work_cpu.h
+++ b/intern/cycles/integrator/path_trace_work_cpu.h
@@ -46,7 +46,9 @@ class PathTraceWorkCPU : public PathTraceWork {
virtual void init_execution() override;
- virtual void render_samples(int start_sample, int samples_num) override;
+ virtual void render_samples(RenderStatistics &statistics,
+ int start_sample,
+ int samples_num) override;
virtual void copy_to_gpu_display(GPUDisplay *gpu_display,
PassMode pass_mode,
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index 7f15237ddbf..10baf869aa6 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -180,7 +180,9 @@ void PathTraceWorkGPU::init_execution()
"__integrator_state", &integrator_state_gpu_, sizeof(integrator_state_gpu_));
}
-void PathTraceWorkGPU::render_samples(int start_sample, int samples_num)
+void PathTraceWorkGPU::render_samples(RenderStatistics &statistics,
+ int start_sample,
+ int samples_num)
{
/* Limit number of states for the tile and rely on a greedy scheduling of tiles. This allows to
* add more work (because tiles are smaller, so there is higher chance that more paths will
@@ -192,6 +194,9 @@ void PathTraceWorkGPU::render_samples(int start_sample, int samples_num)
enqueue_reset();
+ int num_iterations = 0;
+ uint64_t num_busy_accum = 0;
+
/* TODO: set a hard limit in case of undetected kernel failures? */
while (true) {
/* Enqueue work from the scheduler, on start or when there are not enough
@@ -228,7 +233,12 @@ void PathTraceWorkGPU::render_samples(int start_sample, int samples_num)
if (is_cancel_requested()) {
break;
}
+
+ num_busy_accum += get_num_active_paths();
+ ++num_iterations;
}
+
+ statistics.occupancy = static_cast<float>(num_busy_accum) / num_iterations / max_num_paths_;
}
DeviceKernel PathTraceWorkGPU::get_most_queued_kernel() const
diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h
index aee54d4a372..38788122b0d 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -44,7 +44,9 @@ class PathTraceWorkGPU : public PathTraceWork {
virtual void alloc_work_memory() override;
virtual void init_execution() override;
- virtual void render_samples(int start_sample, int samples_num) override;
+ virtual void render_samples(RenderStatistics &statistics,
+ int start_sample,
+ int samples_num) override;
virtual void copy_to_gpu_display(GPUDisplay *gpu_display,
PassMode pass_mode,
diff --git a/intern/cycles/integrator/render_scheduler.cpp b/intern/cycles/integrator/render_scheduler.cpp
index 50017daca38..4eb1dd941f9 100644
--- a/intern/cycles/integrator/render_scheduler.cpp
+++ b/intern/cycles/integrator/render_scheduler.cpp
@@ -155,6 +155,9 @@ void RenderScheduler::reset(const BufferParams &buffer_params, int num_samples)
state_.end_render_time = 0.0;
state_.time_limit_reached = false;
+ state_.occupancy_num_samples = 0;
+ state_.occupancy = 1.0f;
+
first_render_time_.path_trace_per_sample = 0.0;
first_render_time_.denoise_time = 0.0;
first_render_time_.display_update_time = 0.0;
@@ -475,6 +478,13 @@ void RenderScheduler::report_path_trace_time(const RenderWork &render_work,
VLOG(4) << "Average path tracing time: " << path_trace_time_.get_average() << " seconds.";
}
+void RenderScheduler::report_path_trace_occupancy(const RenderWork &render_work, float occupancy)
+{
+ state_.occupancy_num_samples = render_work.path_trace.num_samples;
+ state_.occupancy = occupancy;
+ VLOG(4) << "Measured path tracing occupancy: " << occupancy;
+}
+
void RenderScheduler::report_adaptive_filter_time(const RenderWork &render_work,
double time,
bool is_cancelled)
@@ -803,8 +813,23 @@ int RenderScheduler::get_num_samples_to_path_trace() const
* more than N samples. */
const int num_samples_pot = round_num_samples_to_power_of_2(num_samples_per_update);
- const int num_samples_to_render = min(num_samples_pot,
- start_sample_ + num_samples_ - path_trace_start_sample);
+ const int max_num_samples_to_render = start_sample_ + num_samples_ - path_trace_start_sample;
+
+ int num_samples_to_render = min(num_samples_pot, max_num_samples_to_render);
+
+ /* When enough statistics is available and doing an offlien rendering prefer to keep device
+ * occupied. */
+ if (state_.occupancy_num_samples && (background_ || headless_)) {
+ /* Keep occupancy at about 0.5 (this is more of an empirical figure which seems to match scenes
+ * with good performance without forcing occupancy to be higher). */
+ int num_samples_to_occupy = state_.occupancy_num_samples;
+ if (state_.occupancy < 0.5f) {
+ num_samples_to_occupy = lround(state_.occupancy_num_samples * 0.7f / state_.occupancy);
+ }
+
+ num_samples_to_render = max(num_samples_to_render,
+ min(num_samples_to_occupy, max_num_samples_to_render));
+ }
/* If adaptive sampling is not use, render as many samples per update as possible, keeping the
* device fully occupied, without much overhead of display updates. */
diff --git a/intern/cycles/integrator/render_scheduler.h b/intern/cycles/integrator/render_scheduler.h
index 10fbcc52cd6..9c2d107e46d 100644
--- a/intern/cycles/integrator/render_scheduler.h
+++ b/intern/cycles/integrator/render_scheduler.h
@@ -186,6 +186,7 @@ class RenderScheduler {
/* Report time (in seconds) which corresponding part of work took. */
void report_path_trace_time(const RenderWork &render_work, double time, bool is_cancelled);
+ void report_path_trace_occupancy(const RenderWork &render_work, float occupancy);
void report_adaptive_filter_time(const RenderWork &render_work, double time, bool is_cancelled);
void report_denoise_time(const RenderWork &render_work, double time);
void report_display_update_time(const RenderWork &render_work, double time);
@@ -380,6 +381,13 @@ class RenderScheduler {
/* Time at which rendering started and finished. */
double start_render_time = 0.0;
double end_render_time = 0.0;
+
+ /* Measured occupancy of the render devices measured normalized to the number of samples.
+ *
+ * In a way it is "trailing": when scheduling new work this occupancy is measured when the
+ * previous work was rendered. */
+ int occupancy_num_samples = 0;
+ float occupancy = 1.0f;
} state_;
/* Timing of tasks which were performed at the very first render work at 100% of the
diff --git a/intern/cycles/integrator/work_balancer.h b/intern/cycles/integrator/work_balancer.h
index a2f83ef7f24..94e20ecf054 100644
--- a/intern/cycles/integrator/work_balancer.h
+++ b/intern/cycles/integrator/work_balancer.h
@@ -24,6 +24,9 @@ struct WorkBalanceInfo {
/* Time spent performing corresponding work. */
double time_spent = 0;
+ /* Average occupancy of the device while performing the work. */
+ float occupancy = 1.0f;
+
/* Normalized weight, which is ready to be used for work balancing (like calculating fraction of
* the big tile which is to be rendered on the device). */
double weight = 1.0;