diff options
author | Sergey Sharybin <sergey@blender.org> | 2021-09-20 16:49:10 +0300 |
---|---|---|
committer | Sergey Sharybin <sergey@blender.org> | 2021-09-20 18:43:57 +0300 |
commit | a9ad15a88f129a0593432e1217610ac840687497 (patch) | |
tree | 9bd413ab0fd6db2572d1e0254f378721af52cbd0 | |
parent | 2413da215160403d026d776bdbe0b883de5516ae (diff) |
WIP: Cycles X: Schedule work based on occupancy
Sacrifice refresh interval and increase GPU occupancy, lowering the
final render time.
Lowers `Blender 2.80 - Spring` demo file form predicted 30min
measured 3 min to render 1/10th of samples) to about 7.5min.
It is still considerably higher than the master branch, which finishes
the file in just below 3 min, but it is already a better results.
The timing is from RTX 5000.
The viewport and CPU rendering should stay unaffected by this change.
Differential Revision: https://developer.blender.org/D12570
-rw-r--r-- | intern/cycles/integrator/path_trace.cpp | 24 | ||||
-rw-r--r-- | intern/cycles/integrator/path_trace_work.h | 6 | ||||
-rw-r--r-- | intern/cycles/integrator/path_trace_work_cpu.cpp | 6 | ||||
-rw-r--r-- | intern/cycles/integrator/path_trace_work_cpu.h | 4 | ||||
-rw-r--r-- | intern/cycles/integrator/path_trace_work_gpu.cpp | 12 | ||||
-rw-r--r-- | intern/cycles/integrator/path_trace_work_gpu.h | 4 | ||||
-rw-r--r-- | intern/cycles/integrator/render_scheduler.cpp | 29 | ||||
-rw-r--r-- | intern/cycles/integrator/render_scheduler.h | 8 | ||||
-rw-r--r-- | intern/cycles/integrator/work_balancer.h | 3 |
9 files changed, 86 insertions, 10 deletions
diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp index 55f050d7833..5ce75464acd 100644 --- a/intern/cycles/integrator/path_trace.cpp +++ b/intern/cycles/integrator/path_trace.cpp @@ -351,14 +351,32 @@ void PathTrace::path_trace(RenderWork &render_work) const double start_time = time_dt(); const int num_works = path_trace_works_.size(); + tbb::parallel_for(0, num_works, [&](int i) { const double work_start_time = time_dt(); + const int num_samples = render_work.path_trace.num_samples; + PathTraceWork *path_trace_work = path_trace_works_[i].get(); - path_trace_work->render_samples(render_work.path_trace.start_sample, - render_work.path_trace.num_samples); - work_balance_infos_[i].time_spent += time_dt() - work_start_time; + + PathTraceWork::RenderStatistics statistics; + path_trace_work->render_samples(statistics, render_work.path_trace.start_sample, num_samples); + + const double work_time = time_dt() - work_start_time; + work_balance_infos_[i].time_spent += work_time; + work_balance_infos_[i].occupancy = statistics.occupancy; + + VLOG(3) << "Rendered " << num_samples << " samples in " << work_time << " seconds (" + << work_time / num_samples + << " seconds per sample), occupancy: " << statistics.occupancy; }); + float occupancy_accum = 0.0f; + for (const WorkBalanceInfo &balance_info : work_balance_infos_) { + occupancy_accum += balance_info.occupancy; + } + const float occupancy = occupancy_accum / num_works; + render_scheduler_.report_path_trace_occupancy(render_work, occupancy); + render_scheduler_.report_path_trace_time( render_work, time_dt() - start_time, is_cancel_requested()); } diff --git a/intern/cycles/integrator/path_trace_work.h b/intern/cycles/integrator/path_trace_work.h index ca64c1c2ffd..97b97f3d888 100644 --- a/intern/cycles/integrator/path_trace_work.h +++ b/intern/cycles/integrator/path_trace_work.h @@ -33,6 +33,10 @@ class RenderBuffers; class PathTraceWork { public: + struct RenderStatistics { + float occupancy = 1.0f; + }; + /* Create path trace work which fits best the device. * * The cancel request flag is used for a cheap check whether cancel is to berformed as soon as @@ -71,7 +75,7 @@ class PathTraceWork { /* Render given number of samples as a synchronous blocking call. * The samples are added to the render buffer associated with this work. */ - virtual void render_samples(int start_sample, int samples_num) = 0; + virtual void render_samples(RenderStatistics &statistics, int start_sample, int samples_num) = 0; /* Copy render result from this work to the corresponding place of the GPU display. * diff --git a/intern/cycles/integrator/path_trace_work_cpu.cpp b/intern/cycles/integrator/path_trace_work_cpu.cpp index eaed0d0d636..b9a33b64051 100644 --- a/intern/cycles/integrator/path_trace_work_cpu.cpp +++ b/intern/cycles/integrator/path_trace_work_cpu.cpp @@ -67,7 +67,9 @@ void PathTraceWorkCPU::init_execution() device_->get_cpu_kernel_thread_globals(kernel_thread_globals_); } -void PathTraceWorkCPU::render_samples(int start_sample, int samples_num) +void PathTraceWorkCPU::render_samples(RenderStatistics &statistics, + int start_sample, + int samples_num) { const int64_t image_width = effective_buffer_params_.width; const int64_t image_height = effective_buffer_params_.height; @@ -106,6 +108,8 @@ void PathTraceWorkCPU::render_samples(int start_sample, int samples_num) for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) { kernel_globals.stop_profiling(); } + + statistics.occupancy = 1.0f; } void PathTraceWorkCPU::render_samples_full_pipeline(KernelGlobals *kernel_globals, diff --git a/intern/cycles/integrator/path_trace_work_cpu.h b/intern/cycles/integrator/path_trace_work_cpu.h index 0ea901e452d..ab729bbf879 100644 --- a/intern/cycles/integrator/path_trace_work_cpu.h +++ b/intern/cycles/integrator/path_trace_work_cpu.h @@ -46,7 +46,9 @@ class PathTraceWorkCPU : public PathTraceWork { virtual void init_execution() override; - virtual void render_samples(int start_sample, int samples_num) override; + virtual void render_samples(RenderStatistics &statistics, + int start_sample, + int samples_num) override; virtual void copy_to_gpu_display(GPUDisplay *gpu_display, PassMode pass_mode, diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp index 7f15237ddbf..10baf869aa6 100644 --- a/intern/cycles/integrator/path_trace_work_gpu.cpp +++ b/intern/cycles/integrator/path_trace_work_gpu.cpp @@ -180,7 +180,9 @@ void PathTraceWorkGPU::init_execution() "__integrator_state", &integrator_state_gpu_, sizeof(integrator_state_gpu_)); } -void PathTraceWorkGPU::render_samples(int start_sample, int samples_num) +void PathTraceWorkGPU::render_samples(RenderStatistics &statistics, + int start_sample, + int samples_num) { /* Limit number of states for the tile and rely on a greedy scheduling of tiles. This allows to * add more work (because tiles are smaller, so there is higher chance that more paths will @@ -192,6 +194,9 @@ void PathTraceWorkGPU::render_samples(int start_sample, int samples_num) enqueue_reset(); + int num_iterations = 0; + uint64_t num_busy_accum = 0; + /* TODO: set a hard limit in case of undetected kernel failures? */ while (true) { /* Enqueue work from the scheduler, on start or when there are not enough @@ -228,7 +233,12 @@ void PathTraceWorkGPU::render_samples(int start_sample, int samples_num) if (is_cancel_requested()) { break; } + + num_busy_accum += get_num_active_paths(); + ++num_iterations; } + + statistics.occupancy = static_cast<float>(num_busy_accum) / num_iterations / max_num_paths_; } DeviceKernel PathTraceWorkGPU::get_most_queued_kernel() const diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h index aee54d4a372..38788122b0d 100644 --- a/intern/cycles/integrator/path_trace_work_gpu.h +++ b/intern/cycles/integrator/path_trace_work_gpu.h @@ -44,7 +44,9 @@ class PathTraceWorkGPU : public PathTraceWork { virtual void alloc_work_memory() override; virtual void init_execution() override; - virtual void render_samples(int start_sample, int samples_num) override; + virtual void render_samples(RenderStatistics &statistics, + int start_sample, + int samples_num) override; virtual void copy_to_gpu_display(GPUDisplay *gpu_display, PassMode pass_mode, diff --git a/intern/cycles/integrator/render_scheduler.cpp b/intern/cycles/integrator/render_scheduler.cpp index 50017daca38..4eb1dd941f9 100644 --- a/intern/cycles/integrator/render_scheduler.cpp +++ b/intern/cycles/integrator/render_scheduler.cpp @@ -155,6 +155,9 @@ void RenderScheduler::reset(const BufferParams &buffer_params, int num_samples) state_.end_render_time = 0.0; state_.time_limit_reached = false; + state_.occupancy_num_samples = 0; + state_.occupancy = 1.0f; + first_render_time_.path_trace_per_sample = 0.0; first_render_time_.denoise_time = 0.0; first_render_time_.display_update_time = 0.0; @@ -475,6 +478,13 @@ void RenderScheduler::report_path_trace_time(const RenderWork &render_work, VLOG(4) << "Average path tracing time: " << path_trace_time_.get_average() << " seconds."; } +void RenderScheduler::report_path_trace_occupancy(const RenderWork &render_work, float occupancy) +{ + state_.occupancy_num_samples = render_work.path_trace.num_samples; + state_.occupancy = occupancy; + VLOG(4) << "Measured path tracing occupancy: " << occupancy; +} + void RenderScheduler::report_adaptive_filter_time(const RenderWork &render_work, double time, bool is_cancelled) @@ -803,8 +813,23 @@ int RenderScheduler::get_num_samples_to_path_trace() const * more than N samples. */ const int num_samples_pot = round_num_samples_to_power_of_2(num_samples_per_update); - const int num_samples_to_render = min(num_samples_pot, - start_sample_ + num_samples_ - path_trace_start_sample); + const int max_num_samples_to_render = start_sample_ + num_samples_ - path_trace_start_sample; + + int num_samples_to_render = min(num_samples_pot, max_num_samples_to_render); + + /* When enough statistics is available and doing an offlien rendering prefer to keep device + * occupied. */ + if (state_.occupancy_num_samples && (background_ || headless_)) { + /* Keep occupancy at about 0.5 (this is more of an empirical figure which seems to match scenes + * with good performance without forcing occupancy to be higher). */ + int num_samples_to_occupy = state_.occupancy_num_samples; + if (state_.occupancy < 0.5f) { + num_samples_to_occupy = lround(state_.occupancy_num_samples * 0.7f / state_.occupancy); + } + + num_samples_to_render = max(num_samples_to_render, + min(num_samples_to_occupy, max_num_samples_to_render)); + } /* If adaptive sampling is not use, render as many samples per update as possible, keeping the * device fully occupied, without much overhead of display updates. */ diff --git a/intern/cycles/integrator/render_scheduler.h b/intern/cycles/integrator/render_scheduler.h index 10fbcc52cd6..9c2d107e46d 100644 --- a/intern/cycles/integrator/render_scheduler.h +++ b/intern/cycles/integrator/render_scheduler.h @@ -186,6 +186,7 @@ class RenderScheduler { /* Report time (in seconds) which corresponding part of work took. */ void report_path_trace_time(const RenderWork &render_work, double time, bool is_cancelled); + void report_path_trace_occupancy(const RenderWork &render_work, float occupancy); void report_adaptive_filter_time(const RenderWork &render_work, double time, bool is_cancelled); void report_denoise_time(const RenderWork &render_work, double time); void report_display_update_time(const RenderWork &render_work, double time); @@ -380,6 +381,13 @@ class RenderScheduler { /* Time at which rendering started and finished. */ double start_render_time = 0.0; double end_render_time = 0.0; + + /* Measured occupancy of the render devices measured normalized to the number of samples. + * + * In a way it is "trailing": when scheduling new work this occupancy is measured when the + * previous work was rendered. */ + int occupancy_num_samples = 0; + float occupancy = 1.0f; } state_; /* Timing of tasks which were performed at the very first render work at 100% of the diff --git a/intern/cycles/integrator/work_balancer.h b/intern/cycles/integrator/work_balancer.h index a2f83ef7f24..94e20ecf054 100644 --- a/intern/cycles/integrator/work_balancer.h +++ b/intern/cycles/integrator/work_balancer.h @@ -24,6 +24,9 @@ struct WorkBalanceInfo { /* Time spent performing corresponding work. */ double time_spent = 0; + /* Average occupancy of the device while performing the work. */ + float occupancy = 1.0f; + /* Normalized weight, which is ready to be used for work balancing (like calculating fraction of * the big tile which is to be rendered on the device). */ double weight = 1.0; |