diff options
author | William Leeson <william@blender.org> | 2021-11-25 11:41:03 +0300 |
---|---|---|
committer | William Leeson <william@blender.org> | 2021-11-25 11:41:03 +0300 |
commit | c49d2cbe92a1a18bb3f9ddc04ddc3351ffd27286 (patch) | |
tree | ab585ec4daf8442b054f9bfdc789b6463ae07cf4 /intern/cycles | |
parent | 827c5b399e0bb93182586723a811fcc5afd0d4db (diff) | |
parent | b41c72b710d4013fd6d67dc49a8ebb2a416b4462 (diff) |
Merge branch 'blender-v3.0-release' to bring in D13042:
Fix performance decrease with Scrambling Distance on
Diffstat (limited to 'intern/cycles')
-rw-r--r-- | intern/cycles/integrator/path_trace_work_gpu.cpp | 3 | ||||
-rw-r--r-- | intern/cycles/integrator/tile.cpp | 5 | ||||
-rw-r--r-- | intern/cycles/integrator/tile.h | 3 | ||||
-rw-r--r-- | intern/cycles/integrator/work_tile_scheduler.cpp | 7 | ||||
-rw-r--r-- | intern/cycles/integrator/work_tile_scheduler.h | 6 | ||||
-rw-r--r-- | intern/cycles/kernel/device/gpu/work_stealing.h | 25 | ||||
-rw-r--r-- | intern/cycles/test/integrator_tile_test.cpp | 16 |
7 files changed, 41 insertions, 24 deletions
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp index 956aa6a8c90..05e53f816a0 100644 --- a/intern/cycles/integrator/path_trace_work_gpu.cpp +++ b/intern/cycles/integrator/path_trace_work_gpu.cpp @@ -258,7 +258,8 @@ void PathTraceWorkGPU::render_samples(RenderStatistics &statistics, * become busy after adding new tiles). This is especially important for the shadow catcher which * schedules work in halves of available number of paths. */ work_tile_scheduler_.set_max_num_path_states(max_num_paths_ / 8); - + work_tile_scheduler_.set_accelerated_rt((device_->get_bvh_layout_mask() & BVH_LAYOUT_OPTIX) != + 0); work_tile_scheduler_.reset(effective_buffer_params_, start_sample, samples_num, diff --git a/intern/cycles/integrator/tile.cpp b/intern/cycles/integrator/tile.cpp index 4a1558cce09..e9a3cbd38aa 100644 --- a/intern/cycles/integrator/tile.cpp +++ b/intern/cycles/integrator/tile.cpp @@ -46,7 +46,8 @@ ccl_device_inline uint round_up_to_power_of_two(uint x) return next_power_of_two(x); } -TileSize tile_calculate_best_size(const int2 &image_size, +TileSize tile_calculate_best_size(const bool accel_rt, + const int2 &image_size, const int num_samples, const int max_num_path_states, const float scrambling_distance) @@ -73,7 +74,7 @@ TileSize tile_calculate_best_size(const int2 &image_size, TileSize tile_size; const int num_path_states_per_sample = max_num_path_states / num_samples; - if (scrambling_distance < 0.9f) { + if (scrambling_distance < 0.9f && accel_rt) { /* Prefer large tiles for scrambling distance, bounded by max num path states. */ tile_size.width = min(image_size.x, max_num_path_states); tile_size.height = min(image_size.y, max(max_num_path_states / tile_size.width, 1)); diff --git a/intern/cycles/integrator/tile.h b/intern/cycles/integrator/tile.h index 61f7d736115..05b1e0af6b1 100644 --- a/intern/cycles/integrator/tile.h +++ b/intern/cycles/integrator/tile.h @@ -49,7 +49,8 @@ std::ostream &operator<<(std::ostream &os, const TileSize &tile_size); * of active path states. * Will attempt to provide best guess to keep path tracing threads of a device as localized as * possible, and have as many threads active for every tile as possible. */ -TileSize tile_calculate_best_size(const int2 &image_size, +TileSize tile_calculate_best_size(const bool accel_rt, + const int2 &image_size, const int num_samples, const int max_num_path_states, const float scrambling_distance); diff --git a/intern/cycles/integrator/work_tile_scheduler.cpp b/intern/cycles/integrator/work_tile_scheduler.cpp index d60f7149bf4..353c357475d 100644 --- a/intern/cycles/integrator/work_tile_scheduler.cpp +++ b/intern/cycles/integrator/work_tile_scheduler.cpp @@ -28,6 +28,11 @@ WorkTileScheduler::WorkTileScheduler() { } +void WorkTileScheduler::set_accelerated_rt(bool accelerated_rt) +{ + accelerated_rt_ = accelerated_rt; +} + void WorkTileScheduler::set_max_num_path_states(int max_num_path_states) { max_num_path_states_ = max_num_path_states; @@ -61,7 +66,7 @@ void WorkTileScheduler::reset(const BufferParams &buffer_params, void WorkTileScheduler::reset_scheduler_state() { tile_size_ = tile_calculate_best_size( - image_size_px_, samples_num_, max_num_path_states_, scrambling_distance_); + accelerated_rt_, image_size_px_, samples_num_, max_num_path_states_, scrambling_distance_); VLOG(3) << "Will schedule tiles of size " << tile_size_; diff --git a/intern/cycles/integrator/work_tile_scheduler.h b/intern/cycles/integrator/work_tile_scheduler.h index 2d6395799f7..b9cef7be7c4 100644 --- a/intern/cycles/integrator/work_tile_scheduler.h +++ b/intern/cycles/integrator/work_tile_scheduler.h @@ -31,6 +31,9 @@ class WorkTileScheduler { public: WorkTileScheduler(); + /* To indicate if there is accelerated RT support. */ + void set_accelerated_rt(bool state); + /* MAximum path states which are allowed to be used by a single scheduled work tile. * * Affects the scheduled work size: the work size will be as big as possible, but will not exceed @@ -55,6 +58,9 @@ class WorkTileScheduler { protected: void reset_scheduler_state(); + /* Used to indicate if there is accelerated ray tracing. */ + bool accelerated_rt_ = false; + /* Maximum allowed path states to be used. * * TODO(sergey): Naming can be improved. The fact that this is a limiting factor based on the diff --git a/intern/cycles/kernel/device/gpu/work_stealing.h b/intern/cycles/kernel/device/gpu/work_stealing.h index fab0915c38e..c3083948057 100644 --- a/intern/cycles/kernel/device/gpu/work_stealing.h +++ b/intern/cycles/kernel/device/gpu/work_stealing.h @@ -29,17 +29,20 @@ ccl_device_inline void get_work_pixel(ccl_global const KernelWorkTile *tile, ccl_private uint *y, ccl_private uint *sample) { -#if 0 - /* Keep threads for the same sample together. */ - uint tile_pixels = tile->w * tile->h; - uint sample_offset = global_work_index / tile_pixels; - uint pixel_offset = global_work_index - sample_offset * tile_pixels; -#else - /* Keeping threads for the same pixel together. - * Appears to improve performance by a few % on CUDA and OptiX. */ - uint sample_offset = global_work_index % tile->num_samples; - uint pixel_offset = global_work_index / tile->num_samples; -#endif + uint sample_offset, pixel_offset; + + if (kernel_data.integrator.scrambling_distance < 0.9f) { + /* Keep threads for the same sample together. */ + uint tile_pixels = tile->w * tile->h; + sample_offset = global_work_index / tile_pixels; + pixel_offset = global_work_index - sample_offset * tile_pixels; + } + else { + /* Keeping threads for the same pixel together. + * Appears to improve performance by a few % on CUDA and OptiX. */ + sample_offset = global_work_index % tile->num_samples; + pixel_offset = global_work_index / tile->num_samples; + } uint y_offset = pixel_offset / tile->w; uint x_offset = pixel_offset - y_offset * tile->w; diff --git a/intern/cycles/test/integrator_tile_test.cpp b/intern/cycles/test/integrator_tile_test.cpp index 8bb0856d6a9..822c34c36bf 100644 --- a/intern/cycles/test/integrator_tile_test.cpp +++ b/intern/cycles/test/integrator_tile_test.cpp @@ -24,26 +24,26 @@ CCL_NAMESPACE_BEGIN TEST(tile_calculate_best_size, Basic) { /* Make sure CPU-like case is handled properly. */ - EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 1, 1, 1.0f), TileSize(1, 1, 1)); - EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 100, 1, 1.0f), TileSize(1, 1, 1)); + EXPECT_EQ(tile_calculate_best_size(false, make_int2(1920, 1080), 1, 1, 1.0f), TileSize(1, 1, 1)); + EXPECT_EQ(tile_calculate_best_size(false, make_int2(1920, 1080), 100, 1, 1.0f), TileSize(1, 1, 1)); /* Enough path states to fit an entire image with all samples. */ - EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 1, 1920 * 1080, 1.0f), + EXPECT_EQ(tile_calculate_best_size(false, make_int2(1920, 1080), 1, 1920 * 1080, 1.0f), TileSize(1920, 1080, 1)); - EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 100, 1920 * 1080 * 100, 1.0f), + EXPECT_EQ(tile_calculate_best_size(false, make_int2(1920, 1080), 100, 1920 * 1080 * 100, 1.0f), TileSize(1920, 1080, 100)); } TEST(tile_calculate_best_size, Extreme) { - EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 262144, 131072, 1.0f), + EXPECT_EQ(tile_calculate_best_size(false, make_int2(32, 32), 262144, 131072, 1.0f), TileSize(1, 1, 512)); - EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 1048576, 131072, 1.0f), + EXPECT_EQ(tile_calculate_best_size(false, make_int2(32, 32), 1048576, 131072, 1.0f), TileSize(1, 1, 1024)); - EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 10485760, 131072, 1.0f), + EXPECT_EQ(tile_calculate_best_size(false, make_int2(32, 32), 10485760, 131072, 1.0f), TileSize(1, 1, 4096)); - EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 8192 * 8192 * 2, 1024, 1.0f), + EXPECT_EQ(tile_calculate_best_size(false, make_int2(32, 32), 8192 * 8192 * 2, 1024, 1.0f), TileSize(1, 1, 1024)); } |