From 688e5c6d3895a8f6b5aec06977c9dcb4b00e0a76 Mon Sep 17 00:00:00 2001 From: Lukas Stockner Date: Mon, 11 Jan 2021 20:42:47 +0100 Subject: Fix T82351: Cycles: Tile stealing glitches with adaptive sampling In my testing this works, but it requires me to remove the min(start_sample...) part in the adaptive sampling kernel, and I assume there's a reason why it was there? Reviewed By: brecht Maniphest Tasks: T82351 Differential Revision: https://developer.blender.org/D9445 --- intern/cycles/device/cuda/device_cuda_impl.cpp | 14 ++++---- intern/cycles/device/device_cpu.cpp | 5 ++- intern/cycles/device/device_optix.cpp | 14 ++++---- intern/cycles/device/device_split_kernel.cpp | 4 +-- intern/cycles/device/device_task.cpp | 41 ++++++---------------- intern/cycles/device/device_task.h | 3 +- intern/cycles/kernel/kernels/cuda/kernel.cu | 2 +- .../kernel/split/kernel_adaptive_adjust_samples.h | 3 +- 8 files changed, 33 insertions(+), 53 deletions(-) (limited to 'intern') diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp index 48151365e5d..307d1469ba5 100644 --- a/intern/cycles/device/cuda/device_cuda_impl.cpp +++ b/intern/cycles/device/cuda/device_cuda_impl.cpp @@ -1929,18 +1929,19 @@ void CUDADevice::render(DeviceTask &task, RenderTile &rtile, device_vectorw * wtile->h); - if (task.adaptive_sampling.use) { - step_samples = task.adaptive_sampling.align_static_samples(step_samples); - } /* Render all samples. */ int start_sample = rtile.start_sample; int end_sample = rtile.start_sample + rtile.num_samples; - for (int sample = start_sample; sample < end_sample; sample += step_samples) { + for (int sample = start_sample; sample < end_sample;) { /* Setup and copy work tile to device. */ wtile->start_sample = sample; - wtile->num_samples = min(step_samples, end_sample - sample); + wtile->num_samples = step_samples; + if (task.adaptive_sampling.use) { + wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples); + } + wtile->num_samples = min(wtile->num_samples, end_sample - sample); work_tiles.copy_to_device(); CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer; @@ -1962,7 +1963,8 @@ void CUDADevice::render(DeviceTask &task, RenderTile &rtile, device_vectornum_samples; + sample += wtile->num_samples; + rtile.sample = sample; task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples); if (task.get_cancel()) { diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index fea4fc53d1f..bd00d4db775 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -920,8 +920,7 @@ class CPUDevice : public Device { ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride; if (buffer[kernel_data.film.pass_sample_count] < 0.0f) { buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count]; - float sample_multiplier = tile.sample / max((float)tile.start_sample + 1.0f, - buffer[kernel_data.film.pass_sample_count]); + float sample_multiplier = tile.sample / buffer[kernel_data.film.pass_sample_count]; if (sample_multiplier != 1.0f) { kernel_adaptive_post_adjust(kg, buffer, sample_multiplier); } @@ -997,7 +996,7 @@ class CPUDevice : public Device { coverage.finalize(); } - if (task.adaptive_sampling.use) { + if (task.adaptive_sampling.use && (tile.stealing_state != RenderTile::WAS_STOLEN)) { adaptive_sampling_post(tile, kg); } } diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp index f19289f966e..f04113635f3 100644 --- a/intern/cycles/device/device_optix.cpp +++ b/intern/cycles/device/device_optix.cpp @@ -760,9 +760,6 @@ class OptiXDevice : public CUDADevice { const int end_sample = rtile.start_sample + rtile.num_samples; // Keep this number reasonable to avoid running into TDRs int step_samples = (info.display_device ? 8 : 32); - if (task.adaptive_sampling.use) { - step_samples = task.adaptive_sampling.align_static_samples(step_samples); - } // Offset into launch params buffer so that streams use separate data device_ptr launch_params_ptr = launch_params.device_pointer + @@ -770,10 +767,14 @@ class OptiXDevice : public CUDADevice { const CUDAContextScope scope(cuContext); - for (int sample = rtile.start_sample; sample < end_sample; sample += step_samples) { + for (int sample = rtile.start_sample; sample < end_sample;) { // Copy work tile information to device - wtile.num_samples = min(step_samples, end_sample - sample); wtile.start_sample = sample; + wtile.num_samples = step_samples; + if (task.adaptive_sampling.use) { + wtile.num_samples = task.adaptive_sampling.align_samples(sample, step_samples); + } + wtile.num_samples = min(wtile.num_samples, end_sample - sample); device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile); check_result_cuda( cuMemcpyHtoDAsync(d_wtile_ptr, &wtile, sizeof(wtile), cuda_stream[thread_index])); @@ -815,7 +816,8 @@ class OptiXDevice : public CUDADevice { check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index])); // Update current sample, so it is displayed correctly - rtile.sample = wtile.start_sample + wtile.num_samples; + sample += wtile.num_samples; + rtile.sample = sample; // Update task progress after the kernel completed rendering task.update_progress(&rtile, wtile.w * wtile.h * wtile.num_samples); diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp index 4c288f60c16..9889f688aaa 100644 --- a/intern/cycles/device/device_split_kernel.cpp +++ b/intern/cycles/device/device_split_kernel.cpp @@ -223,8 +223,8 @@ bool DeviceSplitKernel::path_trace(DeviceTask &task, subtile.num_samples = samples_per_second; if (task.adaptive_sampling.use) { - subtile.num_samples = task.adaptive_sampling.align_dynamic_samples(subtile.start_sample, - subtile.num_samples); + subtile.num_samples = task.adaptive_sampling.align_samples(subtile.start_sample, + subtile.num_samples); } /* Don't go beyond requested number of samples. */ diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp index 6e7c184c6c9..2cc9c6a985f 100644 --- a/intern/cycles/device/device_task.cpp +++ b/intern/cycles/device/device_task.cpp @@ -144,41 +144,20 @@ AdaptiveSampling::AdaptiveSampling() : use(true), adaptive_step(0), min_samples( } /* Render samples in steps that align with the adaptive filtering. */ -int AdaptiveSampling::align_static_samples(int samples) const +int AdaptiveSampling::align_samples(int sample, int num_samples) const { - if (samples > adaptive_step) { - /* Make multiple of adaptive_step. */ - while (samples % adaptive_step != 0) { - samples--; - } - } - else if (samples < adaptive_step) { - /* Make divisor of adaptive_step. */ - while (adaptive_step % samples != 0) { - samples--; - } - } - - return max(samples, 1); -} + int end_sample = sample + num_samples; -/* Render samples in steps that align with the adaptive filtering, with the - * suggested number of samples dynamically changing. */ -int AdaptiveSampling::align_dynamic_samples(int offset, int samples) const -{ - /* Round so that we end up on multiples of adaptive_samples. */ - samples += offset; + /* Round down end sample to the nearest sample that needs filtering. */ + end_sample &= ~(adaptive_step - 1); - if (samples > adaptive_step) { - /* Make multiple of adaptive_step. */ - while (samples % adaptive_step != 0) { - samples--; - } + if (end_sample <= sample) { + /* In order to reach the next sample that needs filtering, we'd need + * to increase num_samples. We don't do that in this function, so + * just keep it as is and don't filter this time around. */ + return num_samples; } - - samples -= offset; - - return max(samples, 1); + return end_sample - sample; } bool AdaptiveSampling::need_filter(int sample) const diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h index f819f84eb43..f9b47c59e95 100644 --- a/intern/cycles/device/device_task.h +++ b/intern/cycles/device/device_task.h @@ -117,8 +117,7 @@ class AdaptiveSampling { public: AdaptiveSampling(); - int align_static_samples(int samples) const; - int align_dynamic_samples(int offset, int samples) const; + int align_samples(int sample, int num_samples) const; bool need_filter(int sample) const; bool use; diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu index d4f41132a11..cf62b6e781e 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel.cu +++ b/intern/cycles/kernel/kernels/cuda/kernel.cu @@ -139,7 +139,7 @@ kernel_cuda_adaptive_scale_samples(WorkTile *tile, int start_sample, int sample, ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride; if(buffer[kernel_data.film.pass_sample_count] < 0.0f) { buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count]; - float sample_multiplier = sample / max((float)start_sample + 1.0f, buffer[kernel_data.film.pass_sample_count]); + float sample_multiplier = sample / buffer[kernel_data.film.pass_sample_count]; if(sample_multiplier != 1.0f) { kernel_adaptive_post_adjust(&kg, buffer, sample_multiplier); } diff --git a/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h b/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h index 60ebf415970..437a5c9581b 100644 --- a/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h +++ b/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h @@ -29,8 +29,7 @@ ccl_device void kernel_adaptive_adjust_samples(KernelGlobals *kg) int sample = kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples; if (buffer[kernel_data.film.pass_sample_count] < 0.0f) { buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count]; - float sample_multiplier = sample / max((float)kernel_split_params.tile.start_sample + 1.0f, - buffer[kernel_data.film.pass_sample_count]); + float sample_multiplier = sample / buffer[kernel_data.film.pass_sample_count]; if (sample_multiplier != 1.0f) { kernel_adaptive_post_adjust(kg, buffer, sample_multiplier); } -- cgit v1.2.3