diff options
author | Brecht Van Lommel <brecht@blender.org> | 2020-04-07 20:43:51 +0300 |
---|---|---|
committer | Brecht Van Lommel <brecht@blender.org> | 2020-04-07 21:29:48 +0300 |
commit | 53981c7fb6fdd9973e40f81f867f25d10540c1d1 (patch) | |
tree | 84572c33af5bb0c343f6a2928b135041bb8cb3b3 /intern/cycles | |
parent | 7b4b07a7ddc874efd11b3932f152c7136d720f9c (diff) |
Cleanup: refactor adaptive sampling to more easily change some parameters
No functional changes yet, this is work towards making CPU and GPU results
match more closely.
Diffstat (limited to 'intern/cycles')
-rw-r--r-- | intern/cycles/device/device.cpp | 2 | ||||
-rw-r--r-- | intern/cycles/device/device.h | 14 | ||||
-rw-r--r-- | intern/cycles/device/device_cpu.cpp | 22 | ||||
-rw-r--r-- | intern/cycles/device/device_cuda.cpp | 1 | ||||
-rw-r--r-- | intern/cycles/device/device_network.cpp | 1 | ||||
-rw-r--r-- | intern/cycles/device/device_opencl.cpp | 1 | ||||
-rw-r--r-- | intern/cycles/device/device_task.cpp | 3 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_passes.h | 10 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_types.h | 9 | ||||
-rw-r--r-- | intern/cycles/render/integrator.cpp | 7 | ||||
-rw-r--r-- | intern/cycles/render/session.cpp | 1 |
11 files changed, 50 insertions, 21 deletions
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp index d94d409175b..dfbf57e8b88 100644 --- a/intern/cycles/device/device.cpp +++ b/intern/cycles/device/device.cpp @@ -597,6 +597,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices, info.has_half_images = true; info.has_volume_decoupled = true; + info.has_adaptive_stop_per_sample = true; info.has_osl = true; info.has_profiling = true; @@ -639,6 +640,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices, /* Accumulate device info. */ info.has_half_images &= device.has_half_images; info.has_volume_decoupled &= device.has_volume_decoupled; + info.has_adaptive_stop_per_sample &= device.has_adaptive_stop_per_sample; info.has_osl &= device.has_osl; info.has_profiling &= device.has_profiling; } diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index a98ac171709..c55dfb3a83b 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -75,12 +75,13 @@ class DeviceInfo { string description; string id; /* used for user preferences, should stay fixed with changing hardware config */ int num; - bool display_device; /* GPU is used as a display device. */ - bool has_half_images; /* Support half-float textures. */ - bool has_volume_decoupled; /* Decoupled volume shading. */ - bool has_osl; /* Support Open Shading Language. */ - bool use_split_kernel; /* Use split or mega kernel. */ - bool has_profiling; /* Supports runtime collection of profiling info. */ + bool display_device; /* GPU is used as a display device. */ + bool has_half_images; /* Support half-float textures. */ + bool has_volume_decoupled; /* Decoupled volume shading. */ + bool has_adaptive_stop_per_sample; /* Per-sample adaptive sampling stopping. */ + bool has_osl; /* Support Open Shading Language. */ + bool use_split_kernel; /* Use split or mega kernel. */ + bool has_profiling; /* Supports runtime collection of profiling info. */ int cpu_threads; vector<DeviceInfo> multi_devices; vector<DeviceInfo> denoising_devices; @@ -94,6 +95,7 @@ class DeviceInfo { display_device = false; has_half_images = false; has_volume_decoupled = false; + has_adaptive_stop_per_sample = false; has_osl = false; use_split_kernel = false; has_profiling = false; diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index 57e8523e02a..c701c14318f 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -839,7 +839,7 @@ class CPUDevice : public Device { return true; } - bool adaptive_sampling_filter(KernelGlobals *kg, RenderTile &tile) + bool adaptive_sampling_filter(KernelGlobals *kg, RenderTile &tile, int sample) { WorkTile wtile; wtile.x = tile.x; @@ -850,11 +850,24 @@ class CPUDevice : public Device { wtile.stride = tile.stride; wtile.buffer = (float *)tile.buffer; + /* For CPU we do adaptive stopping per sample so we can stop earlier, but + * for combined CPU + GPU rendering we match the GPU and do it per tile + * after a given number of sample steps. */ + if (!kernel_data.integrator.adaptive_stop_per_sample) { + for (int y = wtile.y; y < wtile.y + wtile.h; ++y) { + for (int x = wtile.x; x < wtile.x + wtile.w; ++x) { + const int index = wtile.offset + x + y * wtile.stride; + float *buffer = wtile.buffer + index * kernel_data.film.pass_stride; + kernel_do_adaptive_stopping(kg, buffer, sample); + } + } + } + bool any = false; - for (int y = tile.y; y < tile.y + tile.h; ++y) { + for (int y = wtile.y; y < wtile.y + wtile.h; ++y) { any |= kernel_do_adaptive_filter_x(kg, y, &wtile); } - for (int x = tile.x; x < tile.x + tile.w; ++x) { + for (int x = wtile.x; x < wtile.x + wtile.w; ++x) { any |= kernel_do_adaptive_filter_y(kg, x, &wtile); } return (!any); @@ -917,7 +930,7 @@ class CPUDevice : public Device { tile.sample = sample + 1; if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(sample)) { - const bool stop = adaptive_sampling_filter(kg, tile); + const bool stop = adaptive_sampling_filter(kg, tile, sample); if (stop) { const int num_progress_samples = end_sample - sample; tile.sample = end_sample; @@ -1327,6 +1340,7 @@ void device_cpu_info(vector<DeviceInfo> &devices) info.id = "CPU"; info.num = 0; info.has_volume_decoupled = true; + info.has_adaptive_stop_per_sample = true; info.has_osl = true; info.has_half_images = true; info.has_profiling = true; diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 9a703b45c0a..4a53fcd151d 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -129,6 +129,7 @@ void device_cuda_info(vector<DeviceInfo> &devices) info.has_half_images = (major >= 3); info.has_volume_decoupled = false; + info.has_adaptive_stop_per_sample = false; int pci_location[3] = {0, 0, 0}; cuDeviceGetAttribute(&pci_location[0], CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, num); diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp index 2742cbf53aa..0933d51f321 100644 --- a/intern/cycles/device/device_network.cpp +++ b/intern/cycles/device/device_network.cpp @@ -311,6 +311,7 @@ void device_network_info(vector<DeviceInfo> &devices) /* todo: get this info from device */ info.has_volume_decoupled = false; + info.has_adaptive_stop_per_sample = false; info.has_osl = false; devices.push_back(info); diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp index 891b73351a0..8a0b128697f 100644 --- a/intern/cycles/device/device_opencl.cpp +++ b/intern/cycles/device/device_opencl.cpp @@ -119,6 +119,7 @@ void device_opencl_info(vector<DeviceInfo> &devices) info.display_device = true; info.use_split_kernel = true; info.has_volume_decoupled = false; + info.has_adaptive_stop_per_sample = false; info.id = id; /* Check OpenCL extensions */ diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp index c36b1344c3b..d2447eae867 100644 --- a/intern/cycles/device/device_task.cpp +++ b/intern/cycles/device/device_task.cpp @@ -138,8 +138,7 @@ void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples) /* Adaptive Sampling */ -AdaptiveSampling::AdaptiveSampling() - : use(true), adaptive_step(ADAPTIVE_SAMPLE_STEP), min_samples(0) +AdaptiveSampling::AdaptiveSampling() : use(true), adaptive_step(0), min_samples(0) { } diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h index 98136bc7047..7437e540a1f 100644 --- a/intern/cycles/kernel/kernel_passes.h +++ b/intern/cycles/kernel/kernel_passes.h @@ -403,9 +403,13 @@ ccl_device_inline void kernel_write_result(KernelGlobals *kg, make_float4(L_sum.x * 2.0f, L_sum.y * 2.0f, L_sum.z * 2.0f, 0.0f)); } #ifdef __KERNEL_CPU__ - if (sample > kernel_data.integrator.adaptive_min_samples && - (sample & (ADAPTIVE_SAMPLE_STEP - 1)) == (ADAPTIVE_SAMPLE_STEP - 1)) { - kernel_do_adaptive_stopping(kg, buffer, sample); + if ((sample > kernel_data.integrator.adaptive_min_samples) && + kernel_data.integrator.adaptive_stop_per_sample) { + const int step = kernel_data.integrator.adaptive_step; + + if ((sample & (step - 1)) == (step - 1)) { + kernel_do_adaptive_stopping(kg, buffer, sample); + } } #endif } diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index 44c936da626..a1f8c35348d 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -63,11 +63,6 @@ CCL_NAMESPACE_BEGIN #define VOLUME_STACK_SIZE 32 -/* Adaptive sampling constants */ -#define ADAPTIVE_SAMPLE_STEP 4 -static_assert((ADAPTIVE_SAMPLE_STEP & (ADAPTIVE_SAMPLE_STEP - 1)) == 0, - "ADAPTIVE_SAMPLE_STEP must be power of two for bitwise operations to work"); - /* Split kernel constants */ #define WORK_POOL_SIZE_GPU 64 #define WORK_POOL_SIZE_CPU 1 @@ -1350,6 +1345,8 @@ typedef struct KernelIntegrator { int sampling_pattern; int aa_samples; int adaptive_min_samples; + int adaptive_step; + int adaptive_stop_per_sample; float adaptive_threshold; /* volume render */ @@ -1362,7 +1359,7 @@ typedef struct KernelIntegrator { int max_closures; - int pad1, pad2, pad3; + int pad1; } KernelIntegrator; static_assert_align(KernelIntegrator, 16); diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp index 2f9d088899e..d4beb06e57b 100644 --- a/intern/cycles/render/integrator.cpp +++ b/intern/cycles/render/integrator.cpp @@ -190,6 +190,13 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene else { kintegrator->adaptive_min_samples = max(4, adaptive_min_samples); } + + kintegrator->adaptive_step = 4; + kintegrator->adaptive_stop_per_sample = device->info.has_adaptive_stop_per_sample; + + /* Adaptive step must be a power of two for bitwise operations to work. */ + assert((kintegrator->adaptive_step & (kintegrator->adaptive_step - 1)) == 0); + if (aa_samples > 0 && adaptive_threshold == 0.0f) { kintegrator->adaptive_threshold = max(0.001f, 1.0f / (float)aa_samples); VLOG(1) << "Cycles adaptive sampling: automatic threshold = " diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp index b1b30979b0e..58bcc7ccdfb 100644 --- a/intern/cycles/render/session.cpp +++ b/intern/cycles/render/session.cpp @@ -1110,6 +1110,7 @@ void Session::render(bool with_denoising) task.adaptive_sampling.use = (scene->integrator->sampling_pattern == SAMPLING_PATTERN_PMJ) && scene->dscene.data.film.pass_adaptive_aux_buffer; task.adaptive_sampling.min_samples = scene->dscene.data.integrator.adaptive_min_samples; + task.adaptive_sampling.adaptive_step = scene->dscene.data.integrator.adaptive_step; /* Acquire render tiles by default. */ task.tile_types = RenderTile::PATH_TRACE; |