diff options
45 files changed, 1215 insertions, 46 deletions
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py index 7917edf8c88..afa573c8dc5 100644 --- a/intern/cycles/blender/addon/engine.py +++ b/intern/cycles/blender/addon/engine.py @@ -255,6 +255,7 @@ def list_render_passes(srl): if crl.pass_debug_bvh_traversed_instances: yield ("Debug BVH Traversed Instances", "X", 'VALUE') if crl.pass_debug_bvh_intersections: yield ("Debug BVH Intersections", "X", 'VALUE') if crl.pass_debug_ray_bounces: yield ("Debug Ray Bounces", "X", 'VALUE') + if crl.pass_debug_sample_count: yield ("Debug Sample Count", "X", 'VALUE') if crl.use_pass_volume_direct: yield ("VolumeDir", "RGB", 'COLOR') if crl.use_pass_volume_indirect: yield ("VolumeInd", "RGB", 'COLOR') diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index 77dc29e11e8..f2c40f509e8 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -350,6 +350,24 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): default=0.01, ) + use_adaptive_sampling: BoolProperty( + name="Use adaptive sampling", + description="Automatically determine the number of samples per pixel based on a variance estimation", + default=False, + ) + adaptive_threshold: FloatProperty( + name="Adaptive Sampling Threshold", + description="Zero for automatic setting based on AA samples", + min=0.0, max=1.0, + default=0.0, + ) + adaptive_min_samples: IntProperty( + name="Adaptive Min Samples", + description="Minimum AA samples for adaptive sampling. Zero for automatic setting based on AA samples", + min=0, max=4096, + default=0, + ) + min_light_bounces: IntProperty( name="Min Light Bounces", description="Minimum number of light bounces. Setting this higher reduces noise in the first bounces, " @@ -1298,7 +1316,12 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup): default=False, update=update_render_passes, ) - + pass_debug_sample_count: BoolProperty( + name="Debug Sample Count", + description="Number of samples/camera rays per pixel", + default=False, + update=update_render_passes, + ) use_pass_volume_direct: BoolProperty( name="Volume Direct", description="Deliver direct volumetric scattering pass", diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py index ed9e3a4c9cf..d04418fc957 100644 --- a/intern/cycles/blender/addon/ui.py +++ b/intern/cycles/blender/addon/ui.py @@ -190,6 +190,7 @@ class CYCLES_RENDER_PT_sampling(CyclesButtonsPanel, Panel): col.prop(cscene, "aa_samples", text="Render") col.prop(cscene, "preview_aa_samples", text="Viewport") + col.prop(cscene, "use_adaptive_sampling", text="Adaptive Sampling") # Viewport denoising is currently only supported with OptiX if show_optix_denoising(context): col = layout.column() @@ -247,7 +248,13 @@ class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel): row.prop(cscene, "seed") row.prop(cscene, "use_animated_seed", text="", icon='TIME') - layout.prop(cscene, "sampling_pattern", text="Pattern") + col = layout.column(align=True) + col.active = not(cscene.use_adaptive_sampling) + col.prop(cscene, "sampling_pattern", text="Pattern") + col = layout.column(align=True) + col.active = cscene.use_adaptive_sampling + col.prop(cscene, "adaptive_min_samples", text="Adaptive Min Samples") + col.prop(cscene, "adaptive_threshold", text="Adaptive Threshold") layout.prop(cscene, "use_square_samples") @@ -813,6 +820,8 @@ class CYCLES_RENDER_PT_passes_data(CyclesButtonsPanel, Panel): col.prop(cycles_view_layer, "denoising_store_passes", text="Denoising Data") col = flow.column() col.prop(cycles_view_layer, "pass_debug_render_time", text="Render Time") + col = flow.column() + col.prop(cycles_view_layer, "pass_debug_sample_count", text="Sample Count") layout.separator() diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp index 5cfb1200c7c..ac307743e48 100644 --- a/intern/cycles/blender/blender_session.cpp +++ b/intern/cycles/blender/blender_session.cpp @@ -470,7 +470,8 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_) b_rlay_name = b_view_layer.name(); /* add passes */ - vector<Pass> passes = sync->sync_render_passes(b_rlay, b_view_layer); + vector<Pass> passes = sync->sync_render_passes( + b_rlay, b_view_layer, session_params.adaptive_sampling); buffer_params.passes = passes; PointerRNA crl = RNA_pointer_get(&b_view_layer.ptr, "cycles"); diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp index 50442c6ebdc..8f00f9ccda0 100644 --- a/intern/cycles/blender/blender_sync.cpp +++ b/intern/cycles/blender/blender_sync.cpp @@ -296,6 +296,16 @@ void BlenderSync::sync_integrator() integrator->sample_all_lights_indirect = get_boolean(cscene, "sample_all_lights_indirect"); integrator->light_sampling_threshold = get_float(cscene, "light_sampling_threshold"); + if (RNA_boolean_get(&cscene, "use_adaptive_sampling")) { + integrator->sampling_pattern = SAMPLING_PATTERN_PMJ; + integrator->adaptive_min_samples = get_int(cscene, "adaptive_min_samples"); + integrator->adaptive_threshold = get_float(cscene, "adaptive_threshold"); + } + else { + integrator->adaptive_min_samples = INT_MAX; + integrator->adaptive_threshold = 0.0f; + } + int diffuse_samples = get_int(cscene, "diffuse_samples"); int glossy_samples = get_int(cscene, "glossy_samples"); int transmission_samples = get_int(cscene, "transmission_samples"); @@ -312,6 +322,8 @@ void BlenderSync::sync_integrator() integrator->mesh_light_samples = mesh_light_samples * mesh_light_samples; integrator->subsurface_samples = subsurface_samples * subsurface_samples; integrator->volume_samples = volume_samples * volume_samples; + integrator->adaptive_min_samples = min( + integrator->adaptive_min_samples * integrator->adaptive_min_samples, INT_MAX); } else { integrator->diffuse_samples = diffuse_samples; @@ -484,6 +496,8 @@ PassType BlenderSync::get_pass_type(BL::RenderPass &b_pass) MAP_PASS("Debug Ray Bounces", PASS_RAY_BOUNCES); #endif MAP_PASS("Debug Render Time", PASS_RENDER_TIME); + MAP_PASS("AdaptiveAuxBuffer", PASS_ADAPTIVE_AUX_BUFFER); + MAP_PASS("Debug Sample Count", PASS_SAMPLE_COUNT); if (string_startswith(name, cryptomatte_prefix)) { return PASS_CRYPTOMATTE; } @@ -519,7 +533,9 @@ int BlenderSync::get_denoising_pass(BL::RenderPass &b_pass) return -1; } -vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLayer &b_view_layer) +vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, + BL::ViewLayer &b_view_layer, + bool adaptive_sampling) { vector<Pass> passes; @@ -595,6 +611,10 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLa b_engine.add_pass("Debug Render Time", 1, "X", b_view_layer.name().c_str()); Pass::add(PASS_RENDER_TIME, passes, "Debug Render Time"); } + if (get_boolean(crp, "pass_debug_sample_count")) { + b_engine.add_pass("Debug Sample Count", 1, "X", b_view_layer.name().c_str()); + Pass::add(PASS_SAMPLE_COUNT, passes, "Debug Sample Count"); + } if (get_boolean(crp, "use_pass_volume_direct")) { b_engine.add_pass("VolumeDir", 3, "RGB", b_view_layer.name().c_str()); Pass::add(PASS_VOLUME_DIRECT, passes, "VolumeDir"); @@ -641,6 +661,13 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLa CRYPT_ACCURATE); } + if (adaptive_sampling) { + Pass::add(PASS_ADAPTIVE_AUX_BUFFER, passes); + if (!get_boolean(crp, "pass_debug_sample_count")) { + Pass::add(PASS_SAMPLE_COUNT, passes); + } + } + RNA_BEGIN (&crp, b_aov, "aovs") { bool is_color = (get_enum(b_aov, "type") == 1); string name = get_string(b_aov, "name"); @@ -880,6 +907,8 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine, params.use_profiling = params.device.has_profiling && !b_engine.is_preview() && background && BlenderSession::print_render_stats; + params.adaptive_sampling = RNA_boolean_get(&cscene, "use_adaptive_sampling"); + return params; } diff --git a/intern/cycles/blender/blender_sync.h b/intern/cycles/blender/blender_sync.h index 219a3615835..0a7174e407b 100644 --- a/intern/cycles/blender/blender_sync.h +++ b/intern/cycles/blender/blender_sync.h @@ -71,7 +71,9 @@ class BlenderSync { int height, void **python_thread_state); void sync_view_layer(BL::SpaceView3D &b_v3d, BL::ViewLayer &b_view_layer); - vector<Pass> sync_render_passes(BL::RenderLayer &b_render_layer, BL::ViewLayer &b_view_layer); + vector<Pass> sync_render_passes(BL::RenderLayer &b_render_layer, + BL::ViewLayer &b_view_layer, + bool adaptive_sampling); void sync_integrator(); void sync_camera(BL::RenderSettings &b_render, BL::Object &b_override, diff --git a/intern/cycles/device/cuda/device_cuda.h b/intern/cycles/device/cuda/device_cuda.h index a825bd6b128..6a0b39434aa 100644 --- a/intern/cycles/device/cuda/device_cuda.h +++ b/intern/cycles/device/cuda/device_cuda.h @@ -82,6 +82,17 @@ class CUDADevice : public Device { device_vector<TextureInfo> texture_info; bool need_texture_info; + /* Kernels */ + struct { + bool loaded; + + CUfunction adaptive_stopping; + CUfunction adaptive_filter_x; + CUfunction adaptive_filter_y; + CUfunction adaptive_scale_samples; + int adaptive_num_threads_per_block; + } functions; + static bool have_precompiled_kernels(); virtual bool show_samples() const; @@ -114,6 +125,8 @@ class CUDADevice : public Device { virtual bool load_kernels(const DeviceRequestedFeatures &requested_features); + void load_functions(); + void reserve_local_memory(const DeviceRequestedFeatures &requested_features); void init_host_memory(); @@ -197,6 +210,15 @@ class CUDADevice : public Device { void denoise(RenderTile &rtile, DenoisingTask &denoising); + void adaptive_sampling_filter(uint filter_sample, + WorkTile *wtile, + CUdeviceptr d_wtile, + CUstream stream = 0); + void adaptive_sampling_post(RenderTile &rtile, + WorkTile *wtile, + CUdeviceptr d_wtile, + CUstream stream = 0); + void path_trace(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles); void film_convert(DeviceTask &task, diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp index 4a7c45d8b93..11dd9b69f10 100644 --- a/intern/cycles/device/cuda/device_cuda_impl.cpp +++ b/intern/cycles/device/cuda/device_cuda_impl.cpp @@ -208,6 +208,8 @@ CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool map_host_used = 0; can_map_host = 0; + functions.loaded = false; + /* Intialize CUDA. */ if (cuda_error(cuInit(0))) return; @@ -531,9 +533,42 @@ bool CUDADevice::load_kernels(const DeviceRequestedFeatures &requested_features) reserve_local_memory(requested_features); } + load_functions(); + return (result == CUDA_SUCCESS); } +void CUDADevice::load_functions() +{ + /* TODO: load all functions here. */ + if (functions.loaded) { + return; + } + functions.loaded = true; + + cuda_assert(cuModuleGetFunction( + &functions.adaptive_stopping, cuModule, "kernel_cuda_adaptive_stopping")); + cuda_assert(cuModuleGetFunction( + &functions.adaptive_filter_x, cuModule, "kernel_cuda_adaptive_filter_x")); + cuda_assert(cuModuleGetFunction( + &functions.adaptive_filter_y, cuModule, "kernel_cuda_adaptive_filter_y")); + cuda_assert(cuModuleGetFunction( + &functions.adaptive_scale_samples, cuModule, "kernel_cuda_adaptive_scale_samples")); + + cuda_assert(cuFuncSetCacheConfig(functions.adaptive_stopping, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_x, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_y, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(functions.adaptive_scale_samples, CU_FUNC_CACHE_PREFER_L1)); + + int unused_min_blocks; + cuda_assert(cuOccupancyMaxPotentialBlockSize(&unused_min_blocks, + &functions.adaptive_num_threads_per_block, + functions.adaptive_scale_samples, + NULL, + 0, + 0)); +} + void CUDADevice::reserve_local_memory(const DeviceRequestedFeatures &requested_features) { if (use_split_kernel()) { @@ -1666,6 +1701,80 @@ void CUDADevice::denoise(RenderTile &rtile, DenoisingTask &denoising) denoising.run_denoising(&rtile); } +void CUDADevice::adaptive_sampling_filter(uint filter_sample, + WorkTile *wtile, + CUdeviceptr d_wtile, + CUstream stream) +{ + const int num_threads_per_block = functions.adaptive_num_threads_per_block; + + /* These are a series of tiny kernels because there is no grid synchronisation + * from within a kernel, so multiple kernel launches it is.*/ + uint total_work_size = wtile->h * wtile->w; + void *args2[] = {&d_wtile, &filter_sample, &total_work_size}; + uint num_blocks = divide_up(total_work_size, num_threads_per_block); + cuda_assert(cuLaunchKernel(functions.adaptive_stopping, + num_blocks, + 1, + 1, + num_threads_per_block, + 1, + 1, + 0, + stream, + args2, + 0)); + total_work_size = wtile->h; + num_blocks = divide_up(total_work_size, num_threads_per_block); + cuda_assert(cuLaunchKernel(functions.adaptive_filter_x, + num_blocks, + 1, + 1, + num_threads_per_block, + 1, + 1, + 0, + stream, + args2, + 0)); + total_work_size = wtile->w; + num_blocks = divide_up(total_work_size, num_threads_per_block); + cuda_assert(cuLaunchKernel(functions.adaptive_filter_y, + num_blocks, + 1, + 1, + num_threads_per_block, + 1, + 1, + 0, + stream, + args2, + 0)); +} + +void CUDADevice::adaptive_sampling_post(RenderTile &rtile, + WorkTile *wtile, + CUdeviceptr d_wtile, + CUstream stream) +{ + const int num_threads_per_block = functions.adaptive_num_threads_per_block; + uint total_work_size = wtile->h * wtile->w; + + void *args[] = {&d_wtile, &rtile.start_sample, &rtile.sample, &total_work_size}; + uint num_blocks = divide_up(total_work_size, num_threads_per_block); + cuda_assert(cuLaunchKernel(functions.adaptive_scale_samples, + num_blocks, + 1, + 1, + num_threads_per_block, + 1, + 1, + 0, + stream, + args, + 0)); +} + void CUDADevice::path_trace(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles) @@ -1715,6 +1824,9 @@ void CUDADevice::path_trace(DeviceTask &task, } uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h); + if (task.adaptive_sampling.use) { + step_samples = task.adaptive_sampling.align_static_samples(step_samples); + } /* Render all samples. */ int start_sample = rtile.start_sample; @@ -1736,6 +1848,12 @@ void CUDADevice::path_trace(DeviceTask &task, cuda_assert( cuLaunchKernel(cuPathTrace, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); + /* Run the adaptive sampling kernels at selected samples aligned to step samples. */ + uint filter_sample = sample + wtile->num_samples - 1; + if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) { + adaptive_sampling_filter(filter_sample, wtile, d_work_tiles); + } + cuda_assert(cuCtxSynchronize()); /* Update progress. */ @@ -1747,6 +1865,14 @@ void CUDADevice::path_trace(DeviceTask &task, break; } } + + /* Finalize adaptive sampling. */ + if (task.adaptive_sampling.use) { + CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer; + adaptive_sampling_post(rtile, wtile, d_work_tiles); + cuda_assert(cuCtxSynchronize()); + task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples); + } } void CUDADevice::film_convert(DeviceTask &task, diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index 1c9d2227ac3..cf239ec39e2 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -34,6 +34,7 @@ #include "kernel/kernel_types.h" #include "kernel/split/kernel_split_data.h" #include "kernel/kernel_globals.h" +#include "kernel/kernel_adaptive_sampling.h" #include "kernel/filter/filter.h" @@ -317,6 +318,10 @@ class CPUDevice : public Device { REGISTER_SPLIT_KERNEL(next_iteration_setup); REGISTER_SPLIT_KERNEL(indirect_subsurface); REGISTER_SPLIT_KERNEL(buffer_update); + REGISTER_SPLIT_KERNEL(adaptive_stopping); + REGISTER_SPLIT_KERNEL(adaptive_filter_x); + REGISTER_SPLIT_KERNEL(adaptive_filter_y); + REGISTER_SPLIT_KERNEL(adaptive_adjust_samples); #undef REGISTER_SPLIT_KERNEL #undef KERNEL_FUNCTIONS } @@ -823,6 +828,50 @@ class CPUDevice : public Device { return true; } + bool adaptive_sampling_filter(KernelGlobals *kg, RenderTile &tile, int sample) + { + WorkTile wtile; + wtile.x = tile.x; + wtile.y = tile.y; + wtile.w = tile.w; + wtile.h = tile.h; + wtile.offset = tile.offset; + wtile.stride = tile.stride; + wtile.buffer = (float *)tile.buffer; + + bool any = false; + for (int y = tile.y; y < tile.y + tile.h; ++y) { + any |= kernel_do_adaptive_filter_x(kg, y, &wtile); + } + for (int x = tile.x; x < tile.x + tile.w; ++x) { + any |= kernel_do_adaptive_filter_y(kg, x, &wtile); + } + + return (!any); + } + + void adaptive_sampling_post(const DeviceTask &task, const RenderTile &tile, KernelGlobals *kg) + { + float *render_buffer = (float *)tile.buffer; + for (int y = tile.y; y < tile.y + tile.h; y++) { + for (int x = tile.x; x < tile.x + tile.w; x++) { + int index = tile.offset + x + y * tile.stride; + ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride; + if (buffer[kernel_data.film.pass_sample_count] < 0.0f) { + buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count]; + float sample_multiplier = tile.sample / max((float)tile.start_sample + 1.0f, + buffer[kernel_data.film.pass_sample_count]); + if (sample_multiplier != 1.0f) { + kernel_adaptive_post_adjust(kg, buffer, sample_multiplier); + } + } + else { + kernel_adaptive_post_adjust(kg, buffer, tile.sample / (tile.sample - 1.0f)); + } + } + } + } + void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg) { const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE; @@ -855,14 +904,25 @@ class CPUDevice : public Device { path_trace_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride); } } - tile.sample = sample + 1; task.update_progress(&tile, tile.w * tile.h); + + if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(sample)) { + const bool stop = adaptive_sampling_filter(kg, tile, sample); + if (stop) { + tile.sample = end_sample; + break; + } + } } if (use_coverage) { coverage.finalize(); } + + if (task.adaptive_sampling.use) { + adaptive_sampling_post(task, tile, kg); + } } void denoise(DenoisingTask &denoising, RenderTile &tile) diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp index 61a5c74f69e..30d624fdf7c 100644 --- a/intern/cycles/device/device_optix.cpp +++ b/intern/cycles/device/device_optix.cpp @@ -627,7 +627,11 @@ class OptiXDevice : public CUDADevice { const int end_sample = rtile.start_sample + rtile.num_samples; // Keep this number reasonable to avoid running into TDRs - const int step_samples = (info.display_device ? 8 : 32); + int step_samples = (info.display_device ? 8 : 32); + if (task.adaptive_sampling.use) { + step_samples = task.adaptive_sampling.align_static_samples(step_samples); + } + // Offset into launch params buffer so that streams use separate data device_ptr launch_params_ptr = launch_params.device_pointer + thread_index * launch_params.data_elements; @@ -638,10 +642,9 @@ class OptiXDevice : public CUDADevice { // Copy work tile information to device wtile.num_samples = min(step_samples, end_sample - sample); wtile.start_sample = sample; - check_result_cuda(cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, tile), - &wtile, - sizeof(wtile), - cuda_stream[thread_index])); + device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile); + check_result_cuda( + cuMemcpyHtoDAsync(d_wtile_ptr, &wtile, sizeof(wtile), cuda_stream[thread_index])); OptixShaderBindingTable sbt_params = {}; sbt_params.raygenRecord = sbt_data.device_pointer + PG_RGEN * sizeof(SbtRecord); @@ -666,6 +669,12 @@ class OptiXDevice : public CUDADevice { wtile.h, 1)); + // Run the adaptive sampling kernels at selected samples aligned to step samples. + uint filter_sample = wtile.start_sample + wtile.num_samples - 1; + if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) { + adaptive_sampling_filter(filter_sample, &wtile, d_wtile_ptr, cuda_stream[thread_index]); + } + // Wait for launch to finish check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index])); @@ -677,6 +686,14 @@ class OptiXDevice : public CUDADevice { if (task.get_cancel() && !task.need_finish_queue) return; // Cancel rendering } + + // Finalize adaptive sampling + if (task.adaptive_sampling.use) { + device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile); + adaptive_sampling_post(rtile, &wtile, d_wtile_ptr, cuda_stream[thread_index]); + check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index])); + task.update_progress(&rtile, rtile.w * rtile.h * wtile.num_samples); + } } bool launch_denoise(DeviceTask &task, RenderTile &rtile) diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp index 42e597a34d7..f22d8761058 100644 --- a/intern/cycles/device/device_split_kernel.cpp +++ b/intern/cycles/device/device_split_kernel.cpp @@ -55,6 +55,10 @@ DeviceSplitKernel::DeviceSplitKernel(Device *device) kernel_next_iteration_setup = NULL; kernel_indirect_subsurface = NULL; kernel_buffer_update = NULL; + kernel_adaptive_stopping = NULL; + kernel_adaptive_filter_x = NULL; + kernel_adaptive_filter_y = NULL; + kernel_adaptive_adjust_samples = NULL; } DeviceSplitKernel::~DeviceSplitKernel() @@ -83,6 +87,10 @@ DeviceSplitKernel::~DeviceSplitKernel() delete kernel_next_iteration_setup; delete kernel_indirect_subsurface; delete kernel_buffer_update; + delete kernel_adaptive_stopping; + delete kernel_adaptive_filter_x; + delete kernel_adaptive_filter_y; + delete kernel_adaptive_adjust_samples; } bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures &requested_features) @@ -114,6 +122,10 @@ bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures &requested_fe LOAD_KERNEL(next_iteration_setup); LOAD_KERNEL(indirect_subsurface); LOAD_KERNEL(buffer_update); + LOAD_KERNEL(adaptive_stopping); + LOAD_KERNEL(adaptive_filter_x); + LOAD_KERNEL(adaptive_filter_y); + LOAD_KERNEL(adaptive_adjust_samples); #undef LOAD_KERNEL @@ -202,13 +214,21 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task, /* initial guess to start rolling average */ const int initial_num_samples = 1; /* approx number of samples per second */ - int samples_per_second = (avg_time_per_sample > 0.0) ? - int(double(time_multiplier) / avg_time_per_sample) + 1 : - initial_num_samples; + const int samples_per_second = (avg_time_per_sample > 0.0) ? + int(double(time_multiplier) / avg_time_per_sample) + 1 : + initial_num_samples; RenderTile subtile = tile; subtile.start_sample = tile.sample; - subtile.num_samples = min(samples_per_second, + subtile.num_samples = samples_per_second; + + if (task->adaptive_sampling.use) { + subtile.num_samples = task->adaptive_sampling.align_dynamic_samples(subtile.start_sample, + subtile.num_samples); + } + + /* Don't go beyond requested number of samples. */ + subtile.num_samples = min(subtile.num_samples, tile.start_sample + tile.num_samples - tile.sample); if (device->have_error()) { @@ -302,6 +322,23 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task, } } + int filter_sample = tile.sample + subtile.num_samples - 1; + if (task->adaptive_sampling.use && task->adaptive_sampling.need_filter(filter_sample)) { + size_t buffer_size[2]; + buffer_size[0] = round_up(tile.w, local_size[0]); + buffer_size[1] = round_up(tile.h, local_size[1]); + kernel_adaptive_stopping->enqueue( + KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data); + buffer_size[0] = round_up(tile.h, local_size[0]); + buffer_size[1] = round_up(1, local_size[1]); + kernel_adaptive_filter_x->enqueue( + KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data); + buffer_size[0] = round_up(tile.w, local_size[0]); + buffer_size[1] = round_up(1, local_size[1]); + kernel_adaptive_filter_y->enqueue( + KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data); + } + double time_per_sample = ((time_dt() - start_time) / subtile.num_samples); if (avg_time_per_sample == 0.0) { @@ -324,6 +361,28 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task, } } + if (task->adaptive_sampling.use) { + /* Reset the start samples. */ + RenderTile subtile = tile; + subtile.start_sample = tile.start_sample; + subtile.num_samples = tile.sample - tile.start_sample; + enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size), + subtile, + num_global_elements, + kgbuffer, + kernel_data, + split_data, + ray_state, + queue_index, + use_queues_flag, + work_pool_wgs); + size_t buffer_size[2]; + buffer_size[0] = round_up(tile.w, local_size[0]); + buffer_size[1] = round_up(tile.h, local_size[1]); + kernel_adaptive_adjust_samples->enqueue( + KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data); + } + return true; } diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h index 6ff326bf214..9d6b9efdd62 100644 --- a/intern/cycles/device/device_split_kernel.h +++ b/intern/cycles/device/device_split_kernel.h @@ -75,6 +75,10 @@ class DeviceSplitKernel { SplitKernelFunction *kernel_next_iteration_setup; SplitKernelFunction *kernel_indirect_subsurface; SplitKernelFunction *kernel_buffer_update; + SplitKernelFunction *kernel_adaptive_stopping; + SplitKernelFunction *kernel_adaptive_filter_x; + SplitKernelFunction *kernel_adaptive_filter_y; + SplitKernelFunction *kernel_adaptive_adjust_samples; /* Global memory variables [porting]; These memory is used for * co-operation between different kernels; Data written by one diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp index 36522b874ab..c36b1344c3b 100644 --- a/intern/cycles/device/device_task.cpp +++ b/intern/cycles/device/device_task.cpp @@ -136,4 +136,59 @@ void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples) } } +/* Adaptive Sampling */ + +AdaptiveSampling::AdaptiveSampling() + : use(true), adaptive_step(ADAPTIVE_SAMPLE_STEP), min_samples(0) +{ +} + +/* Render samples in steps that align with the adaptive filtering. */ +int AdaptiveSampling::align_static_samples(int samples) const +{ + if (samples > adaptive_step) { + /* Make multiple of adaptive_step. */ + while (samples % adaptive_step != 0) { + samples--; + } + } + else if (samples < adaptive_step) { + /* Make divisor of adaptive_step. */ + while (adaptive_step % samples != 0) { + samples--; + } + } + + return max(samples, 1); +} + +/* Render samples in steps that align with the adaptive filtering, with the + * suggested number of samples dynamically changing. */ +int AdaptiveSampling::align_dynamic_samples(int offset, int samples) const +{ + /* Round so that we end up on multiples of adaptive_samples. */ + samples += offset; + + if (samples > adaptive_step) { + /* Make multiple of adaptive_step. */ + while (samples % adaptive_step != 0) { + samples--; + } + } + + samples -= offset; + + return max(samples, 1); +} + +bool AdaptiveSampling::need_filter(int sample) const +{ + if (sample > min_samples) { + return (sample & (adaptive_step - 1)) == (adaptive_step - 1); + } + else { + return false; + } +} + CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h index 972f6131092..8c4e682adb1 100644 --- a/intern/cycles/device/device_task.h +++ b/intern/cycles/device/device_task.h @@ -62,6 +62,19 @@ class DenoiseParams { } }; +class AdaptiveSampling { + public: + AdaptiveSampling(); + + int align_static_samples(int samples) const; + int align_dynamic_samples(int offset, int samples) const; + bool need_filter(int sample) const; + + bool use; + int adaptive_step; + int min_samples; +}; + class DeviceTask : public Task { public: typedef enum { RENDER, FILM_CONVERT, SHADER, DENOISE_BUFFER } Type; @@ -115,6 +128,7 @@ class DeviceTask : public Task { bool need_finish_queue; bool integrator_branched; + AdaptiveSampling adaptive_sampling; protected: double last_update_time; diff --git a/intern/cycles/device/opencl/device_opencl.h b/intern/cycles/device/opencl/device_opencl.h index 61b1e3e3b6b..b761726b1ad 100644 --- a/intern/cycles/device/opencl/device_opencl.h +++ b/intern/cycles/device/opencl/device_opencl.h @@ -445,6 +445,7 @@ class OpenCLDevice : public Device { device_ptr rgba_byte, device_ptr rgba_half); void shader(DeviceTask &task); + void update_adaptive(DeviceTask &task, RenderTile &tile, int sample); void denoise(RenderTile &tile, DenoisingTask &denoising); diff --git a/intern/cycles/device/opencl/device_opencl_impl.cpp b/intern/cycles/device/opencl/device_opencl_impl.cpp index 68cdfd5238c..3dbe54b38aa 100644 --- a/intern/cycles/device/opencl/device_opencl_impl.cpp +++ b/intern/cycles/device/opencl/device_opencl_impl.cpp @@ -56,7 +56,11 @@ static const string SPLIT_BUNDLE_KERNELS = "enqueue_inactive " "next_iteration_setup " "indirect_subsurface " - "buffer_update"; + "buffer_update " + "adaptive_stopping " + "adaptive_filter_x " + "adaptive_filter_y " + "adaptive_adjust_samples"; const string OpenCLDevice::get_opencl_program_name(const string &kernel_name) { @@ -283,6 +287,10 @@ void OpenCLDevice::OpenCLSplitPrograms::load_kernels( ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(next_iteration_setup); ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(indirect_subsurface); ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(buffer_update); + ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_stopping); + ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_x); + ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_y); + ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_adjust_samples); programs.push_back(&program_split); # undef ADD_SPLIT_KERNEL_PROGRAM diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 566b6e3d191..0dd0da65f82 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -36,6 +36,10 @@ set(SRC_CUDA_KERNELS ) set(SRC_OPENCL_KERNELS + kernels/opencl/kernel_adaptive_stopping.cl + kernels/opencl/kernel_adaptive_filter_x.cl + kernels/opencl/kernel_adaptive_filter_y.cl + kernels/opencl/kernel_adaptive_adjust_samples.cl kernels/opencl/kernel_bake.cl kernels/opencl/kernel_base.cl kernels/opencl/kernel_displace.cl @@ -94,6 +98,7 @@ set(SRC_BVH_HEADERS set(SRC_HEADERS kernel_accumulate.h + kernel_adaptive_sampling.h kernel_bake.h kernel_camera.h kernel_color.h @@ -324,6 +329,10 @@ set(SRC_UTIL_HEADERS ) set(SRC_SPLIT_HEADERS + split/kernel_adaptive_adjust_samples.h + split/kernel_adaptive_filter_x.h + split/kernel_adaptive_filter_y.h + split/kernel_adaptive_stopping.h split/kernel_branched.h split/kernel_buffer_update.h split/kernel_data_init.h diff --git a/intern/cycles/kernel/kernel_adaptive_sampling.h b/intern/cycles/kernel/kernel_adaptive_sampling.h new file mode 100644 index 00000000000..502b69e4f7f --- /dev/null +++ b/intern/cycles/kernel/kernel_adaptive_sampling.h @@ -0,0 +1,231 @@ +/* + * Copyright 2019 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __KERNEL_ADAPTIVE_SAMPLING_H__ +#define __KERNEL_ADAPTIVE_SAMPLING_H__ + +CCL_NAMESPACE_BEGIN + +/* Determines whether to continue sampling a given pixel or if it has sufficiently converged. */ + +ccl_device void kernel_do_adaptive_stopping(KernelGlobals *kg, + ccl_global float *buffer, + int sample) +{ + /* TODO Stefan: Is this better in linear, sRGB or something else? */ + float4 I = *((ccl_global float4 *)buffer); + float4 A = *(ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer); + /* The per pixel error as seen in section 2.1 of + * "A hierarchical automatic stopping condition for Monte Carlo global illumination" + * A small epsilon is added to the divisor to prevent division by zero. */ + float error = (fabsf(I.x - A.x) + fabsf(I.y - A.y) + fabsf(I.z - A.z)) / + (sample * 0.0001f + sqrtf(I.x + I.y + I.z)); + if (error < kernel_data.integrator.adaptive_threshold * (float)sample) { + /* Set the fourth component to non-zero value to indicate that this pixel has converged. */ + buffer[kernel_data.film.pass_adaptive_aux_buffer + 3] += 1.0f; + } +} + +/* Adjust the values of an adaptively sampled pixel. */ + +ccl_device void kernel_adaptive_post_adjust(KernelGlobals *kg, + ccl_global float *buffer, + float sample_multiplier) +{ + *(ccl_global float4 *)(buffer) *= sample_multiplier; + + /* Scale the aux pass too, this is necessary for progressive rendering to work properly. */ + kernel_assert(kernel_data.film.pass_adaptive_aux_buffer); + *(ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer) *= sample_multiplier; + +#ifdef __PASSES__ + int flag = kernel_data.film.pass_flag; + + if (flag & PASSMASK(SHADOW)) + *(ccl_global float3 *)(buffer + kernel_data.film.pass_shadow) *= sample_multiplier; + + if (flag & PASSMASK(MIST)) + *(ccl_global float *)(buffer + kernel_data.film.pass_mist) *= sample_multiplier; + + if (flag & PASSMASK(NORMAL)) + *(ccl_global float3 *)(buffer + kernel_data.film.pass_normal) *= sample_multiplier; + + if (flag & PASSMASK(UV)) + *(ccl_global float3 *)(buffer + kernel_data.film.pass_uv) *= sample_multiplier; + + if (flag & PASSMASK(MOTION)) { + *(ccl_global float4 *)(buffer + kernel_data.film.pass_motion) *= sample_multiplier; + *(ccl_global float *)(buffer + kernel_data.film.pass_motion_weight) *= sample_multiplier; + } + + if (kernel_data.film.use_light_pass) { + int light_flag = kernel_data.film.light_pass_flag; + + if (light_flag & PASSMASK(DIFFUSE_INDIRECT)) + *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_indirect) *= sample_multiplier; + if (light_flag & PASSMASK(GLOSSY_INDIRECT)) + *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_indirect) *= sample_multiplier; + if (light_flag & PASSMASK(TRANSMISSION_INDIRECT)) + *(ccl_global float3 *)(buffer + + kernel_data.film.pass_transmission_indirect) *= sample_multiplier; + if (light_flag & PASSMASK(VOLUME_INDIRECT)) + *(ccl_global float3 *)(buffer + kernel_data.film.pass_volume_indirect) *= sample_multiplier; + if (light_flag & PASSMASK(DIFFUSE_DIRECT)) + *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_direct) *= sample_multiplier; + if (light_flag & PASSMASK(GLOSSY_DIRECT)) + *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_direct) *= sample_multiplier; + if (light_flag & PASSMASK(TRANSMISSION_DIRECT)) + *(ccl_global float3 *)(buffer + + kernel_data.film.pass_transmission_direct) *= sample_multiplier; + if (light_flag & PASSMASK(VOLUME_DIRECT)) + *(ccl_global float3 *)(buffer + kernel_data.film.pass_volume_direct) *= sample_multiplier; + + if (light_flag & PASSMASK(EMISSION)) + *(ccl_global float3 *)(buffer + kernel_data.film.pass_emission) *= sample_multiplier; + if (light_flag & PASSMASK(BACKGROUND)) + *(ccl_global float3 *)(buffer + kernel_data.film.pass_background) *= sample_multiplier; + if (light_flag & PASSMASK(AO)) + *(ccl_global float3 *)(buffer + kernel_data.film.pass_ao) *= sample_multiplier; + + if (light_flag & PASSMASK(DIFFUSE_COLOR)) + *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_color) *= sample_multiplier; + if (light_flag & PASSMASK(GLOSSY_COLOR)) + *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_color) *= sample_multiplier; + if (light_flag & PASSMASK(TRANSMISSION_COLOR)) + *(ccl_global float3 *)(buffer + + kernel_data.film.pass_transmission_color) *= sample_multiplier; + } +#endif + +#ifdef __DENOISING_FEATURES__ + +# define scale_float3_variance(buffer, offset, scale) \ + *(buffer + offset) *= scale; \ + *(buffer + offset + 1) *= scale; \ + *(buffer + offset + 2) *= scale; \ + *(buffer + offset + 3) *= scale * scale; \ + *(buffer + offset + 4) *= scale * scale; \ + *(buffer + offset + 5) *= scale * scale; + +# define scale_shadow_variance(buffer, offset, scale) \ + *(buffer + offset) *= scale; \ + *(buffer + offset + 1) *= scale; \ + *(buffer + offset + 2) *= scale * scale; + + if (kernel_data.film.pass_denoising_data) { + scale_shadow_variance( + buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_SHADOW_A, sample_multiplier); + scale_shadow_variance( + buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_SHADOW_B, sample_multiplier); + if (kernel_data.film.pass_denoising_clean) { + scale_float3_variance( + buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, sample_multiplier); + *(buffer + kernel_data.film.pass_denoising_clean) *= sample_multiplier; + *(buffer + kernel_data.film.pass_denoising_clean + 1) *= sample_multiplier; + *(buffer + kernel_data.film.pass_denoising_clean + 2) *= sample_multiplier; + } + else { + scale_float3_variance( + buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, sample_multiplier); + } + scale_float3_variance( + buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL, sample_multiplier); + scale_float3_variance( + buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO, sample_multiplier); + *(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH) *= sample_multiplier; + *(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH + + 1) *= sample_multiplier * sample_multiplier; + } +#endif /* __DENOISING_FEATURES__ */ + + if (kernel_data.film.cryptomatte_passes) { + int num_slots = 0; + num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) ? 1 : 0; + num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) ? 1 : 0; + num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) ? 1 : 0; + num_slots = num_slots * 2 * kernel_data.film.cryptomatte_depth; + ccl_global float2 *id_buffer = (ccl_global float2 *)(buffer + + kernel_data.film.pass_cryptomatte); + for (int slot = 0; slot < num_slots; slot++) { + id_buffer[slot].y *= sample_multiplier; + } + } +} + +/* This is a simple box filter in two passes. + * When a pixel demands more adaptive samples, let its neighboring pixels draw more samples too. */ + +ccl_device bool kernel_do_adaptive_filter_x(KernelGlobals *kg, int y, ccl_global WorkTile *tile) +{ + bool any = false; + bool prev = false; + for (int x = tile->x; x < tile->x + tile->w; ++x) { + int index = tile->offset + x + y * tile->stride; + ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride; + ccl_global float4 *aux = (ccl_global float4 *)(buffer + + kernel_data.film.pass_adaptive_aux_buffer); + if (aux->w == 0.0f) { + any = true; + if (x > tile->x && !prev) { + index = index - 1; + buffer = tile->buffer + index * kernel_data.film.pass_stride; + aux = (ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer); + aux->w = 0.0f; + } + prev = true; + } + else { + if (prev) { + aux->w = 0.0f; + } + prev = false; + } + } + return any; +} + +ccl_device bool kernel_do_adaptive_filter_y(KernelGlobals *kg, int x, ccl_global WorkTile *tile) +{ + bool prev = false; + bool any = false; + for (int y = tile->y; y < tile->y + tile->h; ++y) { + int index = tile->offset + x + y * tile->stride; + ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride; + ccl_global float4 *aux = (ccl_global float4 *)(buffer + + kernel_data.film.pass_adaptive_aux_buffer); + if (aux->w == 0.0f) { + any = true; + if (y > tile->y && !prev) { + index = index - tile->stride; + buffer = tile->buffer + index * kernel_data.film.pass_stride; + aux = (ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer); + aux->w = 0.0f; + } + prev = true; + } + else { + if (prev) { + aux->w = 0.0f; + } + prev = false; + } + } + return any; +} + +CCL_NAMESPACE_END + +#endif /* __KERNEL_ADAPTIVE_SAMPLING_H__ */ diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h index 187e8340c82..33ec05c6048 100644 --- a/intern/cycles/kernel/kernel_passes.h +++ b/intern/cycles/kernel/kernel_passes.h @@ -29,7 +29,9 @@ ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg, if (kernel_data.film.pass_denoising_data == 0) return; - buffer += (sample & 1) ? DENOISING_PASS_SHADOW_B : DENOISING_PASS_SHADOW_A; + buffer += sample_is_even(kernel_data.integrator.sampling_pattern, sample) ? + DENOISING_PASS_SHADOW_B : + DENOISING_PASS_SHADOW_A; path_total = ensure_finite(path_total); path_total_shaded = ensure_finite(path_total_shaded); @@ -386,6 +388,41 @@ ccl_device_inline void kernel_write_result(KernelGlobals *kg, #ifdef __KERNEL_DEBUG__ kernel_write_debug_passes(kg, buffer, L); #endif + + /* Adaptive Sampling. Fill the additional buffer with the odd samples and calculate our stopping + criteria. This is the heuristic from "A hierarchical automatic stopping condition for Monte + Carlo global illumination" except that here it is applied per pixel and not in hierarchical + tiles. */ + if (kernel_data.film.pass_adaptive_aux_buffer && + kernel_data.integrator.adaptive_threshold > 0.0f) { + if (sample_is_even(kernel_data.integrator.sampling_pattern, sample)) { + kernel_write_pass_float4(buffer + kernel_data.film.pass_adaptive_aux_buffer, + make_float4(L_sum.x * 2.0f, L_sum.y * 2.0f, L_sum.z * 2.0f, 0.0f)); + } +#ifdef __KERNEL_CPU__ + if (sample > kernel_data.integrator.adaptive_min_samples && + (sample & (ADAPTIVE_SAMPLE_STEP - 1)) == (ADAPTIVE_SAMPLE_STEP - 1)) { + kernel_do_adaptive_stopping(kg, buffer, sample); + } +#endif + } + + /* Write the sample count as negative numbers initially to mark the samples as in progress. + * Once the tile has finished rendering, the sign gets flipped and all the pixel values + * are scaled as if they were taken at a uniform sample count. */ + if (kernel_data.film.pass_sample_count) { + /* Make sure it's a negative number. In progressive refine mode, this bit gets flipped between + * passes. */ +#ifdef __ATOMIC_PASS_WRITE__ + atomic_fetch_and_or_uint32((ccl_global uint *)(buffer + kernel_data.film.pass_sample_count), + 0x80000000); +#else + if (buffer[kernel_data.film.pass_sample_count] > 0) { + buffer[kernel_data.film.pass_sample_count] *= -1.0f; + } +#endif + kernel_write_pass_float(buffer + kernel_data.film.pass_sample_count, -1.0f); + } } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h index 1a0b67275a7..bdd2703a894 100644 --- a/intern/cycles/kernel/kernel_path.h +++ b/intern/cycles/kernel/kernel_path.h @@ -31,6 +31,7 @@ #include "kernel/kernel_accumulate.h" #include "kernel/kernel_shader.h" #include "kernel/kernel_light.h" +#include "kernel/kernel_adaptive_sampling.h" #include "kernel/kernel_passes.h" #if defined(__VOLUME__) || defined(__SUBSURFACE__) @@ -656,6 +657,14 @@ ccl_device void kernel_path_trace( buffer += index * pass_stride; + if (kernel_data.film.pass_adaptive_aux_buffer) { + ccl_global float4 *aux = (ccl_global float4 *)(buffer + + kernel_data.film.pass_adaptive_aux_buffer); + if (aux->w > 0.0f) { + return; + } + } + /* Initialize random numbers and sample ray. */ uint rng_hash; Ray ray; diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h index f75e4ab4c97..0d5781fe3d1 100644 --- a/intern/cycles/kernel/kernel_path_branched.h +++ b/intern/cycles/kernel/kernel_path_branched.h @@ -523,6 +523,14 @@ ccl_device void kernel_branched_path_trace( buffer += index * pass_stride; + if (kernel_data.film.pass_adaptive_aux_buffer) { + ccl_global float4 *aux = (ccl_global float4 *)(buffer + + kernel_data.film.pass_adaptive_aux_buffer); + if (aux->w > 0.0f) { + return; + } + } + /* initialize random numbers and ray */ uint rng_hash; Ray ray; diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index 88c2d0d3196..c5be93e2cda 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -63,6 +63,11 @@ CCL_NAMESPACE_BEGIN #define VOLUME_STACK_SIZE 32 +/* Adaptive sampling constants */ +#define ADAPTIVE_SAMPLE_STEP 4 +static_assert((ADAPTIVE_SAMPLE_STEP & (ADAPTIVE_SAMPLE_STEP - 1)) == 0, + "ADAPTIVE_SAMPLE_STEP must be power of two for bitwise operations to work"); + /* Split kernel constants */ #define WORK_POOL_SIZE_GPU 64 #define WORK_POOL_SIZE_CPU 1 @@ -374,6 +379,8 @@ typedef enum PassType { PASS_CRYPTOMATTE, PASS_AOV_COLOR, PASS_AOV_VALUE, + PASS_ADAPTIVE_AUX_BUFFER, + PASS_SAMPLE_COUNT, PASS_CATEGORY_MAIN_END = 31, PASS_MIST = 32, @@ -1223,6 +1230,9 @@ typedef struct KernelFilm { int cryptomatte_depth; int pass_cryptomatte; + int pass_adaptive_aux_buffer; + int pass_sample_count; + int pass_mist; float mist_start; float mist_inv_depth; @@ -1256,6 +1266,8 @@ typedef struct KernelFilm { int display_divide_pass_stride; int use_display_exposure; int use_display_pass_alpha; + + int pad3, pad4, pad5; } KernelFilm; static_assert_align(KernelFilm, 16); @@ -1337,6 +1349,8 @@ typedef struct KernelIntegrator { /* sampler */ int sampling_pattern; int aa_samples; + int adaptive_min_samples; + float adaptive_threshold; /* volume render */ int use_volumes; @@ -1348,7 +1362,7 @@ typedef struct KernelIntegrator { int max_closures; - int pad1; + int pad1, pad2, pad3; } KernelIntegrator; static_assert_align(KernelIntegrator, 16); @@ -1662,7 +1676,7 @@ typedef struct WorkTile { uint start_sample; uint num_samples; - uint offset; + int offset; uint stride; ccl_global float *buffer; diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h index 799561a7466..c642d227e4b 100644 --- a/intern/cycles/kernel/kernel_work_stealing.h +++ b/intern/cycles/kernel/kernel_work_stealing.h @@ -23,17 +23,41 @@ CCL_NAMESPACE_BEGIN * Utility functions for work stealing */ +/* Map global work index to tile, pixel X/Y and sample. */ +ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile, + uint global_work_index, + ccl_private uint *x, + ccl_private uint *y, + ccl_private uint *sample) +{ +#ifdef __KERNEL_CUDA__ + /* Keeping threads for the same pixel together improves performance on CUDA. */ + uint sample_offset = global_work_index % tile->num_samples; + uint pixel_offset = global_work_index / tile->num_samples; +#else /* __KERNEL_CUDA__ */ + uint tile_pixels = tile->w * tile->h; + uint sample_offset = global_work_index / tile_pixels; + uint pixel_offset = global_work_index - sample_offset * tile_pixels; +#endif /* __KERNEL_CUDA__ */ + uint y_offset = pixel_offset / tile->w; + uint x_offset = pixel_offset - y_offset * tile->w; + + *x = tile->x + x_offset; + *y = tile->y + y_offset; + *sample = tile->start_sample + sample_offset; +} + #ifdef __KERNEL_OPENCL__ # pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable #endif #ifdef __SPLIT_KERNEL__ /* Returns true if there is work */ -ccl_device bool get_next_work(KernelGlobals *kg, - ccl_global uint *work_pools, - uint total_work_size, - uint ray_index, - ccl_private uint *global_work_index) +ccl_device bool get_next_work_item(KernelGlobals *kg, + ccl_global uint *work_pools, + uint total_work_size, + uint ray_index, + ccl_private uint *global_work_index) { /* With a small amount of work there may be more threads than work due to * rounding up of global size, stop such threads immediately. */ @@ -56,31 +80,37 @@ ccl_device bool get_next_work(KernelGlobals *kg, /* Test if all work for this pool is done. */ return (*global_work_index < total_work_size); } -#endif -/* Map global work index to tile, pixel X/Y and sample. */ -ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile, - uint global_work_index, - ccl_private uint *x, - ccl_private uint *y, - ccl_private uint *sample) +ccl_device bool get_next_work(KernelGlobals *kg, + ccl_global uint *work_pools, + uint total_work_size, + uint ray_index, + ccl_private uint *global_work_index) { -#ifdef __KERNEL_CUDA__ - /* Keeping threads for the same pixel together improves performance on CUDA. */ - uint sample_offset = global_work_index % tile->num_samples; - uint pixel_offset = global_work_index / tile->num_samples; -#else /* __KERNEL_CUDA__ */ - uint tile_pixels = tile->w * tile->h; - uint sample_offset = global_work_index / tile_pixels; - uint pixel_offset = global_work_index - sample_offset * tile_pixels; -#endif /* __KERNEL_CUDA__ */ - uint y_offset = pixel_offset / tile->w; - uint x_offset = pixel_offset - y_offset * tile->w; - - *x = tile->x + x_offset; - *y = tile->y + y_offset; - *sample = tile->start_sample + sample_offset; + bool got_work = false; + if (kernel_data.film.pass_adaptive_aux_buffer) { + do { + got_work = get_next_work_item(kg, work_pools, total_work_size, ray_index, global_work_index); + if (got_work) { + ccl_global WorkTile *tile = &kernel_split_params.tile; + uint x, y, sample; + get_work_pixel(tile, *global_work_index, &x, &y, &sample); + uint buffer_offset = (tile->offset + x + y * tile->stride) * kernel_data.film.pass_stride; + ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset; + ccl_global float4 *aux = (ccl_global float4 *)(buffer + + kernel_data.film.pass_adaptive_aux_buffer); + if (aux->w == 0.0f) { + break; + } + } + } while (got_work); + } + else { + got_work = get_next_work_item(kg, work_pools, total_work_size, ray_index, global_work_index); + } + return got_work; } +#endif CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h index f5d981fb71a..683f4b88d79 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h @@ -89,5 +89,9 @@ DECLARE_SPLIT_KERNEL_FUNCTION(enqueue_inactive) DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup) DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface) DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update) +DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_stopping) +DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x) +DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y) +DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples) #undef KERNEL_ARCH diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h index 9ca3f46b5b6..96b2bf11132 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h @@ -58,6 +58,10 @@ # include "kernel/split/kernel_next_iteration_setup.h" # include "kernel/split/kernel_indirect_subsurface.h" # include "kernel/split/kernel_buffer_update.h" +# include "kernel/split/kernel_adaptive_stopping.h" +# include "kernel/split/kernel_adaptive_filter_x.h" +# include "kernel/split/kernel_adaptive_filter_y.h" +# include "kernel/split/kernel_adaptive_adjust_samples.h" # endif /* __SPLIT_KERNEL__ */ #else # define STUB_ASSERT(arch, name) \ @@ -204,6 +208,10 @@ DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint) DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint) +DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_stopping) +DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x) +DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y) +DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples) #endif /* __SPLIT_KERNEL__ */ #undef KERNEL_STUB diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu index af311027f78..c4c810c6a82 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel.cu +++ b/intern/cycles/kernel/kernels/cuda/kernel.cu @@ -33,6 +33,7 @@ #include "kernel/kernel_path_branched.h" #include "kernel/kernel_bake.h" #include "kernel/kernel_work_stealing.h" +#include "kernel/kernel_adaptive_sampling.h" /* kernels */ extern "C" __global__ void @@ -83,6 +84,75 @@ kernel_cuda_branched_path_trace(WorkTile *tile, uint total_work_size) extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_adaptive_stopping(WorkTile *tile, int sample, uint total_work_size) +{ + int work_index = ccl_global_id(0); + bool thread_is_active = work_index < total_work_size; + KernelGlobals kg; + if(thread_is_active && kernel_data.film.pass_adaptive_aux_buffer) { + uint x = tile->x + work_index % tile->w; + uint y = tile->y + work_index / tile->w; + int index = tile->offset + x + y * tile->stride; + ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride; + kernel_do_adaptive_stopping(&kg, buffer, sample); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_adaptive_filter_x(WorkTile *tile, int sample, uint) +{ + KernelGlobals kg; + if(kernel_data.film.pass_adaptive_aux_buffer && sample > kernel_data.integrator.adaptive_min_samples) { + if(ccl_global_id(0) < tile->h) { + int y = tile->y + ccl_global_id(0); + kernel_do_adaptive_filter_x(&kg, y, tile); + } + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_adaptive_filter_y(WorkTile *tile, int sample, uint) +{ + KernelGlobals kg; + if(kernel_data.film.pass_adaptive_aux_buffer && sample > kernel_data.integrator.adaptive_min_samples) { + if(ccl_global_id(0) < tile->w) { + int x = tile->x + ccl_global_id(0); + kernel_do_adaptive_filter_y(&kg, x, tile); + } + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_adaptive_scale_samples(WorkTile *tile, int start_sample, int sample, uint total_work_size) +{ + if(kernel_data.film.pass_adaptive_aux_buffer) { + int work_index = ccl_global_id(0); + bool thread_is_active = work_index < total_work_size; + KernelGlobals kg; + if(thread_is_active) { + uint x = tile->x + work_index % tile->w; + uint y = tile->y + work_index / tile->w; + int index = tile->offset + x + y * tile->stride; + ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride; + if(buffer[kernel_data.film.pass_sample_count] < 0.0f) { + buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count]; + float sample_multiplier = sample / max((float)start_sample + 1.0f, buffer[kernel_data.film.pass_sample_count]); + if(sample_multiplier != 1.0f) { + kernel_adaptive_post_adjust(&kg, buffer, sample_multiplier); + } + } + else { + kernel_adaptive_post_adjust(&kg, buffer, sample / (sample - 1.0f)); + } + } + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) { int x = sx + blockDim.x*blockIdx.x + threadIdx.x; diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu index 43b3d0aa0e6..95ad7599cf1 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu +++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu @@ -43,6 +43,10 @@ #include "kernel/split/kernel_next_iteration_setup.h" #include "kernel/split/kernel_indirect_subsurface.h" #include "kernel/split/kernel_buffer_update.h" +#include "kernel/split/kernel_adaptive_stopping.h" +#include "kernel/split/kernel_adaptive_filter_x.h" +#include "kernel/split/kernel_adaptive_filter_y.h" +#include "kernel/split/kernel_adaptive_adjust_samples.h" #include "kernel/kernel_film.h" @@ -121,6 +125,10 @@ DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint) DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint) +DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_stopping) +DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x) +DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y) +DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples) extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl new file mode 100644 index 00000000000..ebdb99d4730 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl @@ -0,0 +1,23 @@ +/* + * Copyright 2019 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_adaptive_adjust_samples.h" + +#define KERNEL_NAME adaptive_adjust_samples +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl new file mode 100644 index 00000000000..76d82d4184e --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl @@ -0,0 +1,23 @@ +/* + * Copyright 2019 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_adaptive_filter_x.h" + +#define KERNEL_NAME adaptive_filter_x +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl new file mode 100644 index 00000000000..1e6d15ba0f2 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl @@ -0,0 +1,23 @@ +/* + * Copyright 2019 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_adaptive_filter_y.h" + +#define KERNEL_NAME adaptive_filter_y +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl new file mode 100644 index 00000000000..51de0059667 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl @@ -0,0 +1,23 @@ +/* + * Copyright 2019 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel/kernel_compat_opencl.h" +#include "kernel/split/kernel_split_common.h" +#include "kernel/split/kernel_adaptive_stopping.h" + +#define KERNEL_NAME adaptive_stopping +#include "kernel/kernels/opencl/kernel_split_function.h" +#undef KERNEL_NAME diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl b/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl index 6041f13b52b..c3b7b09460a 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl @@ -28,3 +28,7 @@ #include "kernel/kernels/opencl/kernel_next_iteration_setup.cl" #include "kernel/kernels/opencl/kernel_indirect_subsurface.cl" #include "kernel/kernels/opencl/kernel_buffer_update.cl" +#include "kernel/kernels/opencl/kernel_adaptive_stopping.cl" +#include "kernel/kernels/opencl/kernel_adaptive_filter_x.cl" +#include "kernel/kernels/opencl/kernel_adaptive_filter_y.cl" +#include "kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl" diff --git a/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h b/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h new file mode 100644 index 00000000000..60ebf415970 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h @@ -0,0 +1,44 @@ +/* + * Copyright 2019 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_adaptive_adjust_samples(KernelGlobals *kg) +{ + int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + if (pixel_index < kernel_split_params.tile.w * kernel_split_params.tile.h) { + int x = kernel_split_params.tile.x + pixel_index % kernel_split_params.tile.w; + int y = kernel_split_params.tile.y + pixel_index / kernel_split_params.tile.w; + int buffer_offset = (kernel_split_params.tile.offset + x + + y * kernel_split_params.tile.stride) * + kernel_data.film.pass_stride; + ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset; + int sample = kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples; + if (buffer[kernel_data.film.pass_sample_count] < 0.0f) { + buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count]; + float sample_multiplier = sample / max((float)kernel_split_params.tile.start_sample + 1.0f, + buffer[kernel_data.film.pass_sample_count]); + if (sample_multiplier != 1.0f) { + kernel_adaptive_post_adjust(kg, buffer, sample_multiplier); + } + } + else { + kernel_adaptive_post_adjust(kg, buffer, sample / (sample - 1.0f)); + } + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_adaptive_filter_x.h b/intern/cycles/kernel/split/kernel_adaptive_filter_x.h new file mode 100644 index 00000000000..93f41f7ced4 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_adaptive_filter_x.h @@ -0,0 +1,30 @@ +/* + * Copyright 2019 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_adaptive_filter_x(KernelGlobals *kg) +{ + int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + if (pixel_index < kernel_split_params.tile.h && + kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >= + kernel_data.integrator.adaptive_min_samples) { + int y = kernel_split_params.tile.y + pixel_index; + kernel_do_adaptive_filter_x(kg, y, &kernel_split_params.tile); + } +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_adaptive_filter_y.h b/intern/cycles/kernel/split/kernel_adaptive_filter_y.h new file mode 100644 index 00000000000..eca53d079ec --- /dev/null +++ b/intern/cycles/kernel/split/kernel_adaptive_filter_y.h @@ -0,0 +1,29 @@ +/* + * Copyright 2019 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_adaptive_filter_y(KernelGlobals *kg) +{ + int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + if (pixel_index < kernel_split_params.tile.w && + kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >= + kernel_data.integrator.adaptive_min_samples) { + int x = kernel_split_params.tile.x + pixel_index; + kernel_do_adaptive_filter_y(kg, x, &kernel_split_params.tile); + } +} +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_adaptive_stopping.h b/intern/cycles/kernel/split/kernel_adaptive_stopping.h new file mode 100644 index 00000000000..c8eb1ebd705 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_adaptive_stopping.h @@ -0,0 +1,37 @@ +/* + * Copyright 2019 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device void kernel_adaptive_stopping(KernelGlobals *kg) +{ + int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + if (pixel_index < kernel_split_params.tile.w * kernel_split_params.tile.h && + kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >= + kernel_data.integrator.adaptive_min_samples) { + int x = kernel_split_params.tile.x + pixel_index % kernel_split_params.tile.w; + int y = kernel_split_params.tile.y + pixel_index / kernel_split_params.tile.w; + int buffer_offset = (kernel_split_params.tile.offset + x + + y * kernel_split_params.tile.stride) * + kernel_data.film.pass_stride; + ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset; + kernel_do_adaptive_stopping(kg, + buffer, + kernel_split_params.tile.start_sample + + kernel_split_params.tile.num_samples - 1); + } +} +CCL_NAMESPACE_END diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp index 41e1b73fdac..19d3f00bf82 100644 --- a/intern/cycles/render/buffers.cpp +++ b/intern/cycles/render/buffers.cpp @@ -260,6 +260,22 @@ bool RenderBuffers::get_pass_rect( return false; } + float *sample_count = NULL; + if (name == "Combined") { + int sample_offset = 0; + for (size_t j = 0; j < params.passes.size(); j++) { + Pass &pass = params.passes[j]; + if (pass.type != PASS_SAMPLE_COUNT) { + sample_offset += pass.components; + continue; + } + else { + sample_count = buffer.data() + sample_offset; + break; + } + } + } + int pass_offset = 0; for (size_t j = 0; j < params.passes.size(); j++) { @@ -420,6 +436,11 @@ bool RenderBuffers::get_pass_rect( } else { for (int i = 0; i < size; i++, in += pass_stride, pixels += 4) { + if (sample_count && sample_count[i * pass_stride] < 0.0f) { + scale = (pass.filter) ? -1.0f / (sample_count[i * pass_stride]) : 1.0f; + scale_exposure = (pass.exposure) ? scale * exposure : scale; + } + float4 f = make_float4(in[0], in[1], in[2], in[3]); pixels[0] = f.x * scale_exposure; diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp index 172ea3dd31b..48d9c97e0fb 100644 --- a/intern/cycles/render/film.cpp +++ b/intern/cycles/render/film.cpp @@ -183,6 +183,13 @@ void Pass::add(PassType type, vector<Pass> &passes, const char *name) case PASS_CRYPTOMATTE: pass.components = 4; break; + case PASS_ADAPTIVE_AUX_BUFFER: + pass.components = 4; + break; + case PASS_SAMPLE_COUNT: + pass.components = 1; + pass.exposure = false; + break; case PASS_AOV_COLOR: pass.components = 4; break; @@ -311,6 +318,7 @@ NODE_DEFINE(Film) SOCKET_BOOLEAN(denoising_clean_pass, "Generate Denoising Clean Pass", false); SOCKET_BOOLEAN(denoising_prefiltered_pass, "Generate Denoising Prefiltered Pass", false); SOCKET_INT(denoising_flags, "Denoising Flags", 0); + SOCKET_BOOLEAN(use_adaptive_sampling, "Use Adaptive Sampling", false); return type; } @@ -482,6 +490,12 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene) kfilm->pass_stride; have_cryptomatte = true; break; + case PASS_ADAPTIVE_AUX_BUFFER: + kfilm->pass_adaptive_aux_buffer = kfilm->pass_stride; + break; + case PASS_SAMPLE_COUNT: + kfilm->pass_sample_count = kfilm->pass_stride; + break; case PASS_AOV_COLOR: if (!have_aov_color) { kfilm->pass_aov_color = kfilm->pass_stride; diff --git a/intern/cycles/render/film.h b/intern/cycles/render/film.h index 95e54cb54d8..aae8fb404b0 100644 --- a/intern/cycles/render/film.h +++ b/intern/cycles/render/film.h @@ -81,6 +81,8 @@ class Film : public Node { CryptomatteType cryptomatte_passes; int cryptomatte_depth; + bool use_adaptive_sampling; + bool need_update; Film(); diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp index f289e11fe14..ee1aa5988bf 100644 --- a/intern/cycles/render/integrator.cpp +++ b/intern/cycles/render/integrator.cpp @@ -27,6 +27,7 @@ #include "kernel/kernel_types.h" #include "util/util_foreach.h" +#include "util/util_logging.h" #include "util/util_hash.h" CCL_NAMESPACE_BEGIN @@ -69,6 +70,9 @@ NODE_DEFINE(Integrator) SOCKET_INT(volume_samples, "Volume Samples", 1); SOCKET_INT(start_sample, "Start Sample", 0); + SOCKET_FLOAT(adaptive_threshold, "Adaptive Threshold", 0.0f); + SOCKET_INT(adaptive_min_samples, "Adaptive Min Samples", 0); + SOCKET_BOOLEAN(sample_all_lights_direct, "Sample All Lights Direct", true); SOCKET_BOOLEAN(sample_all_lights_indirect, "Sample All Lights Indirect", true); SOCKET_FLOAT(light_sampling_threshold, "Light Sampling Threshold", 0.05f); @@ -178,6 +182,22 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene kintegrator->sampling_pattern = sampling_pattern; kintegrator->aa_samples = aa_samples; + if (aa_samples > 0 && adaptive_min_samples == 0) { + kintegrator->adaptive_min_samples = max(4, (int)sqrtf(aa_samples)); + VLOG(1) << "Cycles adaptive sampling: automatic min samples = " + << kintegrator->adaptive_min_samples; + } + else { + kintegrator->adaptive_min_samples = max(4, adaptive_min_samples); + } + if (aa_samples > 0 && adaptive_threshold == 0.0f) { + kintegrator->adaptive_threshold = max(0.001f, 1.0f / (float)aa_samples); + VLOG(1) << "Cycles adaptive sampling: automatic threshold = " + << kintegrator->adaptive_threshold; + } + else { + kintegrator->adaptive_threshold = adaptive_threshold; + } if (light_sampling_threshold > 0.0f) { kintegrator->light_inv_rr_threshold = 1.0f / light_sampling_threshold; diff --git a/intern/cycles/render/integrator.h b/intern/cycles/render/integrator.h index 32d84c27072..9930e907aea 100644 --- a/intern/cycles/render/integrator.h +++ b/intern/cycles/render/integrator.h @@ -75,6 +75,9 @@ class Integrator : public Node { bool sample_all_lights_indirect; float light_sampling_threshold; + int adaptive_min_samples; + float adaptive_threshold; + enum Method { BRANCHED_PATH = 0, PATH = 1, diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp index 4231403e39a..6bf2160f9fa 100644 --- a/intern/cycles/render/session.cpp +++ b/intern/cycles/render/session.cpp @@ -1103,6 +1103,10 @@ void Session::render(bool with_denoising) task.need_finish_queue = params.progressive_refine; task.integrator_branched = scene->integrator->method == Integrator::BRANCHED_PATH; + task.adaptive_sampling.use = (scene->integrator->sampling_pattern == SAMPLING_PATTERN_PMJ) && + scene->dscene.data.film.pass_adaptive_aux_buffer; + task.adaptive_sampling.min_samples = scene->dscene.data.integrator.adaptive_min_samples; + /* Acquire render tiles by default. */ task.tile_types = RenderTile::PATH_TRACE; diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h index 7f3614ccb19..8053e46b12e 100644 --- a/intern/cycles/render/session.h +++ b/intern/cycles/render/session.h @@ -56,6 +56,7 @@ class SessionParams { int denoising_start_sample; int pixel_size; int threads; + bool adaptive_sampling; bool use_profiling; @@ -89,6 +90,7 @@ class SessionParams { denoising_start_sample = 0; pixel_size = 1; threads = 0; + adaptive_sampling = false; use_profiling = false; @@ -117,6 +119,7 @@ class SessionParams { progressive == params.progressive && experimental == params.experimental && tile_size == params.tile_size && start_resolution == params.start_resolution && pixel_size == params.pixel_size && threads == params.threads && + adaptive_sampling == params.adaptive_sampling && use_profiling == params.use_profiling && display_buffer_linear == params.display_buffer_linear && cancel_timeout == params.cancel_timeout && reset_timeout == params.reset_timeout && diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h index a8ea1dc925e..13d177d2b25 100644 --- a/intern/cycles/util/util_atomic.h +++ b/intern/cycles/util/util_atomic.h @@ -77,6 +77,7 @@ ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float # define atomic_fetch_and_add_uint32(p, x) atomic_add((p), (x)) # define atomic_fetch_and_inc_uint32(p) atomic_inc((p)) # define atomic_fetch_and_dec_uint32(p) atomic_dec((p)) +# define atomic_fetch_and_or_uint32(p, x) atomic_or((p), (x)) # define CCL_LOCAL_MEM_FENCE CLK_LOCAL_MEM_FENCE # define ccl_barrier(flags) barrier(flags) @@ -91,6 +92,7 @@ ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float # define atomic_fetch_and_sub_uint32(p, x) atomicSub((unsigned int *)(p), (unsigned int)(x)) # define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1) # define atomic_fetch_and_dec_uint32(p) atomic_fetch_and_sub_uint32((p), 1) +# define atomic_fetch_and_or_uint32(p, x) atomicOr((unsigned int *)(p), (unsigned int)(x)) ccl_device_inline float atomic_compare_and_swap_float(volatile float *dest, const float old_val, diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h index 48e9983ac8f..8a1890ad319 100644 --- a/intern/cycles/util/util_types.h +++ b/intern/cycles/util/util_types.h @@ -101,6 +101,11 @@ ccl_device_inline size_t round_down(size_t x, size_t multiple) return (x / multiple) * multiple; } +ccl_device_inline bool is_power_of_two(size_t x) +{ + return (x & (x - 1)) == 0; +} + CCL_NAMESPACE_END /* Vectorized types declaration. */ |