Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--intern/cycles/blender/addon/engine.py1
-rw-r--r--intern/cycles/blender/addon/properties.py25
-rw-r--r--intern/cycles/blender/addon/ui.py11
-rw-r--r--intern/cycles/blender/blender_session.cpp3
-rw-r--r--intern/cycles/blender/blender_sync.cpp31
-rw-r--r--intern/cycles/blender/blender_sync.h4
-rw-r--r--intern/cycles/device/cuda/device_cuda.h22
-rw-r--r--intern/cycles/device/cuda/device_cuda_impl.cpp126
-rw-r--r--intern/cycles/device/device_cpu.cpp62
-rw-r--r--intern/cycles/device/device_optix.cpp27
-rw-r--r--intern/cycles/device/device_split_kernel.cpp67
-rw-r--r--intern/cycles/device/device_split_kernel.h4
-rw-r--r--intern/cycles/device/device_task.cpp55
-rw-r--r--intern/cycles/device/device_task.h14
-rw-r--r--intern/cycles/device/opencl/device_opencl.h1
-rw-r--r--intern/cycles/device/opencl/device_opencl_impl.cpp10
-rw-r--r--intern/cycles/kernel/CMakeLists.txt9
-rw-r--r--intern/cycles/kernel/kernel_adaptive_sampling.h231
-rw-r--r--intern/cycles/kernel/kernel_passes.h39
-rw-r--r--intern/cycles/kernel/kernel_path.h9
-rw-r--r--intern/cycles/kernel/kernel_path_branched.h8
-rw-r--r--intern/cycles/kernel/kernel_types.h18
-rw-r--r--intern/cycles/kernel/kernel_work_stealing.h84
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_cpu.h4
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h8
-rw-r--r--intern/cycles/kernel/kernels/cuda/kernel.cu70
-rw-r--r--intern/cycles/kernel/kernels/cuda/kernel_split.cu8
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl23
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl23
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl23
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl23
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl4
-rw-r--r--intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h44
-rw-r--r--intern/cycles/kernel/split/kernel_adaptive_filter_x.h30
-rw-r--r--intern/cycles/kernel/split/kernel_adaptive_filter_y.h29
-rw-r--r--intern/cycles/kernel/split/kernel_adaptive_stopping.h37
-rw-r--r--intern/cycles/render/buffers.cpp21
-rw-r--r--intern/cycles/render/film.cpp14
-rw-r--r--intern/cycles/render/film.h2
-rw-r--r--intern/cycles/render/integrator.cpp20
-rw-r--r--intern/cycles/render/integrator.h3
-rw-r--r--intern/cycles/render/session.cpp4
-rw-r--r--intern/cycles/render/session.h3
-rw-r--r--intern/cycles/util/util_atomic.h2
-rw-r--r--intern/cycles/util/util_types.h5
45 files changed, 1215 insertions, 46 deletions
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index 7917edf8c88..afa573c8dc5 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -255,6 +255,7 @@ def list_render_passes(srl):
if crl.pass_debug_bvh_traversed_instances: yield ("Debug BVH Traversed Instances", "X", 'VALUE')
if crl.pass_debug_bvh_intersections: yield ("Debug BVH Intersections", "X", 'VALUE')
if crl.pass_debug_ray_bounces: yield ("Debug Ray Bounces", "X", 'VALUE')
+ if crl.pass_debug_sample_count: yield ("Debug Sample Count", "X", 'VALUE')
if crl.use_pass_volume_direct: yield ("VolumeDir", "RGB", 'COLOR')
if crl.use_pass_volume_indirect: yield ("VolumeInd", "RGB", 'COLOR')
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 77dc29e11e8..f2c40f509e8 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -350,6 +350,24 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
default=0.01,
)
+ use_adaptive_sampling: BoolProperty(
+ name="Use adaptive sampling",
+ description="Automatically determine the number of samples per pixel based on a variance estimation",
+ default=False,
+ )
+ adaptive_threshold: FloatProperty(
+ name="Adaptive Sampling Threshold",
+ description="Zero for automatic setting based on AA samples",
+ min=0.0, max=1.0,
+ default=0.0,
+ )
+ adaptive_min_samples: IntProperty(
+ name="Adaptive Min Samples",
+ description="Minimum AA samples for adaptive sampling. Zero for automatic setting based on AA samples",
+ min=0, max=4096,
+ default=0,
+ )
+
min_light_bounces: IntProperty(
name="Min Light Bounces",
description="Minimum number of light bounces. Setting this higher reduces noise in the first bounces, "
@@ -1298,7 +1316,12 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
default=False,
update=update_render_passes,
)
-
+ pass_debug_sample_count: BoolProperty(
+ name="Debug Sample Count",
+ description="Number of samples/camera rays per pixel",
+ default=False,
+ update=update_render_passes,
+ )
use_pass_volume_direct: BoolProperty(
name="Volume Direct",
description="Deliver direct volumetric scattering pass",
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index ed9e3a4c9cf..d04418fc957 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -190,6 +190,7 @@ class CYCLES_RENDER_PT_sampling(CyclesButtonsPanel, Panel):
col.prop(cscene, "aa_samples", text="Render")
col.prop(cscene, "preview_aa_samples", text="Viewport")
+ col.prop(cscene, "use_adaptive_sampling", text="Adaptive Sampling")
# Viewport denoising is currently only supported with OptiX
if show_optix_denoising(context):
col = layout.column()
@@ -247,7 +248,13 @@ class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel):
row.prop(cscene, "seed")
row.prop(cscene, "use_animated_seed", text="", icon='TIME')
- layout.prop(cscene, "sampling_pattern", text="Pattern")
+ col = layout.column(align=True)
+ col.active = not(cscene.use_adaptive_sampling)
+ col.prop(cscene, "sampling_pattern", text="Pattern")
+ col = layout.column(align=True)
+ col.active = cscene.use_adaptive_sampling
+ col.prop(cscene, "adaptive_min_samples", text="Adaptive Min Samples")
+ col.prop(cscene, "adaptive_threshold", text="Adaptive Threshold")
layout.prop(cscene, "use_square_samples")
@@ -813,6 +820,8 @@ class CYCLES_RENDER_PT_passes_data(CyclesButtonsPanel, Panel):
col.prop(cycles_view_layer, "denoising_store_passes", text="Denoising Data")
col = flow.column()
col.prop(cycles_view_layer, "pass_debug_render_time", text="Render Time")
+ col = flow.column()
+ col.prop(cycles_view_layer, "pass_debug_sample_count", text="Sample Count")
layout.separator()
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index 5cfb1200c7c..ac307743e48 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -470,7 +470,8 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
b_rlay_name = b_view_layer.name();
/* add passes */
- vector<Pass> passes = sync->sync_render_passes(b_rlay, b_view_layer);
+ vector<Pass> passes = sync->sync_render_passes(
+ b_rlay, b_view_layer, session_params.adaptive_sampling);
buffer_params.passes = passes;
PointerRNA crl = RNA_pointer_get(&b_view_layer.ptr, "cycles");
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 50442c6ebdc..8f00f9ccda0 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -296,6 +296,16 @@ void BlenderSync::sync_integrator()
integrator->sample_all_lights_indirect = get_boolean(cscene, "sample_all_lights_indirect");
integrator->light_sampling_threshold = get_float(cscene, "light_sampling_threshold");
+ if (RNA_boolean_get(&cscene, "use_adaptive_sampling")) {
+ integrator->sampling_pattern = SAMPLING_PATTERN_PMJ;
+ integrator->adaptive_min_samples = get_int(cscene, "adaptive_min_samples");
+ integrator->adaptive_threshold = get_float(cscene, "adaptive_threshold");
+ }
+ else {
+ integrator->adaptive_min_samples = INT_MAX;
+ integrator->adaptive_threshold = 0.0f;
+ }
+
int diffuse_samples = get_int(cscene, "diffuse_samples");
int glossy_samples = get_int(cscene, "glossy_samples");
int transmission_samples = get_int(cscene, "transmission_samples");
@@ -312,6 +322,8 @@ void BlenderSync::sync_integrator()
integrator->mesh_light_samples = mesh_light_samples * mesh_light_samples;
integrator->subsurface_samples = subsurface_samples * subsurface_samples;
integrator->volume_samples = volume_samples * volume_samples;
+ integrator->adaptive_min_samples = min(
+ integrator->adaptive_min_samples * integrator->adaptive_min_samples, INT_MAX);
}
else {
integrator->diffuse_samples = diffuse_samples;
@@ -484,6 +496,8 @@ PassType BlenderSync::get_pass_type(BL::RenderPass &b_pass)
MAP_PASS("Debug Ray Bounces", PASS_RAY_BOUNCES);
#endif
MAP_PASS("Debug Render Time", PASS_RENDER_TIME);
+ MAP_PASS("AdaptiveAuxBuffer", PASS_ADAPTIVE_AUX_BUFFER);
+ MAP_PASS("Debug Sample Count", PASS_SAMPLE_COUNT);
if (string_startswith(name, cryptomatte_prefix)) {
return PASS_CRYPTOMATTE;
}
@@ -519,7 +533,9 @@ int BlenderSync::get_denoising_pass(BL::RenderPass &b_pass)
return -1;
}
-vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLayer &b_view_layer)
+vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay,
+ BL::ViewLayer &b_view_layer,
+ bool adaptive_sampling)
{
vector<Pass> passes;
@@ -595,6 +611,10 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLa
b_engine.add_pass("Debug Render Time", 1, "X", b_view_layer.name().c_str());
Pass::add(PASS_RENDER_TIME, passes, "Debug Render Time");
}
+ if (get_boolean(crp, "pass_debug_sample_count")) {
+ b_engine.add_pass("Debug Sample Count", 1, "X", b_view_layer.name().c_str());
+ Pass::add(PASS_SAMPLE_COUNT, passes, "Debug Sample Count");
+ }
if (get_boolean(crp, "use_pass_volume_direct")) {
b_engine.add_pass("VolumeDir", 3, "RGB", b_view_layer.name().c_str());
Pass::add(PASS_VOLUME_DIRECT, passes, "VolumeDir");
@@ -641,6 +661,13 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLa
CRYPT_ACCURATE);
}
+ if (adaptive_sampling) {
+ Pass::add(PASS_ADAPTIVE_AUX_BUFFER, passes);
+ if (!get_boolean(crp, "pass_debug_sample_count")) {
+ Pass::add(PASS_SAMPLE_COUNT, passes);
+ }
+ }
+
RNA_BEGIN (&crp, b_aov, "aovs") {
bool is_color = (get_enum(b_aov, "type") == 1);
string name = get_string(b_aov, "name");
@@ -880,6 +907,8 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
params.use_profiling = params.device.has_profiling && !b_engine.is_preview() && background &&
BlenderSession::print_render_stats;
+ params.adaptive_sampling = RNA_boolean_get(&cscene, "use_adaptive_sampling");
+
return params;
}
diff --git a/intern/cycles/blender/blender_sync.h b/intern/cycles/blender/blender_sync.h
index 219a3615835..0a7174e407b 100644
--- a/intern/cycles/blender/blender_sync.h
+++ b/intern/cycles/blender/blender_sync.h
@@ -71,7 +71,9 @@ class BlenderSync {
int height,
void **python_thread_state);
void sync_view_layer(BL::SpaceView3D &b_v3d, BL::ViewLayer &b_view_layer);
- vector<Pass> sync_render_passes(BL::RenderLayer &b_render_layer, BL::ViewLayer &b_view_layer);
+ vector<Pass> sync_render_passes(BL::RenderLayer &b_render_layer,
+ BL::ViewLayer &b_view_layer,
+ bool adaptive_sampling);
void sync_integrator();
void sync_camera(BL::RenderSettings &b_render,
BL::Object &b_override,
diff --git a/intern/cycles/device/cuda/device_cuda.h b/intern/cycles/device/cuda/device_cuda.h
index a825bd6b128..6a0b39434aa 100644
--- a/intern/cycles/device/cuda/device_cuda.h
+++ b/intern/cycles/device/cuda/device_cuda.h
@@ -82,6 +82,17 @@ class CUDADevice : public Device {
device_vector<TextureInfo> texture_info;
bool need_texture_info;
+ /* Kernels */
+ struct {
+ bool loaded;
+
+ CUfunction adaptive_stopping;
+ CUfunction adaptive_filter_x;
+ CUfunction adaptive_filter_y;
+ CUfunction adaptive_scale_samples;
+ int adaptive_num_threads_per_block;
+ } functions;
+
static bool have_precompiled_kernels();
virtual bool show_samples() const;
@@ -114,6 +125,8 @@ class CUDADevice : public Device {
virtual bool load_kernels(const DeviceRequestedFeatures &requested_features);
+ void load_functions();
+
void reserve_local_memory(const DeviceRequestedFeatures &requested_features);
void init_host_memory();
@@ -197,6 +210,15 @@ class CUDADevice : public Device {
void denoise(RenderTile &rtile, DenoisingTask &denoising);
+ void adaptive_sampling_filter(uint filter_sample,
+ WorkTile *wtile,
+ CUdeviceptr d_wtile,
+ CUstream stream = 0);
+ void adaptive_sampling_post(RenderTile &rtile,
+ WorkTile *wtile,
+ CUdeviceptr d_wtile,
+ CUstream stream = 0);
+
void path_trace(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles);
void film_convert(DeviceTask &task,
diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp
index 4a7c45d8b93..11dd9b69f10 100644
--- a/intern/cycles/device/cuda/device_cuda_impl.cpp
+++ b/intern/cycles/device/cuda/device_cuda_impl.cpp
@@ -208,6 +208,8 @@ CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool
map_host_used = 0;
can_map_host = 0;
+ functions.loaded = false;
+
/* Intialize CUDA. */
if (cuda_error(cuInit(0)))
return;
@@ -531,9 +533,42 @@ bool CUDADevice::load_kernels(const DeviceRequestedFeatures &requested_features)
reserve_local_memory(requested_features);
}
+ load_functions();
+
return (result == CUDA_SUCCESS);
}
+void CUDADevice::load_functions()
+{
+ /* TODO: load all functions here. */
+ if (functions.loaded) {
+ return;
+ }
+ functions.loaded = true;
+
+ cuda_assert(cuModuleGetFunction(
+ &functions.adaptive_stopping, cuModule, "kernel_cuda_adaptive_stopping"));
+ cuda_assert(cuModuleGetFunction(
+ &functions.adaptive_filter_x, cuModule, "kernel_cuda_adaptive_filter_x"));
+ cuda_assert(cuModuleGetFunction(
+ &functions.adaptive_filter_y, cuModule, "kernel_cuda_adaptive_filter_y"));
+ cuda_assert(cuModuleGetFunction(
+ &functions.adaptive_scale_samples, cuModule, "kernel_cuda_adaptive_scale_samples"));
+
+ cuda_assert(cuFuncSetCacheConfig(functions.adaptive_stopping, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_x, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_y, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(functions.adaptive_scale_samples, CU_FUNC_CACHE_PREFER_L1));
+
+ int unused_min_blocks;
+ cuda_assert(cuOccupancyMaxPotentialBlockSize(&unused_min_blocks,
+ &functions.adaptive_num_threads_per_block,
+ functions.adaptive_scale_samples,
+ NULL,
+ 0,
+ 0));
+}
+
void CUDADevice::reserve_local_memory(const DeviceRequestedFeatures &requested_features)
{
if (use_split_kernel()) {
@@ -1666,6 +1701,80 @@ void CUDADevice::denoise(RenderTile &rtile, DenoisingTask &denoising)
denoising.run_denoising(&rtile);
}
+void CUDADevice::adaptive_sampling_filter(uint filter_sample,
+ WorkTile *wtile,
+ CUdeviceptr d_wtile,
+ CUstream stream)
+{
+ const int num_threads_per_block = functions.adaptive_num_threads_per_block;
+
+ /* These are a series of tiny kernels because there is no grid synchronisation
+ * from within a kernel, so multiple kernel launches it is.*/
+ uint total_work_size = wtile->h * wtile->w;
+ void *args2[] = {&d_wtile, &filter_sample, &total_work_size};
+ uint num_blocks = divide_up(total_work_size, num_threads_per_block);
+ cuda_assert(cuLaunchKernel(functions.adaptive_stopping,
+ num_blocks,
+ 1,
+ 1,
+ num_threads_per_block,
+ 1,
+ 1,
+ 0,
+ stream,
+ args2,
+ 0));
+ total_work_size = wtile->h;
+ num_blocks = divide_up(total_work_size, num_threads_per_block);
+ cuda_assert(cuLaunchKernel(functions.adaptive_filter_x,
+ num_blocks,
+ 1,
+ 1,
+ num_threads_per_block,
+ 1,
+ 1,
+ 0,
+ stream,
+ args2,
+ 0));
+ total_work_size = wtile->w;
+ num_blocks = divide_up(total_work_size, num_threads_per_block);
+ cuda_assert(cuLaunchKernel(functions.adaptive_filter_y,
+ num_blocks,
+ 1,
+ 1,
+ num_threads_per_block,
+ 1,
+ 1,
+ 0,
+ stream,
+ args2,
+ 0));
+}
+
+void CUDADevice::adaptive_sampling_post(RenderTile &rtile,
+ WorkTile *wtile,
+ CUdeviceptr d_wtile,
+ CUstream stream)
+{
+ const int num_threads_per_block = functions.adaptive_num_threads_per_block;
+ uint total_work_size = wtile->h * wtile->w;
+
+ void *args[] = {&d_wtile, &rtile.start_sample, &rtile.sample, &total_work_size};
+ uint num_blocks = divide_up(total_work_size, num_threads_per_block);
+ cuda_assert(cuLaunchKernel(functions.adaptive_scale_samples,
+ num_blocks,
+ 1,
+ 1,
+ num_threads_per_block,
+ 1,
+ 1,
+ 0,
+ stream,
+ args,
+ 0));
+}
+
void CUDADevice::path_trace(DeviceTask &task,
RenderTile &rtile,
device_vector<WorkTile> &work_tiles)
@@ -1715,6 +1824,9 @@ void CUDADevice::path_trace(DeviceTask &task,
}
uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
+ if (task.adaptive_sampling.use) {
+ step_samples = task.adaptive_sampling.align_static_samples(step_samples);
+ }
/* Render all samples. */
int start_sample = rtile.start_sample;
@@ -1736,6 +1848,12 @@ void CUDADevice::path_trace(DeviceTask &task,
cuda_assert(
cuLaunchKernel(cuPathTrace, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
+ /* Run the adaptive sampling kernels at selected samples aligned to step samples. */
+ uint filter_sample = sample + wtile->num_samples - 1;
+ if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
+ adaptive_sampling_filter(filter_sample, wtile, d_work_tiles);
+ }
+
cuda_assert(cuCtxSynchronize());
/* Update progress. */
@@ -1747,6 +1865,14 @@ void CUDADevice::path_trace(DeviceTask &task,
break;
}
}
+
+ /* Finalize adaptive sampling. */
+ if (task.adaptive_sampling.use) {
+ CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
+ adaptive_sampling_post(rtile, wtile, d_work_tiles);
+ cuda_assert(cuCtxSynchronize());
+ task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
+ }
}
void CUDADevice::film_convert(DeviceTask &task,
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 1c9d2227ac3..cf239ec39e2 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -34,6 +34,7 @@
#include "kernel/kernel_types.h"
#include "kernel/split/kernel_split_data.h"
#include "kernel/kernel_globals.h"
+#include "kernel/kernel_adaptive_sampling.h"
#include "kernel/filter/filter.h"
@@ -317,6 +318,10 @@ class CPUDevice : public Device {
REGISTER_SPLIT_KERNEL(next_iteration_setup);
REGISTER_SPLIT_KERNEL(indirect_subsurface);
REGISTER_SPLIT_KERNEL(buffer_update);
+ REGISTER_SPLIT_KERNEL(adaptive_stopping);
+ REGISTER_SPLIT_KERNEL(adaptive_filter_x);
+ REGISTER_SPLIT_KERNEL(adaptive_filter_y);
+ REGISTER_SPLIT_KERNEL(adaptive_adjust_samples);
#undef REGISTER_SPLIT_KERNEL
#undef KERNEL_FUNCTIONS
}
@@ -823,6 +828,50 @@ class CPUDevice : public Device {
return true;
}
+ bool adaptive_sampling_filter(KernelGlobals *kg, RenderTile &tile, int sample)
+ {
+ WorkTile wtile;
+ wtile.x = tile.x;
+ wtile.y = tile.y;
+ wtile.w = tile.w;
+ wtile.h = tile.h;
+ wtile.offset = tile.offset;
+ wtile.stride = tile.stride;
+ wtile.buffer = (float *)tile.buffer;
+
+ bool any = false;
+ for (int y = tile.y; y < tile.y + tile.h; ++y) {
+ any |= kernel_do_adaptive_filter_x(kg, y, &wtile);
+ }
+ for (int x = tile.x; x < tile.x + tile.w; ++x) {
+ any |= kernel_do_adaptive_filter_y(kg, x, &wtile);
+ }
+
+ return (!any);
+ }
+
+ void adaptive_sampling_post(const DeviceTask &task, const RenderTile &tile, KernelGlobals *kg)
+ {
+ float *render_buffer = (float *)tile.buffer;
+ for (int y = tile.y; y < tile.y + tile.h; y++) {
+ for (int x = tile.x; x < tile.x + tile.w; x++) {
+ int index = tile.offset + x + y * tile.stride;
+ ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride;
+ if (buffer[kernel_data.film.pass_sample_count] < 0.0f) {
+ buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
+ float sample_multiplier = tile.sample / max((float)tile.start_sample + 1.0f,
+ buffer[kernel_data.film.pass_sample_count]);
+ if (sample_multiplier != 1.0f) {
+ kernel_adaptive_post_adjust(kg, buffer, sample_multiplier);
+ }
+ }
+ else {
+ kernel_adaptive_post_adjust(kg, buffer, tile.sample / (tile.sample - 1.0f));
+ }
+ }
+ }
+ }
+
void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
{
const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE;
@@ -855,14 +904,25 @@ class CPUDevice : public Device {
path_trace_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
}
}
-
tile.sample = sample + 1;
task.update_progress(&tile, tile.w * tile.h);
+
+ if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(sample)) {
+ const bool stop = adaptive_sampling_filter(kg, tile, sample);
+ if (stop) {
+ tile.sample = end_sample;
+ break;
+ }
+ }
}
if (use_coverage) {
coverage.finalize();
}
+
+ if (task.adaptive_sampling.use) {
+ adaptive_sampling_post(task, tile, kg);
+ }
}
void denoise(DenoisingTask &denoising, RenderTile &tile)
diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp
index 61a5c74f69e..30d624fdf7c 100644
--- a/intern/cycles/device/device_optix.cpp
+++ b/intern/cycles/device/device_optix.cpp
@@ -627,7 +627,11 @@ class OptiXDevice : public CUDADevice {
const int end_sample = rtile.start_sample + rtile.num_samples;
// Keep this number reasonable to avoid running into TDRs
- const int step_samples = (info.display_device ? 8 : 32);
+ int step_samples = (info.display_device ? 8 : 32);
+ if (task.adaptive_sampling.use) {
+ step_samples = task.adaptive_sampling.align_static_samples(step_samples);
+ }
+
// Offset into launch params buffer so that streams use separate data
device_ptr launch_params_ptr = launch_params.device_pointer +
thread_index * launch_params.data_elements;
@@ -638,10 +642,9 @@ class OptiXDevice : public CUDADevice {
// Copy work tile information to device
wtile.num_samples = min(step_samples, end_sample - sample);
wtile.start_sample = sample;
- check_result_cuda(cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, tile),
- &wtile,
- sizeof(wtile),
- cuda_stream[thread_index]));
+ device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
+ check_result_cuda(
+ cuMemcpyHtoDAsync(d_wtile_ptr, &wtile, sizeof(wtile), cuda_stream[thread_index]));
OptixShaderBindingTable sbt_params = {};
sbt_params.raygenRecord = sbt_data.device_pointer + PG_RGEN * sizeof(SbtRecord);
@@ -666,6 +669,12 @@ class OptiXDevice : public CUDADevice {
wtile.h,
1));
+ // Run the adaptive sampling kernels at selected samples aligned to step samples.
+ uint filter_sample = wtile.start_sample + wtile.num_samples - 1;
+ if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
+ adaptive_sampling_filter(filter_sample, &wtile, d_wtile_ptr, cuda_stream[thread_index]);
+ }
+
// Wait for launch to finish
check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
@@ -677,6 +686,14 @@ class OptiXDevice : public CUDADevice {
if (task.get_cancel() && !task.need_finish_queue)
return; // Cancel rendering
}
+
+ // Finalize adaptive sampling
+ if (task.adaptive_sampling.use) {
+ device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
+ adaptive_sampling_post(rtile, &wtile, d_wtile_ptr, cuda_stream[thread_index]);
+ check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
+ task.update_progress(&rtile, rtile.w * rtile.h * wtile.num_samples);
+ }
}
bool launch_denoise(DeviceTask &task, RenderTile &rtile)
diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
index 42e597a34d7..f22d8761058 100644
--- a/intern/cycles/device/device_split_kernel.cpp
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -55,6 +55,10 @@ DeviceSplitKernel::DeviceSplitKernel(Device *device)
kernel_next_iteration_setup = NULL;
kernel_indirect_subsurface = NULL;
kernel_buffer_update = NULL;
+ kernel_adaptive_stopping = NULL;
+ kernel_adaptive_filter_x = NULL;
+ kernel_adaptive_filter_y = NULL;
+ kernel_adaptive_adjust_samples = NULL;
}
DeviceSplitKernel::~DeviceSplitKernel()
@@ -83,6 +87,10 @@ DeviceSplitKernel::~DeviceSplitKernel()
delete kernel_next_iteration_setup;
delete kernel_indirect_subsurface;
delete kernel_buffer_update;
+ delete kernel_adaptive_stopping;
+ delete kernel_adaptive_filter_x;
+ delete kernel_adaptive_filter_y;
+ delete kernel_adaptive_adjust_samples;
}
bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures &requested_features)
@@ -114,6 +122,10 @@ bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures &requested_fe
LOAD_KERNEL(next_iteration_setup);
LOAD_KERNEL(indirect_subsurface);
LOAD_KERNEL(buffer_update);
+ LOAD_KERNEL(adaptive_stopping);
+ LOAD_KERNEL(adaptive_filter_x);
+ LOAD_KERNEL(adaptive_filter_y);
+ LOAD_KERNEL(adaptive_adjust_samples);
#undef LOAD_KERNEL
@@ -202,13 +214,21 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
/* initial guess to start rolling average */
const int initial_num_samples = 1;
/* approx number of samples per second */
- int samples_per_second = (avg_time_per_sample > 0.0) ?
- int(double(time_multiplier) / avg_time_per_sample) + 1 :
- initial_num_samples;
+ const int samples_per_second = (avg_time_per_sample > 0.0) ?
+ int(double(time_multiplier) / avg_time_per_sample) + 1 :
+ initial_num_samples;
RenderTile subtile = tile;
subtile.start_sample = tile.sample;
- subtile.num_samples = min(samples_per_second,
+ subtile.num_samples = samples_per_second;
+
+ if (task->adaptive_sampling.use) {
+ subtile.num_samples = task->adaptive_sampling.align_dynamic_samples(subtile.start_sample,
+ subtile.num_samples);
+ }
+
+ /* Don't go beyond requested number of samples. */
+ subtile.num_samples = min(subtile.num_samples,
tile.start_sample + tile.num_samples - tile.sample);
if (device->have_error()) {
@@ -302,6 +322,23 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
}
}
+ int filter_sample = tile.sample + subtile.num_samples - 1;
+ if (task->adaptive_sampling.use && task->adaptive_sampling.need_filter(filter_sample)) {
+ size_t buffer_size[2];
+ buffer_size[0] = round_up(tile.w, local_size[0]);
+ buffer_size[1] = round_up(tile.h, local_size[1]);
+ kernel_adaptive_stopping->enqueue(
+ KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
+ buffer_size[0] = round_up(tile.h, local_size[0]);
+ buffer_size[1] = round_up(1, local_size[1]);
+ kernel_adaptive_filter_x->enqueue(
+ KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
+ buffer_size[0] = round_up(tile.w, local_size[0]);
+ buffer_size[1] = round_up(1, local_size[1]);
+ kernel_adaptive_filter_y->enqueue(
+ KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
+ }
+
double time_per_sample = ((time_dt() - start_time) / subtile.num_samples);
if (avg_time_per_sample == 0.0) {
@@ -324,6 +361,28 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
}
}
+ if (task->adaptive_sampling.use) {
+ /* Reset the start samples. */
+ RenderTile subtile = tile;
+ subtile.start_sample = tile.start_sample;
+ subtile.num_samples = tile.sample - tile.start_sample;
+ enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
+ subtile,
+ num_global_elements,
+ kgbuffer,
+ kernel_data,
+ split_data,
+ ray_state,
+ queue_index,
+ use_queues_flag,
+ work_pool_wgs);
+ size_t buffer_size[2];
+ buffer_size[0] = round_up(tile.w, local_size[0]);
+ buffer_size[1] = round_up(tile.h, local_size[1]);
+ kernel_adaptive_adjust_samples->enqueue(
+ KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
+ }
+
return true;
}
diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h
index 6ff326bf214..9d6b9efdd62 100644
--- a/intern/cycles/device/device_split_kernel.h
+++ b/intern/cycles/device/device_split_kernel.h
@@ -75,6 +75,10 @@ class DeviceSplitKernel {
SplitKernelFunction *kernel_next_iteration_setup;
SplitKernelFunction *kernel_indirect_subsurface;
SplitKernelFunction *kernel_buffer_update;
+ SplitKernelFunction *kernel_adaptive_stopping;
+ SplitKernelFunction *kernel_adaptive_filter_x;
+ SplitKernelFunction *kernel_adaptive_filter_y;
+ SplitKernelFunction *kernel_adaptive_adjust_samples;
/* Global memory variables [porting]; These memory is used for
* co-operation between different kernels; Data written by one
diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp
index 36522b874ab..c36b1344c3b 100644
--- a/intern/cycles/device/device_task.cpp
+++ b/intern/cycles/device/device_task.cpp
@@ -136,4 +136,59 @@ void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples)
}
}
+/* Adaptive Sampling */
+
+AdaptiveSampling::AdaptiveSampling()
+ : use(true), adaptive_step(ADAPTIVE_SAMPLE_STEP), min_samples(0)
+{
+}
+
+/* Render samples in steps that align with the adaptive filtering. */
+int AdaptiveSampling::align_static_samples(int samples) const
+{
+ if (samples > adaptive_step) {
+ /* Make multiple of adaptive_step. */
+ while (samples % adaptive_step != 0) {
+ samples--;
+ }
+ }
+ else if (samples < adaptive_step) {
+ /* Make divisor of adaptive_step. */
+ while (adaptive_step % samples != 0) {
+ samples--;
+ }
+ }
+
+ return max(samples, 1);
+}
+
+/* Render samples in steps that align with the adaptive filtering, with the
+ * suggested number of samples dynamically changing. */
+int AdaptiveSampling::align_dynamic_samples(int offset, int samples) const
+{
+ /* Round so that we end up on multiples of adaptive_samples. */
+ samples += offset;
+
+ if (samples > adaptive_step) {
+ /* Make multiple of adaptive_step. */
+ while (samples % adaptive_step != 0) {
+ samples--;
+ }
+ }
+
+ samples -= offset;
+
+ return max(samples, 1);
+}
+
+bool AdaptiveSampling::need_filter(int sample) const
+{
+ if (sample > min_samples) {
+ return (sample & (adaptive_step - 1)) == (adaptive_step - 1);
+ }
+ else {
+ return false;
+ }
+}
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
index 972f6131092..8c4e682adb1 100644
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -62,6 +62,19 @@ class DenoiseParams {
}
};
+class AdaptiveSampling {
+ public:
+ AdaptiveSampling();
+
+ int align_static_samples(int samples) const;
+ int align_dynamic_samples(int offset, int samples) const;
+ bool need_filter(int sample) const;
+
+ bool use;
+ int adaptive_step;
+ int min_samples;
+};
+
class DeviceTask : public Task {
public:
typedef enum { RENDER, FILM_CONVERT, SHADER, DENOISE_BUFFER } Type;
@@ -115,6 +128,7 @@ class DeviceTask : public Task {
bool need_finish_queue;
bool integrator_branched;
+ AdaptiveSampling adaptive_sampling;
protected:
double last_update_time;
diff --git a/intern/cycles/device/opencl/device_opencl.h b/intern/cycles/device/opencl/device_opencl.h
index 61b1e3e3b6b..b761726b1ad 100644
--- a/intern/cycles/device/opencl/device_opencl.h
+++ b/intern/cycles/device/opencl/device_opencl.h
@@ -445,6 +445,7 @@ class OpenCLDevice : public Device {
device_ptr rgba_byte,
device_ptr rgba_half);
void shader(DeviceTask &task);
+ void update_adaptive(DeviceTask &task, RenderTile &tile, int sample);
void denoise(RenderTile &tile, DenoisingTask &denoising);
diff --git a/intern/cycles/device/opencl/device_opencl_impl.cpp b/intern/cycles/device/opencl/device_opencl_impl.cpp
index 68cdfd5238c..3dbe54b38aa 100644
--- a/intern/cycles/device/opencl/device_opencl_impl.cpp
+++ b/intern/cycles/device/opencl/device_opencl_impl.cpp
@@ -56,7 +56,11 @@ static const string SPLIT_BUNDLE_KERNELS =
"enqueue_inactive "
"next_iteration_setup "
"indirect_subsurface "
- "buffer_update";
+ "buffer_update "
+ "adaptive_stopping "
+ "adaptive_filter_x "
+ "adaptive_filter_y "
+ "adaptive_adjust_samples";
const string OpenCLDevice::get_opencl_program_name(const string &kernel_name)
{
@@ -283,6 +287,10 @@ void OpenCLDevice::OpenCLSplitPrograms::load_kernels(
ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(next_iteration_setup);
ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(indirect_subsurface);
ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(buffer_update);
+ ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_stopping);
+ ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_x);
+ ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_y);
+ ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_adjust_samples);
programs.push_back(&program_split);
# undef ADD_SPLIT_KERNEL_PROGRAM
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 566b6e3d191..0dd0da65f82 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -36,6 +36,10 @@ set(SRC_CUDA_KERNELS
)
set(SRC_OPENCL_KERNELS
+ kernels/opencl/kernel_adaptive_stopping.cl
+ kernels/opencl/kernel_adaptive_filter_x.cl
+ kernels/opencl/kernel_adaptive_filter_y.cl
+ kernels/opencl/kernel_adaptive_adjust_samples.cl
kernels/opencl/kernel_bake.cl
kernels/opencl/kernel_base.cl
kernels/opencl/kernel_displace.cl
@@ -94,6 +98,7 @@ set(SRC_BVH_HEADERS
set(SRC_HEADERS
kernel_accumulate.h
+ kernel_adaptive_sampling.h
kernel_bake.h
kernel_camera.h
kernel_color.h
@@ -324,6 +329,10 @@ set(SRC_UTIL_HEADERS
)
set(SRC_SPLIT_HEADERS
+ split/kernel_adaptive_adjust_samples.h
+ split/kernel_adaptive_filter_x.h
+ split/kernel_adaptive_filter_y.h
+ split/kernel_adaptive_stopping.h
split/kernel_branched.h
split/kernel_buffer_update.h
split/kernel_data_init.h
diff --git a/intern/cycles/kernel/kernel_adaptive_sampling.h b/intern/cycles/kernel/kernel_adaptive_sampling.h
new file mode 100644
index 00000000000..502b69e4f7f
--- /dev/null
+++ b/intern/cycles/kernel/kernel_adaptive_sampling.h
@@ -0,0 +1,231 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KERNEL_ADAPTIVE_SAMPLING_H__
+#define __KERNEL_ADAPTIVE_SAMPLING_H__
+
+CCL_NAMESPACE_BEGIN
+
+/* Determines whether to continue sampling a given pixel or if it has sufficiently converged. */
+
+ccl_device void kernel_do_adaptive_stopping(KernelGlobals *kg,
+ ccl_global float *buffer,
+ int sample)
+{
+ /* TODO Stefan: Is this better in linear, sRGB or something else? */
+ float4 I = *((ccl_global float4 *)buffer);
+ float4 A = *(ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer);
+ /* The per pixel error as seen in section 2.1 of
+ * "A hierarchical automatic stopping condition for Monte Carlo global illumination"
+ * A small epsilon is added to the divisor to prevent division by zero. */
+ float error = (fabsf(I.x - A.x) + fabsf(I.y - A.y) + fabsf(I.z - A.z)) /
+ (sample * 0.0001f + sqrtf(I.x + I.y + I.z));
+ if (error < kernel_data.integrator.adaptive_threshold * (float)sample) {
+ /* Set the fourth component to non-zero value to indicate that this pixel has converged. */
+ buffer[kernel_data.film.pass_adaptive_aux_buffer + 3] += 1.0f;
+ }
+}
+
+/* Adjust the values of an adaptively sampled pixel. */
+
+ccl_device void kernel_adaptive_post_adjust(KernelGlobals *kg,
+ ccl_global float *buffer,
+ float sample_multiplier)
+{
+ *(ccl_global float4 *)(buffer) *= sample_multiplier;
+
+ /* Scale the aux pass too, this is necessary for progressive rendering to work properly. */
+ kernel_assert(kernel_data.film.pass_adaptive_aux_buffer);
+ *(ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer) *= sample_multiplier;
+
+#ifdef __PASSES__
+ int flag = kernel_data.film.pass_flag;
+
+ if (flag & PASSMASK(SHADOW))
+ *(ccl_global float3 *)(buffer + kernel_data.film.pass_shadow) *= sample_multiplier;
+
+ if (flag & PASSMASK(MIST))
+ *(ccl_global float *)(buffer + kernel_data.film.pass_mist) *= sample_multiplier;
+
+ if (flag & PASSMASK(NORMAL))
+ *(ccl_global float3 *)(buffer + kernel_data.film.pass_normal) *= sample_multiplier;
+
+ if (flag & PASSMASK(UV))
+ *(ccl_global float3 *)(buffer + kernel_data.film.pass_uv) *= sample_multiplier;
+
+ if (flag & PASSMASK(MOTION)) {
+ *(ccl_global float4 *)(buffer + kernel_data.film.pass_motion) *= sample_multiplier;
+ *(ccl_global float *)(buffer + kernel_data.film.pass_motion_weight) *= sample_multiplier;
+ }
+
+ if (kernel_data.film.use_light_pass) {
+ int light_flag = kernel_data.film.light_pass_flag;
+
+ if (light_flag & PASSMASK(DIFFUSE_INDIRECT))
+ *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_indirect) *= sample_multiplier;
+ if (light_flag & PASSMASK(GLOSSY_INDIRECT))
+ *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_indirect) *= sample_multiplier;
+ if (light_flag & PASSMASK(TRANSMISSION_INDIRECT))
+ *(ccl_global float3 *)(buffer +
+ kernel_data.film.pass_transmission_indirect) *= sample_multiplier;
+ if (light_flag & PASSMASK(VOLUME_INDIRECT))
+ *(ccl_global float3 *)(buffer + kernel_data.film.pass_volume_indirect) *= sample_multiplier;
+ if (light_flag & PASSMASK(DIFFUSE_DIRECT))
+ *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_direct) *= sample_multiplier;
+ if (light_flag & PASSMASK(GLOSSY_DIRECT))
+ *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_direct) *= sample_multiplier;
+ if (light_flag & PASSMASK(TRANSMISSION_DIRECT))
+ *(ccl_global float3 *)(buffer +
+ kernel_data.film.pass_transmission_direct) *= sample_multiplier;
+ if (light_flag & PASSMASK(VOLUME_DIRECT))
+ *(ccl_global float3 *)(buffer + kernel_data.film.pass_volume_direct) *= sample_multiplier;
+
+ if (light_flag & PASSMASK(EMISSION))
+ *(ccl_global float3 *)(buffer + kernel_data.film.pass_emission) *= sample_multiplier;
+ if (light_flag & PASSMASK(BACKGROUND))
+ *(ccl_global float3 *)(buffer + kernel_data.film.pass_background) *= sample_multiplier;
+ if (light_flag & PASSMASK(AO))
+ *(ccl_global float3 *)(buffer + kernel_data.film.pass_ao) *= sample_multiplier;
+
+ if (light_flag & PASSMASK(DIFFUSE_COLOR))
+ *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_color) *= sample_multiplier;
+ if (light_flag & PASSMASK(GLOSSY_COLOR))
+ *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_color) *= sample_multiplier;
+ if (light_flag & PASSMASK(TRANSMISSION_COLOR))
+ *(ccl_global float3 *)(buffer +
+ kernel_data.film.pass_transmission_color) *= sample_multiplier;
+ }
+#endif
+
+#ifdef __DENOISING_FEATURES__
+
+# define scale_float3_variance(buffer, offset, scale) \
+ *(buffer + offset) *= scale; \
+ *(buffer + offset + 1) *= scale; \
+ *(buffer + offset + 2) *= scale; \
+ *(buffer + offset + 3) *= scale * scale; \
+ *(buffer + offset + 4) *= scale * scale; \
+ *(buffer + offset + 5) *= scale * scale;
+
+# define scale_shadow_variance(buffer, offset, scale) \
+ *(buffer + offset) *= scale; \
+ *(buffer + offset + 1) *= scale; \
+ *(buffer + offset + 2) *= scale * scale;
+
+ if (kernel_data.film.pass_denoising_data) {
+ scale_shadow_variance(
+ buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_SHADOW_A, sample_multiplier);
+ scale_shadow_variance(
+ buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_SHADOW_B, sample_multiplier);
+ if (kernel_data.film.pass_denoising_clean) {
+ scale_float3_variance(
+ buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, sample_multiplier);
+ *(buffer + kernel_data.film.pass_denoising_clean) *= sample_multiplier;
+ *(buffer + kernel_data.film.pass_denoising_clean + 1) *= sample_multiplier;
+ *(buffer + kernel_data.film.pass_denoising_clean + 2) *= sample_multiplier;
+ }
+ else {
+ scale_float3_variance(
+ buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, sample_multiplier);
+ }
+ scale_float3_variance(
+ buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL, sample_multiplier);
+ scale_float3_variance(
+ buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO, sample_multiplier);
+ *(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH) *= sample_multiplier;
+ *(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH +
+ 1) *= sample_multiplier * sample_multiplier;
+ }
+#endif /* __DENOISING_FEATURES__ */
+
+ if (kernel_data.film.cryptomatte_passes) {
+ int num_slots = 0;
+ num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) ? 1 : 0;
+ num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) ? 1 : 0;
+ num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) ? 1 : 0;
+ num_slots = num_slots * 2 * kernel_data.film.cryptomatte_depth;
+ ccl_global float2 *id_buffer = (ccl_global float2 *)(buffer +
+ kernel_data.film.pass_cryptomatte);
+ for (int slot = 0; slot < num_slots; slot++) {
+ id_buffer[slot].y *= sample_multiplier;
+ }
+ }
+}
+
+/* This is a simple box filter in two passes.
+ * When a pixel demands more adaptive samples, let its neighboring pixels draw more samples too. */
+
+ccl_device bool kernel_do_adaptive_filter_x(KernelGlobals *kg, int y, ccl_global WorkTile *tile)
+{
+ bool any = false;
+ bool prev = false;
+ for (int x = tile->x; x < tile->x + tile->w; ++x) {
+ int index = tile->offset + x + y * tile->stride;
+ ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
+ ccl_global float4 *aux = (ccl_global float4 *)(buffer +
+ kernel_data.film.pass_adaptive_aux_buffer);
+ if (aux->w == 0.0f) {
+ any = true;
+ if (x > tile->x && !prev) {
+ index = index - 1;
+ buffer = tile->buffer + index * kernel_data.film.pass_stride;
+ aux = (ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer);
+ aux->w = 0.0f;
+ }
+ prev = true;
+ }
+ else {
+ if (prev) {
+ aux->w = 0.0f;
+ }
+ prev = false;
+ }
+ }
+ return any;
+}
+
+ccl_device bool kernel_do_adaptive_filter_y(KernelGlobals *kg, int x, ccl_global WorkTile *tile)
+{
+ bool prev = false;
+ bool any = false;
+ for (int y = tile->y; y < tile->y + tile->h; ++y) {
+ int index = tile->offset + x + y * tile->stride;
+ ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
+ ccl_global float4 *aux = (ccl_global float4 *)(buffer +
+ kernel_data.film.pass_adaptive_aux_buffer);
+ if (aux->w == 0.0f) {
+ any = true;
+ if (y > tile->y && !prev) {
+ index = index - tile->stride;
+ buffer = tile->buffer + index * kernel_data.film.pass_stride;
+ aux = (ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer);
+ aux->w = 0.0f;
+ }
+ prev = true;
+ }
+ else {
+ if (prev) {
+ aux->w = 0.0f;
+ }
+ prev = false;
+ }
+ }
+ return any;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* __KERNEL_ADAPTIVE_SAMPLING_H__ */
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index 187e8340c82..33ec05c6048 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -29,7 +29,9 @@ ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg,
if (kernel_data.film.pass_denoising_data == 0)
return;
- buffer += (sample & 1) ? DENOISING_PASS_SHADOW_B : DENOISING_PASS_SHADOW_A;
+ buffer += sample_is_even(kernel_data.integrator.sampling_pattern, sample) ?
+ DENOISING_PASS_SHADOW_B :
+ DENOISING_PASS_SHADOW_A;
path_total = ensure_finite(path_total);
path_total_shaded = ensure_finite(path_total_shaded);
@@ -386,6 +388,41 @@ ccl_device_inline void kernel_write_result(KernelGlobals *kg,
#ifdef __KERNEL_DEBUG__
kernel_write_debug_passes(kg, buffer, L);
#endif
+
+ /* Adaptive Sampling. Fill the additional buffer with the odd samples and calculate our stopping
+ criteria. This is the heuristic from "A hierarchical automatic stopping condition for Monte
+ Carlo global illumination" except that here it is applied per pixel and not in hierarchical
+ tiles. */
+ if (kernel_data.film.pass_adaptive_aux_buffer &&
+ kernel_data.integrator.adaptive_threshold > 0.0f) {
+ if (sample_is_even(kernel_data.integrator.sampling_pattern, sample)) {
+ kernel_write_pass_float4(buffer + kernel_data.film.pass_adaptive_aux_buffer,
+ make_float4(L_sum.x * 2.0f, L_sum.y * 2.0f, L_sum.z * 2.0f, 0.0f));
+ }
+#ifdef __KERNEL_CPU__
+ if (sample > kernel_data.integrator.adaptive_min_samples &&
+ (sample & (ADAPTIVE_SAMPLE_STEP - 1)) == (ADAPTIVE_SAMPLE_STEP - 1)) {
+ kernel_do_adaptive_stopping(kg, buffer, sample);
+ }
+#endif
+ }
+
+ /* Write the sample count as negative numbers initially to mark the samples as in progress.
+ * Once the tile has finished rendering, the sign gets flipped and all the pixel values
+ * are scaled as if they were taken at a uniform sample count. */
+ if (kernel_data.film.pass_sample_count) {
+ /* Make sure it's a negative number. In progressive refine mode, this bit gets flipped between
+ * passes. */
+#ifdef __ATOMIC_PASS_WRITE__
+ atomic_fetch_and_or_uint32((ccl_global uint *)(buffer + kernel_data.film.pass_sample_count),
+ 0x80000000);
+#else
+ if (buffer[kernel_data.film.pass_sample_count] > 0) {
+ buffer[kernel_data.film.pass_sample_count] *= -1.0f;
+ }
+#endif
+ kernel_write_pass_float(buffer + kernel_data.film.pass_sample_count, -1.0f);
+ }
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index 1a0b67275a7..bdd2703a894 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -31,6 +31,7 @@
#include "kernel/kernel_accumulate.h"
#include "kernel/kernel_shader.h"
#include "kernel/kernel_light.h"
+#include "kernel/kernel_adaptive_sampling.h"
#include "kernel/kernel_passes.h"
#if defined(__VOLUME__) || defined(__SUBSURFACE__)
@@ -656,6 +657,14 @@ ccl_device void kernel_path_trace(
buffer += index * pass_stride;
+ if (kernel_data.film.pass_adaptive_aux_buffer) {
+ ccl_global float4 *aux = (ccl_global float4 *)(buffer +
+ kernel_data.film.pass_adaptive_aux_buffer);
+ if (aux->w > 0.0f) {
+ return;
+ }
+ }
+
/* Initialize random numbers and sample ray. */
uint rng_hash;
Ray ray;
diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h
index f75e4ab4c97..0d5781fe3d1 100644
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ b/intern/cycles/kernel/kernel_path_branched.h
@@ -523,6 +523,14 @@ ccl_device void kernel_branched_path_trace(
buffer += index * pass_stride;
+ if (kernel_data.film.pass_adaptive_aux_buffer) {
+ ccl_global float4 *aux = (ccl_global float4 *)(buffer +
+ kernel_data.film.pass_adaptive_aux_buffer);
+ if (aux->w > 0.0f) {
+ return;
+ }
+ }
+
/* initialize random numbers and ray */
uint rng_hash;
Ray ray;
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 88c2d0d3196..c5be93e2cda 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -63,6 +63,11 @@ CCL_NAMESPACE_BEGIN
#define VOLUME_STACK_SIZE 32
+/* Adaptive sampling constants */
+#define ADAPTIVE_SAMPLE_STEP 4
+static_assert((ADAPTIVE_SAMPLE_STEP & (ADAPTIVE_SAMPLE_STEP - 1)) == 0,
+ "ADAPTIVE_SAMPLE_STEP must be power of two for bitwise operations to work");
+
/* Split kernel constants */
#define WORK_POOL_SIZE_GPU 64
#define WORK_POOL_SIZE_CPU 1
@@ -374,6 +379,8 @@ typedef enum PassType {
PASS_CRYPTOMATTE,
PASS_AOV_COLOR,
PASS_AOV_VALUE,
+ PASS_ADAPTIVE_AUX_BUFFER,
+ PASS_SAMPLE_COUNT,
PASS_CATEGORY_MAIN_END = 31,
PASS_MIST = 32,
@@ -1223,6 +1230,9 @@ typedef struct KernelFilm {
int cryptomatte_depth;
int pass_cryptomatte;
+ int pass_adaptive_aux_buffer;
+ int pass_sample_count;
+
int pass_mist;
float mist_start;
float mist_inv_depth;
@@ -1256,6 +1266,8 @@ typedef struct KernelFilm {
int display_divide_pass_stride;
int use_display_exposure;
int use_display_pass_alpha;
+
+ int pad3, pad4, pad5;
} KernelFilm;
static_assert_align(KernelFilm, 16);
@@ -1337,6 +1349,8 @@ typedef struct KernelIntegrator {
/* sampler */
int sampling_pattern;
int aa_samples;
+ int adaptive_min_samples;
+ float adaptive_threshold;
/* volume render */
int use_volumes;
@@ -1348,7 +1362,7 @@ typedef struct KernelIntegrator {
int max_closures;
- int pad1;
+ int pad1, pad2, pad3;
} KernelIntegrator;
static_assert_align(KernelIntegrator, 16);
@@ -1662,7 +1676,7 @@ typedef struct WorkTile {
uint start_sample;
uint num_samples;
- uint offset;
+ int offset;
uint stride;
ccl_global float *buffer;
diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h
index 799561a7466..c642d227e4b 100644
--- a/intern/cycles/kernel/kernel_work_stealing.h
+++ b/intern/cycles/kernel/kernel_work_stealing.h
@@ -23,17 +23,41 @@ CCL_NAMESPACE_BEGIN
* Utility functions for work stealing
*/
+/* Map global work index to tile, pixel X/Y and sample. */
+ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile,
+ uint global_work_index,
+ ccl_private uint *x,
+ ccl_private uint *y,
+ ccl_private uint *sample)
+{
+#ifdef __KERNEL_CUDA__
+ /* Keeping threads for the same pixel together improves performance on CUDA. */
+ uint sample_offset = global_work_index % tile->num_samples;
+ uint pixel_offset = global_work_index / tile->num_samples;
+#else /* __KERNEL_CUDA__ */
+ uint tile_pixels = tile->w * tile->h;
+ uint sample_offset = global_work_index / tile_pixels;
+ uint pixel_offset = global_work_index - sample_offset * tile_pixels;
+#endif /* __KERNEL_CUDA__ */
+ uint y_offset = pixel_offset / tile->w;
+ uint x_offset = pixel_offset - y_offset * tile->w;
+
+ *x = tile->x + x_offset;
+ *y = tile->y + y_offset;
+ *sample = tile->start_sample + sample_offset;
+}
+
#ifdef __KERNEL_OPENCL__
# pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#endif
#ifdef __SPLIT_KERNEL__
/* Returns true if there is work */
-ccl_device bool get_next_work(KernelGlobals *kg,
- ccl_global uint *work_pools,
- uint total_work_size,
- uint ray_index,
- ccl_private uint *global_work_index)
+ccl_device bool get_next_work_item(KernelGlobals *kg,
+ ccl_global uint *work_pools,
+ uint total_work_size,
+ uint ray_index,
+ ccl_private uint *global_work_index)
{
/* With a small amount of work there may be more threads than work due to
* rounding up of global size, stop such threads immediately. */
@@ -56,31 +80,37 @@ ccl_device bool get_next_work(KernelGlobals *kg,
/* Test if all work for this pool is done. */
return (*global_work_index < total_work_size);
}
-#endif
-/* Map global work index to tile, pixel X/Y and sample. */
-ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile,
- uint global_work_index,
- ccl_private uint *x,
- ccl_private uint *y,
- ccl_private uint *sample)
+ccl_device bool get_next_work(KernelGlobals *kg,
+ ccl_global uint *work_pools,
+ uint total_work_size,
+ uint ray_index,
+ ccl_private uint *global_work_index)
{
-#ifdef __KERNEL_CUDA__
- /* Keeping threads for the same pixel together improves performance on CUDA. */
- uint sample_offset = global_work_index % tile->num_samples;
- uint pixel_offset = global_work_index / tile->num_samples;
-#else /* __KERNEL_CUDA__ */
- uint tile_pixels = tile->w * tile->h;
- uint sample_offset = global_work_index / tile_pixels;
- uint pixel_offset = global_work_index - sample_offset * tile_pixels;
-#endif /* __KERNEL_CUDA__ */
- uint y_offset = pixel_offset / tile->w;
- uint x_offset = pixel_offset - y_offset * tile->w;
-
- *x = tile->x + x_offset;
- *y = tile->y + y_offset;
- *sample = tile->start_sample + sample_offset;
+ bool got_work = false;
+ if (kernel_data.film.pass_adaptive_aux_buffer) {
+ do {
+ got_work = get_next_work_item(kg, work_pools, total_work_size, ray_index, global_work_index);
+ if (got_work) {
+ ccl_global WorkTile *tile = &kernel_split_params.tile;
+ uint x, y, sample;
+ get_work_pixel(tile, *global_work_index, &x, &y, &sample);
+ uint buffer_offset = (tile->offset + x + y * tile->stride) * kernel_data.film.pass_stride;
+ ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
+ ccl_global float4 *aux = (ccl_global float4 *)(buffer +
+ kernel_data.film.pass_adaptive_aux_buffer);
+ if (aux->w == 0.0f) {
+ break;
+ }
+ }
+ } while (got_work);
+ }
+ else {
+ got_work = get_next_work_item(kg, work_pools, total_work_size, ray_index, global_work_index);
+ }
+ return got_work;
}
+#endif
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
index f5d981fb71a..683f4b88d79 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
@@ -89,5 +89,9 @@ DECLARE_SPLIT_KERNEL_FUNCTION(enqueue_inactive)
DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update)
+DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_stopping)
+DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x)
+DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y)
+DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples)
#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
index 9ca3f46b5b6..96b2bf11132 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
@@ -58,6 +58,10 @@
# include "kernel/split/kernel_next_iteration_setup.h"
# include "kernel/split/kernel_indirect_subsurface.h"
# include "kernel/split/kernel_buffer_update.h"
+# include "kernel/split/kernel_adaptive_stopping.h"
+# include "kernel/split/kernel_adaptive_filter_x.h"
+# include "kernel/split/kernel_adaptive_filter_y.h"
+# include "kernel/split/kernel_adaptive_adjust_samples.h"
# endif /* __SPLIT_KERNEL__ */
#else
# define STUB_ASSERT(arch, name) \
@@ -204,6 +208,10 @@ DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_stopping)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples)
#endif /* __SPLIT_KERNEL__ */
#undef KERNEL_STUB
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
index af311027f78..c4c810c6a82 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -33,6 +33,7 @@
#include "kernel/kernel_path_branched.h"
#include "kernel/kernel_bake.h"
#include "kernel/kernel_work_stealing.h"
+#include "kernel/kernel_adaptive_sampling.h"
/* kernels */
extern "C" __global__ void
@@ -83,6 +84,75 @@ kernel_cuda_branched_path_trace(WorkTile *tile, uint total_work_size)
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_adaptive_stopping(WorkTile *tile, int sample, uint total_work_size)
+{
+ int work_index = ccl_global_id(0);
+ bool thread_is_active = work_index < total_work_size;
+ KernelGlobals kg;
+ if(thread_is_active && kernel_data.film.pass_adaptive_aux_buffer) {
+ uint x = tile->x + work_index % tile->w;
+ uint y = tile->y + work_index / tile->w;
+ int index = tile->offset + x + y * tile->stride;
+ ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
+ kernel_do_adaptive_stopping(&kg, buffer, sample);
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_adaptive_filter_x(WorkTile *tile, int sample, uint)
+{
+ KernelGlobals kg;
+ if(kernel_data.film.pass_adaptive_aux_buffer && sample > kernel_data.integrator.adaptive_min_samples) {
+ if(ccl_global_id(0) < tile->h) {
+ int y = tile->y + ccl_global_id(0);
+ kernel_do_adaptive_filter_x(&kg, y, tile);
+ }
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_adaptive_filter_y(WorkTile *tile, int sample, uint)
+{
+ KernelGlobals kg;
+ if(kernel_data.film.pass_adaptive_aux_buffer && sample > kernel_data.integrator.adaptive_min_samples) {
+ if(ccl_global_id(0) < tile->w) {
+ int x = tile->x + ccl_global_id(0);
+ kernel_do_adaptive_filter_y(&kg, x, tile);
+ }
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_adaptive_scale_samples(WorkTile *tile, int start_sample, int sample, uint total_work_size)
+{
+ if(kernel_data.film.pass_adaptive_aux_buffer) {
+ int work_index = ccl_global_id(0);
+ bool thread_is_active = work_index < total_work_size;
+ KernelGlobals kg;
+ if(thread_is_active) {
+ uint x = tile->x + work_index % tile->w;
+ uint y = tile->y + work_index / tile->w;
+ int index = tile->offset + x + y * tile->stride;
+ ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
+ if(buffer[kernel_data.film.pass_sample_count] < 0.0f) {
+ buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
+ float sample_multiplier = sample / max((float)start_sample + 1.0f, buffer[kernel_data.film.pass_sample_count]);
+ if(sample_multiplier != 1.0f) {
+ kernel_adaptive_post_adjust(&kg, buffer, sample_multiplier);
+ }
+ }
+ else {
+ kernel_adaptive_post_adjust(&kg, buffer, sample / (sample - 1.0f));
+ }
+ }
+ }
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
{
int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
index 43b3d0aa0e6..95ad7599cf1 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
@@ -43,6 +43,10 @@
#include "kernel/split/kernel_next_iteration_setup.h"
#include "kernel/split/kernel_indirect_subsurface.h"
#include "kernel/split/kernel_buffer_update.h"
+#include "kernel/split/kernel_adaptive_stopping.h"
+#include "kernel/split/kernel_adaptive_filter_x.h"
+#include "kernel/split/kernel_adaptive_filter_y.h"
+#include "kernel/split/kernel_adaptive_adjust_samples.h"
#include "kernel/kernel_film.h"
@@ -121,6 +125,10 @@ DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_stopping)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples)
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl
new file mode 100644
index 00000000000..ebdb99d4730
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_adaptive_adjust_samples.h"
+
+#define KERNEL_NAME adaptive_adjust_samples
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl
new file mode 100644
index 00000000000..76d82d4184e
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_adaptive_filter_x.h"
+
+#define KERNEL_NAME adaptive_filter_x
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl
new file mode 100644
index 00000000000..1e6d15ba0f2
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_adaptive_filter_y.h"
+
+#define KERNEL_NAME adaptive_filter_y
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl
new file mode 100644
index 00000000000..51de0059667
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_adaptive_stopping.h"
+
+#define KERNEL_NAME adaptive_stopping
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl b/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl
index 6041f13b52b..c3b7b09460a 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl
@@ -28,3 +28,7 @@
#include "kernel/kernels/opencl/kernel_next_iteration_setup.cl"
#include "kernel/kernels/opencl/kernel_indirect_subsurface.cl"
#include "kernel/kernels/opencl/kernel_buffer_update.cl"
+#include "kernel/kernels/opencl/kernel_adaptive_stopping.cl"
+#include "kernel/kernels/opencl/kernel_adaptive_filter_x.cl"
+#include "kernel/kernels/opencl/kernel_adaptive_filter_y.cl"
+#include "kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl"
diff --git a/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h b/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h
new file mode 100644
index 00000000000..60ebf415970
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_adaptive_adjust_samples(KernelGlobals *kg)
+{
+ int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ if (pixel_index < kernel_split_params.tile.w * kernel_split_params.tile.h) {
+ int x = kernel_split_params.tile.x + pixel_index % kernel_split_params.tile.w;
+ int y = kernel_split_params.tile.y + pixel_index / kernel_split_params.tile.w;
+ int buffer_offset = (kernel_split_params.tile.offset + x +
+ y * kernel_split_params.tile.stride) *
+ kernel_data.film.pass_stride;
+ ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
+ int sample = kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples;
+ if (buffer[kernel_data.film.pass_sample_count] < 0.0f) {
+ buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
+ float sample_multiplier = sample / max((float)kernel_split_params.tile.start_sample + 1.0f,
+ buffer[kernel_data.film.pass_sample_count]);
+ if (sample_multiplier != 1.0f) {
+ kernel_adaptive_post_adjust(kg, buffer, sample_multiplier);
+ }
+ }
+ else {
+ kernel_adaptive_post_adjust(kg, buffer, sample / (sample - 1.0f));
+ }
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_adaptive_filter_x.h b/intern/cycles/kernel/split/kernel_adaptive_filter_x.h
new file mode 100644
index 00000000000..93f41f7ced4
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_adaptive_filter_x.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_adaptive_filter_x(KernelGlobals *kg)
+{
+ int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ if (pixel_index < kernel_split_params.tile.h &&
+ kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >=
+ kernel_data.integrator.adaptive_min_samples) {
+ int y = kernel_split_params.tile.y + pixel_index;
+ kernel_do_adaptive_filter_x(kg, y, &kernel_split_params.tile);
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_adaptive_filter_y.h b/intern/cycles/kernel/split/kernel_adaptive_filter_y.h
new file mode 100644
index 00000000000..eca53d079ec
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_adaptive_filter_y.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_adaptive_filter_y(KernelGlobals *kg)
+{
+ int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ if (pixel_index < kernel_split_params.tile.w &&
+ kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >=
+ kernel_data.integrator.adaptive_min_samples) {
+ int x = kernel_split_params.tile.x + pixel_index;
+ kernel_do_adaptive_filter_y(kg, x, &kernel_split_params.tile);
+ }
+}
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_adaptive_stopping.h b/intern/cycles/kernel/split/kernel_adaptive_stopping.h
new file mode 100644
index 00000000000..c8eb1ebd705
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_adaptive_stopping.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_adaptive_stopping(KernelGlobals *kg)
+{
+ int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ if (pixel_index < kernel_split_params.tile.w * kernel_split_params.tile.h &&
+ kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >=
+ kernel_data.integrator.adaptive_min_samples) {
+ int x = kernel_split_params.tile.x + pixel_index % kernel_split_params.tile.w;
+ int y = kernel_split_params.tile.y + pixel_index / kernel_split_params.tile.w;
+ int buffer_offset = (kernel_split_params.tile.offset + x +
+ y * kernel_split_params.tile.stride) *
+ kernel_data.film.pass_stride;
+ ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
+ kernel_do_adaptive_stopping(kg,
+ buffer,
+ kernel_split_params.tile.start_sample +
+ kernel_split_params.tile.num_samples - 1);
+ }
+}
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp
index 41e1b73fdac..19d3f00bf82 100644
--- a/intern/cycles/render/buffers.cpp
+++ b/intern/cycles/render/buffers.cpp
@@ -260,6 +260,22 @@ bool RenderBuffers::get_pass_rect(
return false;
}
+ float *sample_count = NULL;
+ if (name == "Combined") {
+ int sample_offset = 0;
+ for (size_t j = 0; j < params.passes.size(); j++) {
+ Pass &pass = params.passes[j];
+ if (pass.type != PASS_SAMPLE_COUNT) {
+ sample_offset += pass.components;
+ continue;
+ }
+ else {
+ sample_count = buffer.data() + sample_offset;
+ break;
+ }
+ }
+ }
+
int pass_offset = 0;
for (size_t j = 0; j < params.passes.size(); j++) {
@@ -420,6 +436,11 @@ bool RenderBuffers::get_pass_rect(
}
else {
for (int i = 0; i < size; i++, in += pass_stride, pixels += 4) {
+ if (sample_count && sample_count[i * pass_stride] < 0.0f) {
+ scale = (pass.filter) ? -1.0f / (sample_count[i * pass_stride]) : 1.0f;
+ scale_exposure = (pass.exposure) ? scale * exposure : scale;
+ }
+
float4 f = make_float4(in[0], in[1], in[2], in[3]);
pixels[0] = f.x * scale_exposure;
diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp
index 172ea3dd31b..48d9c97e0fb 100644
--- a/intern/cycles/render/film.cpp
+++ b/intern/cycles/render/film.cpp
@@ -183,6 +183,13 @@ void Pass::add(PassType type, vector<Pass> &passes, const char *name)
case PASS_CRYPTOMATTE:
pass.components = 4;
break;
+ case PASS_ADAPTIVE_AUX_BUFFER:
+ pass.components = 4;
+ break;
+ case PASS_SAMPLE_COUNT:
+ pass.components = 1;
+ pass.exposure = false;
+ break;
case PASS_AOV_COLOR:
pass.components = 4;
break;
@@ -311,6 +318,7 @@ NODE_DEFINE(Film)
SOCKET_BOOLEAN(denoising_clean_pass, "Generate Denoising Clean Pass", false);
SOCKET_BOOLEAN(denoising_prefiltered_pass, "Generate Denoising Prefiltered Pass", false);
SOCKET_INT(denoising_flags, "Denoising Flags", 0);
+ SOCKET_BOOLEAN(use_adaptive_sampling, "Use Adaptive Sampling", false);
return type;
}
@@ -482,6 +490,12 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
kfilm->pass_stride;
have_cryptomatte = true;
break;
+ case PASS_ADAPTIVE_AUX_BUFFER:
+ kfilm->pass_adaptive_aux_buffer = kfilm->pass_stride;
+ break;
+ case PASS_SAMPLE_COUNT:
+ kfilm->pass_sample_count = kfilm->pass_stride;
+ break;
case PASS_AOV_COLOR:
if (!have_aov_color) {
kfilm->pass_aov_color = kfilm->pass_stride;
diff --git a/intern/cycles/render/film.h b/intern/cycles/render/film.h
index 95e54cb54d8..aae8fb404b0 100644
--- a/intern/cycles/render/film.h
+++ b/intern/cycles/render/film.h
@@ -81,6 +81,8 @@ class Film : public Node {
CryptomatteType cryptomatte_passes;
int cryptomatte_depth;
+ bool use_adaptive_sampling;
+
bool need_update;
Film();
diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp
index f289e11fe14..ee1aa5988bf 100644
--- a/intern/cycles/render/integrator.cpp
+++ b/intern/cycles/render/integrator.cpp
@@ -27,6 +27,7 @@
#include "kernel/kernel_types.h"
#include "util/util_foreach.h"
+#include "util/util_logging.h"
#include "util/util_hash.h"
CCL_NAMESPACE_BEGIN
@@ -69,6 +70,9 @@ NODE_DEFINE(Integrator)
SOCKET_INT(volume_samples, "Volume Samples", 1);
SOCKET_INT(start_sample, "Start Sample", 0);
+ SOCKET_FLOAT(adaptive_threshold, "Adaptive Threshold", 0.0f);
+ SOCKET_INT(adaptive_min_samples, "Adaptive Min Samples", 0);
+
SOCKET_BOOLEAN(sample_all_lights_direct, "Sample All Lights Direct", true);
SOCKET_BOOLEAN(sample_all_lights_indirect, "Sample All Lights Indirect", true);
SOCKET_FLOAT(light_sampling_threshold, "Light Sampling Threshold", 0.05f);
@@ -178,6 +182,22 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
kintegrator->sampling_pattern = sampling_pattern;
kintegrator->aa_samples = aa_samples;
+ if (aa_samples > 0 && adaptive_min_samples == 0) {
+ kintegrator->adaptive_min_samples = max(4, (int)sqrtf(aa_samples));
+ VLOG(1) << "Cycles adaptive sampling: automatic min samples = "
+ << kintegrator->adaptive_min_samples;
+ }
+ else {
+ kintegrator->adaptive_min_samples = max(4, adaptive_min_samples);
+ }
+ if (aa_samples > 0 && adaptive_threshold == 0.0f) {
+ kintegrator->adaptive_threshold = max(0.001f, 1.0f / (float)aa_samples);
+ VLOG(1) << "Cycles adaptive sampling: automatic threshold = "
+ << kintegrator->adaptive_threshold;
+ }
+ else {
+ kintegrator->adaptive_threshold = adaptive_threshold;
+ }
if (light_sampling_threshold > 0.0f) {
kintegrator->light_inv_rr_threshold = 1.0f / light_sampling_threshold;
diff --git a/intern/cycles/render/integrator.h b/intern/cycles/render/integrator.h
index 32d84c27072..9930e907aea 100644
--- a/intern/cycles/render/integrator.h
+++ b/intern/cycles/render/integrator.h
@@ -75,6 +75,9 @@ class Integrator : public Node {
bool sample_all_lights_indirect;
float light_sampling_threshold;
+ int adaptive_min_samples;
+ float adaptive_threshold;
+
enum Method {
BRANCHED_PATH = 0,
PATH = 1,
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index 4231403e39a..6bf2160f9fa 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -1103,6 +1103,10 @@ void Session::render(bool with_denoising)
task.need_finish_queue = params.progressive_refine;
task.integrator_branched = scene->integrator->method == Integrator::BRANCHED_PATH;
+ task.adaptive_sampling.use = (scene->integrator->sampling_pattern == SAMPLING_PATTERN_PMJ) &&
+ scene->dscene.data.film.pass_adaptive_aux_buffer;
+ task.adaptive_sampling.min_samples = scene->dscene.data.integrator.adaptive_min_samples;
+
/* Acquire render tiles by default. */
task.tile_types = RenderTile::PATH_TRACE;
diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h
index 7f3614ccb19..8053e46b12e 100644
--- a/intern/cycles/render/session.h
+++ b/intern/cycles/render/session.h
@@ -56,6 +56,7 @@ class SessionParams {
int denoising_start_sample;
int pixel_size;
int threads;
+ bool adaptive_sampling;
bool use_profiling;
@@ -89,6 +90,7 @@ class SessionParams {
denoising_start_sample = 0;
pixel_size = 1;
threads = 0;
+ adaptive_sampling = false;
use_profiling = false;
@@ -117,6 +119,7 @@ class SessionParams {
progressive == params.progressive && experimental == params.experimental &&
tile_size == params.tile_size && start_resolution == params.start_resolution &&
pixel_size == params.pixel_size && threads == params.threads &&
+ adaptive_sampling == params.adaptive_sampling &&
use_profiling == params.use_profiling &&
display_buffer_linear == params.display_buffer_linear &&
cancel_timeout == params.cancel_timeout && reset_timeout == params.reset_timeout &&
diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h
index a8ea1dc925e..13d177d2b25 100644
--- a/intern/cycles/util/util_atomic.h
+++ b/intern/cycles/util/util_atomic.h
@@ -77,6 +77,7 @@ ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float
# define atomic_fetch_and_add_uint32(p, x) atomic_add((p), (x))
# define atomic_fetch_and_inc_uint32(p) atomic_inc((p))
# define atomic_fetch_and_dec_uint32(p) atomic_dec((p))
+# define atomic_fetch_and_or_uint32(p, x) atomic_or((p), (x))
# define CCL_LOCAL_MEM_FENCE CLK_LOCAL_MEM_FENCE
# define ccl_barrier(flags) barrier(flags)
@@ -91,6 +92,7 @@ ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float
# define atomic_fetch_and_sub_uint32(p, x) atomicSub((unsigned int *)(p), (unsigned int)(x))
# define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1)
# define atomic_fetch_and_dec_uint32(p) atomic_fetch_and_sub_uint32((p), 1)
+# define atomic_fetch_and_or_uint32(p, x) atomicOr((unsigned int *)(p), (unsigned int)(x))
ccl_device_inline float atomic_compare_and_swap_float(volatile float *dest,
const float old_val,
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index 48e9983ac8f..8a1890ad319 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -101,6 +101,11 @@ ccl_device_inline size_t round_down(size_t x, size_t multiple)
return (x / multiple) * multiple;
}
+ccl_device_inline bool is_power_of_two(size_t x)
+{
+ return (x & (x - 1)) == 0;
+}
+
CCL_NAMESPACE_END
/* Vectorized types declaration. */