Adaptive Sampling for Cycles.

This feature takes some inspiration from "RenderMan: An Advanced Path Tracing Architecture for Movie Rendering" and "A Hierarchical Automatic Stopping Condition for Monte Carlo Global Illumination" The basic principle is as follows: While samples are being added to a pixel, the adaptive sampler writes half of the samples to a separate buffer. This gives it two separate estimates of the same pixel, and by comparing their difference it estimates convergence. Once convergence drops below a given threshold, the pixel is considered done. When a pixel has not converged yet and needs more samples than the minimum, its immediate neighbors are also set to take more samples. This is done in order to more reliably detect sharp features such as caustics. A 3x3 box filter that is run periodically over the tile buffer is used for that purpose. After a tile has finished rendering, the values of all passes are scaled as if they were rendered with the full number of samples. This way, any code operating on these buffers, for example the denoiser, does not need to be changed for per-pixel sample counts. Reviewed By: brecht, #cycles Differential Revision: https://developer.blender.org/D4686
author: Stefan Werner <stefan.werner@tangent-animation.com> 2020-03-05 14:05:42 +0300
committer: Stefan Werner <stefan.werner@tangent-animation.com> 2020-03-05 14:21:38 +0300
commit: 51e898324de30c0985a80e5bc067358b5ccedbfc (patch)
tree: 5efddead1b7ca5655f1d6d2422b59e7da51fe271 /intern/cycles
parent: 4ccbbd308060f0330472828b317c59e054c9ee7b (diff)
45 files changed, 1215 insertions, 46 deletions
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index 7917edf8c88..afa573c8dc5 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -255,6 +255,7 @@ def list_render_passes(srl):
     if crl.pass_debug_bvh_traversed_instances: yield ("Debug BVH Traversed Instances", "X",   'VALUE')
     if crl.pass_debug_bvh_intersections:       yield ("Debug BVH Intersections",       "X",   'VALUE')
     if crl.pass_debug_ray_bounces:             yield ("Debug Ray Bounces",             "X",   'VALUE')
+    if crl.pass_debug_sample_count:            yield ("Debug Sample Count",            "X",   'VALUE')
     if crl.use_pass_volume_direct:             yield ("VolumeDir",                     "RGB", 'COLOR')
     if crl.use_pass_volume_indirect:           yield ("VolumeInd",                     "RGB", 'COLOR')
 
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 77dc29e11e8..f2c40f509e8 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -350,6 +350,24 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         default=0.01,
     )
 
+    use_adaptive_sampling: BoolProperty(
+        name="Use adaptive sampling",
+        description="Automatically determine the number of samples per pixel based on a variance estimation",
+        default=False,
+    )
+    adaptive_threshold: FloatProperty(
+        name="Adaptive Sampling Threshold",
+        description="Zero for automatic setting based on AA samples",
+        min=0.0, max=1.0,
+        default=0.0,
+    )
+    adaptive_min_samples: IntProperty(
+        name="Adaptive Min Samples",
+        description="Minimum AA samples for adaptive sampling. Zero for automatic setting based on AA samples",
+        min=0, max=4096,
+        default=0,
+    )
+
     min_light_bounces: IntProperty(
             name="Min Light Bounces",
             description="Minimum number of light bounces. Setting this higher reduces noise in the first bounces, "
@@ -1298,7 +1316,12 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
         default=False,
         update=update_render_passes,
     )
-
+    pass_debug_sample_count: BoolProperty(
+        name="Debug Sample Count",
+        description="Number of samples/camera rays per pixel",
+        default=False,
+        update=update_render_passes,
+    )
     use_pass_volume_direct: BoolProperty(
         name="Volume Direct",
         description="Deliver direct volumetric scattering pass",
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index ed9e3a4c9cf..d04418fc957 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -190,6 +190,7 @@ class CYCLES_RENDER_PT_sampling(CyclesButtonsPanel, Panel):
             col.prop(cscene, "aa_samples", text="Render")
             col.prop(cscene, "preview_aa_samples", text="Viewport")
 
+        col.prop(cscene, "use_adaptive_sampling", text="Adaptive Sampling")
         # Viewport denoising is currently only supported with OptiX
         if show_optix_denoising(context):
             col = layout.column()
@@ -247,7 +248,13 @@ class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel):
         row.prop(cscene, "seed")
         row.prop(cscene, "use_animated_seed", text="", icon='TIME')
 
-        layout.prop(cscene, "sampling_pattern", text="Pattern")
+        col = layout.column(align=True)
+        col.active = not(cscene.use_adaptive_sampling)
+        col.prop(cscene, "sampling_pattern", text="Pattern")
+        col = layout.column(align=True)
+        col.active = cscene.use_adaptive_sampling
+        col.prop(cscene, "adaptive_min_samples", text="Adaptive Min Samples")
+        col.prop(cscene, "adaptive_threshold", text="Adaptive Threshold")
 
         layout.prop(cscene, "use_square_samples")
 
@@ -813,6 +820,8 @@ class CYCLES_RENDER_PT_passes_data(CyclesButtonsPanel, Panel):
         col.prop(cycles_view_layer, "denoising_store_passes", text="Denoising Data")
         col = flow.column()
         col.prop(cycles_view_layer, "pass_debug_render_time", text="Render Time")
+        col = flow.column()
+        col.prop(cycles_view_layer, "pass_debug_sample_count", text="Sample Count")
 
         layout.separator()
 
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index 5cfb1200c7c..ac307743e48 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -470,7 +470,8 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
   b_rlay_name = b_view_layer.name();
 
   /* add passes */
-  vector<Pass> passes = sync->sync_render_passes(b_rlay, b_view_layer);
+  vector<Pass> passes = sync->sync_render_passes(
+      b_rlay, b_view_layer, session_params.adaptive_sampling);
   buffer_params.passes = passes;
 
   PointerRNA crl = RNA_pointer_get(&b_view_layer.ptr, "cycles");
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 50442c6ebdc..8f00f9ccda0 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -296,6 +296,16 @@ void BlenderSync::sync_integrator()
   integrator->sample_all_lights_indirect = get_boolean(cscene, "sample_all_lights_indirect");
   integrator->light_sampling_threshold = get_float(cscene, "light_sampling_threshold");
 
+  if (RNA_boolean_get(&cscene, "use_adaptive_sampling")) {
+    integrator->sampling_pattern = SAMPLING_PATTERN_PMJ;
+    integrator->adaptive_min_samples = get_int(cscene, "adaptive_min_samples");
+    integrator->adaptive_threshold = get_float(cscene, "adaptive_threshold");
+  }
+  else {
+    integrator->adaptive_min_samples = INT_MAX;
+    integrator->adaptive_threshold = 0.0f;
+  }
+
   int diffuse_samples = get_int(cscene, "diffuse_samples");
   int glossy_samples = get_int(cscene, "glossy_samples");
   int transmission_samples = get_int(cscene, "transmission_samples");
@@ -312,6 +322,8 @@ void BlenderSync::sync_integrator()
     integrator->mesh_light_samples = mesh_light_samples * mesh_light_samples;
     integrator->subsurface_samples = subsurface_samples * subsurface_samples;
     integrator->volume_samples = volume_samples * volume_samples;
+    integrator->adaptive_min_samples = min(
+        integrator->adaptive_min_samples * integrator->adaptive_min_samples, INT_MAX);
   }
   else {
     integrator->diffuse_samples = diffuse_samples;
@@ -484,6 +496,8 @@ PassType BlenderSync::get_pass_type(BL::RenderPass &b_pass)
   MAP_PASS("Debug Ray Bounces", PASS_RAY_BOUNCES);
 #endif
   MAP_PASS("Debug Render Time", PASS_RENDER_TIME);
+  MAP_PASS("AdaptiveAuxBuffer", PASS_ADAPTIVE_AUX_BUFFER);
+  MAP_PASS("Debug Sample Count", PASS_SAMPLE_COUNT);
   if (string_startswith(name, cryptomatte_prefix)) {
     return PASS_CRYPTOMATTE;
   }
@@ -519,7 +533,9 @@ int BlenderSync::get_denoising_pass(BL::RenderPass &b_pass)
   return -1;
 }
 
-vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLayer &b_view_layer)
+vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay,
+                                             BL::ViewLayer &b_view_layer,
+                                             bool adaptive_sampling)
 {
   vector<Pass> passes;
 
@@ -595,6 +611,10 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLa
     b_engine.add_pass("Debug Render Time", 1, "X", b_view_layer.name().c_str());
     Pass::add(PASS_RENDER_TIME, passes, "Debug Render Time");
   }
+  if (get_boolean(crp, "pass_debug_sample_count")) {
+    b_engine.add_pass("Debug Sample Count", 1, "X", b_view_layer.name().c_str());
+    Pass::add(PASS_SAMPLE_COUNT, passes, "Debug Sample Count");
+  }
   if (get_boolean(crp, "use_pass_volume_direct")) {
     b_engine.add_pass("VolumeDir", 3, "RGB", b_view_layer.name().c_str());
     Pass::add(PASS_VOLUME_DIRECT, passes, "VolumeDir");
@@ -641,6 +661,13 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLa
                                                         CRYPT_ACCURATE);
   }
 
+  if (adaptive_sampling) {
+    Pass::add(PASS_ADAPTIVE_AUX_BUFFER, passes);
+    if (!get_boolean(crp, "pass_debug_sample_count")) {
+      Pass::add(PASS_SAMPLE_COUNT, passes);
+    }
+  }
+
   RNA_BEGIN (&crp, b_aov, "aovs") {
     bool is_color = (get_enum(b_aov, "type") == 1);
     string name = get_string(b_aov, "name");
@@ -880,6 +907,8 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
   params.use_profiling = params.device.has_profiling && !b_engine.is_preview() && background &&
                          BlenderSession::print_render_stats;
 
+  params.adaptive_sampling = RNA_boolean_get(&cscene, "use_adaptive_sampling");
+
   return params;
 }
 
diff --git a/intern/cycles/blender/blender_sync.h b/intern/cycles/blender/blender_sync.h
index 219a3615835..0a7174e407b 100644
--- a/intern/cycles/blender/blender_sync.h
+++ b/intern/cycles/blender/blender_sync.h
@@ -71,7 +71,9 @@ class BlenderSync {
                  int height,
                  void **python_thread_state);
   void sync_view_layer(BL::SpaceView3D &b_v3d, BL::ViewLayer &b_view_layer);
-  vector<Pass> sync_render_passes(BL::RenderLayer &b_render_layer, BL::ViewLayer &b_view_layer);
+  vector<Pass> sync_render_passes(BL::RenderLayer &b_render_layer,
+                                  BL::ViewLayer &b_view_layer,
+                                  bool adaptive_sampling);
   void sync_integrator();
   void sync_camera(BL::RenderSettings &b_render,
                    BL::Object &b_override,
diff --git a/intern/cycles/device/cuda/device_cuda.h b/intern/cycles/device/cuda/device_cuda.h
index a825bd6b128..6a0b39434aa 100644
--- a/intern/cycles/device/cuda/device_cuda.h
+++ b/intern/cycles/device/cuda/device_cuda.h
@@ -82,6 +82,17 @@ class CUDADevice : public Device {
   device_vector<TextureInfo> texture_info;
   bool need_texture_info;
 
+  /* Kernels */
+  struct {
+    bool loaded;
+
+    CUfunction adaptive_stopping;
+    CUfunction adaptive_filter_x;
+    CUfunction adaptive_filter_y;
+    CUfunction adaptive_scale_samples;
+    int adaptive_num_threads_per_block;
+  } functions;
+
   static bool have_precompiled_kernels();
 
   virtual bool show_samples() const;
@@ -114,6 +125,8 @@ class CUDADevice : public Device {
 
   virtual bool load_kernels(const DeviceRequestedFeatures &requested_features);
 
+  void load_functions();
+
   void reserve_local_memory(const DeviceRequestedFeatures &requested_features);
 
   void init_host_memory();
@@ -197,6 +210,15 @@ class CUDADevice : public Device {
 
   void denoise(RenderTile &rtile, DenoisingTask &denoising);
 
+  void adaptive_sampling_filter(uint filter_sample,
+                                WorkTile *wtile,
+                                CUdeviceptr d_wtile,
+                                CUstream stream = 0);
+  void adaptive_sampling_post(RenderTile &rtile,
+                              WorkTile *wtile,
+                              CUdeviceptr d_wtile,
+                              CUstream stream = 0);
+
   void path_trace(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles);
 
   void film_convert(DeviceTask &task,
diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp
index 4a7c45d8b93..11dd9b69f10 100644
--- a/intern/cycles/device/cuda/device_cuda_impl.cpp
+++ b/intern/cycles/device/cuda/device_cuda_impl.cpp
@@ -208,6 +208,8 @@ CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool
   map_host_used = 0;
   can_map_host = 0;
 
+  functions.loaded = false;
+
   /* Intialize CUDA. */
   if (cuda_error(cuInit(0)))
     return;
@@ -531,9 +533,42 @@ bool CUDADevice::load_kernels(const DeviceRequestedFeatures &requested_features)
     reserve_local_memory(requested_features);
   }
 
+  load_functions();
+
   return (result == CUDA_SUCCESS);
 }
 
+void CUDADevice::load_functions()
+{
+  /* TODO: load all functions here. */
+  if (functions.loaded) {
+    return;
+  }
+  functions.loaded = true;
+
+  cuda_assert(cuModuleGetFunction(
+      &functions.adaptive_stopping, cuModule, "kernel_cuda_adaptive_stopping"));
+  cuda_assert(cuModuleGetFunction(
+      &functions.adaptive_filter_x, cuModule, "kernel_cuda_adaptive_filter_x"));
+  cuda_assert(cuModuleGetFunction(
+      &functions.adaptive_filter_y, cuModule, "kernel_cuda_adaptive_filter_y"));
+  cuda_assert(cuModuleGetFunction(
+      &functions.adaptive_scale_samples, cuModule, "kernel_cuda_adaptive_scale_samples"));
+
+  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_stopping, CU_FUNC_CACHE_PREFER_L1));
+  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_x, CU_FUNC_CACHE_PREFER_L1));
+  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_y, CU_FUNC_CACHE_PREFER_L1));
+  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_scale_samples, CU_FUNC_CACHE_PREFER_L1));
+
+  int unused_min_blocks;
+  cuda_assert(cuOccupancyMaxPotentialBlockSize(&unused_min_blocks,
+                                               &functions.adaptive_num_threads_per_block,
+                                               functions.adaptive_scale_samples,
+                                               NULL,
+                                               0,
+                                               0));
+}
+
 void CUDADevice::reserve_local_memory(const DeviceRequestedFeatures &requested_features)
 {
   if (use_split_kernel()) {
@@ -1666,6 +1701,80 @@ void CUDADevice::denoise(RenderTile &rtile, DenoisingTask &denoising)
   denoising.run_denoising(&rtile);
 }
 
+void CUDADevice::adaptive_sampling_filter(uint filter_sample,
+                                          WorkTile *wtile,
+                                          CUdeviceptr d_wtile,
+                                          CUstream stream)
+{
+  const int num_threads_per_block = functions.adaptive_num_threads_per_block;
+
+  /* These are a series of tiny kernels because there is no grid synchronisation
+   * from within a kernel, so multiple kernel launches it is.*/
+  uint total_work_size = wtile->h * wtile->w;
+  void *args2[] = {&d_wtile, &filter_sample, &total_work_size};
+  uint num_blocks = divide_up(total_work_size, num_threads_per_block);
+  cuda_assert(cuLaunchKernel(functions.adaptive_stopping,
+                             num_blocks,
+                             1,
+                             1,
+                             num_threads_per_block,
+                             1,
+                             1,
+                             0,
+                             stream,
+                             args2,
+                             0));
+  total_work_size = wtile->h;
+  num_blocks = divide_up(total_work_size, num_threads_per_block);
+  cuda_assert(cuLaunchKernel(functions.adaptive_filter_x,
+                             num_blocks,
+                             1,
+                             1,
+                             num_threads_per_block,
+                             1,
+                             1,
+                             0,
+                             stream,
+                             args2,
+                             0));
+  total_work_size = wtile->w;
+  num_blocks = divide_up(total_work_size, num_threads_per_block);
+  cuda_assert(cuLaunchKernel(functions.adaptive_filter_y,
+                             num_blocks,
+                             1,
+                             1,
+                             num_threads_per_block,
+                             1,
+                             1,
+                             0,
+                             stream,
+                             args2,
+                             0));
+}
+
+void CUDADevice::adaptive_sampling_post(RenderTile &rtile,
+                                        WorkTile *wtile,
+                                        CUdeviceptr d_wtile,
+                                        CUstream stream)
+{
+  const int num_threads_per_block = functions.adaptive_num_threads_per_block;
+  uint total_work_size = wtile->h * wtile->w;
+
+  void *args[] = {&d_wtile, &rtile.start_sample, &rtile.sample, &total_work_size};
+  uint num_blocks = divide_up(total_work_size, num_threads_per_block);
+  cuda_assert(cuLaunchKernel(functions.adaptive_scale_samples,
+                             num_blocks,
+                             1,
+                             1,
+                             num_threads_per_block,
+                             1,
+                             1,
+                             0,
+                             stream,
+                             args,
+                             0));
+}
+
 void CUDADevice::path_trace(DeviceTask &task,
                             RenderTile &rtile,
                             device_vector<WorkTile> &work_tiles)
@@ -1715,6 +1824,9 @@ void CUDADevice::path_trace(DeviceTask &task,
   }
 
   uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
+  if (task.adaptive_sampling.use) {
+    step_samples = task.adaptive_sampling.align_static_samples(step_samples);
+  }
 
   /* Render all samples. */
   int start_sample = rtile.start_sample;
@@ -1736,6 +1848,12 @@ void CUDADevice::path_trace(DeviceTask &task,
     cuda_assert(
         cuLaunchKernel(cuPathTrace, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
 
+    /* Run the adaptive sampling kernels at selected samples aligned to step samples. */
+    uint filter_sample = sample + wtile->num_samples - 1;
+    if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
+      adaptive_sampling_filter(filter_sample, wtile, d_work_tiles);
+    }
+
     cuda_assert(cuCtxSynchronize());
 
     /* Update progress. */
@@ -1747,6 +1865,14 @@ void CUDADevice::path_trace(DeviceTask &task,
         break;
     }
   }
+
+  /* Finalize adaptive sampling. */
+  if (task.adaptive_sampling.use) {
+    CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
+    adaptive_sampling_post(rtile, wtile, d_work_tiles);
+    cuda_assert(cuCtxSynchronize());
+    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
+  }
 }
 
 void CUDADevice::film_convert(DeviceTask &task,
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 1c9d2227ac3..cf239ec39e2 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -34,6 +34,7 @@
 #include "kernel/kernel_types.h"
 #include "kernel/split/kernel_split_data.h"
 #include "kernel/kernel_globals.h"
+#include "kernel/kernel_adaptive_sampling.h"
 
 #include "kernel/filter/filter.h"
 
@@ -317,6 +318,10 @@ class CPUDevice : public Device {
     REGISTER_SPLIT_KERNEL(next_iteration_setup);
     REGISTER_SPLIT_KERNEL(indirect_subsurface);
     REGISTER_SPLIT_KERNEL(buffer_update);
+    REGISTER_SPLIT_KERNEL(adaptive_stopping);
+    REGISTER_SPLIT_KERNEL(adaptive_filter_x);
+    REGISTER_SPLIT_KERNEL(adaptive_filter_y);
+    REGISTER_SPLIT_KERNEL(adaptive_adjust_samples);
 #undef REGISTER_SPLIT_KERNEL
 #undef KERNEL_FUNCTIONS
   }
@@ -823,6 +828,50 @@ class CPUDevice : public Device {
     return true;
   }
 
+  bool adaptive_sampling_filter(KernelGlobals *kg, RenderTile &tile, int sample)
+  {
+    WorkTile wtile;
+    wtile.x = tile.x;
+    wtile.y = tile.y;
+    wtile.w = tile.w;
+    wtile.h = tile.h;
+    wtile.offset = tile.offset;
+    wtile.stride = tile.stride;
+    wtile.buffer = (float *)tile.buffer;
+
+    bool any = false;
+    for (int y = tile.y; y < tile.y + tile.h; ++y) {
+      any |= kernel_do_adaptive_filter_x(kg, y, &wtile);
+    }
+    for (int x = tile.x; x < tile.x + tile.w; ++x) {
+      any |= kernel_do_adaptive_filter_y(kg, x, &wtile);
+    }
+
+    return (!any);
+  }
+
+  void adaptive_sampling_post(const DeviceTask &task, const RenderTile &tile, KernelGlobals *kg)
+  {
+    float *render_buffer = (float *)tile.buffer;
+    for (int y = tile.y; y < tile.y + tile.h; y++) {
+      for (int x = tile.x; x < tile.x + tile.w; x++) {
+        int index = tile.offset + x + y * tile.stride;
+        ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride;
+        if (buffer[kernel_data.film.pass_sample_count] < 0.0f) {
+          buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
+          float sample_multiplier = tile.sample / max((float)tile.start_sample + 1.0f,
+                                                      buffer[kernel_data.film.pass_sample_count]);
+          if (sample_multiplier != 1.0f) {
+            kernel_adaptive_post_adjust(kg, buffer, sample_multiplier);
+          }
+        }
+        else {
+          kernel_adaptive_post_adjust(kg, buffer, tile.sample / (tile.sample - 1.0f));
+        }
+      }
+    }
+  }
+
   void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
   {
     const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE;
@@ -855,14 +904,25 @@ class CPUDevice : public Device {
           path_trace_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
         }
       }
-
       tile.sample = sample + 1;
 
       task.update_progress(&tile, tile.w * tile.h);
+
+      if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(sample)) {
+        const bool stop = adaptive_sampling_filter(kg, tile, sample);
+        if (stop) {
+          tile.sample = end_sample;
+          break;
+        }
+      }
     }
     if (use_coverage) {
       coverage.finalize();
     }
+
+    if (task.adaptive_sampling.use) {
+      adaptive_sampling_post(task, tile, kg);
+    }
   }
 
   void denoise(DenoisingTask &denoising, RenderTile &tile)
diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp
index 61a5c74f69e..30d624fdf7c 100644
--- a/intern/cycles/device/device_optix.cpp
+++ b/intern/cycles/device/device_optix.cpp
@@ -627,7 +627,11 @@ class OptiXDevice : public CUDADevice {
 
     const int end_sample = rtile.start_sample + rtile.num_samples;
     // Keep this number reasonable to avoid running into TDRs
-    const int step_samples = (info.display_device ? 8 : 32);
+    int step_samples = (info.display_device ? 8 : 32);
+    if (task.adaptive_sampling.use) {
+      step_samples = task.adaptive_sampling.align_static_samples(step_samples);
+    }
+
     // Offset into launch params buffer so that streams use separate data
     device_ptr launch_params_ptr = launch_params.device_pointer +
                                    thread_index * launch_params.data_elements;
@@ -638,10 +642,9 @@ class OptiXDevice : public CUDADevice {
       // Copy work tile information to device
       wtile.num_samples = min(step_samples, end_sample - sample);
       wtile.start_sample = sample;
-      check_result_cuda(cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, tile),
-                                          &wtile,
-                                          sizeof(wtile),
-                                          cuda_stream[thread_index]));
+      device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
+      check_result_cuda(
+          cuMemcpyHtoDAsync(d_wtile_ptr, &wtile, sizeof(wtile), cuda_stream[thread_index]));
 
       OptixShaderBindingTable sbt_params = {};
       sbt_params.raygenRecord = sbt_data.device_pointer + PG_RGEN * sizeof(SbtRecord);
@@ -666,6 +669,12 @@ class OptiXDevice : public CUDADevice {
                                      wtile.h,
                                      1));
 
+      // Run the adaptive sampling kernels at selected samples aligned to step samples.
+      uint filter_sample = wtile.start_sample + wtile.num_samples - 1;
+      if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
+        adaptive_sampling_filter(filter_sample, &wtile, d_wtile_ptr, cuda_stream[thread_index]);
+      }
+
       // Wait for launch to finish
       check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
 
@@ -677,6 +686,14 @@ class OptiXDevice : public CUDADevice {
       if (task.get_cancel() && !task.need_finish_queue)
         return;  // Cancel rendering
     }
+
+    // Finalize adaptive sampling
+    if (task.adaptive_sampling.use) {
+      device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
+      adaptive_sampling_post(rtile, &wtile, d_wtile_ptr, cuda_stream[thread_index]);
+      check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
+      task.update_progress(&rtile, rtile.w * rtile.h * wtile.num_samples);
+    }
   }
 
   bool launch_denoise(DeviceTask &task, RenderTile &rtile)
diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
index 42e597a34d7..f22d8761058 100644
--- a/intern/cycles/device/device_split_kernel.cpp
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -55,6 +55,10 @@ DeviceSplitKernel::DeviceSplitKernel(Device *device)
   kernel_next_iteration_setup = NULL;
   kernel_indirect_subsurface = NULL;
   kernel_buffer_update = NULL;
+  kernel_adaptive_stopping = NULL;
+  kernel_adaptive_filter_x = NULL;
+  kernel_adaptive_filter_y = NULL;
+  kernel_adaptive_adjust_samples = NULL;
 }
 
 DeviceSplitKernel::~DeviceSplitKernel()
@@ -83,6 +87,10 @@ DeviceSplitKernel::~DeviceSplitKernel()
   delete kernel_next_iteration_setup;
   delete kernel_indirect_subsurface;
   delete kernel_buffer_update;
+  delete kernel_adaptive_stopping;
+  delete kernel_adaptive_filter_x;
+  delete kernel_adaptive_filter_y;
+  delete kernel_adaptive_adjust_samples;
 }
 
 bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures &requested_features)
@@ -114,6 +122,10 @@ bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures &requested_fe
   LOAD_KERNEL(next_iteration_setup);
   LOAD_KERNEL(indirect_subsurface);
   LOAD_KERNEL(buffer_update);
+  LOAD_KERNEL(adaptive_stopping);
+  LOAD_KERNEL(adaptive_filter_x);
+  LOAD_KERNEL(adaptive_filter_y);
+  LOAD_KERNEL(adaptive_adjust_samples);
 
 #undef LOAD_KERNEL
 
@@ -202,13 +214,21 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
     /* initial guess to start rolling average */
     const int initial_num_samples = 1;
     /* approx number of samples per second */
-    int samples_per_second = (avg_time_per_sample > 0.0) ?
-                                 int(double(time_multiplier) / avg_time_per_sample) + 1 :
-                                 initial_num_samples;
+    const int samples_per_second = (avg_time_per_sample > 0.0) ?
+                                       int(double(time_multiplier) / avg_time_per_sample) + 1 :
+                                       initial_num_samples;
 
     RenderTile subtile = tile;
     subtile.start_sample = tile.sample;
-    subtile.num_samples = min(samples_per_second,
+    subtile.num_samples = samples_per_second;
+
+    if (task->adaptive_sampling.use) {
+      subtile.num_samples = task->adaptive_sampling.align_dynamic_samples(subtile.start_sample,
+                                                                          subtile.num_samples);
+    }
+
+    /* Don't go beyond requested number of samples. */
+    subtile.num_samples = min(subtile.num_samples,
                               tile.start_sample + tile.num_samples - tile.sample);
 
     if (device->have_error()) {
@@ -302,6 +322,23 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
       }
     }
 
+    int filter_sample = tile.sample + subtile.num_samples - 1;
+    if (task->adaptive_sampling.use && task->adaptive_sampling.need_filter(filter_sample)) {
+      size_t buffer_size[2];
+      buffer_size[0] = round_up(tile.w, local_size[0]);
+      buffer_size[1] = round_up(tile.h, local_size[1]);
+      kernel_adaptive_stopping->enqueue(
+          KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
+      buffer_size[0] = round_up(tile.h, local_size[0]);
+      buffer_size[1] = round_up(1, local_size[1]);
+      kernel_adaptive_filter_x->enqueue(
+          KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
+      buffer_size[0] = round_up(tile.w, local_size[0]);
+      buffer_size[1] = round_up(1, local_size[1]);
+      kernel_adaptive_filter_y->enqueue(
+          KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
+    }
+
     double time_per_sample = ((time_dt() - start_time) / subtile.num_samples);
 
     if (avg_time_per_sample == 0.0) {
@@ -324,6 +361,28 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
     }
   }
 
+  if (task->adaptive_sampling.use) {
+    /* Reset the start samples. */
+    RenderTile subtile = tile;
+    subtile.start_sample = tile.start_sample;
+    subtile.num_samples = tile.sample - tile.start_sample;
+    enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
+                                   subtile,
+                                   num_global_elements,
+                                   kgbuffer,
+                                   kernel_data,
+                                   split_data,
+                                   ray_state,
+                                   queue_index,
+                                   use_queues_flag,
+                                   work_pool_wgs);
+    size_t buffer_size[2];
+    buffer_size[0] = round_up(tile.w, local_size[0]);
+    buffer_size[1] = round_up(tile.h, local_size[1]);
+    kernel_adaptive_adjust_samples->enqueue(
+        KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
+  }
+
   return true;
 }
 
diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h
index 6ff326bf214..9d6b9efdd62 100644
--- a/intern/cycles/device/device_split_kernel.h
+++ b/intern/cycles/device/device_split_kernel.h
@@ -75,6 +75,10 @@ class DeviceSplitKernel {
   SplitKernelFunction *kernel_next_iteration_setup;
   SplitKernelFunction *kernel_indirect_subsurface;
   SplitKernelFunction *kernel_buffer_update;
+  SplitKernelFunction *kernel_adaptive_stopping;
+  SplitKernelFunction *kernel_adaptive_filter_x;
+  SplitKernelFunction *kernel_adaptive_filter_y;
+  SplitKernelFunction *kernel_adaptive_adjust_samples;
 
   /* Global memory variables [porting]; These memory is used for
    * co-operation between different kernels; Data written by one
diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp
index 36522b874ab..c36b1344c3b 100644
--- a/intern/cycles/device/device_task.cpp
+++ b/intern/cycles/device/device_task.cpp
@@ -136,4 +136,59 @@ void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples)
   }
 }
 
+/* Adaptive Sampling */
+
+AdaptiveSampling::AdaptiveSampling()
+    : use(true), adaptive_step(ADAPTIVE_SAMPLE_STEP), min_samples(0)
+{
+}
+
+/* Render samples in steps that align with the adaptive filtering. */
+int AdaptiveSampling::align_static_samples(int samples) const
+{
+  if (samples > adaptive_step) {
+    /* Make multiple of adaptive_step. */
+    while (samples % adaptive_step != 0) {
+      samples--;
+    }
+  }
+  else if (samples < adaptive_step) {
+    /* Make divisor of adaptive_step. */
+    while (adaptive_step % samples != 0) {
+      samples--;
+    }
+  }
+
+  return max(samples, 1);
+}
+
+/* Render samples in steps that align with the adaptive filtering, with the
+ * suggested number of samples dynamically changing. */
+int AdaptiveSampling::align_dynamic_samples(int offset, int samples) const
+{
+  /* Round so that we end up on multiples of adaptive_samples. */
+  samples += offset;
+
+  if (samples > adaptive_step) {
+    /* Make multiple of adaptive_step. */
+    while (samples % adaptive_step != 0) {
+      samples--;
+    }
+  }
+
+  samples -= offset;
+
+  return max(samples, 1);
+}
+
+bool AdaptiveSampling::need_filter(int sample) const
+{
+  if (sample > min_samples) {
+    return (sample & (adaptive_step - 1)) == (adaptive_step - 1);
+  }
+  else {
+    return false;
+  }
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
index 972f6131092..8c4e682adb1 100644
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -62,6 +62,19 @@ class DenoiseParams {
   }
 };
 
+class AdaptiveSampling {
+ public:
+  AdaptiveSampling();
+
+  int align_static_samples(int samples) const;
+  int align_dynamic_samples(int offset, int samples) const;
+  bool need_filter(int sample) const;
+
+  bool use;
+  int adaptive_step;
+  int min_samples;
+};
+
 class DeviceTask : public Task {
  public:
   typedef enum { RENDER, FILM_CONVERT, SHADER, DENOISE_BUFFER } Type;
@@ -115,6 +128,7 @@ class DeviceTask : public Task {
 
   bool need_finish_queue;
   bool integrator_branched;
+  AdaptiveSampling adaptive_sampling;
 
  protected:
   double last_update_time;
diff --git a/intern/cycles/device/opencl/device_opencl.h b/intern/cycles/device/opencl/device_opencl.h
index 61b1e3e3b6b..b761726b1ad 100644
--- a/intern/cycles/device/opencl/device_opencl.h
+++ b/intern/cycles/device/opencl/device_opencl.h
@@ -445,6 +445,7 @@ class OpenCLDevice : public Device {
                     device_ptr rgba_byte,
                     device_ptr rgba_half);
   void shader(DeviceTask &task);
+  void update_adaptive(DeviceTask &task, RenderTile &tile, int sample);
 
   void denoise(RenderTile &tile, DenoisingTask &denoising);
 
diff --git a/intern/cycles/device/opencl/device_opencl_impl.cpp b/intern/cycles/device/opencl/device_opencl_impl.cpp
index 68cdfd5238c..3dbe54b38aa 100644
--- a/intern/cycles/device/opencl/device_opencl_impl.cpp
+++ b/intern/cycles/device/opencl/device_opencl_impl.cpp
@@ -56,7 +56,11 @@ static const string SPLIT_BUNDLE_KERNELS =
     "enqueue_inactive "
     "next_iteration_setup "
     "indirect_subsurface "
-    "buffer_update";
+    "buffer_update "
+    "adaptive_stopping "
+    "adaptive_filter_x "
+    "adaptive_filter_y "
+    "adaptive_adjust_samples";
 
 const string OpenCLDevice::get_opencl_program_name(const string &kernel_name)
 {
@@ -283,6 +287,10 @@ void OpenCLDevice::OpenCLSplitPrograms::load_kernels(
     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(next_iteration_setup);
     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(indirect_subsurface);
     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(buffer_update);
+    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_stopping);
+    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_x);
+    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_y);
+    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_adjust_samples);
     programs.push_back(&program_split);
 
 #  undef ADD_SPLIT_KERNEL_PROGRAM
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 566b6e3d191..0dd0da65f82 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -36,6 +36,10 @@ set(SRC_CUDA_KERNELS
 )
 
 set(SRC_OPENCL_KERNELS
+  kernels/opencl/kernel_adaptive_stopping.cl
+  kernels/opencl/kernel_adaptive_filter_x.cl
+  kernels/opencl/kernel_adaptive_filter_y.cl
+  kernels/opencl/kernel_adaptive_adjust_samples.cl
   kernels/opencl/kernel_bake.cl
   kernels/opencl/kernel_base.cl
   kernels/opencl/kernel_displace.cl
@@ -94,6 +98,7 @@ set(SRC_BVH_HEADERS
 
 set(SRC_HEADERS
   kernel_accumulate.h
+  kernel_adaptive_sampling.h
   kernel_bake.h
   kernel_camera.h
   kernel_color.h
@@ -324,6 +329,10 @@ set(SRC_UTIL_HEADERS
 )
 
 set(SRC_SPLIT_HEADERS
+  split/kernel_adaptive_adjust_samples.h
+  split/kernel_adaptive_filter_x.h
+  split/kernel_adaptive_filter_y.h
+  split/kernel_adaptive_stopping.h
   split/kernel_branched.h
   split/kernel_buffer_update.h
   split/kernel_data_init.h
diff --git a/intern/cycles/kernel/kernel_adaptive_sampling.h b/intern/cycles/kernel/kernel_adaptive_sampling.h
new file mode 100644
index 00000000000..502b69e4f7f
--- /dev/null
+++ b/intern/cycles/kernel/kernel_adaptive_sampling.h
@@ -0,0 +1,231 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KERNEL_ADAPTIVE_SAMPLING_H__
+#define __KERNEL_ADAPTIVE_SAMPLING_H__
+
+CCL_NAMESPACE_BEGIN
+
+/* Determines whether to continue sampling a given pixel or if it has sufficiently converged. */
+
+ccl_device void kernel_do_adaptive_stopping(KernelGlobals *kg,
+                                            ccl_global float *buffer,
+                                            int sample)
+{
+  /* TODO Stefan: Is this better in linear, sRGB or something else? */
+  float4 I = *((ccl_global float4 *)buffer);
+  float4 A = *(ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer);
+  /* The per pixel error as seen in section 2.1 of
+   * "A hierarchical automatic stopping condition for Monte Carlo global illumination"
+   * A small epsilon is added to the divisor to prevent division by zero. */
+  float error = (fabsf(I.x - A.x) + fabsf(I.y - A.y) + fabsf(I.z - A.z)) /
+                (sample * 0.0001f + sqrtf(I.x + I.y + I.z));
+  if (error < kernel_data.integrator.adaptive_threshold * (float)sample) {
+    /* Set the fourth component to non-zero value to indicate that this pixel has converged. */
+    buffer[kernel_data.film.pass_adaptive_aux_buffer + 3] += 1.0f;
+  }
+}
+
+/* Adjust the values of an adaptively sampled pixel. */
+
+ccl_device void kernel_adaptive_post_adjust(KernelGlobals *kg,
+                                            ccl_global float *buffer,
+                                            float sample_multiplier)
+{
+  *(ccl_global float4 *)(buffer) *= sample_multiplier;
+
+  /* Scale the aux pass too, this is necessary for progressive rendering to work properly. */
+  kernel_assert(kernel_data.film.pass_adaptive_aux_buffer);
+  *(ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer) *= sample_multiplier;
+
+#ifdef __PASSES__
+  int flag = kernel_data.film.pass_flag;
+
+  if (flag & PASSMASK(SHADOW))
+    *(ccl_global float3 *)(buffer + kernel_data.film.pass_shadow) *= sample_multiplier;
+
+  if (flag & PASSMASK(MIST))
+    *(ccl_global float *)(buffer + kernel_data.film.pass_mist) *= sample_multiplier;
+
+  if (flag & PASSMASK(NORMAL))
+    *(ccl_global float3 *)(buffer + kernel_data.film.pass_normal) *= sample_multiplier;
+
+  if (flag & PASSMASK(UV))
+    *(ccl_global float3 *)(buffer + kernel_data.film.pass_uv) *= sample_multiplier;
+
+  if (flag & PASSMASK(MOTION)) {
+    *(ccl_global float4 *)(buffer + kernel_data.film.pass_motion) *= sample_multiplier;
+    *(ccl_global float *)(buffer + kernel_data.film.pass_motion_weight) *= sample_multiplier;
+  }
+
+  if (kernel_data.film.use_light_pass) {
+    int light_flag = kernel_data.film.light_pass_flag;
+
+    if (light_flag & PASSMASK(DIFFUSE_INDIRECT))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_indirect) *= sample_multiplier;
+    if (light_flag & PASSMASK(GLOSSY_INDIRECT))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_indirect) *= sample_multiplier;
+    if (light_flag & PASSMASK(TRANSMISSION_INDIRECT))
+      *(ccl_global float3 *)(buffer +
+                             kernel_data.film.pass_transmission_indirect) *= sample_multiplier;
+    if (light_flag & PASSMASK(VOLUME_INDIRECT))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_volume_indirect) *= sample_multiplier;
+    if (light_flag & PASSMASK(DIFFUSE_DIRECT))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_direct) *= sample_multiplier;
+    if (light_flag & PASSMASK(GLOSSY_DIRECT))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_direct) *= sample_multiplier;
+    if (light_flag & PASSMASK(TRANSMISSION_DIRECT))
+      *(ccl_global float3 *)(buffer +
+                             kernel_data.film.pass_transmission_direct) *= sample_multiplier;
+    if (light_flag & PASSMASK(VOLUME_DIRECT))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_volume_direct) *= sample_multiplier;
+
+    if (light_flag & PASSMASK(EMISSION))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_emission) *= sample_multiplier;
+    if (light_flag & PASSMASK(BACKGROUND))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_background) *= sample_multiplier;
+    if (light_flag & PASSMASK(AO))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_ao) *= sample_multiplier;
+
+    if (light_flag & PASSMASK(DIFFUSE_COLOR))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_color) *= sample_multiplier;
+    if (light_flag & PASSMASK(GLOSSY_COLOR))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_color) *= sample_multiplier;
+    if (light_flag & PASSMASK(TRANSMISSION_COLOR))
+      *(ccl_global float3 *)(buffer +
+                             kernel_data.film.pass_transmission_color) *= sample_multiplier;
+  }
+#endif
+
+#ifdef __DENOISING_FEATURES__
+
+#  define scale_float3_variance(buffer, offset, scale) \
+    *(buffer + offset) *= scale; \
+    *(buffer + offset + 1) *= scale; \
+    *(buffer + offset + 2) *= scale; \
+    *(buffer + offset + 3) *= scale * scale; \
+    *(buffer + offset + 4) *= scale * scale; \
+    *(buffer + offset + 5) *= scale * scale;
+
+#  define scale_shadow_variance(buffer, offset, scale) \
+    *(buffer + offset) *= scale; \
+    *(buffer + offset + 1) *= scale; \
+    *(buffer + offset + 2) *= scale * scale;
+
+  if (kernel_data.film.pass_denoising_data) {
+    scale_shadow_variance(
+        buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_SHADOW_A, sample_multiplier);
+    scale_shadow_variance(
+        buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_SHADOW_B, sample_multiplier);
+    if (kernel_data.film.pass_denoising_clean) {
+      scale_float3_variance(
+          buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, sample_multiplier);
+      *(buffer + kernel_data.film.pass_denoising_clean) *= sample_multiplier;
+      *(buffer + kernel_data.film.pass_denoising_clean + 1) *= sample_multiplier;
+      *(buffer + kernel_data.film.pass_denoising_clean + 2) *= sample_multiplier;
+    }
+    else {
+      scale_float3_variance(
+          buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, sample_multiplier);
+    }
+    scale_float3_variance(
+        buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL, sample_multiplier);
+    scale_float3_variance(
+        buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO, sample_multiplier);
+    *(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH) *= sample_multiplier;
+    *(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH +
+      1) *= sample_multiplier * sample_multiplier;
+  }
+#endif /* __DENOISING_FEATURES__ */
+
+  if (kernel_data.film.cryptomatte_passes) {
+    int num_slots = 0;
+    num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) ? 1 : 0;
+    num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) ? 1 : 0;
+    num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) ? 1 : 0;
+    num_slots = num_slots * 2 * kernel_data.film.cryptomatte_depth;
+    ccl_global float2 *id_buffer = (ccl_global float2 *)(buffer +
+                                                         kernel_data.film.pass_cryptomatte);
+    for (int slot = 0; slot < num_slots; slot++) {
+      id_buffer[slot].y *= sample_multiplier;
+    }
+  }
+}
+
+/* This is a simple box filter in two passes.
+ * When a pixel demands more adaptive samples, let its neighboring pixels draw more samples too. */
+
+ccl_device bool kernel_do_adaptive_filter_x(KernelGlobals *kg, int y, ccl_global WorkTile *tile)
+{
+  bool any = false;
+  bool prev = false;
+  for (int x = tile->x; x < tile->x + tile->w; ++x) {
+    int index = tile->offset + x + y * tile->stride;
+    ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
+    ccl_global float4 *aux = (ccl_global float4 *)(buffer +
+                                                   kernel_data.film.pass_adaptive_aux_buffer);
+    if (aux->w == 0.0f) {
+      any = true;
+      if (x > tile->x && !prev) {
+        index = index - 1;
+        buffer = tile->buffer + index * kernel_data.film.pass_stride;
+        aux = (ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer);
+        aux->w = 0.0f;
+      }
+      prev = true;
+    }
+    else {
+      if (prev) {
+        aux->w = 0.0f;
+      }
+      prev = false;
+    }
+  }
+  return any;
+}
+
+ccl_device bool kernel_do_adaptive_filter_y(KernelGlobals *kg, int x, ccl_global WorkTile *tile)
+{
+  bool prev = false;
+  bool any = false;
+  for (int y = tile->y; y < tile->y + tile->h; ++y) {
+    int index = tile->offset + x + y * tile->stride;
+    ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
+    ccl_global float4 *aux = (ccl_global float4 *)(buffer +
+                                                   kernel_data.film.pass_adaptive_aux_buffer);
+    if (aux->w == 0.0f) {
+      any = true;
+      if (y > tile->y && !prev) {
+        index = index - tile->stride;
+        buffer = tile->buffer + index * kernel_data.film.pass_stride;
+        aux = (ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer);
+        aux->w = 0.0f;
+      }
+      prev = true;
+    }
+    else {
+      if (prev) {
+        aux->w = 0.0f;
+      }
+      prev = false;
+    }
+  }
+  return any;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* __KERNEL_ADAPTIVE_SAMPLING_H__ */
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index 187e8340c82..33ec05c6048 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -29,7 +29,9 @@ ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg,
   if (kernel_data.film.pass_denoising_data == 0)
     return;
 
-  buffer += (sample & 1) ? DENOISING_PASS_SHADOW_B : DENOISING_PASS_SHADOW_A;
+  buffer += sample_is_even(kernel_data.integrator.sampling_pattern, sample) ?
+                DENOISING_PASS_SHADOW_B :
+                DENOISING_PASS_SHADOW_A;
 
   path_total = ensure_finite(path_total);
   path_total_shaded = ensure_finite(path_total_shaded);
@@ -386,6 +388,41 @@ ccl_device_inline void kernel_write_result(KernelGlobals *kg,
 #ifdef __KERNEL_DEBUG__
   kernel_write_debug_passes(kg, buffer, L);
 #endif
+
+  /* Adaptive Sampling. Fill the additional buffer with the odd samples and calculate our stopping
+     criteria. This is the heuristic from "A hierarchical automatic stopping condition for Monte
+     Carlo global illumination" except that here it is applied per pixel and not in hierarchical
+     tiles. */
+  if (kernel_data.film.pass_adaptive_aux_buffer &&
+      kernel_data.integrator.adaptive_threshold > 0.0f) {
+    if (sample_is_even(kernel_data.integrator.sampling_pattern, sample)) {
+      kernel_write_pass_float4(buffer + kernel_data.film.pass_adaptive_aux_buffer,
+                               make_float4(L_sum.x * 2.0f, L_sum.y * 2.0f, L_sum.z * 2.0f, 0.0f));
+    }
+#ifdef __KERNEL_CPU__
+    if (sample > kernel_data.integrator.adaptive_min_samples &&
+        (sample & (ADAPTIVE_SAMPLE_STEP - 1)) == (ADAPTIVE_SAMPLE_STEP - 1)) {
+      kernel_do_adaptive_stopping(kg, buffer, sample);
+    }
+#endif
+  }
+
+  /* Write the sample count as negative numbers initially to mark the samples as in progress.
+   * Once the tile has finished rendering, the sign gets flipped and all the pixel values
+   * are scaled as if they were taken at a uniform sample count. */
+  if (kernel_data.film.pass_sample_count) {
+    /* Make sure it's a negative number. In progressive refine mode, this bit gets flipped between
+     * passes. */
+#ifdef __ATOMIC_PASS_WRITE__
+    atomic_fetch_and_or_uint32((ccl_global uint *)(buffer + kernel_data.film.pass_sample_count),
+                               0x80000000);
+#else
+    if (buffer[kernel_data.film.pass_sample_count] > 0) {
+      buffer[kernel_data.film.pass_sample_count] *= -1.0f;
+    }
+#endif
+    kernel_write_pass_float(buffer + kernel_data.film.pass_sample_count, -1.0f);
+  }
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index 1a0b67275a7..bdd2703a894 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -31,6 +31,7 @@
 #include "kernel/kernel_accumulate.h"
 #include "kernel/kernel_shader.h"
 #include "kernel/kernel_light.h"
+#include "kernel/kernel_adaptive_sampling.h"
 #include "kernel/kernel_passes.h"
 
 #if defined(__VOLUME__) || defined(__SUBSURFACE__)
@@ -656,6 +657,14 @@ ccl_device void kernel_path_trace(
 
   buffer += index * pass_stride;
 
+  if (kernel_data.film.pass_adaptive_aux_buffer) {
+    ccl_global float4 *aux = (ccl_global float4 *)(buffer +
+                                                   kernel_data.film.pass_adaptive_aux_buffer);
+    if (aux->w > 0.0f) {
+      return;
+    }
+  }
+
   /* Initialize random numbers and sample ray. */
   uint rng_hash;
   Ray ray;
diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h
index f75e4ab4c97..0d5781fe3d1 100644
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ b/intern/cycles/kernel/kernel_path_branched.h
@@ -523,6 +523,14 @@ ccl_device void kernel_branched_path_trace(
 
   buffer += index * pass_stride;
 
+  if (kernel_data.film.pass_adaptive_aux_buffer) {
+    ccl_global float4 *aux = (ccl_global float4 *)(buffer +
+                                                   kernel_data.film.pass_adaptive_aux_buffer);
+    if (aux->w > 0.0f) {
+      return;
+    }
+  }
+
   /* initialize random numbers and ray */
   uint rng_hash;
   Ray ray;
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 88c2d0d3196..c5be93e2cda 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -63,6 +63,11 @@ CCL_NAMESPACE_BEGIN
 
 #define VOLUME_STACK_SIZE 32
 
+/* Adaptive sampling constants */
+#define ADAPTIVE_SAMPLE_STEP 4
+static_assert((ADAPTIVE_SAMPLE_STEP & (ADAPTIVE_SAMPLE_STEP - 1)) == 0,
+              "ADAPTIVE_SAMPLE_STEP must be power of two for bitwise operations to work");
+
 /* Split kernel constants */
 #define WORK_POOL_SIZE_GPU 64
 #define WORK_POOL_SIZE_CPU 1
@@ -374,6 +379,8 @@ typedef enum PassType {
   PASS_CRYPTOMATTE,
   PASS_AOV_COLOR,
   PASS_AOV_VALUE,
+  PASS_ADAPTIVE_AUX_BUFFER,
+  PASS_SAMPLE_COUNT,
   PASS_CATEGORY_MAIN_END = 31,
 
   PASS_MIST = 32,
@@ -1223,6 +1230,9 @@ typedef struct KernelFilm {
   int cryptomatte_depth;
   int pass_cryptomatte;
 
+  int pass_adaptive_aux_buffer;
+  int pass_sample_count;
+
   int pass_mist;
   float mist_start;
   float mist_inv_depth;
@@ -1256,6 +1266,8 @@ typedef struct KernelFilm {
   int display_divide_pass_stride;
   int use_display_exposure;
   int use_display_pass_alpha;
+
+  int pad3, pad4, pad5;
 } KernelFilm;
 static_assert_align(KernelFilm, 16);
 
@@ -1337,6 +1349,8 @@ typedef struct KernelIntegrator {
   /* sampler */
   int sampling_pattern;
   int aa_samples;
+  int adaptive_min_samples;
+  float adaptive_threshold;
 
   /* volume render */
   int use_volumes;
@@ -1348,7 +1362,7 @@ typedef struct KernelIntegrator {
 
   int max_closures;
 
-  int pad1;
+  int pad1, pad2, pad3;
 } KernelIntegrator;
 static_assert_align(KernelIntegrator, 16);
 
@@ -1662,7 +1676,7 @@ typedef struct WorkTile {
   uint start_sample;
   uint num_samples;
 
-  uint offset;
+  int offset;
   uint stride;
 
   ccl_global float *buffer;
diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h
index 799561a7466..c642d227e4b 100644
--- a/intern/cycles/kernel/kernel_work_stealing.h
+++ b/intern/cycles/kernel/kernel_work_stealing.h
@@ -23,17 +23,41 @@ CCL_NAMESPACE_BEGIN
  * Utility functions for work stealing
  */
 
+/* Map global work index to tile, pixel X/Y and sample. */
+ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile,
+                                      uint global_work_index,
+                                      ccl_private uint *x,
+                                      ccl_private uint *y,
+                                      ccl_private uint *sample)
+{
+#ifdef __KERNEL_CUDA__
+  /* Keeping threads for the same pixel together improves performance on CUDA. */
+  uint sample_offset = global_work_index % tile->num_samples;
+  uint pixel_offset = global_work_index / tile->num_samples;
+#else  /* __KERNEL_CUDA__ */
+  uint tile_pixels = tile->w * tile->h;
+  uint sample_offset = global_work_index / tile_pixels;
+  uint pixel_offset = global_work_index - sample_offset * tile_pixels;
+#endif /* __KERNEL_CUDA__ */
+  uint y_offset = pixel_offset / tile->w;
+  uint x_offset = pixel_offset - y_offset * tile->w;
+
+  *x = tile->x + x_offset;
+  *y = tile->y + y_offset;
+  *sample = tile->start_sample + sample_offset;
+}
+
 #ifdef __KERNEL_OPENCL__
 #  pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
 #endif
 
 #ifdef __SPLIT_KERNEL__
 /* Returns true if there is work */
-ccl_device bool get_next_work(KernelGlobals *kg,
-                              ccl_global uint *work_pools,
-                              uint total_work_size,
-                              uint ray_index,
-                              ccl_private uint *global_work_index)
+ccl_device bool get_next_work_item(KernelGlobals *kg,
+                                   ccl_global uint *work_pools,
+                                   uint total_work_size,
+                                   uint ray_index,
+                                   ccl_private uint *global_work_index)
 {
   /* With a small amount of work there may be more threads than work due to
    * rounding up of global size, stop such threads immediately. */
@@ -56,31 +80,37 @@ ccl_device bool get_next_work(KernelGlobals *kg,
   /* Test if all work for this pool is done. */
   return (*global_work_index < total_work_size);
 }
-#endif
 
-/* Map global work index to tile, pixel X/Y and sample. */
-ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile,
-                                      uint global_work_index,
-                                      ccl_private uint *x,
-                                      ccl_private uint *y,
-                                      ccl_private uint *sample)
+ccl_device bool get_next_work(KernelGlobals *kg,
+                              ccl_global uint *work_pools,
+                              uint total_work_size,
+                              uint ray_index,
+                              ccl_private uint *global_work_index)
 {
-#ifdef __KERNEL_CUDA__
-  /* Keeping threads for the same pixel together improves performance on CUDA. */
-  uint sample_offset = global_work_index % tile->num_samples;
-  uint pixel_offset = global_work_index / tile->num_samples;
-#else  /* __KERNEL_CUDA__ */
-  uint tile_pixels = tile->w * tile->h;
-  uint sample_offset = global_work_index / tile_pixels;
-  uint pixel_offset = global_work_index - sample_offset * tile_pixels;
-#endif /* __KERNEL_CUDA__ */
-  uint y_offset = pixel_offset / tile->w;
-  uint x_offset = pixel_offset - y_offset * tile->w;
-
-  *x = tile->x + x_offset;
-  *y = tile->y + y_offset;
-  *sample = tile->start_sample + sample_offset;
+  bool got_work = false;
+  if (kernel_data.film.pass_adaptive_aux_buffer) {
+    do {
+      got_work = get_next_work_item(kg, work_pools, total_work_size, ray_index, global_work_index);
+      if (got_work) {
+        ccl_global WorkTile *tile = &kernel_split_params.tile;
+        uint x, y, sample;
+        get_work_pixel(tile, *global_work_index, &x, &y, &sample);
+        uint buffer_offset = (tile->offset + x + y * tile->stride) * kernel_data.film.pass_stride;
+        ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
+        ccl_global float4 *aux = (ccl_global float4 *)(buffer +
+                                                       kernel_data.film.pass_adaptive_aux_buffer);
+        if (aux->w == 0.0f) {
+          break;
+        }
+      }
+    } while (got_work);
+  }
+  else {
+    got_work = get_next_work_item(kg, work_pools, total_work_size, ray_index, global_work_index);
+  }
+  return got_work;
 }
+#endif
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
index f5d981fb71a..683f4b88d79 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
@@ -89,5 +89,9 @@ DECLARE_SPLIT_KERNEL_FUNCTION(enqueue_inactive)
 DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
 DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
 DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update)
+DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_stopping)
+DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x)
+DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y)
+DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples)
 
 #undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
index 9ca3f46b5b6..96b2bf11132 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
@@ -58,6 +58,10 @@
 #    include "kernel/split/kernel_next_iteration_setup.h"
 #    include "kernel/split/kernel_indirect_subsurface.h"
 #    include "kernel/split/kernel_buffer_update.h"
+#    include "kernel/split/kernel_adaptive_stopping.h"
+#    include "kernel/split/kernel_adaptive_filter_x.h"
+#    include "kernel/split/kernel_adaptive_filter_y.h"
+#    include "kernel/split/kernel_adaptive_adjust_samples.h"
 #  endif /* __SPLIT_KERNEL__ */
 #else
 #  define STUB_ASSERT(arch, name) \
@@ -204,6 +208,10 @@ DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_stopping)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples)
 #endif   /* __SPLIT_KERNEL__ */
 
 #undef KERNEL_STUB
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
index af311027f78..c4c810c6a82 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -33,6 +33,7 @@
 #include "kernel/kernel_path_branched.h"
 #include "kernel/kernel_bake.h"
 #include "kernel/kernel_work_stealing.h"
+#include "kernel/kernel_adaptive_sampling.h"
 
 /* kernels */
 extern "C" __global__ void
@@ -83,6 +84,75 @@ kernel_cuda_branched_path_trace(WorkTile *tile, uint total_work_size)
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_adaptive_stopping(WorkTile *tile, int sample, uint total_work_size)
+{
+	int work_index = ccl_global_id(0);
+	bool thread_is_active = work_index < total_work_size;
+	KernelGlobals kg;
+	if(thread_is_active && kernel_data.film.pass_adaptive_aux_buffer) {
+		uint x = tile->x + work_index % tile->w;
+		uint y = tile->y + work_index / tile->w;
+		int index = tile->offset + x + y * tile->stride;
+		ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
+		kernel_do_adaptive_stopping(&kg, buffer, sample);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_adaptive_filter_x(WorkTile *tile, int sample, uint)
+{
+	KernelGlobals kg;
+	if(kernel_data.film.pass_adaptive_aux_buffer && sample > kernel_data.integrator.adaptive_min_samples) {
+		if(ccl_global_id(0) < tile->h) {
+			int y = tile->y + ccl_global_id(0);
+			kernel_do_adaptive_filter_x(&kg, y, tile);
+		}
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_adaptive_filter_y(WorkTile *tile, int sample, uint)
+{
+	KernelGlobals kg;
+	if(kernel_data.film.pass_adaptive_aux_buffer && sample > kernel_data.integrator.adaptive_min_samples) {
+		if(ccl_global_id(0) < tile->w) {
+			int x = tile->x + ccl_global_id(0);
+			kernel_do_adaptive_filter_y(&kg, x, tile);
+		}
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_adaptive_scale_samples(WorkTile *tile, int start_sample, int sample, uint total_work_size)
+{
+	if(kernel_data.film.pass_adaptive_aux_buffer) {
+		int work_index = ccl_global_id(0);
+		bool thread_is_active = work_index < total_work_size;
+		KernelGlobals kg;
+		if(thread_is_active) {
+			uint x = tile->x + work_index % tile->w;
+			uint y = tile->y + work_index / tile->w;
+			int index = tile->offset + x + y * tile->stride;
+			ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
+			if(buffer[kernel_data.film.pass_sample_count] < 0.0f) {
+				buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
+				float sample_multiplier = sample / max((float)start_sample + 1.0f, buffer[kernel_data.film.pass_sample_count]);
+				if(sample_multiplier != 1.0f) {
+					kernel_adaptive_post_adjust(&kg, buffer, sample_multiplier);
+				}
+			}
+			else {
+				kernel_adaptive_post_adjust(&kg, buffer, sample / (sample - 1.0f));
+			}
+		}
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
 kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
 {
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
index 43b3d0aa0e6..95ad7599cf1 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
@@ -43,6 +43,10 @@
 #include "kernel/split/kernel_next_iteration_setup.h"
 #include "kernel/split/kernel_indirect_subsurface.h"
 #include "kernel/split/kernel_buffer_update.h"
+#include "kernel/split/kernel_adaptive_stopping.h"
+#include "kernel/split/kernel_adaptive_filter_x.h"
+#include "kernel/split/kernel_adaptive_filter_y.h"
+#include "kernel/split/kernel_adaptive_adjust_samples.h"
 
 #include "kernel/kernel_film.h"
 
@@ -121,6 +125,10 @@ DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_stopping)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples)
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl
new file mode 100644
index 00000000000..ebdb99d4730
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_adaptive_adjust_samples.h"
+
+#define KERNEL_NAME adaptive_adjust_samples
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl
new file mode 100644
index 00000000000..76d82d4184e
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_adaptive_filter_x.h"
+
+#define KERNEL_NAME adaptive_filter_x
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl
new file mode 100644
index 00000000000..1e6d15ba0f2
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_adaptive_filter_y.h"
+
+#define KERNEL_NAME adaptive_filter_y
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl
new file mode 100644
index 00000000000..51de0059667
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_adaptive_stopping.h"
+
+#define KERNEL_NAME adaptive_stopping
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl b/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl
index 6041f13b52b..c3b7b09460a 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl
@@ -28,3 +28,7 @@
 #include "kernel/kernels/opencl/kernel_next_iteration_setup.cl"
 #include "kernel/kernels/opencl/kernel_indirect_subsurface.cl"
 #include "kernel/kernels/opencl/kernel_buffer_update.cl"
+#include "kernel/kernels/opencl/kernel_adaptive_stopping.cl"
+#include "kernel/kernels/opencl/kernel_adaptive_filter_x.cl"
+#include "kernel/kernels/opencl/kernel_adaptive_filter_y.cl"
+#include "kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl"
diff --git a/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h b/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h
new file mode 100644
index 00000000000..60ebf415970
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_adaptive_adjust_samples(KernelGlobals *kg)
+{
+  int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+  if (pixel_index < kernel_split_params.tile.w * kernel_split_params.tile.h) {
+    int x = kernel_split_params.tile.x + pixel_index % kernel_split_params.tile.w;
+    int y = kernel_split_params.tile.y + pixel_index / kernel_split_params.tile.w;
+    int buffer_offset = (kernel_split_params.tile.offset + x +
+                         y * kernel_split_params.tile.stride) *
+                        kernel_data.film.pass_stride;
+    ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
+    int sample = kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples;
+    if (buffer[kernel_data.film.pass_sample_count] < 0.0f) {
+      buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
+      float sample_multiplier = sample / max((float)kernel_split_params.tile.start_sample + 1.0f,
+                                             buffer[kernel_data.film.pass_sample_count]);
+      if (sample_multiplier != 1.0f) {
+        kernel_adaptive_post_adjust(kg, buffer, sample_multiplier);
+      }
+    }
+    else {
+      kernel_adaptive_post_adjust(kg, buffer, sample / (sample - 1.0f));
+    }
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_adaptive_filter_x.h b/intern/cycles/kernel/split/kernel_adaptive_filter_x.h
new file mode 100644
index 00000000000..93f41f7ced4
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_adaptive_filter_x.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_adaptive_filter_x(KernelGlobals *kg)
+{
+  int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+  if (pixel_index < kernel_split_params.tile.h &&
+      kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >=
+          kernel_data.integrator.adaptive_min_samples) {
+    int y = kernel_split_params.tile.y + pixel_index;
+    kernel_do_adaptive_filter_x(kg, y, &kernel_split_params.tile);
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_adaptive_filter_y.h b/intern/cycles/kernel/split/kernel_adaptive_filter_y.h
new file mode 100644
index 00000000000..eca53d079ec
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_adaptive_filter_y.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_adaptive_filter_y(KernelGlobals *kg)
+{
+  int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+  if (pixel_index < kernel_split_params.tile.w &&
+      kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >=
+          kernel_data.integrator.adaptive_min_samples) {
+    int x = kernel_split_params.tile.x + pixel_index;
+    kernel_do_adaptive_filter_y(kg, x, &kernel_split_params.tile);
+  }
+}
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_adaptive_stopping.h b/intern/cycles/kernel/split/kernel_adaptive_stopping.h
new file mode 100644
index 00000000000..c8eb1ebd705
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_adaptive_stopping.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_adaptive_stopping(KernelGlobals *kg)
+{
+  int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+  if (pixel_index < kernel_split_params.tile.w * kernel_split_params.tile.h &&
+      kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >=
+          kernel_data.integrator.adaptive_min_samples) {
+    int x = kernel_split_params.tile.x + pixel_index % kernel_split_params.tile.w;
+    int y = kernel_split_params.tile.y + pixel_index / kernel_split_params.tile.w;
+    int buffer_offset = (kernel_split_params.tile.offset + x +
+                         y * kernel_split_params.tile.stride) *
+                        kernel_data.film.pass_stride;
+    ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
+    kernel_do_adaptive_stopping(kg,
+                                buffer,
+                                kernel_split_params.tile.start_sample +
+                                    kernel_split_params.tile.num_samples - 1);
+  }
+}
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp
index 41e1b73fdac..19d3f00bf82 100644
--- a/intern/cycles/render/buffers.cpp
+++ b/intern/cycles/render/buffers.cpp
@@ -260,6 +260,22 @@ bool RenderBuffers::get_pass_rect(
     return false;
   }
 
+  float *sample_count = NULL;
+  if (name == "Combined") {
+    int sample_offset = 0;
+    for (size_t j = 0; j < params.passes.size(); j++) {
+      Pass &pass = params.passes[j];
+      if (pass.type != PASS_SAMPLE_COUNT) {
+        sample_offset += pass.components;
+        continue;
+      }
+      else {
+        sample_count = buffer.data() + sample_offset;
+        break;
+      }
+    }
+  }
+
   int pass_offset = 0;
 
   for (size_t j = 0; j < params.passes.size(); j++) {
@@ -420,6 +436,11 @@ bool RenderBuffers::get_pass_rect(
       }
       else {
         for (int i = 0; i < size; i++, in += pass_stride, pixels += 4) {
+          if (sample_count && sample_count[i * pass_stride] < 0.0f) {
+            scale = (pass.filter) ? -1.0f / (sample_count[i * pass_stride]) : 1.0f;
+            scale_exposure = (pass.exposure) ? scale * exposure : scale;
+          }
+
           float4 f = make_float4(in[0], in[1], in[2], in[3]);
 
           pixels[0] = f.x * scale_exposure;
diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp
index 172ea3dd31b..48d9c97e0fb 100644
--- a/intern/cycles/render/film.cpp
+++ b/intern/cycles/render/film.cpp
@@ -183,6 +183,13 @@ void Pass::add(PassType type, vector<Pass> &passes, const char *name)
     case PASS_CRYPTOMATTE:
       pass.components = 4;
       break;
+    case PASS_ADAPTIVE_AUX_BUFFER:
+      pass.components = 4;
+      break;
+    case PASS_SAMPLE_COUNT:
+      pass.components = 1;
+      pass.exposure = false;
+      break;
     case PASS_AOV_COLOR:
       pass.components = 4;
       break;
@@ -311,6 +318,7 @@ NODE_DEFINE(Film)
   SOCKET_BOOLEAN(denoising_clean_pass, "Generate Denoising Clean Pass", false);
   SOCKET_BOOLEAN(denoising_prefiltered_pass, "Generate Denoising Prefiltered Pass", false);
   SOCKET_INT(denoising_flags, "Denoising Flags", 0);
+  SOCKET_BOOLEAN(use_adaptive_sampling, "Use Adaptive Sampling", false);
 
   return type;
 }
@@ -482,6 +490,12 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
                                       kfilm->pass_stride;
         have_cryptomatte = true;
         break;
+      case PASS_ADAPTIVE_AUX_BUFFER:
+        kfilm->pass_adaptive_aux_buffer = kfilm->pass_stride;
+        break;
+      case PASS_SAMPLE_COUNT:
+        kfilm->pass_sample_count = kfilm->pass_stride;
+        break;
       case PASS_AOV_COLOR:
         if (!have_aov_color) {
           kfilm->pass_aov_color = kfilm->pass_stride;
diff --git a/intern/cycles/render/film.h b/intern/cycles/render/film.h
index 95e54cb54d8..aae8fb404b0 100644
--- a/intern/cycles/render/film.h
+++ b/intern/cycles/render/film.h
@@ -81,6 +81,8 @@ class Film : public Node {
   CryptomatteType cryptomatte_passes;
   int cryptomatte_depth;
 
+  bool use_adaptive_sampling;
+
   bool need_update;
 
   Film();
diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp
index f289e11fe14..ee1aa5988bf 100644
--- a/intern/cycles/render/integrator.cpp
+++ b/intern/cycles/render/integrator.cpp
@@ -27,6 +27,7 @@
 #include "kernel/kernel_types.h"
 
 #include "util/util_foreach.h"
+#include "util/util_logging.h"
 #include "util/util_hash.h"
 
 CCL_NAMESPACE_BEGIN
@@ -69,6 +70,9 @@ NODE_DEFINE(Integrator)
   SOCKET_INT(volume_samples, "Volume Samples", 1);
   SOCKET_INT(start_sample, "Start Sample", 0);
 
+  SOCKET_FLOAT(adaptive_threshold, "Adaptive Threshold", 0.0f);
+  SOCKET_INT(adaptive_min_samples, "Adaptive Min Samples", 0);
+
   SOCKET_BOOLEAN(sample_all_lights_direct, "Sample All Lights Direct", true);
   SOCKET_BOOLEAN(sample_all_lights_indirect, "Sample All Lights Indirect", true);
   SOCKET_FLOAT(light_sampling_threshold, "Light Sampling Threshold", 0.05f);
@@ -178,6 +182,22 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
 
   kintegrator->sampling_pattern = sampling_pattern;
   kintegrator->aa_samples = aa_samples;
+  if (aa_samples > 0 && adaptive_min_samples == 0) {
+    kintegrator->adaptive_min_samples = max(4, (int)sqrtf(aa_samples));
+    VLOG(1) << "Cycles adaptive sampling: automatic min samples = "
+            << kintegrator->adaptive_min_samples;
+  }
+  else {
+    kintegrator->adaptive_min_samples = max(4, adaptive_min_samples);
+  }
+  if (aa_samples > 0 && adaptive_threshold == 0.0f) {
+    kintegrator->adaptive_threshold = max(0.001f, 1.0f / (float)aa_samples);
+    VLOG(1) << "Cycles adaptive sampling: automatic threshold = "
+            << kintegrator->adaptive_threshold;
+  }
+  else {
+    kintegrator->adaptive_threshold = adaptive_threshold;
+  }
 
   if (light_sampling_threshold > 0.0f) {
     kintegrator->light_inv_rr_threshold = 1.0f / light_sampling_threshold;
diff --git a/intern/cycles/render/integrator.h b/intern/cycles/render/integrator.h
index 32d84c27072..9930e907aea 100644
--- a/intern/cycles/render/integrator.h
+++ b/intern/cycles/render/integrator.h
@@ -75,6 +75,9 @@ class Integrator : public Node {
   bool sample_all_lights_indirect;
   float light_sampling_threshold;
 
+  int adaptive_min_samples;
+  float adaptive_threshold;
+
   enum Method {
     BRANCHED_PATH = 0,
     PATH = 1,
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index 4231403e39a..6bf2160f9fa 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -1103,6 +1103,10 @@ void Session::render(bool with_denoising)
   task.need_finish_queue = params.progressive_refine;
   task.integrator_branched = scene->integrator->method == Integrator::BRANCHED_PATH;
 
+  task.adaptive_sampling.use = (scene->integrator->sampling_pattern == SAMPLING_PATTERN_PMJ) &&
+                               scene->dscene.data.film.pass_adaptive_aux_buffer;
+  task.adaptive_sampling.min_samples = scene->dscene.data.integrator.adaptive_min_samples;
+
   /* Acquire render tiles by default. */
   task.tile_types = RenderTile::PATH_TRACE;
 
diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h
index 7f3614ccb19..8053e46b12e 100644
--- a/intern/cycles/render/session.h
+++ b/intern/cycles/render/session.h
@@ -56,6 +56,7 @@ class SessionParams {
   int denoising_start_sample;
   int pixel_size;
   int threads;
+  bool adaptive_sampling;
 
   bool use_profiling;
 
@@ -89,6 +90,7 @@ class SessionParams {
     denoising_start_sample = 0;
     pixel_size = 1;
     threads = 0;
+    adaptive_sampling = false;
 
     use_profiling = false;
 
@@ -117,6 +119,7 @@ class SessionParams {
              progressive == params.progressive && experimental == params.experimental &&
              tile_size == params.tile_size && start_resolution == params.start_resolution &&
              pixel_size == params.pixel_size && threads == params.threads &&
+             adaptive_sampling == params.adaptive_sampling &&
              use_profiling == params.use_profiling &&
              display_buffer_linear == params.display_buffer_linear &&
              cancel_timeout == params.cancel_timeout && reset_timeout == params.reset_timeout &&
diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h
index a8ea1dc925e..13d177d2b25 100644
--- a/intern/cycles/util/util_atomic.h
+++ b/intern/cycles/util/util_atomic.h
@@ -77,6 +77,7 @@ ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float
 #    define atomic_fetch_and_add_uint32(p, x) atomic_add((p), (x))
 #    define atomic_fetch_and_inc_uint32(p) atomic_inc((p))
 #    define atomic_fetch_and_dec_uint32(p) atomic_dec((p))
+#    define atomic_fetch_and_or_uint32(p, x) atomic_or((p), (x))
 
 #    define CCL_LOCAL_MEM_FENCE CLK_LOCAL_MEM_FENCE
 #    define ccl_barrier(flags) barrier(flags)
@@ -91,6 +92,7 @@ ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float
 #    define atomic_fetch_and_sub_uint32(p, x) atomicSub((unsigned int *)(p), (unsigned int)(x))
 #    define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1)
 #    define atomic_fetch_and_dec_uint32(p) atomic_fetch_and_sub_uint32((p), 1)
+#    define atomic_fetch_and_or_uint32(p, x) atomicOr((unsigned int *)(p), (unsigned int)(x))
 
 ccl_device_inline float atomic_compare_and_swap_float(volatile float *dest,
                                                       const float old_val,
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index 48e9983ac8f..8a1890ad319 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -101,6 +101,11 @@ ccl_device_inline size_t round_down(size_t x, size_t multiple)
   return (x / multiple) * multiple;
 }
 
+ccl_device_inline bool is_power_of_two(size_t x)
+{
+  return (x & (x - 1)) == 0;
+}
+
 CCL_NAMESPACE_END
 
 /* Vectorized types declaration. */
author	Stefan Werner <stefan.werner@tangent-animation.com>	2020-03-05 14:05:42 +0300
committer	Stefan Werner <stefan.werner@tangent-animation.com>	2020-03-05 14:21:38 +0300
commit	51e898324de30c0985a80e5bc067358b5ccedbfc (patch)
tree	5efddead1b7ca5655f1d6d2422b59e7da51fe271 /intern/cycles
parent	4ccbbd308060f0330472828b317c59e054c9ee7b (diff)