45 files changed, 1215 insertions, 46 deletions
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index 7917edf8c88..afa573c8dc5 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -255,6 +255,7 @@ def list_render_passes(srl):
     if crl.pass_debug_bvh_traversed_instances: yield ("Debug BVH Traversed Instances", "X",   'VALUE')
     if crl.pass_debug_bvh_intersections:       yield ("Debug BVH Intersections",       "X",   'VALUE')
     if crl.pass_debug_ray_bounces:             yield ("Debug Ray Bounces",             "X",   'VALUE')
+    if crl.pass_debug_sample_count:            yield ("Debug Sample Count",            "X",   'VALUE')
     if crl.use_pass_volume_direct:             yield ("VolumeDir",                     "RGB", 'COLOR')
     if crl.use_pass_volume_indirect:           yield ("VolumeInd",                     "RGB", 'COLOR')
 
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 77dc29e11e8..f2c40f509e8 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -350,6 +350,24 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         default=0.01,
     )
 
+    use_adaptive_sampling: BoolProperty(
+        name="Use adaptive sampling",
+        description="Automatically determine the number of samples per pixel based on a variance estimation",
+        default=False,
+    )
+    adaptive_threshold: FloatProperty(
+        name="Adaptive Sampling Threshold",
+        description="Zero for automatic setting based on AA samples",
+        min=0.0, max=1.0,
+        default=0.0,
+    )
+    adaptive_min_samples: IntProperty(
+        name="Adaptive Min Samples",
+        description="Minimum AA samples for adaptive sampling. Zero for automatic setting based on AA samples",
+        min=0, max=4096,
+        default=0,
+    )
+
     min_light_bounces: IntProperty(
             name="Min Light Bounces",
             description="Minimum number of light bounces. Setting this higher reduces noise in the first bounces, "
@@ -1298,7 +1316,12 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
         default=False,
         update=update_render_passes,
     )
-
+    pass_debug_sample_count: BoolProperty(
+        name="Debug Sample Count",
+        description="Number of samples/camera rays per pixel",
+        default=False,
+        update=update_render_passes,
+    )
     use_pass_volume_direct: BoolProperty(
         name="Volume Direct",
         description="Deliver direct volumetric scattering pass",
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index ed9e3a4c9cf..d04418fc957 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -190,6 +190,7 @@ class CYCLES_RENDER_PT_sampling(CyclesButtonsPanel, Panel):
             col.prop(cscene, "aa_samples", text="Render")
             col.prop(cscene, "preview_aa_samples", text="Viewport")
 
+        col.prop(cscene, "use_adaptive_sampling", text="Adaptive Sampling")
         # Viewport denoising is currently only supported with OptiX
         if show_optix_denoising(context):
             col = layout.column()
@@ -247,7 +248,13 @@ class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel):
         row.prop(cscene, "seed")
         row.prop(cscene, "use_animated_seed", text="", icon='TIME')
 
-        layout.prop(cscene, "sampling_pattern", text="Pattern")
+        col = layout.column(align=True)
+        col.active = not(cscene.use_adaptive_sampling)
+        col.prop(cscene, "sampling_pattern", text="Pattern")
+        col = layout.column(align=True)
+        col.active = cscene.use_adaptive_sampling
+        col.prop(cscene, "adaptive_min_samples", text="Adaptive Min Samples")
+        col.prop(cscene, "adaptive_threshold", text="Adaptive Threshold")
 
         layout.prop(cscene, "use_square_samples")
 
@@ -813,6 +820,8 @@ class CYCLES_RENDER_PT_passes_data(CyclesButtonsPanel, Panel):
         col.prop(cycles_view_layer, "denoising_store_passes", text="Denoising Data")
         col = flow.column()
         col.prop(cycles_view_layer, "pass_debug_render_time", text="Render Time")
+        col = flow.column()
+        col.prop(cycles_view_layer, "pass_debug_sample_count", text="Sample Count")
 
         layout.separator()
 
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index 5cfb1200c7c..ac307743e48 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -470,7 +470,8 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
   b_rlay_name = b_view_layer.name();
 
   /* add passes */
-  vector<Pass> passes = sync->sync_render_passes(b_rlay, b_view_layer);
+  vector<Pass> passes = sync->sync_render_passes(
+      b_rlay, b_view_layer, session_params.adaptive_sampling);
   buffer_params.passes = passes;
 
   PointerRNA crl = RNA_pointer_get(&b_view_layer.ptr, "cycles");
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 50442c6ebdc..8f00f9ccda0 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -296,6 +296,16 @@ void BlenderSync::sync_integrator()
   integrator->sample_all_lights_indirect = get_boolean(cscene, "sample_all_lights_indirect");
   integrator->light_sampling_threshold = get_float(cscene, "light_sampling_threshold");
 
+  if (RNA_boolean_get(&cscene, "use_adaptive_sampling")) {
+    integrator->sampling_pattern = SAMPLING_PATTERN_PMJ;
+    integrator->adaptive_min_samples = get_int(cscene, "adaptive_min_samples");
+    integrator->adaptive_threshold = get_float(cscene, "adaptive_threshold");
+  }
+  else {
+    integrator->adaptive_min_samples = INT_MAX;
+    integrator->adaptive_threshold = 0.0f;
+  }
+
   int diffuse_samples = get_int(cscene, "diffuse_samples");
   int glossy_samples = get_int(cscene, "glossy_samples");
   int transmission_samples = get_int(cscene, "transmission_samples");
@@ -312,6 +322,8 @@ void BlenderSync::sync_integrator()
     integrator->mesh_light_samples = mesh_light_samples * mesh_light_samples;
     integrator->subsurface_samples = subsurface_samples * subsurface_samples;
     integrator->volume_samples = volume_samples * volume_samples;
+    integrator->adaptive_min_samples = min(
+        integrator->adaptive_min_samples * integrator->adaptive_min_samples, INT_MAX);
   }
   else {
     integrator->diffuse_samples = diffuse_samples;
@@ -484,6 +496,8 @@ PassType BlenderSync::get_pass_type(BL::RenderPass &b_pass)
   MAP_PASS("Debug Ray Bounces", PASS_RAY_BOUNCES);
 #endif
   MAP_PASS("Debug Render Time", PASS_RENDER_TIME);
+  MAP_PASS("AdaptiveAuxBuffer", PASS_ADAPTIVE_AUX_BUFFER);
+  MAP_PASS("Debug Sample Count", PASS_SAMPLE_COUNT);
   if (string_startswith(name, cryptomatte_prefix)) {
     return PASS_CRYPTOMATTE;
   }
@@ -519,7 +533,9 @@ int BlenderSync::get_denoising_pass(BL::RenderPass &b_pass)
   return -1;
 }
 
-vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLayer &b_view_layer)
+vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay,
+                                             BL::ViewLayer &b_view_layer,
+                                             bool adaptive_sampling)
 {
   vector<Pass> passes;
 
@@ -595,6 +611,10 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLa
     b_engine.add_pass("Debug Render Time", 1, "X", b_view_layer.name().c_str());
     Pass::add(PASS_RENDER_TIME, passes, "Debug Render Time");
   }
+  if (get_boolean(crp, "pass_debug_sample_count")) {
+    b_engine.add_pass("Debug Sample Count", 1, "X", b_view_layer.name().c_str());
+    Pass::add(PASS_SAMPLE_COUNT, passes, "Debug Sample Count");
+  }
   if (get_boolean(crp, "use_pass_volume_direct")) {
     b_engine.add_pass("VolumeDir", 3, "RGB", b_view_layer.name().c_str());
     Pass::add(PASS_VOLUME_DIRECT, passes, "VolumeDir");
@@ -641,6 +661,13 @@ vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLa
                                                         CRYPT_ACCURATE);
   }
 
+  if (adaptive_sampling) {
+    Pass::add(PASS_ADAPTIVE_AUX_BUFFER, passes);
+    if (!get_boolean(crp, "pass_debug_sample_count")) {
+      Pass::add(PASS_SAMPLE_COUNT, passes);
+    }
+  }
+
   RNA_BEGIN (&crp, b_aov, "aovs") {
     bool is_color = (get_enum(b_aov, "type") == 1);
     string name = get_string(b_aov, "name");
@@ -880,6 +907,8 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
   params.use_profiling = params.device.has_profiling && !b_engine.is_preview() && background &&
                          BlenderSession::print_render_stats;
 
+  params.adaptive_sampling = RNA_boolean_get(&cscene, "use_adaptive_sampling");
+
   return params;
 }
 
diff --git a/intern/cycles/blender/blender_sync.h b/intern/cycles/blender/blender_sync.h
index 219a3615835..0a7174e407b 100644
--- a/intern/cycles/blender/blender_sync.h
+++ b/intern/cycles/blender/blender_sync.h
@@ -71,7 +71,9 @@ class BlenderSync {
                  int height,
                  void **python_thread_state);
   void sync_view_layer(BL::SpaceView3D &b_v3d, BL::ViewLayer &b_view_layer);
-  vector<Pass> sync_render_passes(BL::RenderLayer &b_render_layer, BL::ViewLayer &b_view_layer);
+  vector<Pass> sync_render_passes(BL::RenderLayer &b_render_layer,
+                                  BL::ViewLayer &b_view_layer,
+                                  bool adaptive_sampling);
   void sync_integrator();
   void sync_camera(BL::RenderSettings &b_render,
                    BL::Object &b_override,
diff --git a/intern/cycles/device/cuda/device_cuda.h b/intern/cycles/device/cuda/device_cuda.h
index a825bd6b128..6a0b39434aa 100644
--- a/intern/cycles/device/cuda/device_cuda.h
+++ b/intern/cycles/device/cuda/device_cuda.h
@@ -82,6 +82,17 @@ class CUDADevice : public Device {
   device_vector<TextureInfo> texture_info;
   bool need_texture_info;
 
+  /* Kernels */
+  struct {
+    bool loaded;
+
+    CUfunction adaptive_stopping;
+    CUfunction adaptive_filter_x;
+    CUfunction adaptive_filter_y;
+    CUfunction adaptive_scale_samples;
+    int adaptive_num_threads_per_block;
+  } functions;
+
   static bool have_precompiled_kernels();
 
   virtual bool show_samples() const;
@@ -114,6 +125,8 @@ class CUDADevice : public Device {
 
   virtual bool load_kernels(const DeviceRequestedFeatures &requested_features);
 
+  void load_functions();
+
   void reserve_local_memory(const DeviceRequestedFeatures &requested_features);
 
   void init_host_memory();
@@ -197,6 +210,15 @@ class CUDADevice : public Device {
 
   void denoise(RenderTile &rtile, DenoisingTask &denoising);
 
+  void adaptive_sampling_filter(uint filter_sample,
+                                WorkTile *wtile,
+                                CUdeviceptr d_wtile,
+                                CUstream stream = 0);
+  void adaptive_sampling_post(RenderTile &rtile,
+                              WorkTile *wtile,
+                              CUdeviceptr d_wtile,
+                              CUstream stream = 0);
+
   void path_trace(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles);
 
   void film_convert(DeviceTask &task,
diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp
index 4a7c45d8b93..11dd9b69f10 100644
--- a/intern/cycles/device/cuda/device_cuda_impl.cpp
+++ b/intern/cycles/device/cuda/device_cuda_impl.cpp
@@ -208,6 +208,8 @@ CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool
   map_host_used = 0;
   can_map_host = 0;
 
+  functions.loaded = false;
+
   /* Intialize CUDA. */
   if (cuda_error(cuInit(0)))
     return;
@@ -531,9 +533,42 @@ bool CUDADevice::load_kernels(const DeviceRequestedFeatures &requested_features)
     reserve_local_memory(requested_features);
   }
 
+  load_functions();
+
   return (result == CUDA_SUCCESS);
 }
 
+void CUDADevice::load_functions()
+{
+  /* TODO: load all functions here. */
+  if (functions.loaded) {
+    return;
+  }
+  functions.loaded = true;
+
+  cuda_assert(cuModuleGetFunction(
+      &functions.adaptive_stopping, cuModule, "kernel_cuda_adaptive_stopping"));
+  cuda_assert(cuModuleGetFunction(
+      &functions.adaptive_filter_x, cuModule, "kernel_cuda_adaptive_filter_x"));
+  cuda_assert(cuModuleGetFunction(
+      &functions.adaptive_filter_y, cuModule, "kernel_cuda_adaptive_filter_y"));
+  cuda_assert(cuModuleGetFunction(
+      &functions.adaptive_scale_samples, cuModule, "kernel_cuda_adaptive_scale_samples"));
+
+  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_stopping, CU_FUNC_CACHE_PREFER_L1));
+  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_x, CU_FUNC_CACHE_PREFER_L1));
+  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_y, CU_FUNC_CACHE_PREFER_L1));
+  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_scale_samples, CU_FUNC_CACHE_PREFER_L1));
+
+  int unused_min_blocks;
+  cuda_assert(cuOccupancyMaxPotentialBlockSize(&unused_min_blocks,
+                                               &functions.adaptive_num_threads_per_block,
+                                               functions.adaptive_scale_samples,
+                                               NULL,
+                                               0,
+                                               0));
+}
+
 void CUDADevice::reserve_local_memory(const DeviceRequestedFeatures &requested_features)
 {
   if (use_split_kernel()) {
@@ -1666,6 +1701,80 @@ void CUDADevice::denoise(RenderTile &rtile, DenoisingTask &denoising)
   denoising.run_denoising(&rtile);
 }
 
+void CUDADevice::adaptive_sampling_filter(uint filter_sample,
+                                          WorkTile *wtile,
+                                          CUdeviceptr d_wtile,
+                                          CUstream stream)
+{
+  const int num_threads_per_block = functions.adaptive_num_threads_per_block;
+
+  /* These are a series of tiny kernels because there is no grid synchronisation
+   * from within a kernel, so multiple kernel launches it is.*/
+  uint total_work_size = wtile->h * wtile->w;
+  void *args2[] = {&d_wtile, &filter_sample, &total_work_size};
+  uint num_blocks = divide_up(total_work_size, num_threads_per_block);
+  cuda_assert(cuLaunchKernel(functions.adaptive_stopping,
+                             num_blocks,
+                             1,
+                             1,
+                             num_threads_per_block,
+                             1,
+                             1,
+                             0,
+                             stream,
+                             args2,
+                             0));
+  total_work_size = wtile->h;
+  num_blocks = divide_up(total_work_size, num_threads_per_block);
+  cuda_assert(cuLaunchKernel(functions.adaptive_filter_x,
+                             num_blocks,
+                             1,
+                             1,
+                             num_threads_per_block,
+                             1,
+                             1,
+                             0,
+                             stream,
+                             args2,
+                             0));
+  total_work_size = wtile->w;
+  num_blocks = divide_up(total_work_size, num_threads_per_block);
+  cuda_assert(cuLaunchKernel(functions.adaptive_filter_y,
+                             num_blocks,
+                             1,
+                             1,
+                             num_threads_per_block,
+                             1,
+                             1,
+                             0,
+                             stream,
+                             args2,
+                             0));
+}
+
+void CUDADevice::adaptive_sampling_post(RenderTile &rtile,
+                                        WorkTile *wtile,
+                                        CUdeviceptr d_wtile,
+                                        CUstream stream)
+{
+  const int num_threads_per_block = functions.adaptive_num_threads_per_block;
+  uint total_work_size = wtile->h * wtile->w;
+
+  void *args[] = {&d_wtile, &rtile.start_sample, &rtile.sample, &total_work_size};
+  uint num_blocks = divide_up(total_work_size, num_threads_per_block);
+  cuda_assert(cuLaunchKernel(functions.adaptive_scale_samples,
+                             num_blocks,
+                             1,
+                             1,
+                             num_threads_per_block,
+                             1,
+                             1,
+                             0,
+                             stream,
+                             args,
+                             0));
+}
+
 void CUDADevice::path_trace(DeviceTask &task,
                             RenderTile &rtile,
                             device_vector<WorkTile> &work_tiles)
@@ -1715,6 +1824,9 @@ void CUDADevice::path_trace(DeviceTask &task,
   }
 
   uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
+  if (task.adaptive_sampling.use) {
+    step_samples = task.adaptive_sampling.align_static_samples(step_samples);
+  }
 
   /* Render all samples. */
   int start_sample = rtile.start_sample;
@@ -1736,6 +1848,12 @@ void CUDADevice::path_trace(DeviceTask &task,
     cuda_assert(
         cuLaunchKernel(cuPathTrace, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
 
+    /* Run the adaptive sampling kernels at selected samples aligned to step samples. */
+    uint filter_sample = sample + wtile->num_samples - 1;
+    if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
+      adaptive_sampling_filter(filter_sample, wtile, d_work_tiles);
+    }
+
     cuda_assert(cuCtxSynchronize());
 
     /* Update progress. */
@@ -1747,6 +1865,14 @@ void CUDADevice::path_trace(DeviceTask &task,
         break;
     }
   }
+
+  /* Finalize adaptive sampling. */
+  if (task.adaptive_sampling.use) {
+    CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
+    adaptive_sampling_post(rtile, wtile, d_work_tiles);
+    cuda_assert(cuCtxSynchronize());
+    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
+  }
 }
 
 void CUDADevice::film_convert(DeviceTask &task,
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 1c9d2227ac3..cf239ec39e2 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -34,6 +34,7 @@
 #include "kernel/kernel_types.h"
 #include "kernel/split/kernel_split_data.h"
 #include "kernel/kernel_globals.h"
+#include "kernel/kernel_adaptive_sampling.h"
 
 #include "kernel/filter/filter.h"
 
@@ -317,6 +318,10 @@ class CPUDevice : public Device {
     REGISTER_SPLIT_KERNEL(next_iteration_setup);
     REGISTER_SPLIT_KERNEL(indirect_subsurface);
     REGISTER_SPLIT_KERNEL(buffer_update);
+    REGISTER_SPLIT_KERNEL(adaptive_stopping);
+    REGISTER_SPLIT_KERNEL(adaptive_filter_x);
+    REGISTER_SPLIT_KERNEL(adaptive_filter_y);
+    REGISTER_SPLIT_KERNEL(adaptive_adjust_samples);
 #undef REGISTER_SPLIT_KERNEL
 #undef KERNEL_FUNCTIONS
   }
@@ -823,6 +828,50 @@ class CPUDevice : public Device {
     return true;
   }
 
+  bool adaptive_sampling_filter(KernelGlobals *kg, RenderTile &tile, int sample)
+  {
+    WorkTile wtile;
+    wtile.x = tile.x;
+    wtile.y = tile.y;
+    wtile.w = tile.w;
+    wtile.h = tile.h;
+    wtile.offset = tile.offset;
+    wtile.stride = tile.stride;
+    wtile.buffer = (float *)tile.buffer;
+
+    bool any = false;
+    for (int y = tile.y; y < tile.y + tile.h; ++y) {
+      any |= kernel_do_adaptive_filter_x(kg, y, &wtile);
+    }
+    for (int x = tile.x; x < tile.x + tile.w; ++x) {
+      any |= kernel_do_adaptive_filter_y(kg, x, &wtile);
+    }
+
+    return (!any);
+  }
+
+  void adaptive_sampling_post(const DeviceTask &task, const RenderTile &tile, KernelGlobals *kg)
+  {
+    float *render_buffer = (float *)tile.buffer;
+    for (int y = tile.y; y < tile.y + tile.h; y++) {
+      for (int x = tile.x; x < tile.x + tile.w; x++) {
+        int index = tile.offset + x + y * tile.stride;
+        ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride;
+        if (buffer[kernel_data.film.pass_sample_count] < 0.0f) {
+          buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
+          float sample_multiplier = tile.sample / max((float)tile.start_sample + 1.0f,
+                                                      buffer[kernel_data.film.pass_sample_count]);
+          if (sample_multiplier != 1.0f) {
+            kernel_adaptive_post_adjust(kg, buffer, sample_multiplier);
+          }
+        }
+        else {
+          kernel_adaptive_post_adjust(kg, buffer, tile.sample / (tile.sample - 1.0f));
+        }
+      }
+    }
+  }
+
   void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
   {
     const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE;
@@ -855,14 +904,25 @@ class CPUDevice : public Device {
           path_trace_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
         }
       }
-
       tile.sample = sample + 1;
 
       task.update_progress(&tile, tile.w * tile.h);
+
+      if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(sample)) {
+        const bool stop = adaptive_sampling_filter(kg, tile, sample);
+        if (stop) {
+          tile.sample = end_sample;
+          break;
+        }
+      }
     }
     if (use_coverage) {
       coverage.finalize();
     }
+
+    if (task.adaptive_sampling.use) {
+      adaptive_sampling_post(task, tile, kg);
+    }
   }
 
   void denoise(DenoisingTask &denoising, RenderTile &tile)
diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp
index 61a5c74f69e..30d624fdf7c 100644
--- a/intern/cycles/device/device_optix.cpp
+++ b/intern/cycles/device/device_optix.cpp
@@ -627,7 +627,11 @@ class OptiXDevice : public CUDADevice {
 
     const int end_sample = rtile.start_sample + rtile.num_samples;
     // Keep this number reasonable to avoid running into TDRs
-    const int step_samples = (info.display_device ? 8 : 32);
+    int step_samples = (info.display_device ? 8 : 32);
+    if (task.adaptive_sampling.use) {
+      step_samples = task.adaptive_sampling.align_static_samples(step_samples);
+    }
+
     // Offset into launch params buffer so that streams use separate data
     device_ptr launch_params_ptr = launch_params.device_pointer +
                                    thread_index * launch_params.data_elements;
@@ -638,10 +642,9 @@ class OptiXDevice : public CUDADevice {
       // Copy work tile information to device
       wtile.num_samples = min(step_samples, end_sample - sample);
       wtile.start_sample = sample;
-      check_result_cuda(cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, tile),
-                                          &wtile,
-                                          sizeof(wtile),
-                                          cuda_stream[thread_index]));
+      device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
+      check_result_cuda(
+          cuMemcpyHtoDAsync(d_wtile_ptr, &wtile, sizeof(wtile), cuda_stream[thread_index]));
 
       OptixShaderBindingTable sbt_params = {};
       sbt_params.raygenRecord = sbt_data.device_pointer + PG_RGEN * sizeof(SbtRecord);
@@ -666,6 +669,12 @@ class OptiXDevice : public CUDADevice {
                                      wtile.h,
                                      1));
 
+      // Run the adaptive sampling kernels at selected samples aligned to step samples.
+      uint filter_sample = wtile.start_sample + wtile.num_samples - 1;
+      if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
+        adaptive_sampling_filter(filter_sample, &wtile, d_wtile_ptr, cuda_stream[thread_index]);
+      }
+
       // Wait for launch to finish
       check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
 
@@ -677,6 +686,14 @@ class OptiXDevice : public CUDADevice {
       if (task.get_cancel() && !task.need_finish_queue)
         return;  // Cancel rendering
     }
+
+    // Finalize adaptive sampling
+    if (task.adaptive_sampling.use) {
+      device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
+      adaptive_sampling_post(rtile, &wtile, d_wtile_ptr, cuda_stream[thread_index]);
+      check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
+      task.update_progress(&rtile, rtile.w * rtile.h * wtile.num_samples);
+    }
   }
 
   bool launch_denoise(DeviceTask &task, RenderTile &rtile)
diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
index 42e597a34d7..f22d8761058 100644
--- a/intern/cycles/device/device_split_kernel.cpp
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -55,6 +55,10 @@ DeviceSplitKernel::DeviceSplitKernel(Device *device)
   kernel_next_iteration_setup = NULL;
   kernel_indirect_subsurface = NULL;
   kernel_buffer_update = NULL;
+  kernel_adaptive_stopping = NULL;
+  kernel_adaptive_filter_x = NULL;
+  kernel_adaptive_filter_y = NULL;
+  kernel_adaptive_adjust_samples = NULL;
 }
 
 DeviceSplitKernel::~DeviceSplitKernel()
@@ -83,6 +87,10 @@ DeviceSplitKernel::~DeviceSplitKernel()
   delete kernel_next_iteration_setup;
   delete kernel_indirect_subsurface;
   delete kernel_buffer_update;
+  delete kernel_adaptive_stopping;
+  delete kernel_adaptive_filter_x;
+  delete kernel_adaptive_filter_y;
+  delete kernel_adaptive_adjust_samples;
 }
 
 bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures &requested_features)
@@ -114,6 +122,10 @@ bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures &requested_fe
   LOAD_KERNEL(next_iteration_setup);
   LOAD_KERNEL(indirect_subsurface);
   LOAD_KERNEL(buffer_update);
+  LOAD_KERNEL(adaptive_stopping);
+  LOAD_KERNEL(adaptive_filter_x);
+  LOAD_KERNEL(adaptive_filter_y);
+  LOAD_KERNEL(adaptive_adjust_samples);
 
 #undef LOAD_KERNEL
 
@@ -202,13 +214,21 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
     /* initial guess to start rolling average */
     const int initial_num_samples = 1;
     /* approx number of samples per second */
-    int samples_per_second = (avg_time_per_sample > 0.0) ?
-                                 int(double(time_multiplier) / avg_time_per_sample) + 1 :
-                                 initial_num_samples;
+    const int samples_per_second = (avg_time_per_sample > 0.0) ?
+                                       int(double(time_multiplier) / avg_time_per_sample) + 1 :
+                                       initial_num_samples;
 
     RenderTile subtile = tile;
     subtile.start_sample = tile.sample;
-    subtile.num_samples = min(samples_per_second,
+    subtile.num_samples = samples_per_second;
+
+    if (task->adaptive_sampling.use) {
+      subtile.num_samples = task->adaptive_sampling.align_dynamic_samples(subtile.start_sample,
+                                                                          subtile.num_samples);
+    }
+
+    /* Don't go beyond requested number of samples. */
+    subtile.num_samples = min(subtile.num_samples,
                               tile.start_sample + tile.num_samples - tile.sample);
 
     if (device->have_error()) {
@@ -302,6 +322,23 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
       }
     }
 
+    int filter_sample = tile.sample + subtile.num_samples - 1;
+    if (task->adaptive_sampling.use && task->adaptive_sampling.need_filter(filter_sample)) {
+      size_t buffer_size[2];
+      buffer_size[0] = round_up(tile.w, local_size[0]);
+      buffer_size[1] = round_up(tile.h, local_size[1]);
+      kernel_adaptive_stopping->enqueue(
+          KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
+      buffer_size[0] = round_up(tile.h, local_size[0]);
+      buffer_size[1] = round_up(1, local_size[1]);
+      kernel_adaptive_filter_x->enqueue(
+          KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
+      buffer_size[0] = round_up(tile.w, local_size[0]);
+      buffer_size[1] = round_up(1, local_size[1]);
+      kernel_adaptive_filter_y->enqueue(
+          KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
+    }
+
     double time_per_sample = ((time_dt() - start_time) / subtile.num_samples);
 
     if (avg_time_per_sample == 0.0) {
@@ -324,6 +361,28 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
     }
   }
 
+  if (task->adaptive_sampling.use) {
+    /* Reset the start samples. */
+    RenderTile subtile = tile;
+    subtile.start_sample = tile.start_sample;
+    subtile.num_samples = tile.sample - tile.start_sample;
+    enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
+                                   subtile,
+                                   num_global_elements,
+                                   kgbuffer,
+                                   kernel_data,
+                                   split_data,
+                                   ray_state,
+                                   queue_index,
+                                   use_queues_flag,
+                                   work_pool_wgs);
+    size_t buffer_size[2];
+    buffer_size[0] = round_up(tile.w, local_size[0]);
+    buffer_size[1] = round_up(tile.h, local_size[1]);
+    kernel_adaptive_adjust_samples->enqueue(
+        KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
+  }
+
   return true;
 }
 
diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h
index 6ff326bf214..9d6b9efdd62 100644
--- a/intern/cycles/device/device_split_kernel.h
+++ b/intern/cycles/device/device_split_kernel.h
@@ -75,6 +75,10 @@ class DeviceSplitKernel {
   SplitKernelFunction *kernel_next_iteration_setup;
   SplitKernelFunction *kernel_indirect_subsurface;
   SplitKernelFunction *kernel_buffer_update;
+  SplitKernelFunction *kernel_adaptive_stopping;
+  SplitKernelFunction *kernel_adaptive_filter_x;
+  SplitKernelFunction *kernel_adaptive_filter_y;
+  SplitKernelFunction *kernel_adaptive_adjust_samples;
 
   /* Global memory variables [porting]; These memory is used for
    * co-operation between different kernels; Data written by one
diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp
index 36522b874ab..c36b1344c3b 100644
--- a/intern/cycles/device/device_task.cpp
+++ b/intern/cycles/device/device_task.cpp
@@ -136,4 +136,59 @@ void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples)
   }
 }
 
+/* Adaptive Sampling */
+
+AdaptiveSampling::AdaptiveSampling()
+    : use(true), adaptive_step(ADAPTIVE_SAMPLE_STEP), min_samples(0)
+{
+}
+
+/* Render samples in steps that align with the adaptive filtering. */
+int AdaptiveSampling::align_static_samples(int samples) const
+{
+  if (samples > adaptive_step) {
+    /* Make multiple of adaptive_step. */
+    while (samples % adaptive_step != 0) {
+      samples--;
+    }
+  }
+  else if (samples < adaptive_step) {
+    /* Make divisor of adaptive_step. */
+    while (adaptive_step % samples != 0) {
+      samples--;
+    }
+  }
+
+  return max(samples, 1);
+}
+
+/* Render samples in steps that align with the adaptive filtering, with the
+ * suggested number of samples dynamically changing. */
+int AdaptiveSampling::align_dynamic_samples(int offset, int samples) const
+{
+  /* Round so that we end up on multiples of adaptive_samples. */
+  samples += offset;
+
+  if (samples > adaptive_step) {
+    /* Make multiple of adaptive_step. */
+    while (samples % adaptive_step != 0) {
+      samples--;
+    }
+  }
+
+  samples -= offset;
+
+  return max(samples, 1);
+}
+
+bool AdaptiveSampling::need_filter(int sample) const
+{
+  if (sample > min_samples) {
+    return (sample & (adaptive_step - 1)) == (adaptive_step - 1);
+  }
+  else {
+    return false;
+  }
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
index 972f6131092..8c4e682adb1 100644
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -62,6 +62,19 @@ class DenoiseParams {
   }
 };
 
+class AdaptiveSampling {
+ public:
+  AdaptiveSampling();
+
+  int align_static_samples(int samples) const;
+  int align_dynamic_samples(int offset, int samples) const;
+  bool need_filter(int sample) const;
+
+  bool use;
+  int adaptive_step;
+  int min_samples;
+};
+
 class DeviceTask : public Task {
  public:
   typedef enum { RENDER, FILM_CONVERT, SHADER, DENOISE_BUFFER } Type;
@@ -115,6 +128,7 @@ class DeviceTask : public Task {
 
   bool need_finish_queue;
   bool integrator_branched;
+  AdaptiveSampling adaptive_sampling;
 
  protected:
   double last_update_time;
diff --git a/intern/cycles/device/opencl/device_opencl.h b/intern/cycles/device/opencl/device_opencl.h
index 61b1e3e3b6b..b761726b1ad 100644
--- a/intern/cycles/device/opencl/device_opencl.h
+++ b/intern/cycles/device/opencl/device_opencl.h
@@ -445,6 +445,7 @@ class OpenCLDevice : public Device {
                     device_ptr rgba_byte,
                     device_ptr rgba_half);
   void shader(DeviceTask &task);
+  void update_adaptive(DeviceTask &task, RenderTile &tile, int sample);
 
   void denoise(RenderTile &tile, DenoisingTask &denoising);
 
diff --git a/intern/cycles/device/opencl/device_opencl_impl.cpp b/intern/cycles/device/opencl/device_opencl_impl.cpp
index 68cdfd5238c..3dbe54b38aa 100644
--- a/intern/cycles/device/opencl/device_opencl_impl.cpp
+++ b/intern/cycles/device/opencl/device_opencl_impl.cpp
@@ -56,7 +56,11 @@ static const string SPLIT_BUNDLE_KERNELS =
     "enqueue_inactive "
     "next_iteration_setup "
     "indirect_subsurface "
-    "buffer_update";
+    "buffer_update "
+    "adaptive_stopping "
+    "adaptive_filter_x "
+    "adaptive_filter_y "
+    "adaptive_adjust_samples";
 
 const string OpenCLDevice::get_opencl_program_name(const string &kernel_name)
 {
@@ -283,6 +287,10 @@ void OpenCLDevice::OpenCLSplitPrograms::load_kernels(
     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(next_iteration_setup);
     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(indirect_subsurface);
     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(buffer_update);
+    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_stopping);
+    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_x);
+    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_y);
+    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_adjust_samples);
     programs.push_back(&program_split);
 
 #  undef ADD_SPLIT_KERNEL_PROGRAM
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 566b6e3d191..0dd0da65f82 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -36,6 +36,10 @@ set(SRC_CUDA_KERNELS
 )
 
 set(SRC_OPENCL_KERNELS
+  kernels/opencl/kernel_adaptive_stopping.cl
+  kernels/opencl/kernel_adaptive_filter_x.cl
+  kernels/opencl/kernel_adaptive_filter_y.cl
+  kernels/opencl/kernel_adaptive_adjust_samples.cl
   kernels/opencl/kernel_bake.cl
   kernels/opencl/kernel_base.cl
   kernels/opencl/kernel_displace.cl
@@ -94,6 +98,7 @@ set(SRC_BVH_HEADERS
 
 set(SRC_HEADERS
   kernel_accumulate.h
+  kernel_adaptive_sampling.h
   kernel_bake.h
   kernel_camera.h
   kernel_color.h
@@ -324,6 +329,10 @@ set(SRC_UTIL_HEADERS
 )
 
 set(SRC_SPLIT_HEADERS
+  split/kernel_adaptive_adjust_samples.h
+  split/kernel_adaptive_filter_x.h
+  split/kernel_adaptive_filter_y.h
+  split/kernel_adaptive_stopping.h
   split/kernel_branched.h
   split/kernel_buffer_update.h
   split/kernel_data_init.h
diff --git a/intern/cycles/kernel/kernel_adaptive_sampling.h b/intern/cycles/kernel/kernel_adaptive_sampling.h
new file mode 100644
index 00000000000..502b69e4f7f
--- /dev/null
+++ b/intern/cycles/kernel/kernel_adaptive_sampling.h
@@ -0,0 +1,231 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KERNEL_ADAPTIVE_SAMPLING_H__
+#define __KERNEL_ADAPTIVE_SAMPLING_H__
+
+CCL_NAMESPACE_BEGIN
+
+/* Determines whether to continue sampling a given pixel or if it has sufficiently converged. */
+
+ccl_device void kernel_do_adaptive_stopping(KernelGlobals *kg,
+                                            ccl_global float *buffer,
+                                            int sample)
+{
+  /* TODO Stefan: Is this better in linear, sRGB or something else? */
+  float4 I = *((ccl_global float4 *)buffer);
+  float4 A = *(ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer);
+  /* The per pixel error as seen in section 2.1 of
+   * "A hierarchical automatic stopping condition for Monte Carlo global illumination"
+   * A small epsilon is added to the divisor to prevent division by zero. */
+  float error = (fabsf(I.x - A.x) + fabsf(I.y - A.y) + fabsf(I.z - A.z)) /
+                (sample * 0.0001f + sqrtf(I.x + I.y + I.z));
+  if (error < kernel_data.integrator.adaptive_threshold * (float)sample) {
+    /* Set the fourth component to non-zero value to indicate that this pixel has converged. */
+    buffer[kernel_data.film.pass_adaptive_aux_buffer + 3] += 1.0f;
+  }
+}
+
+/* Adjust the values of an adaptively sampled pixel. */
+
+ccl_device void kernel_adaptive_post_adjust(KernelGlobals *kg,
+                                            ccl_global float *buffer,
+                                            float sample_multiplier)
+{
+  *(ccl_global float4 *)(buffer) *= sample_multiplier;
+
+  /* Scale the aux pass too, this is necessary for progressive rendering to work properly. */
+  kernel_assert(kernel_data.film.pass_adaptive_aux_buffer);
+  *(ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer) *= sample_multiplier;
+
+#ifdef __PASSES__
+  int flag = kernel_data.film.pass_flag;
+
+  if (flag & PASSMASK(SHADOW))
+    *(ccl_global float3 *)(buffer + kernel_data.film.pass_shadow) *= sample_multiplier;
+
+  if (flag & PASSMASK(MIST))
+    *(ccl_global float *)(buffer + kernel_data.film.pass_mist) *= sample_multiplier;
+
+  if (flag & PASSMASK(NORMAL))
+    *(ccl_global float3 *)(buffer + kernel_data.film.pass_normal) *= sample_multiplier;
+
+  if (flag & PASSMASK(UV))
+    *(ccl_global float3 *)(buffer + kernel_data.film.pass_uv) *= sample_multiplier;
+
+  if (flag & PASSMASK(MOTION)) {
+    *(ccl_global float4 *)(buffer + kernel_data.film.pass_motion) *= sample_multiplier;
+    *(ccl_global float *)(buffer + kernel_data.film.pass_motion_weight) *= sample_multiplier;
+  }
+
+  if (kernel_data.film.use_light_pass) {
+    int light_flag = kernel_data.film.light_pass_flag;
+
+    if (light_flag & PASSMASK(DIFFUSE_INDIRECT))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_indirect) *= sample_multiplier;
+    if (light_flag & PASSMASK(GLOSSY_INDIRECT))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_indirect) *= sample_multiplier;
+    if (light_flag & PASSMASK(TRANSMISSION_INDIRECT))
+      *(ccl_global float3 *)(buffer +
+                             kernel_data.film.pass_transmission_indirect) *= sample_multiplier;
+    if (light_flag & PASSMASK(VOLUME_INDIRECT))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_volume_indirect) *= sample_multiplier;
+    if (light_flag & PASSMASK(DIFFUSE_DIRECT))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_direct) *= sample_multiplier;
+    if (light_flag & PASSMASK(GLOSSY_DIRECT))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_direct) *= sample_multiplier;
+    if (light_flag & PASSMASK(TRANSMISSION_DIRECT))
+      *(ccl_global float3 *)(buffer +
+                             kernel_data.film.pass_transmission_direct) *= sample_multiplier;
+    if (light_flag & PASSMASK(VOLUME_DIRECT))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_volume_direct) *= sample_multiplier;
+
+    if (light_flag & PASSMASK(EMISSION))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_emission) *= sample_multiplier;
+    if (light_flag & PASSMASK(BACKGROUND))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_background) *= sample_multiplier;
+    if (light_flag & PASSMASK(AO))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_ao) *= sample_multiplier;
+
+    if (light_flag & PASSMASK(DIFFUSE_COLOR))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_color) *= sample_multiplier;
+    if (light_flag & PASSMASK(GLOSSY_COLOR))
+      *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_color) *= sample_multiplier;
+    if (light_flag & PASSMASK(TRANSMISSION_COLOR))
+      *(ccl_global float3 *)(buffer +
+                             kernel_data.film.pass_transmission_color) *= sample_multiplier;
+  }
+#endif
+
+#ifdef __DENOISING_FEATURES__
+
+#  define scale_float3_variance(buffer, offset, scale) \
+    *(buffer + offset) *= scale; \
+    *(buffer + offset + 1) *= scale; \
+    *(buffer + offset + 2) *= scale; \
+    *(buffer + offset + 3) *= scale * scale; \
+    *(buffer + offset + 4) *= scale * scale; \
+    *(buffer + offset + 5) *= scale * scale;
+
+#  define scale_shadow_variance(buffer, offset, scale) \
+    *(buffer + offset) *= scale; \
+    *(buffer + offset + 1) *= scale; \
+    *(buffer + offset + 2) *= scale * scale;
+
+  if (kernel_data.film.pass_denoising_data) {
+    scale_shadow_variance(
+        buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_SHADOW_A, sample_multiplier);
+    scale_shadow_variance(
+        buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_SHADOW_B, sample_multiplier);
+    if (kernel_data.film.pass_denoising_clean) {
+      scale_float3_variance(
+          buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, sample_multiplier);
+      *(buffer + kernel_data.film.pass_denoising_clean) *= sample_multiplier;
+      *(buffer + kernel_data.film.pass_denoising_clean + 1) *= sample_multiplier;
+      *(buffer + kernel_data.film.pass_denoising_clean + 2) *= sample_multiplier;
+    }
+    else {
+      scale_float3_variance(
+          buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, sample_multiplier);
+    }
+    scale_float3_variance(
+        buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL, sample_multiplier);
+    scale_float3_variance(
+        buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO, sample_multiplier);
+    *(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH) *= sample_multiplier;
+    *(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH +
+      1) *= sample_multiplier * sample_multiplier;
+  }
+#endif /* __DENOISING_FEATURES__ */
+
+  if (kernel_data.film.cryptomatte_passes) {
+    int num_slots = 0;
+    num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) ? 1 : 0;
+    num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) ? 1 : 0;
+    num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) ? 1 : 0;
+    num_slots = num_slots * 2 * kernel_data.film.cryptomatte_depth;
+    ccl_global float2 *id_buffer = (ccl_global float2 *)(buffer +
+                                                         kernel_data.film.pass_cryptomatte);
+    for (int slot = 0; slot < num_slots; slot++) {
+      id_buffer[slot].y *= sample_multiplier;
+    }
+  }
+}
+
+/* This is a simple box filter in two passes.
+ * When a pixel demands more adaptive samples, let its neighboring pixels draw more samples too. */
+
+ccl_device bool kernel_do_adaptive_filter_x(KernelGlobals *kg, int y, ccl_global WorkTile *tile)
+{
+  bool any = false;
+  bool prev = false;
+  for (int x = tile->x; x < tile->x + tile->w; ++x) {
+    int index = tile->offset + x + y * tile->stride;
+    ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
+    ccl_global float4 *aux = (ccl_global float4 *)(buffer +
+                                                   kernel_data.film.pass_adaptive_aux_buffer);
+    if (aux->w == 0.0f) {
+      any = true;
+      if (x > tile->x && !prev) {
+        index = index - 1;
+        buffer = tile->buffer + index * kernel_data.film.pass_stride;
+        aux = (ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer);
+        aux->w = 0.0f;
+      }
+      prev = true;
+    }
+    else {
+      if (prev) {
+        aux->w = 0.0f;
+      }
+      prev = false;
+    }
+  }
+  return any;
+}
+
+ccl_device bool kernel_do_adaptive_filter_y(KernelGlobals *kg, int x, ccl_global WorkTile *tile)
+{
+  bool prev = false;
+  bool any = false;
+  for (int y = tile->y; y < tile->y + tile->h; ++y) {
+    int index = tile->offset + x + y * tile->stride;
+    ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
+    ccl_global float4 *aux = (ccl_global float4 *)(buffer +
+                                                   kernel_data.film.pass_adaptive_aux_buffer);
+    if (aux->w == 0.0f) {
+      any = true;
+      if (y > tile->y && !prev) {
+        index = index - tile->stride;
+        buffer = tile->buffer + index * kernel_data.film.pass_stride;
+        aux = (ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer);
+        aux->w = 0.0f;
+      }
+      prev = true;
+    }
+    else {
+      if (prev) {
+        aux->w = 0.0f;
+      }
+      prev = false;
+    }
+  }
+  return any;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* __KERNEL_ADAPTIVE_SAMPLING_H__ */
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index 187e8340c82..33ec05c6048 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -29,7 +29,9 @@ ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg,
   if (kernel_data.film.pass_denoising_data == 0)
     return;
 
-  buffer += (sample & 1) ? DENOISING_PASS_SHADOW_B : DENOISING_PASS_SHADOW_A;
+  buffer += sample_is_even(kernel_data.integrator.sampling_pattern, sample) ?
+                DENOISING_PASS_SHADOW_B :
+                DENOISING_PASS_SHADOW_A;
 
   path_total = ensure_finite(path_total);
   path_total_shaded = ensure_finite(path_total_shaded);
@@ -386,6 +388,41 @@ ccl_device_inline void kernel_write_result(KernelGlobals *kg,
 #ifdef __KERNEL_DEBUG__
   kernel_write_debug_passes(kg, buffer, L);
 #endif
+
+  /* Adaptive Sampling. Fill the additional buffer with the odd samples and calculate our stopping
+     criteria. This is the heuristic from "A hierarchical automatic stopping condition for Monte
+     Carlo global illumination" except that here it is applied per pixel and not in hierarchical
+     tiles. */
+  if (kernel_data.film.pass_adaptive_aux_buffer &&
+      kernel_data.integrator.adaptive_threshold > 0.0f) {
+    if (sample_is_even(kernel_data.integrator.sampling_pattern, sample)) {
+      kernel_write_pass_float4(buffer + kernel_data.film.pass_adaptive_aux_buffer,
+                               make_float4(L_sum.x * 2.0f, L_sum.y * 2.0f, L_sum.z * 2.0f, 0.0f));
+    }
+#ifdef __KERNEL_CPU__
+    if (sample > kernel_data.integrator.adaptive_min_samples &&
+        (sample & (ADAPTIVE_SAMPLE_STEP - 1)) == (ADAPTIVE_SAMPLE_STEP - 1)) {
+      kernel_do_adaptive_stopping(kg, buffer, sample);
+    }
+#endif
+  }
+
+  /* Write the sample count as negative numbers initially to mark the samples as in progress.
+   * Once the tile has finished rendering, the sign gets flipped and all the pixel values
+   * are scaled as if they were taken at a uniform sample count. */
+  if (kernel_data.film.pass_sample_count) {
+    /* Make sure it's a negative number. In progressive refine mode, this bit gets flipped between
+     * passes. */
+#ifdef __ATOMIC_PASS_WRITE__
+    atomic_fetch_and_or_uint32((ccl_global uint *)(buffer + kernel_data.film.pass_sample_count),
+                               0x80000000);
+#else
+    if (buffer[kernel_data.film.pass_sample_count] > 0) {
+      buffer[kernel_data.film.pass_sample_count] *= -1.0f;
+    }
+#endif
+    kernel_write_pass_float(buffer + kernel_data.film.pass_sample_count, -1.0f);
+  }
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index 1a0b67275a7..bdd2703a894 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -31,6 +31,7 @@
 #include "kernel/kernel_accumulate.h"
 #include "kernel/kernel_shader.h"
 #include "kernel/kernel_light.h"
+#include "kernel/kernel_adaptive_sampling.h"
 #include "kernel/kernel_passes.h"
 
 #if defined(__VOLUME__) || defined(__SUBSURFACE__)
@@ -656,6 +657,14 @@ ccl_device void kernel_path_trace(
 
   buffer += index * pass_stride;
 
+  if (kernel_data.film.pass_adaptive_aux_buffer) {
+    ccl_global float4 *aux = (ccl_global float4 *)(buffer +
+                                                   kernel_data.film.pass_adaptive_aux_buffer);
+    if (aux->w > 0.0f) {
+      return;
+    }
+  }
+
   /* Initialize random numbers and sample ray. */
   uint rng_hash;
   Ray ray;
diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h
index f75e4ab4c97..0d5781fe3d1 100644
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ b/intern/cycles/kernel/kernel_path_branched.h
@@ -523,6 +523,14 @@ ccl_device void kernel_branched_path_trace(
 
   buffer += index * pass_stride;
 
+  if (kernel_data.film.pass_adaptive_aux_buffer) {
+    ccl_global float4 *aux = (ccl_global float4 *)(buffer +
+                                                   kernel_data.film.pass_adaptive_aux_buffer);
+    if (aux->w > 0.0f) {
+      return;
+    }
+  }
+
   /* initialize random numbers and ray */
   uint rng_hash;
   Ray ray;
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 88c2d0d3196..c5be93e2cda 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -63,6 +63,11 @@ CCL_NAMESPACE_BEGIN
 
 #define VOLUME_STACK_SIZE 32
 
+/* Adaptive sampling constants */
+#define ADAPTIVE_SAMPLE_STEP 4
+static_assert((ADAPTIVE_SAMPLE_STEP & (ADAPTIVE_SAMPLE_STEP - 1)) == 0,
+              "ADAPTIVE_SAMPLE_STEP must be power of two for bitwise operations to work");
+
 /* Split kernel constants */
 #define WORK_POOL_SIZE_GPU 64
 #define WORK_POOL_SIZE_CPU 1
@@ -374,6 +379,8 @@ typedef enum PassType {
   PASS_CRYPTOMATTE,
   PASS_AOV_COLOR,
   PASS_AOV_VALUE,
+  PASS_ADAPTIVE_AUX_BUFFER,
+  PASS_SAMPLE_COUNT,
   PASS_CATEGORY_MAIN_END = 31,
 
   PASS_MIST = 32,
@@ -1223,6 +1230,9 @@ typedef struct KernelFilm {
   int cryptomatte_depth;
   int pass_cryptomatte;
 
+  int pass_adaptive_aux_buffer;
+  int pass_sample_count;
+
   int pass_mist;
   float mist_start;
   float mist_inv_depth;
@@ -1256,6 +1266,8 @@ typedef struct KernelFilm {
   int display_divide_pass_stride;
   int use_display_exposure;
   int use_display_pass_alpha;
+
+  int pad3, pad4, pad5;
 } KernelFilm;
 static_assert_align(KernelFilm, 16);
 
@@ -1337,6 +1349,8 @@ typedef struct KernelIntegrator {
   /* sampler */
   int sampling_pattern;
   int aa_samples;
+  int adaptive_min_samples;
+  float adaptive_threshold;
 
   /* volume render */
   int use_volumes;
@@ -1348,7 +1362,7 @@ typedef struct KernelIntegrator {
 
   int max_closures;
 
-  int pad1;
+  int pad1, pad2, pad3;
 } KernelIntegrator;
 static_assert_align(KernelIntegrator, 16);
 
@@ -1662,7 +1676,7 @@ typedef struct WorkTile {
   uint start_sample;
   uint num_samples;
 
-  uint offset;
+  int offset;
   uint stride;
 
   ccl_global float *buffer;
diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h
index 799561a7466..c642d227e4b 100644
--- a/intern/cycles/kernel/kernel_work_stealing.h
+++ b/intern/cycles/kernel/kernel_work_stealing.h
@@ -23,17 +23,41 @@ CCL_NAMESPACE_BEGIN
  * Utility functions for work stealing
  */
 
+/* Map global work index to tile, pixel X/Y and sample. */
+ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile,
+                                      uint global_work_index,
+                                      ccl_private uint *x,
+                                      ccl_private uint *y,
+                                      ccl_private uint *sample)
+{
+#ifdef __KERNEL_CUDA__
+  /* Keeping threads for the same pixel together improves performance on CUDA. */
+  uint sample_offset = global_work_index % tile->num_samples;
+  uint pixel_offset = global_work_index / tile->num_samples;
+#else  /* __KERNEL_CUDA__ */
+  uint tile_pixels = tile->w * tile->h;
+  uint sample_offset = global_work_index / tile_pixels;
+  uint pixel_offset = global_work_index - sample_offset * tile_pixels;
+#endif /* __KERNEL_CUDA__ */
+  uint y_offset = pixel_offset / tile->w;
+  uint x_offset = pixel_offset - y_offset * tile->w;
+
+  *x = tile->x + x_offset;
+  *y = tile->y + y_offset;
+  *sample = tile->start_sample + sample_offset;
+}
+
 #ifdef __KERNEL_OPENCL__
 #  pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
 #endif
 
 #ifdef __SPLIT_KERNEL__
 /* Returns true if there is work */
-ccl_device bool get_next_work(KernelGlobals *kg,
-                              ccl_global uint *work_pools,
-                              uint total_work_size,
-                              uint ray_index,
-                              ccl_private uint *global_work_index)
+ccl_device bool get_next_work_item(KernelGlobals *kg,
+                                   ccl_global uint *work_pools,
+                                   uint total_work_size,
+                                   uint ray_index,
+                                   ccl_private uint *global_work_index)
 {
   /* With a small amount of work there may be more threads than work due to
    * rounding up of global size, stop such threads immediately. */
@@ -56,31 +80,37 @@ ccl_device bool get_next_work(KernelGlobals *kg,
   /* Test if all work for this pool is done. */
   return (*global_work_index < total_work_size);
 }
-#endif
 
-/* Map global work index to tile, pixel X/Y and sample. */
-ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile,
-                                      uint global_work_index,
-                                      ccl_private uint *x,
-                                      ccl_private uint *y,
-                                      ccl_private uint *sample)
+ccl_device bool get_next_work(KernelGlobals *kg,
+                              ccl_global uint *work_pools,
+                              uint total_work_size,
+                              uint ray_index,
+                              ccl_private uint *global_work_index)
 {
-#ifdef __KERNEL_CUDA__
-  /* Keeping threads for the same pixel together improves performance on CUDA. */
-  uint sample_offset = global_work_index % tile->num_samples;
-  uint pixel_offset = global_work_index / tile->num_samples;
-#else  /* __KERNEL_CUDA__ */
-  uint tile_pixels = tile->w * tile->h;
-  uint sample_offset = global_work_index / tile_pixels;
-  uint pixel_offset = global_work_index - sample_offset * tile_pixels;
-#endif /* __KERNEL_CUDA__ */
-  uint y_offset = pixel_offset / tile->w;
-  uint x_offset = pixel_offset - y_offset * tile->w;
-
-  *x = tile->x + x_offset;
-  *y = tile->y + y_offset;
-  *sample = tile->start_sample + sample_offset;
+  bool got_work = false;
+  if (kernel_data.film.pass_adaptive_aux_buffer) {
+    do {
+      got_work = get_next_work_item(kg, work_pools, total_work_size, ray_index, global_work_index);
+      if (got_work) {
+        ccl_global WorkTile *tile = &kernel_split_params.tile;
+        uint x, y, sample;
+        get_work_pixel(tile, *global_work_index, &x, &y, &sample);
+        uint buffer_offset = (tile->offset + x + y * tile->stride) * kernel_data.film.pass_stride;
+        ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
+        ccl_global float4 *aux = (ccl_global float4 *)(buffer +
+                                                       kernel_data.film.pass_adaptive_aux_buffer);
+        if (aux->w == 0.0f) {
+          break;
+        }
+      }
+    } while (got_work);
+  }
+  else {
+    got_work = get_next_work_item(kg, work_pools, total_work_size, ray_index, global_work_index);
+  }
+  return got_work;
 }
+#endif
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
index f5d981fb71a..683f4b88d79 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
@@ -89,5 +89,9 @@ DECLARE_SPLIT_KERNEL_FUNCTION(enqueue_inactive)
 DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
 DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
 DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update)
+DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_stopping)
+DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x)
+DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y)
+DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples)
 
 #undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
index 9ca3f46b5b6..96b2bf11132 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
@@ -58,6 +58,10 @@
 #    include "kernel/split/kernel_next_iteration_setup.h"
 #    include "kernel/split/kernel_indirect_subsurface.h"
 #    include "kernel/split/kernel_buffer_update.h"
+#    include "kernel/split/kernel_adaptive_stopping.h"
+#    include "kernel/split/kernel_adaptive_filter_x.h"
+#    include "kernel/split/kernel_adaptive_filter_y.h"
+#    include "kernel/split/kernel_adaptive_adjust_samples.h"
 #  endif /* __SPLIT_KERNEL__ */
 #else
 #  define STUB_ASSERT(arch, name) \
@@ -204,6 +208,10 @@ DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_stopping)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples)
 #endif   /* __SPLIT_KERNEL__ */
 
 #undef KERNEL_STUB
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
index af311027f78..c4c810c6a82 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -33,6 +33,7 @@
 #include "kernel/kernel_path_branched.h"
 #include "kernel/kernel_bake.h"
 #include "kernel/kernel_work_stealing.h"
+#include "kernel/kernel_adaptive_sampling.h"
 
 /* kernels */
 extern "C" __global__ void
@@ -83,6 +84,75 @@ kernel_cuda_branched_path_trace(WorkTile *tile, uint total_work_size)
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_adaptive_stopping(WorkTile *tile, int sample, uint total_work_size)
+{
+	int work_index = ccl_global_id(0);
+	bool thread_is_active = work_index < total_work_size;
+	KernelGlobals kg;
+	if(thread_is_active && kernel_data.film.pass_adaptive_aux_buffer) {
+		uint x = tile->x + work_index % tile->w;
+		uint y = tile->y + work_index / tile->w;
+		int index = tile->offset + x + y * tile->stride;
+		ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
+		kernel_do_adaptive_stopping(&kg, buffer, sample);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_adaptive_filter_x(WorkTile *tile, int sample, uint)
+{
+	KernelGlobals kg;
+	if(kernel_data.film.pass_adaptive_aux_buffer && sample > kernel_data.integrator.adaptive_min_samples) {
+		if(ccl_global_id(0) < tile->h) {
+			int y = tile->y + ccl_global_id(0);
+			kernel_do_adaptive_filter_x(&kg, y, tile);
+		}
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_adaptive_filter_y(WorkTile *tile, int sample, uint)
+{
+	KernelGlobals kg;
+	if(kernel_data.film.pass_adaptive_aux_buffer && sample > kernel_data.integrator.adaptive_min_samples) {
+		if(ccl_global_id(0) < tile->w) {
+			int x = tile->x + ccl_global_id(0);
+			kernel_do_adaptive_filter_y(&kg, x, tile);
+		}
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_adaptive_scale_samples(WorkTile *tile, int start_sample, int sample, uint total_work_size)
+{
+	if(kernel_data.film.pass_adaptive_aux_buffer) {
+		int work_index = ccl_global_id(0);
+		bool thread_is_active = work_index < total_work_size;
+		KernelGlobals kg;
+		if(thread_is_active) {
+			uint x = tile->x + work_index % tile->w;
+			uint y = tile->y + work_index / tile->w;
+			int index = tile->offset + x + y * tile->stride;
+			ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
+			if(buffer[kernel_data.film.pass_sample_count] < 0.0f) {
+				buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
+				float sample_multiplier = sample / max((float)start_sample + 1.0f, buffer[kernel_data.film.pass_sample_count]);
+				if(sample_multiplier != 1.0f) {
+					kernel_adaptive_post_adjust(&kg, buffer, sample_multiplier);
+				}
+			}
+			else {
+				kernel_adaptive_post_adjust(&kg, buffer, sample / (sample - 1.0f));
+			}
+		}
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
 kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
 {
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
index 43b3d0aa0e6..95ad7599cf1 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
@@ -43,6 +43,10 @@
 #include "kernel/split/kernel_next_iteration_setup.h"
 #include "kernel/split/kernel_indirect_subsurface.h"
 #include "kernel/split/kernel_buffer_update.h"
+#include "kernel/split/kernel_adaptive_stopping.h"
+#include "kernel/split/kernel_adaptive_filter_x.h"
+#include "kernel/split/kernel_adaptive_filter_y.h"
+#include "kernel/split/kernel_adaptive_adjust_samples.h"
 
 #include "kernel/kernel_film.h"
 
@@ -121,6 +125,10 @@ DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_stopping)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y)
+DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples)
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl
new file mode 100644
index 00000000000..ebdb99d4730
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_adaptive_adjust_samples.h"
+
+#define KERNEL_NAME adaptive_adjust_samples
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl
new file mode 100644
index 00000000000..76d82d4184e
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_adaptive_filter_x.h"
+
+#define KERNEL_NAME adaptive_filter_x
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl
new file mode 100644
index 00000000000..1e6d15ba0f2
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_adaptive_filter_y.h"
+
+#define KERNEL_NAME adaptive_filter_y
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl
new file mode 100644
index 00000000000..51de0059667
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_adaptive_stopping.h"
+
+#define KERNEL_NAME adaptive_stopping
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl b/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl
index 6041f13b52b..c3b7b09460a 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl
@@ -28,3 +28,7 @@
 #include "kernel/kernels/opencl/kernel_next_iteration_setup.cl"
 #include "kernel/kernels/opencl/kernel_indirect_subsurface.cl"
 #include "kernel/kernels/opencl/kernel_buffer_update.cl"
+#include "kernel/kernels/opencl/kernel_adaptive_stopping.cl"
+#include "kernel/kernels/opencl/kernel_adaptive_filter_x.cl"
+#include "kernel/kernels/opencl/kernel_adaptive_filter_y.cl"
+#include "kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl"
diff --git a/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h b/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h
new file mode 100644
index 00000000000..60ebf415970
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_adaptive_adjust_samples(KernelGlobals *kg)
+{
+  int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+  if (pixel_index < kernel_split_params.tile.w * kernel_split_params.tile.h) {
+    int x = kernel_split_params.tile.x + pixel_index % kernel_split_params.tile.w;
+    int y = kernel_split_params.tile.y + pixel_index / kernel_split_params.tile.w;
+    int buffer_offset = (kernel_split_params.tile.offset + x +
+                         y * kernel_split_params.tile.stride) *
+                        kernel_data.film.pass_stride;
+    ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
+    int sample = kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples;
+    if (buffer[kernel_data.film.pass_sample_count] < 0.0f) {
+      buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
+      float sample_multiplier = sample / max((float)kernel_split_params.tile.start_sample + 1.0f,
+                                             buffer[kernel_data.film.pass_sample_count]);
+      if (sample_multiplier != 1.0f) {
+        kernel_adaptive_post_adjust(kg, buffer, sample_multiplier);
+      }
+    }
+    else {
+      kernel_adaptive_post_adjust(kg, buffer, sample / (sample - 1.0f));
+    }
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_adaptive_filter_x.h b/intern/cycles/kernel/split/kernel_adaptive_filter_x.h
new file mode 100644
index 00000000000..93f41f7ced4
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_adaptive_filter_x.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_adaptive_filter_x(KernelGlobals *kg)
+{
+  int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+  if (pixel_index < kernel_split_params.tile.h &&
+      kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >=
+          kernel_data.integrator.adaptive_min_samples) {
+    int y = kernel_split_params.tile.y + pixel_index;
+    kernel_do_adaptive_filter_x(kg, y, &kernel_split_params.tile);
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_adaptive_filter_y.h b/intern/cycles/kernel/split/kernel_adaptive_filter_y.h
new file mode 100644
index 00000000000..eca53d079ec
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_adaptive_filter_y.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_adaptive_filter_y(KernelGlobals *kg)
+{
+  int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+  if (pixel_index < kernel_split_params.tile.w &&
+      kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >=
+          kernel_data.integrator.adaptive_min_samples) {
+    int x = kernel_split_params.tile.x + pixel_index;
+    kernel_do_adaptive_filter_y(kg, x, &kernel_split_params.tile);
+  }
+}
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_adaptive_stopping.h b/intern/cycles/kernel/split/kernel_adaptive_stopping.h
new file mode 100644
index 00000000000..c8eb1ebd705
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_adaptive_stopping.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_adaptive_stopping(KernelGlobals *kg)
+{
+  int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+  if (pixel_index < kernel_split_params.tile.w * kernel_split_params.tile.h &&
+      kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >=
+          kernel_data.integrator.adaptive_min_samples) {
+    int x = kernel_split_params.tile.x + pixel_index % kernel_split_params.tile.w;
+    int y = kernel_split_params.tile.y + pixel_index / kernel_split_params.tile.w;
+    int buffer_offset = (kernel_split_params.tile.offset + x +
+                         y * kernel_split_params.tile.stride) *
+                        kernel_data.film.pass_stride;
+    ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
+    kernel_do_adaptive_stopping(kg,
+                                buffer,
+                                kernel_split_params.tile.start_sample +
+                                    kernel_split_params.tile.num_samples - 1);
+  }
+}
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp
index 41e1b73fdac..19d3f00bf82 100644
--- a/intern/cycles/render/buffers.cpp
+++ b/intern/cycles/render/buffers.cpp
@@ -260,6 +260,22 @@ bool RenderBuffers::get_pass_rect(
     return false;
   }
 
+  float *sample_count = NULL;
+  if (name == "Combined") {
+    int sample_offset = 0;
+    for (size_t j = 0; j < params.passes.size(); j++) {
+      Pass &pass = params.passes[j];
+      if (pass.type != PASS_SAMPLE_COUNT) {
+        sample_offset += pass.components;
+        continue;
+      }
+      else {
+        sample_count = buffer.data() + sample_offset;
+        break;
+      }
+    }
+  }
+
   int pass_offset = 0;
 
   for (size_t j = 0; j < params.passes.size(); j++) {
@@ -420,6 +436,11 @@ bool RenderBuffers::get_pass_rect(
       }
       else {
         for (int i = 0; i < size; i++, in += pass_stride, pixels += 4) {
+          if (sample_count && sample_count[i * pass_stride] < 0.0f) {
+            scale = (pass.filter) ? -1.0f / (sample_count[i * pass_stride]) : 1.0f;
+            scale_exposure = (pass.exposure) ? scale * exposure : scale;
+          }
+
           float4 f = make_float4(in[0], in[1], in[2], in[3]);
 
           pixels[0] = f.x * scale_exposure;
diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp
index 172ea3dd31b..48d9c97e0fb 100644
--- a/intern/cycles/render/film.cpp
+++ b/intern/cycles/render/film.cpp
@@ -183,6 +183,13 @@ void Pass::add(PassType type, vector<Pass> &passes, const char *name)
     case PASS_CRYPTOMATTE:
       pass.components = 4;
       break;
+    case PASS_ADAPTIVE_AUX_BUFFER:
+      pass.components = 4;
+      break;
+    case PASS_SAMPLE_COUNT:
+      pass.components = 1;
+      pass.exposure = false;
+      break;
     case PASS_AOV_COLOR:
       pass.components = 4;
       break;
@@ -311,6 +318,7 @@ NODE_DEFINE(Film)
   SOCKET_BOOLEAN(denoising_clean_pass, "Generate Denoising Clean Pass", false);
   SOCKET_BOOLEAN(denoising_prefiltered_pass, "Generate Denoising Prefiltered Pass", false);
   SOCKET_INT(denoising_flags, "Denoising Flags", 0);
+  SOCKET_BOOLEAN(use_adaptive_sampling, "Use Adaptive Sampling", false);
 
   return type;
 }
@@ -482,6 +490,12 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
                                       kfilm->pass_stride;
         have_cryptomatte = true;
         break;
+      case PASS_ADAPTIVE_AUX_BUFFER:
+        kfilm->pass_adaptive_aux_buffer = kfilm->pass_stride;
+        break;
+      case PASS_SAMPLE_COUNT:
+        kfilm->pass_sample_count = kfilm->pass_stride;
+        break;
       case PASS_AOV_COLOR:
         if (!have_aov_color) {
           kfilm->pass_aov_color = kfilm->pass_stride;
diff --git a/intern/cycles/render/film.h b/intern/cycles/render/film.h
index 95e54cb54d8..aae8fb404b0 100644
--- a/intern/cycles/render/film.h
+++ b/intern/cycles/render/film.h
@@ -81,6 +81,8 @@ class Film : public Node {
   CryptomatteType cryptomatte_passes;
   int cryptomatte_depth;
 
+  bool use_adaptive_sampling;
+
   bool need_update;
 
   Film();
diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp
index f289e11fe14..ee1aa5988bf 100644
--- a/intern/cycles/render/integrator.cpp
+++ b/intern/cycles/render/integrator.cpp
@@ -27,6 +27,7 @@
 #include "kernel/kernel_types.h"
 
 #include "util/util_foreach.h"
+#include "util/util_logging.h"
 #include "util/util_hash.h"
 
 CCL_NAMESPACE_BEGIN
@@ -69,6 +70,9 @@ NODE_DEFINE(Integrator)
   SOCKET_INT(volume_samples, "Volume Samples", 1);
   SOCKET_INT(start_sample, "Start Sample", 0);
 
+  SOCKET_FLOAT(adaptive_threshold, "Adaptive Threshold", 0.0f);
+  SOCKET_INT(adaptive_min_samples, "Adaptive Min Samples", 0);
+
   SOCKET_BOOLEAN(sample_all_lights_direct, "Sample All Lights Direct", true);
   SOCKET_BOOLEAN(sample_all_lights_indirect, "Sample All Lights Indirect", true);
   SOCKET_FLOAT(light_sampling_threshold, "Light Sampling Threshold", 0.05f);
@@ -178,6 +182,22 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
 
   kintegrator->sampling_pattern = sampling_pattern;
   kintegrator->aa_samples = aa_samples;
+  if (aa_samples > 0 && adaptive_min_samples == 0) {
+    kintegrator->adaptive_min_samples = max(4, (int)sqrtf(aa_samples));
+    VLOG(1) << "Cycles adaptive sampling: automatic min samples = "
+            << kintegrator->adaptive_min_samples;
+  }
+  else {
+    kintegrator->adaptive_min_samples = max(4, adaptive_min_samples);
+  }
+  if (aa_samples > 0 && adaptive_threshold == 0.0f) {
+    kintegrator->adaptive_threshold = max(0.001f, 1.0f / (float)aa_samples);
+    VLOG(1) << "Cycles adaptive sampling: automatic threshold = "
+            << kintegrator->adaptive_threshold;
+  }
+  else {
+    kintegrator->adaptive_threshold = adaptive_threshold;
+  }
 
   if (light_sampling_threshold > 0.0f) {
     kintegrator->light_inv_rr_threshold = 1.0f / light_sampling_threshold;
diff --git a/intern/cycles/render/integrator.h b/intern/cycles/render/integrator.h
index 32d84c27072..9930e907aea 100644
--- a/intern/cycles/render/integrator.h
+++ b/intern/cycles/render/integrator.h
@@ -75,6 +75,9 @@ class Integrator : public Node {
   bool sample_all_lights_indirect;
   float light_sampling_threshold;
 
+  int adaptive_min_samples;
+  float adaptive_threshold;
+
   enum Method {
     BRANCHED_PATH = 0,
     PATH = 1,
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index 4231403e39a..6bf2160f9fa 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -1103,6 +1103,10 @@ void Session::render(bool with_denoising)
   task.need_finish_queue = params.progressive_refine;
   task.integrator_branched = scene->integrator->method == Integrator::BRANCHED_PATH;
 
+  task.adaptive_sampling.use = (scene->integrator->sampling_pattern == SAMPLING_PATTERN_PMJ) &&
+                               scene->dscene.data.film.pass_adaptive_aux_buffer;
+  task.adaptive_sampling.min_samples = scene->dscene.data.integrator.adaptive_min_samples;
+
   /* Acquire render tiles by default. */
   task.tile_types = RenderTile::PATH_TRACE;
 
diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h
index 7f3614ccb19..8053e46b12e 100644
--- a/intern/cycles/render/session.h
+++ b/intern/cycles/render/session.h
@@ -56,6 +56,7 @@ class SessionParams {
   int denoising_start_sample;
   int pixel_size;
   int threads;
+  bool adaptive_sampling;
 
   bool use_profiling;
 
@@ -89,6 +90,7 @@ class SessionParams {
     denoising_start_sample = 0;
     pixel_size = 1;
     threads = 0;
+    adaptive_sampling = false;
 
     use_profiling = false;
 
@@ -117,6 +119,7 @@ class SessionParams {
              progressive == params.progressive && experimental == params.experimental &&
              tile_size == params.tile_size && start_resolution == params.start_resolution &&
              pixel_size == params.pixel_size && threads == params.threads &&
+             adaptive_sampling == params.adaptive_sampling &&
              use_profiling == params.use_profiling &&
              display_buffer_linear == params.display_buffer_linear &&
              cancel_timeout == params.cancel_timeout && reset_timeout == params.reset_timeout &&
diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h
index a8ea1dc925e..13d177d2b25 100644
--- a/intern/cycles/util/util_atomic.h
+++ b/intern/cycles/util/util_atomic.h
@@ -77,6 +77,7 @@ ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float
 #    define atomic_fetch_and_add_uint32(p, x) atomic_add((p), (x))
 #    define atomic_fetch_and_inc_uint32(p) atomic_inc((p))
 #    define atomic_fetch_and_dec_uint32(p) atomic_dec((p))
+#    define atomic_fetch_and_or_uint32(p, x) atomic_or((p), (x))
 
 #    define CCL_LOCAL_MEM_FENCE CLK_LOCAL_MEM_FENCE
 #    define ccl_barrier(flags) barrier(flags)
@@ -91,6 +92,7 @@ ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float
 #    define atomic_fetch_and_sub_uint32(p, x) atomicSub((unsigned int *)(p), (unsigned int)(x))
 #    define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1)
 #    define atomic_fetch_and_dec_uint32(p) atomic_fetch_and_sub_uint32((p), 1)
+#    define atomic_fetch_and_or_uint32(p, x) atomicOr((unsigned int *)(p), (unsigned int)(x))
 
 ccl_device_inline float atomic_compare_and_swap_float(volatile float *dest,
                                                       const float old_val,
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index 48e9983ac8f..8a1890ad319 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -101,6 +101,11 @@ ccl_device_inline size_t round_down(size_t x, size_t multiple)
   return (x / multiple) * multiple;
 }
 
+ccl_device_inline bool is_power_of_two(size_t x)
+{
+  return (x & (x - 1)) == 0;
+}
+
 CCL_NAMESPACE_END
 
 /* Vectorized types declaration. */