Cycles: merge of cycles-x branch, a major update to the renderer

This includes much improved GPU rendering performance, viewport interactivity, new shadow catcher, revamped sampling settings, subsurface scattering anisotropy, new GPU volume sampling, improved PMJ sampling pattern, and more. Some features have also been removed or changed, breaking backwards compatibility. Including the removal of the OpenCL backend, for which alternatives are under development. Release notes and code docs: https://wiki.blender.org/wiki/Reference/Release_Notes/3.0/Cycles https://wiki.blender.org/wiki/Source/Render/Cycles Credits: * Sergey Sharybin * Brecht Van Lommel * Patrick Mours (OptiX backend) * Christophe Hery (subsurface scattering anisotropy) * William Leeson (PMJ sampling pattern) * Alaska (various fixes and tweaks) * Thomas Dinges (various fixes) For the full commit history, see the cycles-x branch. This squashes together all the changes since intermediate changes would often fail building or tests. Ref T87839, T87837, T87836 Fixes T90734, T89353, T80267, T80267, T77185, T69800
author: Brecht Van Lommel <brecht@blender.org> 2021-09-20 18:59:20 +0300
committer: Brecht Van Lommel <brecht@blender.org> 2021-09-21 15:55:54 +0300
commit: 08031197250aeecbaca3803254e6f25b8c7b7b37 (patch)
tree: 6fe7ab045f0dc0a423d6557c4073f34309ef4740 /intern/cycles/device
parent: fa6b1007bad065440950cd67deb16a04f368856f (diff)
65 files changed, 6970 insertions, 15812 deletions
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 928249931a3..d18f4360aef 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -36,49 +36,70 @@ endif()
 
 set(SRC
   device.cpp
-  device_cpu.cpp
-  device_cuda.cpp
-  device_denoising.cpp
-  device_dummy.cpp
+  device_denoise.cpp
+  device_graphics_interop.cpp
+  device_kernel.cpp
   device_memory.cpp
-  device_multi.cpp
-  device_opencl.cpp
-  device_optix.cpp
-  device_split_kernel.cpp
-  device_task.cpp
+  device_queue.cpp
+)
+
+set(SRC_CPU
+  cpu/device.cpp
+  cpu/device.h
+  cpu/device_impl.cpp
+  cpu/device_impl.h
+  cpu/kernel.cpp
+  cpu/kernel.h
+  cpu/kernel_function.h
+  cpu/kernel_thread_globals.cpp
+  cpu/kernel_thread_globals.h
 )
 
 set(SRC_CUDA
-  cuda/device_cuda.h
-  cuda/device_cuda_impl.cpp
+  cuda/device.cpp
+  cuda/device.h
+  cuda/device_impl.cpp
+  cuda/device_impl.h
+  cuda/graphics_interop.cpp
+  cuda/graphics_interop.h
+  cuda/kernel.cpp
+  cuda/kernel.h
+  cuda/queue.cpp
+  cuda/queue.h
+  cuda/util.cpp
+  cuda/util.h
 )
 
-set(SRC_OPENCL
-  opencl/device_opencl.h
-  opencl/device_opencl_impl.cpp
-  opencl/memory_manager.h
-  opencl/memory_manager.cpp
-  opencl/opencl_util.cpp
+set(SRC_DUMMY
+  dummy/device.cpp
+  dummy/device.h
 )
 
-if(WITH_CYCLES_NETWORK)
-  list(APPEND SRC
-    device_network.cpp
-  )
-endif()
+set(SRC_MULTI
+  multi/device.cpp
+  multi/device.h
+)
+
+set(SRC_OPTIX
+  optix/device.cpp
+  optix/device.h
+  optix/device_impl.cpp
+  optix/device_impl.h
+  optix/queue.cpp
+  optix/queue.h
+  optix/util.h
+)
 
 set(SRC_HEADERS
   device.h
-  device_denoising.h
+  device_denoise.h
+  device_graphics_interop.h
   device_memory.h
-  device_intern.h
-  device_network.h
-  device_split_kernel.h
-  device_task.h
+  device_kernel.h
+  device_queue.h
 )
 
 set(LIB
-  cycles_render
   cycles_kernel
   cycles_util
   ${CYCLES_GL_LIBRARIES}
@@ -95,15 +116,7 @@ else()
 endif()
 
 add_definitions(${GL_DEFINITIONS})
-if(WITH_CYCLES_NETWORK)
-  add_definitions(-DWITH_NETWORK)
-endif()
-if(WITH_CYCLES_DEVICE_OPENCL)
-  list(APPEND LIB
-    extern_clew
-  )
-  add_definitions(-DWITH_OPENCL)
-endif()
+
 if(WITH_CYCLES_DEVICE_CUDA)
   add_definitions(-DWITH_CUDA)
 endif()
@@ -115,18 +128,27 @@ if(WITH_CYCLES_DEVICE_MULTI)
 endif()
 
 if(WITH_OPENIMAGEDENOISE)
-  add_definitions(-DWITH_OPENIMAGEDENOISE)
-  add_definitions(-DOIDN_STATIC_LIB)
-  list(APPEND INC_SYS
-    ${OPENIMAGEDENOISE_INCLUDE_DIRS}
-  )
   list(APPEND LIB
     ${OPENIMAGEDENOISE_LIBRARIES}
-    ${TBB_LIBRARIES}
   )
 endif()
 
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
 
-cycles_add_library(cycles_device "${LIB}" ${SRC} ${SRC_CUDA} ${SRC_OPENCL} ${SRC_HEADERS})
+cycles_add_library(cycles_device "${LIB}"
+  ${SRC}
+  ${SRC_CPU}
+  ${SRC_CUDA}
+  ${SRC_DUMMY}
+  ${SRC_MULTI}
+  ${SRC_OPTIX}
+  ${SRC_HEADERS}
+)
+
+source_group("cpu" FILES ${SRC_CPU})
+source_group("cuda" FILES ${SRC_CUDA})
+source_group("dummy" FILES ${SRC_DUMMY})
+source_group("multi" FILES ${SRC_MULTI})
+source_group("optix" FILES ${SRC_OPTIX})
+source_group("common" FILES ${SRC} ${SRC_HEADERS})
diff --git a/intern/cycles/device/cpu/device.cpp b/intern/cycles/device/cpu/device.cpp
new file mode 100644
index 00000000000..68ca8e8bb22
--- /dev/null
+++ b/intern/cycles/device/cpu/device.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/cpu/device.h"
+#include "device/cpu/device_impl.h"
+
+/* Used for `info.denoisers`. */
+/* TODO(sergey): The denoisers are probably to be moved completely out of the device into their
+ * own class. But until then keep API consistent with how it used to work before. */
+#include "util/util_openimagedenoise.h"
+
+CCL_NAMESPACE_BEGIN
+
+Device *device_cpu_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+{
+  return new CPUDevice(info, stats, profiler);
+}
+
+void device_cpu_info(vector<DeviceInfo> &devices)
+{
+  DeviceInfo info;
+
+  info.type = DEVICE_CPU;
+  info.description = system_cpu_brand_string();
+  info.id = "CPU";
+  info.num = 0;
+  info.has_osl = true;
+  info.has_half_images = true;
+  info.has_nanovdb = true;
+  info.has_profiling = true;
+  if (openimagedenoise_supported()) {
+    info.denoisers |= DENOISER_OPENIMAGEDENOISE;
+  }
+
+  devices.insert(devices.begin(), info);
+}
+
+string device_cpu_capabilities()
+{
+  string capabilities = "";
+  capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
+  capabilities += system_cpu_support_sse3() ? "SSE3 " : "";
+  capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
+  capabilities += system_cpu_support_avx() ? "AVX " : "";
+  capabilities += system_cpu_support_avx2() ? "AVX2" : "";
+  if (capabilities[capabilities.size() - 1] == ' ')
+    capabilities.resize(capabilities.size() - 1);
+  return capabilities;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/device.h b/intern/cycles/device/cpu/device.h
new file mode 100644
index 00000000000..9cb2e80068d
--- /dev/null
+++ b/intern/cycles/device/cpu/device.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_string.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+Device *device_cpu_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+void device_cpu_info(vector<DeviceInfo> &devices);
+
+string device_cpu_capabilities();
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/device_impl.cpp b/intern/cycles/device/cpu/device_impl.cpp
new file mode 100644
index 00000000000..3b0db6bdd0e
--- /dev/null
+++ b/intern/cycles/device/cpu/device_impl.cpp
@@ -0,0 +1,481 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/cpu/device_impl.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+/* So ImathMath is included before our kernel_cpu_compat. */
+#ifdef WITH_OSL
+/* So no context pollution happens from indirectly included windows.h */
+#  include "util/util_windows.h"
+#  include <OSL/oslexec.h>
+#endif
+
+#ifdef WITH_EMBREE
+#  include <embree3/rtcore.h>
+#endif
+
+#include "device/cpu/kernel.h"
+#include "device/cpu/kernel_thread_globals.h"
+
+#include "device/device.h"
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+#include "kernel/device/cpu/kernel.h"
+#include "kernel/kernel_types.h"
+
+#include "kernel/osl/osl_shader.h"
+#include "kernel/osl/osl_globals.h"
+// clang-format on
+
+#include "bvh/bvh_embree.h"
+
+#include "render/buffers.h"
+
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_function.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_opengl.h"
+#include "util/util_openimagedenoise.h"
+#include "util/util_optimization.h"
+#include "util/util_progress.h"
+#include "util/util_system.h"
+#include "util/util_task.h"
+#include "util/util_thread.h"
+
+CCL_NAMESPACE_BEGIN
+
+CPUDevice::CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_)
+    : Device(info_, stats_, profiler_), texture_info(this, "__texture_info", MEM_GLOBAL)
+{
+  /* Pick any kernel, all of them are supposed to have same level of microarchitecture
+   * optimization. */
+  VLOG(1) << "Will be using " << kernels.integrator_init_from_camera.get_uarch_name()
+          << " kernels.";
+
+  if (info.cpu_threads == 0) {
+    info.cpu_threads = TaskScheduler::num_threads();
+  }
+
+#ifdef WITH_OSL
+  kernel_globals.osl = &osl_globals;
+#endif
+#ifdef WITH_EMBREE
+  embree_device = rtcNewDevice("verbose=0");
+#endif
+  need_texture_info = false;
+}
+
+CPUDevice::~CPUDevice()
+{
+#ifdef WITH_EMBREE
+  rtcReleaseDevice(embree_device);
+#endif
+
+  texture_info.free();
+}
+
+bool CPUDevice::show_samples() const
+{
+  return (info.cpu_threads == 1);
+}
+
+BVHLayoutMask CPUDevice::get_bvh_layout_mask() const
+{
+  BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2;
+#ifdef WITH_EMBREE
+  bvh_layout_mask |= BVH_LAYOUT_EMBREE;
+#endif /* WITH_EMBREE */
+  return bvh_layout_mask;
+}
+
+bool CPUDevice::load_texture_info()
+{
+  if (!need_texture_info) {
+    return false;
+  }
+
+  texture_info.copy_to_device();
+  need_texture_info = false;
+
+  return true;
+}
+
+void CPUDevice::mem_alloc(device_memory &mem)
+{
+  if (mem.type == MEM_TEXTURE) {
+    assert(!"mem_alloc not supported for textures.");
+  }
+  else if (mem.type == MEM_GLOBAL) {
+    assert(!"mem_alloc not supported for global memory.");
+  }
+  else {
+    if (mem.name) {
+      VLOG(1) << "Buffer allocate: " << mem.name << ", "
+              << string_human_readable_number(mem.memory_size()) << " bytes. ("
+              << string_human_readable_size(mem.memory_size()) << ")";
+    }
+
+    if (mem.type == MEM_DEVICE_ONLY) {
+      assert(!mem.host_pointer);
+      size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES;
+      void *data = util_aligned_malloc(mem.memory_size(), alignment);
+      mem.device_pointer = (device_ptr)data;
+    }
+    else {
+      mem.device_pointer = (device_ptr)mem.host_pointer;
+    }
+
+    mem.device_size = mem.memory_size();
+    stats.mem_alloc(mem.device_size);
+  }
+}
+
+void CPUDevice::mem_copy_to(device_memory &mem)
+{
+  if (mem.type == MEM_GLOBAL) {
+    global_free(mem);
+    global_alloc(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free((device_texture &)mem);
+    tex_alloc((device_texture &)mem);
+  }
+  else {
+    if (!mem.device_pointer) {
+      mem_alloc(mem);
+    }
+
+    /* copy is no-op */
+  }
+}
+
+void CPUDevice::mem_copy_from(
+    device_memory & /*mem*/, int /*y*/, int /*w*/, int /*h*/, int /*elem*/)
+{
+  /* no-op */
+}
+
+void CPUDevice::mem_zero(device_memory &mem)
+{
+  if (!mem.device_pointer) {
+    mem_alloc(mem);
+  }
+
+  if (mem.device_pointer) {
+    memset((void *)mem.device_pointer, 0, mem.memory_size());
+  }
+}
+
+void CPUDevice::mem_free(device_memory &mem)
+{
+  if (mem.type == MEM_GLOBAL) {
+    global_free(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free((device_texture &)mem);
+  }
+  else if (mem.device_pointer) {
+    if (mem.type == MEM_DEVICE_ONLY) {
+      util_aligned_free((void *)mem.device_pointer);
+    }
+    mem.device_pointer = 0;
+    stats.mem_free(mem.device_size);
+    mem.device_size = 0;
+  }
+}
+
+device_ptr CPUDevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
+{
+  return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
+}
+
+void CPUDevice::const_copy_to(const char *name, void *host, size_t size)
+{
+#if WITH_EMBREE
+  if (strcmp(name, "__data") == 0) {
+    assert(size <= sizeof(KernelData));
+
+    // Update scene handle (since it is different for each device on multi devices)
+    KernelData *const data = (KernelData *)host;
+    data->bvh.scene = embree_scene;
+  }
+#endif
+  kernel_const_copy(&kernel_globals, name, host, size);
+}
+
+void CPUDevice::global_alloc(device_memory &mem)
+{
+  VLOG(1) << "Global memory allocate: " << mem.name << ", "
+          << string_human_readable_number(mem.memory_size()) << " bytes. ("
+          << string_human_readable_size(mem.memory_size()) << ")";
+
+  kernel_global_memory_copy(&kernel_globals, mem.name, mem.host_pointer, mem.data_size);
+
+  mem.device_pointer = (device_ptr)mem.host_pointer;
+  mem.device_size = mem.memory_size();
+  stats.mem_alloc(mem.device_size);
+}
+
+void CPUDevice::global_free(device_memory &mem)
+{
+  if (mem.device_pointer) {
+    mem.device_pointer = 0;
+    stats.mem_free(mem.device_size);
+    mem.device_size = 0;
+  }
+}
+
+void CPUDevice::tex_alloc(device_texture &mem)
+{
+  VLOG(1) << "Texture allocate: " << mem.name << ", "
+          << string_human_readable_number(mem.memory_size()) << " bytes. ("
+          << string_human_readable_size(mem.memory_size()) << ")";
+
+  mem.device_pointer = (device_ptr)mem.host_pointer;
+  mem.device_size = mem.memory_size();
+  stats.mem_alloc(mem.device_size);
+
+  const uint slot = mem.slot;
+  if (slot >= texture_info.size()) {
+    /* Allocate some slots in advance, to reduce amount of re-allocations. */
+    texture_info.resize(slot + 128);
+  }
+
+  texture_info[slot] = mem.info;
+  texture_info[slot].data = (uint64_t)mem.host_pointer;
+  need_texture_info = true;
+}
+
+void CPUDevice::tex_free(device_texture &mem)
+{
+  if (mem.device_pointer) {
+    mem.device_pointer = 0;
+    stats.mem_free(mem.device_size);
+    mem.device_size = 0;
+    need_texture_info = true;
+  }
+}
+
+void CPUDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
+{
+#ifdef WITH_EMBREE
+  if (bvh->params.bvh_layout == BVH_LAYOUT_EMBREE ||
+      bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE) {
+    BVHEmbree *const bvh_embree = static_cast<BVHEmbree *>(bvh);
+    if (refit) {
+      bvh_embree->refit(progress);
+    }
+    else {
+      bvh_embree->build(progress, &stats, embree_device);
+    }
+
+    if (bvh->params.top_level) {
+      embree_scene = bvh_embree->scene;
+    }
+  }
+  else
+#endif
+    Device::build_bvh(bvh, progress, refit);
+}
+
+#if 0
+void CPUDevice::render(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
+{
+  const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE;
+
+  scoped_timer timer(&tile.buffers->render_time);
+
+  Coverage coverage(kg, tile);
+  if (use_coverage) {
+    coverage.init_path_trace();
+  }
+
+  float *render_buffer = (float *)tile.buffer;
+  int start_sample = tile.start_sample;
+  int end_sample = tile.start_sample + tile.num_samples;
+
+  /* Needed for Embree. */
+  SIMD_SET_FLUSH_TO_ZERO;
+
+  for (int sample = start_sample; sample < end_sample; sample++) {
+    if (task.get_cancel() || TaskPool::canceled()) {
+      if (task.need_finish_queue == false)
+        break;
+    }
+
+    if (tile.stealing_state == RenderTile::CAN_BE_STOLEN && task.get_tile_stolen()) {
+      tile.stealing_state = RenderTile::WAS_STOLEN;
+      break;
+    }
+
+    if (tile.task == RenderTile::PATH_TRACE) {
+      for (int y = tile.y; y < tile.y + tile.h; y++) {
+        for (int x = tile.x; x < tile.x + tile.w; x++) {
+          if (use_coverage) {
+            coverage.init_pixel(x, y);
+          }
+          kernels.path_trace(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
+        }
+      }
+    }
+    else {
+      for (int y = tile.y; y < tile.y + tile.h; y++) {
+        for (int x = tile.x; x < tile.x + tile.w; x++) {
+          kernels.bake(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
+        }
+      }
+    }
+    tile.sample = sample + 1;
+
+    if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(sample)) {
+      const bool stop = adaptive_sampling_filter(kg, tile, sample);
+      if (stop) {
+        const int num_progress_samples = end_sample - sample;
+        tile.sample = end_sample;
+        task.update_progress(&tile, tile.w * tile.h * num_progress_samples);
+        break;
+      }
+    }
+
+    task.update_progress(&tile, tile.w * tile.h);
+  }
+  if (use_coverage) {
+    coverage.finalize();
+  }
+
+  if (task.adaptive_sampling.use && (tile.stealing_state != RenderTile::WAS_STOLEN)) {
+    adaptive_sampling_post(tile, kg);
+  }
+}
+
+void CPUDevice::thread_render(DeviceTask &task)
+{
+  if (TaskPool::canceled()) {
+    if (task.need_finish_queue == false)
+      return;
+  }
+
+  /* allocate buffer for kernel globals */
+  CPUKernelThreadGlobals kg(kernel_globals, get_cpu_osl_memory());
+
+  profiler.add_state(&kg.profiler);
+
+  /* NLM denoiser. */
+  DenoisingTask *denoising = NULL;
+
+  /* OpenImageDenoise: we can only denoise with one thread at a time, so to
+   * avoid waiting with mutex locks in the denoiser, we let only a single
+   * thread acquire denoising tiles. */
+  uint tile_types = task.tile_types;
+  bool hold_denoise_lock = false;
+  if ((tile_types & RenderTile::DENOISE) && task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
+    if (!oidn_task_lock.try_lock()) {
+      tile_types &= ~RenderTile::DENOISE;
+      hold_denoise_lock = true;
+    }
+  }
+
+  RenderTile tile;
+  while (task.acquire_tile(this, tile, tile_types)) {
+    if (tile.task == RenderTile::PATH_TRACE) {
+      render(task, tile, &kg);
+    }
+    else if (tile.task == RenderTile::BAKE) {
+      render(task, tile, &kg);
+    }
+    else if (tile.task == RenderTile::DENOISE) {
+      denoise_openimagedenoise(task, tile);
+      task.update_progress(&tile, tile.w * tile.h);
+    }
+
+    task.release_tile(tile);
+
+    if (TaskPool::canceled()) {
+      if (task.need_finish_queue == false)
+        break;
+    }
+  }
+
+  if (hold_denoise_lock) {
+    oidn_task_lock.unlock();
+  }
+
+  profiler.remove_state(&kg.profiler);
+
+  delete denoising;
+}
+
+void CPUDevice::thread_denoise(DeviceTask &task)
+{
+  RenderTile tile;
+  tile.x = task.x;
+  tile.y = task.y;
+  tile.w = task.w;
+  tile.h = task.h;
+  tile.buffer = task.buffer;
+  tile.sample = task.sample + task.num_samples;
+  tile.num_samples = task.num_samples;
+  tile.start_sample = task.sample;
+  tile.offset = task.offset;
+  tile.stride = task.stride;
+  tile.buffers = task.buffers;
+
+  denoise_openimagedenoise(task, tile);
+
+  task.update_progress(&tile, tile.w * tile.h);
+}
+#endif
+
+const CPUKernels *CPUDevice::get_cpu_kernels() const
+{
+  return &kernels;
+}
+
+void CPUDevice::get_cpu_kernel_thread_globals(
+    vector<CPUKernelThreadGlobals> &kernel_thread_globals)
+{
+  /* Ensure latest texture info is loaded into kernel globals before returning. */
+  load_texture_info();
+
+  kernel_thread_globals.clear();
+  void *osl_memory = get_cpu_osl_memory();
+  for (int i = 0; i < info.cpu_threads; i++) {
+    kernel_thread_globals.emplace_back(kernel_globals, osl_memory, profiler);
+  }
+}
+
+void *CPUDevice::get_cpu_osl_memory()
+{
+#ifdef WITH_OSL
+  return &osl_globals;
+#else
+  return NULL;
+#endif
+}
+
+bool CPUDevice::load_kernels(const uint /*kernel_features*/)
+{
+  return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/device_impl.h b/intern/cycles/device/cpu/device_impl.h
new file mode 100644
index 00000000000..7d222808652
--- /dev/null
+++ b/intern/cycles/device/cpu/device_impl.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+/* So ImathMath is included before our kernel_cpu_compat. */
+#ifdef WITH_OSL
+/* So no context pollution happens from indirectly included windows.h */
+#  include "util/util_windows.h"
+#  include <OSL/oslexec.h>
+#endif
+
+#ifdef WITH_EMBREE
+#  include <embree3/rtcore.h>
+#endif
+
+#include "device/cpu/kernel.h"
+#include "device/device.h"
+#include "device/device_memory.h"
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/kernel.h"
+#include "kernel/device/cpu/globals.h"
+
+#include "kernel/osl/osl_shader.h"
+#include "kernel/osl/osl_globals.h"
+// clang-format on
+
+CCL_NAMESPACE_BEGIN
+
+class CPUDevice : public Device {
+ public:
+  KernelGlobals kernel_globals;
+
+  device_vector<TextureInfo> texture_info;
+  bool need_texture_info;
+
+#ifdef WITH_OSL
+  OSLGlobals osl_globals;
+#endif
+#ifdef WITH_EMBREE
+  RTCScene embree_scene = NULL;
+  RTCDevice embree_device;
+#endif
+
+  CPUKernels kernels;
+
+  CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_);
+  ~CPUDevice();
+
+  virtual bool show_samples() const override;
+
+  virtual BVHLayoutMask get_bvh_layout_mask() const override;
+
+  /* Returns true if the texture info was copied to the device (meaning, some more
+   * re-initialization might be needed). */
+  bool load_texture_info();
+
+  virtual void mem_alloc(device_memory &mem) override;
+  virtual void mem_copy_to(device_memory &mem) override;
+  virtual void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override;
+  virtual void mem_zero(device_memory &mem) override;
+  virtual void mem_free(device_memory &mem) override;
+  virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override;
+
+  virtual void const_copy_to(const char *name, void *host, size_t size) override;
+
+  void global_alloc(device_memory &mem);
+  void global_free(device_memory &mem);
+
+  void tex_alloc(device_texture &mem);
+  void tex_free(device_texture &mem);
+
+  void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
+
+  virtual const CPUKernels *get_cpu_kernels() const override;
+  virtual void get_cpu_kernel_thread_globals(
+      vector<CPUKernelThreadGlobals> &kernel_thread_globals) override;
+  virtual void *get_cpu_osl_memory() override;
+
+ protected:
+  virtual bool load_kernels(uint /*kernel_features*/) override;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel.cpp b/intern/cycles/device/cpu/kernel.cpp
new file mode 100644
index 00000000000..0ab58ff8600
--- /dev/null
+++ b/intern/cycles/device/cpu/kernel.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/cpu/kernel.h"
+
+#include "kernel/device/cpu/kernel.h"
+
+CCL_NAMESPACE_BEGIN
+
+#define KERNEL_FUNCTIONS(name) \
+  KERNEL_NAME_EVAL(cpu, name), KERNEL_NAME_EVAL(cpu_sse2, name), \
+      KERNEL_NAME_EVAL(cpu_sse3, name), KERNEL_NAME_EVAL(cpu_sse41, name), \
+      KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name)
+
+#define REGISTER_KERNEL(name) name(KERNEL_FUNCTIONS(name))
+
+CPUKernels::CPUKernels()
+    : /* Integrator. */
+      REGISTER_KERNEL(integrator_init_from_camera),
+      REGISTER_KERNEL(integrator_init_from_bake),
+      REGISTER_KERNEL(integrator_intersect_closest),
+      REGISTER_KERNEL(integrator_intersect_shadow),
+      REGISTER_KERNEL(integrator_intersect_subsurface),
+      REGISTER_KERNEL(integrator_intersect_volume_stack),
+      REGISTER_KERNEL(integrator_shade_background),
+      REGISTER_KERNEL(integrator_shade_light),
+      REGISTER_KERNEL(integrator_shade_shadow),
+      REGISTER_KERNEL(integrator_shade_surface),
+      REGISTER_KERNEL(integrator_shade_volume),
+      REGISTER_KERNEL(integrator_megakernel),
+      /* Shader evaluation. */
+      REGISTER_KERNEL(shader_eval_displace),
+      REGISTER_KERNEL(shader_eval_background),
+      /* Adaptive campling. */
+      REGISTER_KERNEL(adaptive_sampling_convergence_check),
+      REGISTER_KERNEL(adaptive_sampling_filter_x),
+      REGISTER_KERNEL(adaptive_sampling_filter_y),
+      /* Cryptomatte. */
+      REGISTER_KERNEL(cryptomatte_postprocess),
+      /* Bake. */
+      REGISTER_KERNEL(bake)
+{
+}
+
+#undef REGISTER_KERNEL
+#undef KERNEL_FUNCTIONS
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel.h b/intern/cycles/device/cpu/kernel.h
new file mode 100644
index 00000000000..54b18308544
--- /dev/null
+++ b/intern/cycles/device/cpu/kernel.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "device/cpu/kernel_function.h"
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct KernelGlobals;
+struct IntegratorStateCPU;
+struct TileInfo;
+
+class CPUKernels {
+ public:
+  /* Integrator. */
+
+  using IntegratorFunction =
+      CPUKernelFunction<void (*)(const KernelGlobals *kg, IntegratorStateCPU *state)>;
+  using IntegratorShadeFunction = CPUKernelFunction<void (*)(
+      const KernelGlobals *kg, IntegratorStateCPU *state, ccl_global float *render_buffer)>;
+  using IntegratorInitFunction = CPUKernelFunction<bool (*)(const KernelGlobals *kg,
+                                                            IntegratorStateCPU *state,
+                                                            KernelWorkTile *tile,
+                                                            ccl_global float *render_buffer)>;
+
+  IntegratorInitFunction integrator_init_from_camera;
+  IntegratorInitFunction integrator_init_from_bake;
+  IntegratorFunction integrator_intersect_closest;
+  IntegratorFunction integrator_intersect_shadow;
+  IntegratorFunction integrator_intersect_subsurface;
+  IntegratorFunction integrator_intersect_volume_stack;
+  IntegratorShadeFunction integrator_shade_background;
+  IntegratorShadeFunction integrator_shade_light;
+  IntegratorShadeFunction integrator_shade_shadow;
+  IntegratorShadeFunction integrator_shade_surface;
+  IntegratorShadeFunction integrator_shade_volume;
+  IntegratorShadeFunction integrator_megakernel;
+
+  /* Shader evaluation. */
+
+  using ShaderEvalFunction = CPUKernelFunction<void (*)(
+      const KernelGlobals *kg, const KernelShaderEvalInput *, float4 *, const int)>;
+
+  ShaderEvalFunction shader_eval_displace;
+  ShaderEvalFunction shader_eval_background;
+
+  /* Adaptive stopping. */
+
+  using AdaptiveSamplingConvergenceCheckFunction =
+      CPUKernelFunction<bool (*)(const KernelGlobals *kg,
+                                 ccl_global float *render_buffer,
+                                 int x,
+                                 int y,
+                                 float threshold,
+                                 bool reset,
+                                 int offset,
+                                 int stride)>;
+
+  using AdaptiveSamplingFilterXFunction =
+      CPUKernelFunction<void (*)(const KernelGlobals *kg,
+                                 ccl_global float *render_buffer,
+                                 int y,
+                                 int start_x,
+                                 int width,
+                                 int offset,
+                                 int stride)>;
+
+  using AdaptiveSamplingFilterYFunction =
+      CPUKernelFunction<void (*)(const KernelGlobals *kg,
+                                 ccl_global float *render_buffer,
+                                 int x,
+                                 int start_y,
+                                 int height,
+                                 int offset,
+                                 int stride)>;
+
+  AdaptiveSamplingConvergenceCheckFunction adaptive_sampling_convergence_check;
+
+  AdaptiveSamplingFilterXFunction adaptive_sampling_filter_x;
+  AdaptiveSamplingFilterYFunction adaptive_sampling_filter_y;
+
+  /* Cryptomatte. */
+
+  using CryptomattePostprocessFunction = CPUKernelFunction<void (*)(
+      const KernelGlobals *kg, ccl_global float *render_buffer, int pixel_index)>;
+
+  CryptomattePostprocessFunction cryptomatte_postprocess;
+
+  /* Bake. */
+
+  CPUKernelFunction<void (*)(const KernelGlobals *, float *, int, int, int, int, int)> bake;
+
+  CPUKernels();
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel_function.h b/intern/cycles/device/cpu/kernel_function.h
new file mode 100644
index 00000000000..aa18720cc24
--- /dev/null
+++ b/intern/cycles/device/cpu/kernel_function.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_debug.h"
+#include "util/util_system.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* A wrapper around per-microarchitecture variant of a kernel function.
+ *
+ * Provides a function-call-like API which gets routed to the most suitable implementation.
+ *
+ * For example, on a computer which only has SSE4.1 the kernel_sse41 will be used. */
+template<typename FunctionType> class CPUKernelFunction {
+ public:
+  CPUKernelFunction(FunctionType kernel_default,
+                    FunctionType kernel_sse2,
+                    FunctionType kernel_sse3,
+                    FunctionType kernel_sse41,
+                    FunctionType kernel_avx,
+                    FunctionType kernel_avx2)
+  {
+    kernel_info_ = get_best_kernel_info(
+        kernel_default, kernel_sse2, kernel_sse3, kernel_sse41, kernel_avx, kernel_avx2);
+  }
+
+  template<typename... Args> inline auto operator()(Args... args) const
+  {
+    assert(kernel_info_.kernel);
+
+    return kernel_info_.kernel(args...);
+  }
+
+  const char *get_uarch_name() const
+  {
+    return kernel_info_.uarch_name;
+  }
+
+ protected:
+  /* Helper class which allows to pass human-readable microarchitecture name together with function
+   * pointer. */
+  class KernelInfo {
+   public:
+    KernelInfo() : KernelInfo("", nullptr)
+    {
+    }
+
+    /* TODO(sergey): Use string view, to have higher-level functionality (i.e. comparison) without
+     * memory allocation. */
+    KernelInfo(const char *uarch_name, FunctionType kernel)
+        : uarch_name(uarch_name), kernel(kernel)
+    {
+    }
+
+    const char *uarch_name;
+    FunctionType kernel;
+  };
+
+  KernelInfo get_best_kernel_info(FunctionType kernel_default,
+                                  FunctionType kernel_sse2,
+                                  FunctionType kernel_sse3,
+                                  FunctionType kernel_sse41,
+                                  FunctionType kernel_avx,
+                                  FunctionType kernel_avx2)
+  {
+    /* Silence warnings about unused variables when compiling without some architectures. */
+    (void)kernel_sse2;
+    (void)kernel_sse3;
+    (void)kernel_sse41;
+    (void)kernel_avx;
+    (void)kernel_avx2;
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+    if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) {
+      return KernelInfo("AVX2", kernel_avx2);
+    }
+#endif
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+    if (DebugFlags().cpu.has_avx() && system_cpu_support_avx()) {
+      return KernelInfo("AVX", kernel_avx);
+    }
+#endif
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+    if (DebugFlags().cpu.has_sse41() && system_cpu_support_sse41()) {
+      return KernelInfo("SSE4.1", kernel_sse41);
+    }
+#endif
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+    if (DebugFlags().cpu.has_sse3() && system_cpu_support_sse3()) {
+      return KernelInfo("SSE3", kernel_sse3);
+    }
+#endif
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+    if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) {
+      return KernelInfo("SSE2", kernel_sse2);
+    }
+#endif
+
+    return KernelInfo("default", kernel_default);
+  }
+
+  KernelInfo kernel_info_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel_thread_globals.cpp b/intern/cycles/device/cpu/kernel_thread_globals.cpp
new file mode 100644
index 00000000000..988b00cd1f0
--- /dev/null
+++ b/intern/cycles/device/cpu/kernel_thread_globals.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/cpu/kernel_thread_globals.h"
+
+// clang-format off
+#include "kernel/osl/osl_shader.h"
+#include "kernel/osl/osl_globals.h"
+// clang-format on
+
+#include "util/util_profiling.h"
+
+CCL_NAMESPACE_BEGIN
+
+CPUKernelThreadGlobals::CPUKernelThreadGlobals(const KernelGlobals &kernel_globals,
+                                               void *osl_globals_memory,
+                                               Profiler &cpu_profiler)
+    : KernelGlobals(kernel_globals), cpu_profiler_(cpu_profiler)
+{
+  reset_runtime_memory();
+
+#ifdef WITH_OSL
+  OSLShader::thread_init(this, reinterpret_cast<OSLGlobals *>(osl_globals_memory));
+#else
+  (void)osl_globals_memory;
+#endif
+}
+
+CPUKernelThreadGlobals::CPUKernelThreadGlobals(CPUKernelThreadGlobals &&other) noexcept
+    : KernelGlobals(std::move(other)), cpu_profiler_(other.cpu_profiler_)
+{
+  other.reset_runtime_memory();
+}
+
+CPUKernelThreadGlobals::~CPUKernelThreadGlobals()
+{
+#ifdef WITH_OSL
+  OSLShader::thread_free(this);
+#endif
+}
+
+CPUKernelThreadGlobals &CPUKernelThreadGlobals::operator=(CPUKernelThreadGlobals &&other)
+{
+  if (this == &other) {
+    return *this;
+  }
+
+  *static_cast<KernelGlobals *>(this) = *static_cast<KernelGlobals *>(&other);
+
+  other.reset_runtime_memory();
+
+  return *this;
+}
+
+void CPUKernelThreadGlobals::reset_runtime_memory()
+{
+#ifdef WITH_OSL
+  osl = nullptr;
+#endif
+}
+
+void CPUKernelThreadGlobals::start_profiling()
+{
+  cpu_profiler_.add_state(&profiler);
+}
+
+void CPUKernelThreadGlobals::stop_profiling()
+{
+  cpu_profiler_.remove_state(&profiler);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel_thread_globals.h b/intern/cycles/device/cpu/kernel_thread_globals.h
new file mode 100644
index 00000000000..d005c3bb56c
--- /dev/null
+++ b/intern/cycles/device/cpu/kernel_thread_globals.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Profiler;
+
+/* A special class which extends memory ownership of the `KernelGlobals` decoupling any resource
+ * which is not thread-safe for access. Every worker thread which needs to operate on
+ * `KernelGlobals` needs to initialize its own copy of this object.
+ *
+ * NOTE: Only minimal subset of objects are copied: `KernelData` is never copied. This means that
+ * there is no unnecessary data duplication happening when using this object. */
+class CPUKernelThreadGlobals : public KernelGlobals {
+ public:
+  /* TODO(sergey): Would be nice to have properly typed OSLGlobals even in the case when building
+   * without OSL support. Will avoid need to those unnamed pointers and casts. */
+  CPUKernelThreadGlobals(const KernelGlobals &kernel_globals,
+                         void *osl_globals_memory,
+                         Profiler &cpu_profiler);
+
+  ~CPUKernelThreadGlobals();
+
+  CPUKernelThreadGlobals(const CPUKernelThreadGlobals &other) = delete;
+  CPUKernelThreadGlobals(CPUKernelThreadGlobals &&other) noexcept;
+
+  CPUKernelThreadGlobals &operator=(const CPUKernelThreadGlobals &other) = delete;
+  CPUKernelThreadGlobals &operator=(CPUKernelThreadGlobals &&other);
+
+  void start_profiling();
+  void stop_profiling();
+
+ protected:
+  void reset_runtime_memory();
+
+  Profiler &cpu_profiler_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/cuda/device.cpp
index 2e225ecfaf8..84becd6d081 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/cuda/device.cpp
@@ -14,21 +14,25 @@
  * limitations under the License.
  */
 
-#ifdef WITH_CUDA
+#include "device/cuda/device.h"
+
+#include "util/util_logging.h"
 
-#  include "device/cuda/device_cuda.h"
+#ifdef WITH_CUDA
+#  include "device/cuda/device_impl.h"
 #  include "device/device.h"
-#  include "device/device_intern.h"
 
-#  include "util/util_logging.h"
 #  include "util/util_string.h"
 #  include "util/util_windows.h"
+#endif /* WITH_CUDA */
 
 CCL_NAMESPACE_BEGIN
 
 bool device_cuda_init()
 {
-#  ifdef WITH_CUDA_DYNLOAD
+#if !defined(WITH_CUDA)
+  return false;
+#elif defined(WITH_CUDA_DYNLOAD)
   static bool initialized = false;
   static bool result = false;
 
@@ -59,16 +63,27 @@ bool device_cuda_init()
   }
 
   return result;
-#  else  /* WITH_CUDA_DYNLOAD */
+#else  /* WITH_CUDA_DYNLOAD */
   return true;
-#  endif /* WITH_CUDA_DYNLOAD */
+#endif /* WITH_CUDA_DYNLOAD */
 }
 
-Device *device_cuda_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
+Device *device_cuda_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
 {
-  return new CUDADevice(info, stats, profiler, background);
+#ifdef WITH_CUDA
+  return new CUDADevice(info, stats, profiler);
+#else
+  (void)info;
+  (void)stats;
+  (void)profiler;
+
+  LOG(FATAL) << "Request to create CUDA device without compiled-in support. Should never happen.";
+
+  return nullptr;
+#endif
 }
 
+#ifdef WITH_CUDA
 static CUresult device_cuda_safe_init()
 {
 #  ifdef _WIN32
@@ -86,9 +101,11 @@ static CUresult device_cuda_safe_init()
   return cuInit(0);
 #  endif
 }
+#endif /* WITH_CUDA */
 
 void device_cuda_info(vector<DeviceInfo> &devices)
 {
+#ifdef WITH_CUDA
   CUresult result = device_cuda_safe_init();
   if (result != CUDA_SUCCESS) {
     if (result != CUDA_ERROR_NO_DEVICE)
@@ -129,9 +146,9 @@ void device_cuda_info(vector<DeviceInfo> &devices)
 
     info.has_half_images = (major >= 3);
     info.has_nanovdb = true;
-    info.has_volume_decoupled = false;
-    info.has_adaptive_stop_per_sample = false;
-    info.denoisers = DENOISER_NLM;
+    info.denoisers = 0;
+
+    info.has_gpu_queue = true;
 
     /* Check if the device has P2P access to any other device in the system. */
     for (int peer_num = 0; peer_num < count && !info.has_peer_memory; peer_num++) {
@@ -182,10 +199,14 @@ void device_cuda_info(vector<DeviceInfo> &devices)
 
   if (!display_devices.empty())
     devices.insert(devices.end(), display_devices.begin(), display_devices.end());
+#else  /* WITH_CUDA */
+  (void)devices;
+#endif /* WITH_CUDA */
 }
 
 string device_cuda_capabilities()
 {
+#ifdef WITH_CUDA
   CUresult result = device_cuda_safe_init();
   if (result != CUDA_SUCCESS) {
     if (result != CUDA_ERROR_NO_DEVICE) {
@@ -310,8 +331,10 @@ string device_cuda_capabilities()
   }
 
   return capabilities;
+
+#else  /* WITH_CUDA */
+  return "";
+#endif /* WITH_CUDA */
 }
 
 CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/cuda/device.h b/intern/cycles/device/cuda/device.h
new file mode 100644
index 00000000000..b0484904d1a
--- /dev/null
+++ b/intern/cycles/device/cuda/device.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_string.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+bool device_cuda_init();
+
+Device *device_cuda_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+void device_cuda_info(vector<DeviceInfo> &devices);
+
+string device_cuda_capabilities();
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cuda/device_cuda.h b/intern/cycles/device/cuda/device_cuda.h
deleted file mode 100644
index c3271c3cfcf..00000000000
--- a/intern/cycles/device/cuda/device_cuda.h
+++ /dev/null
@@ -1,270 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_CUDA
-
-#  include "device/device.h"
-#  include "device/device_denoising.h"
-#  include "device/device_split_kernel.h"
-
-#  include "util/util_map.h"
-#  include "util/util_task.h"
-
-#  ifdef WITH_CUDA_DYNLOAD
-#    include "cuew.h"
-#  else
-#    include "util/util_opengl.h"
-#    include <cuda.h>
-#    include <cudaGL.h>
-#  endif
-
-CCL_NAMESPACE_BEGIN
-
-class CUDASplitKernel;
-
-class CUDADevice : public Device {
-
-  friend class CUDASplitKernelFunction;
-  friend class CUDASplitKernel;
-  friend class CUDAContextScope;
-
- public:
-  DedicatedTaskPool task_pool;
-  CUdevice cuDevice;
-  CUcontext cuContext;
-  CUmodule cuModule, cuFilterModule;
-  size_t device_texture_headroom;
-  size_t device_working_headroom;
-  bool move_texture_to_host;
-  size_t map_host_used;
-  size_t map_host_limit;
-  int can_map_host;
-  int pitch_alignment;
-  int cuDevId;
-  int cuDevArchitecture;
-  bool first_error;
-  CUDASplitKernel *split_kernel;
-
-  struct CUDAMem {
-    CUDAMem() : texobject(0), array(0), use_mapped_host(false)
-    {
-    }
-
-    CUtexObject texobject;
-    CUarray array;
-
-    /* If true, a mapped host memory in shared_pointer is being used. */
-    bool use_mapped_host;
-  };
-  typedef map<device_memory *, CUDAMem> CUDAMemMap;
-  CUDAMemMap cuda_mem_map;
-  thread_mutex cuda_mem_map_mutex;
-
-  struct PixelMem {
-    GLuint cuPBO;
-    CUgraphicsResource cuPBOresource;
-    GLuint cuTexId;
-    int w, h;
-  };
-  map<device_ptr, PixelMem> pixel_mem_map;
-
-  /* Bindless Textures */
-  device_vector<TextureInfo> texture_info;
-  bool need_texture_info;
-
-  /* Kernels */
-  struct {
-    bool loaded;
-
-    CUfunction adaptive_stopping;
-    CUfunction adaptive_filter_x;
-    CUfunction adaptive_filter_y;
-    CUfunction adaptive_scale_samples;
-    int adaptive_num_threads_per_block;
-  } functions;
-
-  static bool have_precompiled_kernels();
-
-  virtual bool show_samples() const override;
-
-  virtual BVHLayoutMask get_bvh_layout_mask() const override;
-
-  void set_error(const string &error) override;
-
-  CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_);
-
-  virtual ~CUDADevice();
-
-  bool support_device(const DeviceRequestedFeatures & /*requested_features*/);
-
-  bool check_peer_access(Device *peer_device) override;
-
-  bool use_adaptive_compilation();
-
-  bool use_split_kernel();
-
-  virtual string compile_kernel_get_common_cflags(
-      const DeviceRequestedFeatures &requested_features, bool filter = false, bool split = false);
-
-  string compile_kernel(const DeviceRequestedFeatures &requested_features,
-                        const char *name,
-                        const char *base = "cuda",
-                        bool force_ptx = false);
-
-  virtual bool load_kernels(const DeviceRequestedFeatures &requested_features) override;
-
-  void load_functions();
-
-  void reserve_local_memory(const DeviceRequestedFeatures &requested_features);
-
-  void init_host_memory();
-
-  void load_texture_info();
-
-  void move_textures_to_host(size_t size, bool for_texture);
-
-  CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
-
-  void generic_copy_to(device_memory &mem);
-
-  void generic_free(device_memory &mem);
-
-  void mem_alloc(device_memory &mem) override;
-
-  void mem_copy_to(device_memory &mem) override;
-
-  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override;
-
-  void mem_zero(device_memory &mem) override;
-
-  void mem_free(device_memory &mem) override;
-
-  device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override;
-
-  virtual void const_copy_to(const char *name, void *host, size_t size) override;
-
-  void global_alloc(device_memory &mem);
-
-  void global_free(device_memory &mem);
-
-  void tex_alloc(device_texture &mem);
-
-  void tex_free(device_texture &mem);
-
-  bool denoising_non_local_means(device_ptr image_ptr,
-                                 device_ptr guide_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr out_ptr,
-                                 DenoisingTask *task);
-
-  bool denoising_construct_transform(DenoisingTask *task);
-
-  bool denoising_accumulate(device_ptr color_ptr,
-                            device_ptr color_variance_ptr,
-                            device_ptr scale_ptr,
-                            int frame,
-                            DenoisingTask *task);
-
-  bool denoising_solve(device_ptr output_ptr, DenoisingTask *task);
-
-  bool denoising_combine_halves(device_ptr a_ptr,
-                                device_ptr b_ptr,
-                                device_ptr mean_ptr,
-                                device_ptr variance_ptr,
-                                int r,
-                                int4 rect,
-                                DenoisingTask *task);
-
-  bool denoising_divide_shadow(device_ptr a_ptr,
-                               device_ptr b_ptr,
-                               device_ptr sample_variance_ptr,
-                               device_ptr sv_variance_ptr,
-                               device_ptr buffer_variance_ptr,
-                               DenoisingTask *task);
-
-  bool denoising_get_feature(int mean_offset,
-                             int variance_offset,
-                             device_ptr mean_ptr,
-                             device_ptr variance_ptr,
-                             float scale,
-                             DenoisingTask *task);
-
-  bool denoising_write_feature(int out_offset,
-                               device_ptr from_ptr,
-                               device_ptr buffer_ptr,
-                               DenoisingTask *task);
-
-  bool denoising_detect_outliers(device_ptr image_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr depth_ptr,
-                                 device_ptr output_ptr,
-                                 DenoisingTask *task);
-
-  void denoise(RenderTile &rtile, DenoisingTask &denoising);
-
-  void adaptive_sampling_filter(uint filter_sample,
-                                WorkTile *wtile,
-                                CUdeviceptr d_wtile,
-                                CUstream stream = 0);
-  void adaptive_sampling_post(RenderTile &rtile,
-                              WorkTile *wtile,
-                              CUdeviceptr d_wtile,
-                              CUstream stream = 0);
-
-  void render(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles);
-
-  void film_convert(DeviceTask &task,
-                    device_ptr buffer,
-                    device_ptr rgba_byte,
-                    device_ptr rgba_half);
-
-  void shader(DeviceTask &task);
-
-  CUdeviceptr map_pixels(device_ptr mem);
-
-  void unmap_pixels(device_ptr mem);
-
-  void pixels_alloc(device_memory &mem);
-
-  void pixels_copy_from(device_memory &mem, int y, int w, int h);
-
-  void pixels_free(device_memory &mem);
-
-  void draw_pixels(device_memory &mem,
-                   int y,
-                   int w,
-                   int h,
-                   int width,
-                   int height,
-                   int dx,
-                   int dy,
-                   int dw,
-                   int dh,
-                   bool transparent,
-                   const DeviceDrawParams &draw_params) override;
-
-  void thread_run(DeviceTask &task);
-
-  virtual void task_add(DeviceTask &task) override;
-
-  virtual void task_wait() override;
-
-  virtual void task_cancel() override;
-};
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp
deleted file mode 100644
index 2d2fcb38705..00000000000
--- a/intern/cycles/device/cuda/device_cuda_impl.cpp
+++ /dev/null
@@ -1,2714 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_CUDA
-
-#  include <climits>
-#  include <limits.h>
-#  include <stdio.h>
-#  include <stdlib.h>
-#  include <string.h>
-
-#  include "device/cuda/device_cuda.h"
-#  include "device/device_intern.h"
-#  include "device/device_split_kernel.h"
-
-#  include "render/buffers.h"
-
-#  include "kernel/filter/filter_defines.h"
-
-#  include "util/util_debug.h"
-#  include "util/util_foreach.h"
-#  include "util/util_logging.h"
-#  include "util/util_map.h"
-#  include "util/util_md5.h"
-#  include "util/util_opengl.h"
-#  include "util/util_path.h"
-#  include "util/util_string.h"
-#  include "util/util_system.h"
-#  include "util/util_time.h"
-#  include "util/util_types.h"
-#  include "util/util_windows.h"
-
-#  include "kernel/split/kernel_split_data_types.h"
-
-CCL_NAMESPACE_BEGIN
-
-#  ifndef WITH_CUDA_DYNLOAD
-
-/* Transparently implement some functions, so majority of the file does not need
- * to worry about difference between dynamically loaded and linked CUDA at all.
- */
-
-namespace {
-
-const char *cuewErrorString(CUresult result)
-{
-  /* We can only give error code here without major code duplication, that
-   * should be enough since dynamic loading is only being disabled by folks
-   * who knows what they're doing anyway.
-   *
-   * NOTE: Avoid call from several threads.
-   */
-  static string error;
-  error = string_printf("%d", result);
-  return error.c_str();
-}
-
-const char *cuewCompilerPath()
-{
-  return CYCLES_CUDA_NVCC_EXECUTABLE;
-}
-
-int cuewCompilerVersion()
-{
-  return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10);
-}
-
-} /* namespace */
-#  endif /* WITH_CUDA_DYNLOAD */
-
-class CUDADevice;
-
-class CUDASplitKernel : public DeviceSplitKernel {
-  CUDADevice *device;
-
- public:
-  explicit CUDASplitKernel(CUDADevice *device);
-
-  virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads);
-
-  virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
-                                              RenderTile &rtile,
-                                              int num_global_elements,
-                                              device_memory &kernel_globals,
-                                              device_memory &kernel_data_,
-                                              device_memory &split_data,
-                                              device_memory &ray_state,
-                                              device_memory &queue_index,
-                                              device_memory &use_queues_flag,
-                                              device_memory &work_pool_wgs);
-
-  virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
-                                                         const DeviceRequestedFeatures &);
-  virtual int2 split_kernel_local_size();
-  virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task);
-};
-
-/* Utility to push/pop CUDA context. */
-class CUDAContextScope {
- public:
-  CUDAContextScope(CUDADevice *device);
-  ~CUDAContextScope();
-
- private:
-  CUDADevice *device;
-};
-
-bool CUDADevice::have_precompiled_kernels()
-{
-  string cubins_path = path_get("lib");
-  return path_exists(cubins_path);
-}
-
-bool CUDADevice::show_samples() const
-{
-  /* The CUDADevice only processes one tile at a time, so showing samples is fine. */
-  return true;
-}
-
-BVHLayoutMask CUDADevice::get_bvh_layout_mask() const
-{
-  return BVH_LAYOUT_BVH2;
-}
-
-void CUDADevice::set_error(const string &error)
-{
-  Device::set_error(error);
-
-  if (first_error) {
-    fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
-    fprintf(stderr,
-            "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
-    first_error = false;
-  }
-}
-
-#  define cuda_assert(stmt) \
-    { \
-      CUresult result = stmt; \
-      if (result != CUDA_SUCCESS) { \
-        const char *name = cuewErrorString(result); \
-        set_error(string_printf("%s in %s (device_cuda_impl.cpp:%d)", name, #stmt, __LINE__)); \
-      } \
-    } \
-    (void)0
-
-CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_)
-    : Device(info, stats, profiler, background_), texture_info(this, "__texture_info", MEM_GLOBAL)
-{
-  first_error = true;
-  background = background_;
-
-  cuDevId = info.num;
-  cuDevice = 0;
-  cuContext = 0;
-
-  cuModule = 0;
-  cuFilterModule = 0;
-
-  split_kernel = NULL;
-
-  need_texture_info = false;
-
-  device_texture_headroom = 0;
-  device_working_headroom = 0;
-  move_texture_to_host = false;
-  map_host_limit = 0;
-  map_host_used = 0;
-  can_map_host = 0;
-  pitch_alignment = 0;
-
-  functions.loaded = false;
-
-  /* Initialize CUDA. */
-  CUresult result = cuInit(0);
-  if (result != CUDA_SUCCESS) {
-    set_error(string_printf("Failed to initialize CUDA runtime (%s)", cuewErrorString(result)));
-    return;
-  }
-
-  /* Setup device and context. */
-  result = cuDeviceGet(&cuDevice, cuDevId);
-  if (result != CUDA_SUCCESS) {
-    set_error(string_printf("Failed to get CUDA device handle from ordinal (%s)",
-                            cuewErrorString(result)));
-    return;
-  }
-
-  /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
-   * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
-   * so we can predict which memory to map to host. */
-  cuda_assert(
-      cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
-
-  cuda_assert(cuDeviceGetAttribute(
-      &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
-
-  unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
-  if (can_map_host) {
-    ctx_flags |= CU_CTX_MAP_HOST;
-    init_host_memory();
-  }
-
-  /* Create context. */
-  if (background) {
-    result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
-  }
-  else {
-    result = cuGLCtxCreate(&cuContext, ctx_flags, cuDevice);
-
-    if (result != CUDA_SUCCESS) {
-      result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
-      background = true;
-    }
-  }
-
-  if (result != CUDA_SUCCESS) {
-    set_error(string_printf("Failed to create CUDA context (%s)", cuewErrorString(result)));
-    return;
-  }
-
-  int major, minor;
-  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
-  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
-  cuDevArchitecture = major * 100 + minor * 10;
-
-  /* Pop context set by cuCtxCreate. */
-  cuCtxPopCurrent(NULL);
-}
-
-CUDADevice::~CUDADevice()
-{
-  task_pool.cancel();
-
-  delete split_kernel;
-
-  texture_info.free();
-
-  cuda_assert(cuCtxDestroy(cuContext));
-}
-
-bool CUDADevice::support_device(const DeviceRequestedFeatures & /*requested_features*/)
-{
-  int major, minor;
-  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
-  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
-
-  /* We only support sm_30 and above */
-  if (major < 3) {
-    set_error(string_printf(
-        "CUDA backend requires compute capability 3.0 or up, but found %d.%d.", major, minor));
-    return false;
-  }
-
-  return true;
-}
-
-bool CUDADevice::check_peer_access(Device *peer_device)
-{
-  if (peer_device == this) {
-    return false;
-  }
-  if (peer_device->info.type != DEVICE_CUDA && peer_device->info.type != DEVICE_OPTIX) {
-    return false;
-  }
-
-  CUDADevice *const peer_device_cuda = static_cast<CUDADevice *>(peer_device);
-
-  int can_access = 0;
-  cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice));
-  if (can_access == 0) {
-    return false;
-  }
-
-  // Ensure array access over the link is possible as well (for 3D textures)
-  cuda_assert(cuDeviceGetP2PAttribute(&can_access,
-                                      CU_DEVICE_P2P_ATTRIBUTE_ARRAY_ACCESS_ACCESS_SUPPORTED,
-                                      cuDevice,
-                                      peer_device_cuda->cuDevice));
-  if (can_access == 0) {
-    return false;
-  }
-
-  // Enable peer access in both directions
-  {
-    const CUDAContextScope scope(this);
-    CUresult result = cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0);
-    if (result != CUDA_SUCCESS) {
-      set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
-                              cuewErrorString(result)));
-      return false;
-    }
-  }
-  {
-    const CUDAContextScope scope(peer_device_cuda);
-    CUresult result = cuCtxEnablePeerAccess(cuContext, 0);
-    if (result != CUDA_SUCCESS) {
-      set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
-                              cuewErrorString(result)));
-      return false;
-    }
-  }
-
-  return true;
-}
-
-bool CUDADevice::use_adaptive_compilation()
-{
-  return DebugFlags().cuda.adaptive_compile;
-}
-
-bool CUDADevice::use_split_kernel()
-{
-  return DebugFlags().cuda.split_kernel;
-}
-
-/* Common NVCC flags which stays the same regardless of shading model,
- * kernel sources md5 and only depends on compiler or compilation settings.
- */
-string CUDADevice::compile_kernel_get_common_cflags(
-    const DeviceRequestedFeatures &requested_features, bool filter, bool split)
-{
-  const int machine = system_cpu_bits();
-  const string source_path = path_get("source");
-  const string include_path = source_path;
-  string cflags = string_printf(
-      "-m%d "
-      "--ptxas-options=\"-v\" "
-      "--use_fast_math "
-      "-DNVCC "
-      "-I\"%s\"",
-      machine,
-      include_path.c_str());
-  if (!filter && use_adaptive_compilation()) {
-    cflags += " " + requested_features.get_build_options();
-  }
-  const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
-  if (extra_cflags) {
-    cflags += string(" ") + string(extra_cflags);
-  }
-
-  if (split) {
-    cflags += " -D__SPLIT__";
-  }
-
-#  ifdef WITH_NANOVDB
-  cflags += " -DWITH_NANOVDB";
-#  endif
-
-  return cflags;
-}
-
-string CUDADevice::compile_kernel(const DeviceRequestedFeatures &requested_features,
-                                  const char *name,
-                                  const char *base,
-                                  bool force_ptx)
-{
-  /* Compute kernel name. */
-  int major, minor;
-  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
-  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
-
-  /* Attempt to use kernel provided with Blender. */
-  if (!use_adaptive_compilation()) {
-    if (!force_ptx) {
-      const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor));
-      VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
-      if (path_exists(cubin)) {
-        VLOG(1) << "Using precompiled kernel.";
-        return cubin;
-      }
-    }
-
-    /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */
-    int ptx_major = major, ptx_minor = minor;
-    while (ptx_major >= 3) {
-      const string ptx = path_get(
-          string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor));
-      VLOG(1) << "Testing for pre-compiled kernel " << ptx << ".";
-      if (path_exists(ptx)) {
-        VLOG(1) << "Using precompiled kernel.";
-        return ptx;
-      }
-
-      if (ptx_minor > 0) {
-        ptx_minor--;
-      }
-      else {
-        ptx_major--;
-        ptx_minor = 9;
-      }
-    }
-  }
-
-  /* Try to use locally compiled kernel. */
-  string source_path = path_get("source");
-  const string source_md5 = path_files_md5_hash(source_path);
-
-  /* We include cflags into md5 so changing cuda toolkit or changing other
-   * compiler command line arguments makes sure cubin gets re-built.
-   */
-  string common_cflags = compile_kernel_get_common_cflags(
-      requested_features, strstr(name, "filter") != NULL, strstr(name, "split") != NULL);
-  const string kernel_md5 = util_md5_string(source_md5 + common_cflags);
-
-  const char *const kernel_ext = force_ptx ? "ptx" : "cubin";
-  const char *const kernel_arch = force_ptx ? "compute" : "sm";
-  const string cubin_file = string_printf(
-      "cycles_%s_%s_%d%d_%s.%s", name, kernel_arch, major, minor, kernel_md5.c_str(), kernel_ext);
-  const string cubin = path_cache_get(path_join("kernels", cubin_file));
-  VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
-  if (path_exists(cubin)) {
-    VLOG(1) << "Using locally compiled kernel.";
-    return cubin;
-  }
-
-#  ifdef _WIN32
-  if (!use_adaptive_compilation() && have_precompiled_kernels()) {
-    if (major < 3) {
-      set_error(
-          string_printf("CUDA backend requires compute capability 3.0 or up, but found %d.%d. "
-                        "Your GPU is not supported.",
-                        major,
-                        minor));
-    }
-    else {
-      set_error(
-          string_printf("CUDA binary kernel for this graphics card compute "
-                        "capability (%d.%d) not found.",
-                        major,
-                        minor));
-    }
-    return string();
-  }
-#  endif
-
-  /* Compile. */
-  const char *const nvcc = cuewCompilerPath();
-  if (nvcc == NULL) {
-    set_error(
-        "CUDA nvcc compiler not found. "
-        "Install CUDA toolkit in default location.");
-    return string();
-  }
-
-  const int nvcc_cuda_version = cuewCompilerVersion();
-  VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << nvcc_cuda_version << ".";
-  if (nvcc_cuda_version < 101) {
-    printf(
-        "Unsupported CUDA version %d.%d detected, "
-        "you need CUDA 10.1 or newer.\n",
-        nvcc_cuda_version / 10,
-        nvcc_cuda_version % 10);
-    return string();
-  }
-  else if (!(nvcc_cuda_version == 101 || nvcc_cuda_version == 102 || nvcc_cuda_version == 111 ||
-             nvcc_cuda_version == 112 || nvcc_cuda_version == 113 || nvcc_cuda_version == 114)) {
-    printf(
-        "CUDA version %d.%d detected, build may succeed but only "
-        "CUDA 10.1 to 11.4 are officially supported.\n",
-        nvcc_cuda_version / 10,
-        nvcc_cuda_version % 10);
-  }
-
-  double starttime = time_dt();
-
-  path_create_directories(cubin);
-
-  source_path = path_join(path_join(source_path, "kernel"),
-                          path_join("kernels", path_join(base, string_printf("%s.cu", name))));
-
-  string command = string_printf(
-      "\"%s\" "
-      "-arch=%s_%d%d "
-      "--%s \"%s\" "
-      "-o \"%s\" "
-      "%s",
-      nvcc,
-      kernel_arch,
-      major,
-      minor,
-      kernel_ext,
-      source_path.c_str(),
-      cubin.c_str(),
-      common_cflags.c_str());
-
-  printf("Compiling CUDA kernel ...\n%s\n", command.c_str());
-
-#  ifdef _WIN32
-  command = "call " + command;
-#  endif
-  if (system(command.c_str()) != 0) {
-    set_error(
-        "Failed to execute compilation command, "
-        "see console for details.");
-    return string();
-  }
-
-  /* Verify if compilation succeeded */
-  if (!path_exists(cubin)) {
-    set_error(
-        "CUDA kernel compilation failed, "
-        "see console for details.");
-    return string();
-  }
-
-  printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
-
-  return cubin;
-}
-
-bool CUDADevice::load_kernels(const DeviceRequestedFeatures &requested_features)
-{
-  /* TODO(sergey): Support kernels re-load for CUDA devices.
-   *
-   * Currently re-loading kernel will invalidate memory pointers,
-   * causing problems in cuCtxSynchronize.
-   */
-  if (cuFilterModule && cuModule) {
-    VLOG(1) << "Skipping kernel reload, not currently supported.";
-    return true;
-  }
-
-  /* check if cuda init succeeded */
-  if (cuContext == 0)
-    return false;
-
-  /* check if GPU is supported */
-  if (!support_device(requested_features))
-    return false;
-
-  /* get kernel */
-  const char *kernel_name = use_split_kernel() ? "kernel_split" : "kernel";
-  string cubin = compile_kernel(requested_features, kernel_name);
-  if (cubin.empty())
-    return false;
-
-  const char *filter_name = "filter";
-  string filter_cubin = compile_kernel(requested_features, filter_name);
-  if (filter_cubin.empty())
-    return false;
-
-  /* open module */
-  CUDAContextScope scope(this);
-
-  string cubin_data;
-  CUresult result;
-
-  if (path_read_text(cubin, cubin_data))
-    result = cuModuleLoadData(&cuModule, cubin_data.c_str());
-  else
-    result = CUDA_ERROR_FILE_NOT_FOUND;
-
-  if (result != CUDA_SUCCESS)
-    set_error(string_printf(
-        "Failed to load CUDA kernel from '%s' (%s)", cubin.c_str(), cuewErrorString(result)));
-
-  if (path_read_text(filter_cubin, cubin_data))
-    result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
-  else
-    result = CUDA_ERROR_FILE_NOT_FOUND;
-
-  if (result != CUDA_SUCCESS)
-    set_error(string_printf("Failed to load CUDA kernel from '%s' (%s)",
-                            filter_cubin.c_str(),
-                            cuewErrorString(result)));
-
-  if (result == CUDA_SUCCESS) {
-    reserve_local_memory(requested_features);
-  }
-
-  load_functions();
-
-  return (result == CUDA_SUCCESS);
-}
-
-void CUDADevice::load_functions()
-{
-  /* TODO: load all functions here. */
-  if (functions.loaded) {
-    return;
-  }
-  functions.loaded = true;
-
-  cuda_assert(cuModuleGetFunction(
-      &functions.adaptive_stopping, cuModule, "kernel_cuda_adaptive_stopping"));
-  cuda_assert(cuModuleGetFunction(
-      &functions.adaptive_filter_x, cuModule, "kernel_cuda_adaptive_filter_x"));
-  cuda_assert(cuModuleGetFunction(
-      &functions.adaptive_filter_y, cuModule, "kernel_cuda_adaptive_filter_y"));
-  cuda_assert(cuModuleGetFunction(
-      &functions.adaptive_scale_samples, cuModule, "kernel_cuda_adaptive_scale_samples"));
-
-  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_stopping, CU_FUNC_CACHE_PREFER_L1));
-  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_x, CU_FUNC_CACHE_PREFER_L1));
-  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_y, CU_FUNC_CACHE_PREFER_L1));
-  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_scale_samples, CU_FUNC_CACHE_PREFER_L1));
-
-  int unused_min_blocks;
-  cuda_assert(cuOccupancyMaxPotentialBlockSize(&unused_min_blocks,
-                                               &functions.adaptive_num_threads_per_block,
-                                               functions.adaptive_scale_samples,
-                                               NULL,
-                                               0,
-                                               0));
-}
-
-void CUDADevice::reserve_local_memory(const DeviceRequestedFeatures &requested_features)
-{
-  if (use_split_kernel()) {
-    /* Split kernel mostly uses global memory and adaptive compilation,
-     * difficult to predict how much is needed currently. */
-    return;
-  }
-
-  /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
-   * needed for kernel launches, so that we can reliably figure out when
-   * to allocate scene data in mapped host memory. */
-  CUDAContextScope scope(this);
-
-  size_t total = 0, free_before = 0, free_after = 0;
-  cuMemGetInfo(&free_before, &total);
-
-  /* Get kernel function. */
-  CUfunction cuRender;
-
-  if (requested_features.use_baking) {
-    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake"));
-  }
-  else if (requested_features.use_integrator_branched) {
-    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace"));
-  }
-  else {
-    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace"));
-  }
-
-  cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
-
-  int min_blocks, num_threads_per_block;
-  cuda_assert(
-      cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0));
-
-  /* Launch kernel, using just 1 block appears sufficient to reserve
-   * memory for all multiprocessors. It would be good to do this in
-   * parallel for the multi GPU case still to make it faster. */
-  CUdeviceptr d_work_tiles = 0;
-  uint total_work_size = 0;
-
-  void *args[] = {&d_work_tiles, &total_work_size};
-
-  cuda_assert(cuLaunchKernel(cuRender, 1, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
-
-  cuda_assert(cuCtxSynchronize());
-
-  cuMemGetInfo(&free_after, &total);
-  VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after)
-          << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
-
-#  if 0
-  /* For testing mapped host memory, fill up device memory. */
-  const size_t keep_mb = 1024;
-
-  while (free_after > keep_mb * 1024 * 1024LL) {
-    CUdeviceptr tmp;
-    cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
-    cuMemGetInfo(&free_after, &total);
-  }
-#  endif
-}
-
-void CUDADevice::init_host_memory()
-{
-  /* Limit amount of host mapped memory, because allocating too much can
-   * cause system instability. Leave at least half or 4 GB of system
-   * memory free, whichever is smaller. */
-  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
-  size_t system_ram = system_physical_ram();
-
-  if (system_ram > 0) {
-    if (system_ram / 2 > default_limit) {
-      map_host_limit = system_ram - default_limit;
-    }
-    else {
-      map_host_limit = system_ram / 2;
-    }
-  }
-  else {
-    VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
-    map_host_limit = 0;
-  }
-
-  /* Amount of device memory to keep is free after texture memory
-   * and working memory allocations respectively. We set the working
-   * memory limit headroom lower so that some space is left after all
-   * texture memory allocations. */
-  device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
-  device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
-
-  VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
-          << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
-}
-
-void CUDADevice::load_texture_info()
-{
-  if (need_texture_info) {
-    /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
-     * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
-    need_texture_info = false;
-    texture_info.copy_to_device();
-  }
-}
-
-void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
-{
-  /* Break out of recursive call, which can happen when moving memory on a multi device. */
-  static bool any_device_moving_textures_to_host = false;
-  if (any_device_moving_textures_to_host) {
-    return;
-  }
-
-  /* Signal to reallocate textures in host memory only. */
-  move_texture_to_host = true;
-
-  while (size > 0) {
-    /* Find suitable memory allocation to move. */
-    device_memory *max_mem = NULL;
-    size_t max_size = 0;
-    bool max_is_image = false;
-
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
-      device_memory &mem = *pair.first;
-      CUDAMem *cmem = &pair.second;
-
-      /* Can only move textures allocated on this device (and not those from peer devices).
-       * And need to ignore memory that is already on the host. */
-      if (!mem.is_resident(this) || cmem->use_mapped_host) {
-        continue;
-      }
-
-      bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
-                        (&mem != &texture_info);
-      bool is_image = is_texture && (mem.data_height > 1);
-
-      /* Can't move this type of memory. */
-      if (!is_texture || cmem->array) {
-        continue;
-      }
-
-      /* For other textures, only move image textures. */
-      if (for_texture && !is_image) {
-        continue;
-      }
-
-      /* Try to move largest allocation, prefer moving images. */
-      if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
-        max_is_image = is_image;
-        max_size = mem.device_size;
-        max_mem = &mem;
-      }
-    }
-    lock.unlock();
-
-    /* Move to host memory. This part is mutex protected since
-     * multiple CUDA devices could be moving the memory. The
-     * first one will do it, and the rest will adopt the pointer. */
-    if (max_mem) {
-      VLOG(1) << "Move memory from device to host: " << max_mem->name;
-
-      static thread_mutex move_mutex;
-      thread_scoped_lock lock(move_mutex);
-
-      any_device_moving_textures_to_host = true;
-
-      /* Potentially need to call back into multi device, so pointer mapping
-       * and peer devices are updated. This is also necessary since the device
-       * pointer may just be a key here, so cannot be accessed and freed directly.
-       * Unfortunately it does mean that memory is reallocated on all other
-       * devices as well, which is potentially dangerous when still in use (since
-       * a thread rendering on another devices would only be caught in this mutex
-       * if it so happens to do an allocation at the same time as well. */
-      max_mem->device_copy_to();
-      size = (max_size >= size) ? 0 : size - max_size;
-
-      any_device_moving_textures_to_host = false;
-    }
-    else {
-      break;
-    }
-  }
-
-  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
-  move_texture_to_host = false;
-
-  /* Update texture info array with new pointers. */
-  load_texture_info();
-}
-
-CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding)
-{
-  CUDAContextScope scope(this);
-
-  CUdeviceptr device_pointer = 0;
-  size_t size = mem.memory_size() + pitch_padding;
-
-  CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
-  const char *status = "";
-
-  /* First try allocating in device memory, respecting headroom. We make
-   * an exception for texture info. It is small and frequently accessed,
-   * so treat it as working memory.
-   *
-   * If there is not enough room for working memory, we will try to move
-   * textures to host memory, assuming the performance impact would have
-   * been worse for working memory. */
-  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
-  bool is_image = is_texture && (mem.data_height > 1);
-
-  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
-
-  size_t total = 0, free = 0;
-  cuMemGetInfo(&free, &total);
-
-  /* Move textures to host memory if needed. */
-  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
-    move_textures_to_host(size + headroom - free, is_texture);
-    cuMemGetInfo(&free, &total);
-  }
-
-  /* Allocate in device memory. */
-  if (!move_texture_to_host && (size + headroom) < free) {
-    mem_alloc_result = cuMemAlloc(&device_pointer, size);
-    if (mem_alloc_result == CUDA_SUCCESS) {
-      status = " in device memory";
-    }
-  }
-
-  /* Fall back to mapped host memory if needed and possible. */
-
-  void *shared_pointer = 0;
-
-  if (mem_alloc_result != CUDA_SUCCESS && can_map_host && mem.type != MEM_DEVICE_ONLY) {
-    if (mem.shared_pointer) {
-      /* Another device already allocated host memory. */
-      mem_alloc_result = CUDA_SUCCESS;
-      shared_pointer = mem.shared_pointer;
-    }
-    else if (map_host_used + size < map_host_limit) {
-      /* Allocate host memory ourselves. */
-      mem_alloc_result = cuMemHostAlloc(
-          &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
-
-      assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
-             (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
-    }
-
-    if (mem_alloc_result == CUDA_SUCCESS) {
-      cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0));
-      map_host_used += size;
-      status = " in host memory";
-    }
-  }
-
-  if (mem_alloc_result != CUDA_SUCCESS) {
-    if (mem.type == MEM_DEVICE_ONLY) {
-      status = " failed, out of device memory";
-      set_error("System is out of GPU memory");
-    }
-    else {
-      status = " failed, out of device and host memory";
-      set_error("System is out of GPU and shared host memory");
-    }
-  }
-
-  if (mem.name) {
-    VLOG(1) << "Buffer allocate: " << mem.name << ", "
-            << string_human_readable_number(mem.memory_size()) << " bytes. ("
-            << string_human_readable_size(mem.memory_size()) << ")" << status;
-  }
-
-  mem.device_pointer = (device_ptr)device_pointer;
-  mem.device_size = size;
-  stats.mem_alloc(size);
-
-  if (!mem.device_pointer) {
-    return NULL;
-  }
-
-  /* Insert into map of allocations. */
-  thread_scoped_lock lock(cuda_mem_map_mutex);
-  CUDAMem *cmem = &cuda_mem_map[&mem];
-  if (shared_pointer != 0) {
-    /* Replace host pointer with our host allocation. Only works if
-     * CUDA memory layout is the same and has no pitch padding. Also
-     * does not work if we move textures to host during a render,
-     * since other devices might be using the memory. */
-
-    if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
-        mem.host_pointer != shared_pointer) {
-      memcpy(shared_pointer, mem.host_pointer, size);
-
-      /* A Call to device_memory::host_free() should be preceded by
-       * a call to device_memory::device_free() for host memory
-       * allocated by a device to be handled properly. Two exceptions
-       * are here and a call in OptiXDevice::generic_alloc(), where
-       * the current host memory can be assumed to be allocated by
-       * device_memory::host_alloc(), not by a device */
-
-      mem.host_free();
-      mem.host_pointer = shared_pointer;
-    }
-    mem.shared_pointer = shared_pointer;
-    mem.shared_counter++;
-    cmem->use_mapped_host = true;
-  }
-  else {
-    cmem->use_mapped_host = false;
-  }
-
-  return cmem;
-}
-
-void CUDADevice::generic_copy_to(device_memory &mem)
-{
-  if (!mem.host_pointer || !mem.device_pointer) {
-    return;
-  }
-
-  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
-   * cuMemAlloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
-   * mem.host_pointer. */
-  thread_scoped_lock lock(cuda_mem_map_mutex);
-  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
-    const CUDAContextScope scope(this);
-    cuda_assert(
-        cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size()));
-  }
-}
-
-void CUDADevice::generic_free(device_memory &mem)
-{
-  if (mem.device_pointer) {
-    CUDAContextScope scope(this);
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    const CUDAMem &cmem = cuda_mem_map[&mem];
-
-    /* If cmem.use_mapped_host is true, reference counting is used
-     * to safely free a mapped host memory. */
-
-    if (cmem.use_mapped_host) {
-      assert(mem.shared_pointer);
-      if (mem.shared_pointer) {
-        assert(mem.shared_counter > 0);
-        if (--mem.shared_counter == 0) {
-          if (mem.host_pointer == mem.shared_pointer) {
-            mem.host_pointer = 0;
-          }
-          cuMemFreeHost(mem.shared_pointer);
-          mem.shared_pointer = 0;
-        }
-      }
-      map_host_used -= mem.device_size;
-    }
-    else {
-      /* Free device memory. */
-      cuda_assert(cuMemFree(mem.device_pointer));
-    }
-
-    stats.mem_free(mem.device_size);
-    mem.device_pointer = 0;
-    mem.device_size = 0;
-
-    cuda_mem_map.erase(cuda_mem_map.find(&mem));
-  }
-}
-
-void CUDADevice::mem_alloc(device_memory &mem)
-{
-  if (mem.type == MEM_PIXELS && !background) {
-    pixels_alloc(mem);
-  }
-  else if (mem.type == MEM_TEXTURE) {
-    assert(!"mem_alloc not supported for textures.");
-  }
-  else if (mem.type == MEM_GLOBAL) {
-    assert(!"mem_alloc not supported for global memory.");
-  }
-  else {
-    generic_alloc(mem);
-  }
-}
-
-void CUDADevice::mem_copy_to(device_memory &mem)
-{
-  if (mem.type == MEM_PIXELS) {
-    assert(!"mem_copy_to not supported for pixels.");
-  }
-  else if (mem.type == MEM_GLOBAL) {
-    global_free(mem);
-    global_alloc(mem);
-  }
-  else if (mem.type == MEM_TEXTURE) {
-    tex_free((device_texture &)mem);
-    tex_alloc((device_texture &)mem);
-  }
-  else {
-    if (!mem.device_pointer) {
-      generic_alloc(mem);
-    }
-    generic_copy_to(mem);
-  }
-}
-
-void CUDADevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
-{
-  if (mem.type == MEM_PIXELS && !background) {
-    pixels_copy_from(mem, y, w, h);
-  }
-  else if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) {
-    assert(!"mem_copy_from not supported for textures.");
-  }
-  else if (mem.host_pointer) {
-    const size_t size = elem * w * h;
-    const size_t offset = elem * y * w;
-
-    if (mem.device_pointer) {
-      const CUDAContextScope scope(this);
-      cuda_assert(cuMemcpyDtoH(
-          (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size));
-    }
-    else {
-      memset((char *)mem.host_pointer + offset, 0, size);
-    }
-  }
-}
-
-void CUDADevice::mem_zero(device_memory &mem)
-{
-  if (!mem.device_pointer) {
-    mem_alloc(mem);
-  }
-  if (!mem.device_pointer) {
-    return;
-  }
-
-  /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
-   * regardless of mem.host_pointer and mem.shared_pointer. */
-  thread_scoped_lock lock(cuda_mem_map_mutex);
-  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
-    const CUDAContextScope scope(this);
-    cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
-  }
-  else if (mem.host_pointer) {
-    memset(mem.host_pointer, 0, mem.memory_size());
-  }
-}
-
-void CUDADevice::mem_free(device_memory &mem)
-{
-  if (mem.type == MEM_PIXELS && !background) {
-    pixels_free(mem);
-  }
-  else if (mem.type == MEM_GLOBAL) {
-    global_free(mem);
-  }
-  else if (mem.type == MEM_TEXTURE) {
-    tex_free((device_texture &)mem);
-  }
-  else {
-    generic_free(mem);
-  }
-}
-
-device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
-{
-  return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
-}
-
-void CUDADevice::const_copy_to(const char *name, void *host, size_t size)
-{
-  CUDAContextScope scope(this);
-  CUdeviceptr mem;
-  size_t bytes;
-
-  cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
-  // assert(bytes == size);
-  cuda_assert(cuMemcpyHtoD(mem, host, size));
-}
-
-void CUDADevice::global_alloc(device_memory &mem)
-{
-  if (mem.is_resident(this)) {
-    generic_alloc(mem);
-    generic_copy_to(mem);
-  }
-
-  const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
-}
-
-void CUDADevice::global_free(device_memory &mem)
-{
-  if (mem.is_resident(this) && mem.device_pointer) {
-    generic_free(mem);
-  }
-}
-
-void CUDADevice::tex_alloc(device_texture &mem)
-{
-  CUDAContextScope scope(this);
-
-  /* General variables for both architectures */
-  string bind_name = mem.name;
-  size_t dsize = datatype_size(mem.data_type);
-  size_t size = mem.memory_size();
-
-  CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
-  switch (mem.info.extension) {
-    case EXTENSION_REPEAT:
-      address_mode = CU_TR_ADDRESS_MODE_WRAP;
-      break;
-    case EXTENSION_EXTEND:
-      address_mode = CU_TR_ADDRESS_MODE_CLAMP;
-      break;
-    case EXTENSION_CLIP:
-      address_mode = CU_TR_ADDRESS_MODE_BORDER;
-      break;
-    default:
-      assert(0);
-      break;
-  }
-
-  CUfilter_mode filter_mode;
-  if (mem.info.interpolation == INTERPOLATION_CLOSEST) {
-    filter_mode = CU_TR_FILTER_MODE_POINT;
-  }
-  else {
-    filter_mode = CU_TR_FILTER_MODE_LINEAR;
-  }
-
-  /* Image Texture Storage */
-  CUarray_format_enum format;
-  switch (mem.data_type) {
-    case TYPE_UCHAR:
-      format = CU_AD_FORMAT_UNSIGNED_INT8;
-      break;
-    case TYPE_UINT16:
-      format = CU_AD_FORMAT_UNSIGNED_INT16;
-      break;
-    case TYPE_UINT:
-      format = CU_AD_FORMAT_UNSIGNED_INT32;
-      break;
-    case TYPE_INT:
-      format = CU_AD_FORMAT_SIGNED_INT32;
-      break;
-    case TYPE_FLOAT:
-      format = CU_AD_FORMAT_FLOAT;
-      break;
-    case TYPE_HALF:
-      format = CU_AD_FORMAT_HALF;
-      break;
-    default:
-      assert(0);
-      return;
-  }
-
-  CUDAMem *cmem = NULL;
-  CUarray array_3d = NULL;
-  size_t src_pitch = mem.data_width * dsize * mem.data_elements;
-  size_t dst_pitch = src_pitch;
-
-  if (!mem.is_resident(this)) {
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    cmem = &cuda_mem_map[&mem];
-    cmem->texobject = 0;
-
-    if (mem.data_depth > 1) {
-      array_3d = (CUarray)mem.device_pointer;
-      cmem->array = array_3d;
-    }
-    else if (mem.data_height > 0) {
-      dst_pitch = align_up(src_pitch, pitch_alignment);
-    }
-  }
-  else if (mem.data_depth > 1) {
-    /* 3D texture using array, there is no API for linear memory. */
-    CUDA_ARRAY3D_DESCRIPTOR desc;
-
-    desc.Width = mem.data_width;
-    desc.Height = mem.data_height;
-    desc.Depth = mem.data_depth;
-    desc.Format = format;
-    desc.NumChannels = mem.data_elements;
-    desc.Flags = 0;
-
-    VLOG(1) << "Array 3D allocate: " << mem.name << ", "
-            << string_human_readable_number(mem.memory_size()) << " bytes. ("
-            << string_human_readable_size(mem.memory_size()) << ")";
-
-    cuda_assert(cuArray3DCreate(&array_3d, &desc));
-
-    if (!array_3d) {
-      return;
-    }
-
-    CUDA_MEMCPY3D param;
-    memset(&param, 0, sizeof(param));
-    param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
-    param.dstArray = array_3d;
-    param.srcMemoryType = CU_MEMORYTYPE_HOST;
-    param.srcHost = mem.host_pointer;
-    param.srcPitch = src_pitch;
-    param.WidthInBytes = param.srcPitch;
-    param.Height = mem.data_height;
-    param.Depth = mem.data_depth;
-
-    cuda_assert(cuMemcpy3D(&param));
-
-    mem.device_pointer = (device_ptr)array_3d;
-    mem.device_size = size;
-    stats.mem_alloc(size);
-
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    cmem = &cuda_mem_map[&mem];
-    cmem->texobject = 0;
-    cmem->array = array_3d;
-  }
-  else if (mem.data_height > 0) {
-    /* 2D texture, using pitch aligned linear memory. */
-    dst_pitch = align_up(src_pitch, pitch_alignment);
-    size_t dst_size = dst_pitch * mem.data_height;
-
-    cmem = generic_alloc(mem, dst_size - mem.memory_size());
-    if (!cmem) {
-      return;
-    }
-
-    CUDA_MEMCPY2D param;
-    memset(&param, 0, sizeof(param));
-    param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
-    param.dstDevice = mem.device_pointer;
-    param.dstPitch = dst_pitch;
-    param.srcMemoryType = CU_MEMORYTYPE_HOST;
-    param.srcHost = mem.host_pointer;
-    param.srcPitch = src_pitch;
-    param.WidthInBytes = param.srcPitch;
-    param.Height = mem.data_height;
-
-    cuda_assert(cuMemcpy2DUnaligned(&param));
-  }
-  else {
-    /* 1D texture, using linear memory. */
-    cmem = generic_alloc(mem);
-    if (!cmem) {
-      return;
-    }
-
-    cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
-  }
-
-  /* Resize once */
-  const uint slot = mem.slot;
-  if (slot >= texture_info.size()) {
-    /* Allocate some slots in advance, to reduce amount
-     * of re-allocations. */
-    texture_info.resize(slot + 128);
-  }
-
-  /* Set Mapping and tag that we need to (re-)upload to device */
-  texture_info[slot] = mem.info;
-  need_texture_info = true;
-
-  if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
-      mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
-    /* Kepler+, bindless textures. */
-    CUDA_RESOURCE_DESC resDesc;
-    memset(&resDesc, 0, sizeof(resDesc));
-
-    if (array_3d) {
-      resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
-      resDesc.res.array.hArray = array_3d;
-      resDesc.flags = 0;
-    }
-    else if (mem.data_height > 0) {
-      resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
-      resDesc.res.pitch2D.devPtr = mem.device_pointer;
-      resDesc.res.pitch2D.format = format;
-      resDesc.res.pitch2D.numChannels = mem.data_elements;
-      resDesc.res.pitch2D.height = mem.data_height;
-      resDesc.res.pitch2D.width = mem.data_width;
-      resDesc.res.pitch2D.pitchInBytes = dst_pitch;
-    }
-    else {
-      resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
-      resDesc.res.linear.devPtr = mem.device_pointer;
-      resDesc.res.linear.format = format;
-      resDesc.res.linear.numChannels = mem.data_elements;
-      resDesc.res.linear.sizeInBytes = mem.device_size;
-    }
-
-    CUDA_TEXTURE_DESC texDesc;
-    memset(&texDesc, 0, sizeof(texDesc));
-    texDesc.addressMode[0] = address_mode;
-    texDesc.addressMode[1] = address_mode;
-    texDesc.addressMode[2] = address_mode;
-    texDesc.filterMode = filter_mode;
-    texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
-
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    cmem = &cuda_mem_map[&mem];
-
-    cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
-
-    texture_info[slot].data = (uint64_t)cmem->texobject;
-  }
-  else {
-    texture_info[slot].data = (uint64_t)mem.device_pointer;
-  }
-}
-
-void CUDADevice::tex_free(device_texture &mem)
-{
-  if (mem.device_pointer) {
-    CUDAContextScope scope(this);
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    const CUDAMem &cmem = cuda_mem_map[&mem];
-
-    if (cmem.texobject) {
-      /* Free bindless texture. */
-      cuTexObjectDestroy(cmem.texobject);
-    }
-
-    if (!mem.is_resident(this)) {
-      /* Do not free memory here, since it was allocated on a different device. */
-      cuda_mem_map.erase(cuda_mem_map.find(&mem));
-    }
-    else if (cmem.array) {
-      /* Free array. */
-      cuArrayDestroy(cmem.array);
-      stats.mem_free(mem.device_size);
-      mem.device_pointer = 0;
-      mem.device_size = 0;
-
-      cuda_mem_map.erase(cuda_mem_map.find(&mem));
-    }
-    else {
-      lock.unlock();
-      generic_free(mem);
-    }
-  }
-}
-
-#  define CUDA_GET_BLOCKSIZE(func, w, h) \
-    int threads_per_block; \
-    cuda_assert( \
-        cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
-    int threads = (int)sqrt((float)threads_per_block); \
-    int xblocks = ((w) + threads - 1) / threads; \
-    int yblocks = ((h) + threads - 1) / threads;
-
-#  define CUDA_LAUNCH_KERNEL(func, args) \
-    cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0));
-
-/* Similar as above, but for 1-dimensional blocks. */
-#  define CUDA_GET_BLOCKSIZE_1D(func, w, h) \
-    int threads_per_block; \
-    cuda_assert( \
-        cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
-    int xblocks = ((w) + threads_per_block - 1) / threads_per_block; \
-    int yblocks = h;
-
-#  define CUDA_LAUNCH_KERNEL_1D(func, args) \
-    cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads_per_block, 1, 1, 0, 0, args, 0));
-
-bool CUDADevice::denoising_non_local_means(device_ptr image_ptr,
-                                           device_ptr guide_ptr,
-                                           device_ptr variance_ptr,
-                                           device_ptr out_ptr,
-                                           DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  int stride = task->buffer.stride;
-  int w = task->buffer.width;
-  int h = task->buffer.h;
-  int r = task->nlm_state.r;
-  int f = task->nlm_state.f;
-  float a = task->nlm_state.a;
-  float k_2 = task->nlm_state.k_2;
-
-  int pass_stride = task->buffer.pass_stride;
-  int num_shifts = (2 * r + 1) * (2 * r + 1);
-  int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
-  int frame_offset = 0;
-
-  if (have_error())
-    return false;
-
-  CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer;
-  CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
-  CUdeviceptr weightAccum = difference + 2 * sizeof(float) * pass_stride * num_shifts;
-  CUdeviceptr scale_ptr = 0;
-
-  cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float) * pass_stride));
-  cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float) * pass_stride));
-
-  {
-    CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput;
-    cuda_assert(cuModuleGetFunction(
-        &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
-    cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
-    cuda_assert(cuModuleGetFunction(
-        &cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
-    cuda_assert(cuModuleGetFunction(
-        &cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
-
-    cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
-    cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
-    cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
-    cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
-
-    CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w * h, num_shifts);
-
-    void *calc_difference_args[] = {&guide_ptr,
-                                    &variance_ptr,
-                                    &scale_ptr,
-                                    &difference,
-                                    &w,
-                                    &h,
-                                    &stride,
-                                    &pass_stride,
-                                    &r,
-                                    &channel_offset,
-                                    &frame_offset,
-                                    &a,
-                                    &k_2};
-    void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
-    void *calc_weight_args[] = {
-        &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
-    void *update_output_args[] = {&blurDifference,
-                                  &image_ptr,
-                                  &out_ptr,
-                                  &weightAccum,
-                                  &w,
-                                  &h,
-                                  &stride,
-                                  &pass_stride,
-                                  &channel_offset,
-                                  &r,
-                                  &f};
-
-    CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args);
-  }
-
-  {
-    CUfunction cuNLMNormalize;
-    cuda_assert(
-        cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
-    cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
-    void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride};
-    CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h);
-    CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
-    cuda_assert(cuCtxSynchronize());
-  }
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_construct_transform(DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuFilterConstructTransform;
-  cuda_assert(cuModuleGetFunction(
-      &cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
-  cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
-  CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, task->storage.w, task->storage.h);
-
-  void *args[] = {&task->buffer.mem.device_pointer,
-                  &task->tile_info_mem.device_pointer,
-                  &task->storage.transform.device_pointer,
-                  &task->storage.rank.device_pointer,
-                  &task->filter_area,
-                  &task->rect,
-                  &task->radius,
-                  &task->pca_threshold,
-                  &task->buffer.pass_stride,
-                  &task->buffer.frame_stride,
-                  &task->buffer.use_time};
-  CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_accumulate(device_ptr color_ptr,
-                                      device_ptr color_variance_ptr,
-                                      device_ptr scale_ptr,
-                                      int frame,
-                                      DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  int r = task->radius;
-  int f = 4;
-  float a = 1.0f;
-  float k_2 = task->nlm_k_2;
-
-  int w = task->reconstruction_state.source_w;
-  int h = task->reconstruction_state.source_h;
-  int stride = task->buffer.stride;
-  int frame_offset = frame * task->buffer.frame_stride;
-  int t = task->tile_info->frames[frame];
-
-  int pass_stride = task->buffer.pass_stride;
-  int num_shifts = (2 * r + 1) * (2 * r + 1);
-
-  if (have_error())
-    return false;
-
-  CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer;
-  CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
-
-  CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
-  cuda_assert(cuModuleGetFunction(
-      &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
-  cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
-  cuda_assert(
-      cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
-  cuda_assert(cuModuleGetFunction(
-      &cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
-
-  cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
-  cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
-  cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
-  cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
-
-  CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
-                        task->reconstruction_state.source_w * task->reconstruction_state.source_h,
-                        num_shifts);
-
-  void *calc_difference_args[] = {&color_ptr,
-                                  &color_variance_ptr,
-                                  &scale_ptr,
-                                  &difference,
-                                  &w,
-                                  &h,
-                                  &stride,
-                                  &pass_stride,
-                                  &r,
-                                  &pass_stride,
-                                  &frame_offset,
-                                  &a,
-                                  &k_2};
-  void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
-  void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
-  void *construct_gramian_args[] = {&t,
-                                    &blurDifference,
-                                    &task->buffer.mem.device_pointer,
-                                    &task->storage.transform.device_pointer,
-                                    &task->storage.rank.device_pointer,
-                                    &task->storage.XtWX.device_pointer,
-                                    &task->storage.XtWY.device_pointer,
-                                    &task->reconstruction_state.filter_window,
-                                    &w,
-                                    &h,
-                                    &stride,
-                                    &pass_stride,
-                                    &r,
-                                    &f,
-                                    &frame_offset,
-                                    &task->buffer.use_time};
-
-  CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
-  CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-  CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
-  CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-  CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_solve(device_ptr output_ptr, DenoisingTask *task)
-{
-  CUfunction cuFinalize;
-  cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
-  cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
-  void *finalize_args[] = {&output_ptr,
-                           &task->storage.rank.device_pointer,
-                           &task->storage.XtWX.device_pointer,
-                           &task->storage.XtWY.device_pointer,
-                           &task->filter_area,
-                           &task->reconstruction_state.buffer_params.x,
-                           &task->render_buffer.samples};
-  CUDA_GET_BLOCKSIZE(
-      cuFinalize, task->reconstruction_state.source_w, task->reconstruction_state.source_h);
-  CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_combine_halves(device_ptr a_ptr,
-                                          device_ptr b_ptr,
-                                          device_ptr mean_ptr,
-                                          device_ptr variance_ptr,
-                                          int r,
-                                          int4 rect,
-                                          DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuFilterCombineHalves;
-  cuda_assert(cuModuleGetFunction(
-      &cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
-  cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
-  CUDA_GET_BLOCKSIZE(
-      cuFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  void *args[] = {&mean_ptr, &variance_ptr, &a_ptr, &b_ptr, &rect, &r};
-  CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_divide_shadow(device_ptr a_ptr,
-                                         device_ptr b_ptr,
-                                         device_ptr sample_variance_ptr,
-                                         device_ptr sv_variance_ptr,
-                                         device_ptr buffer_variance_ptr,
-                                         DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuFilterDivideShadow;
-  cuda_assert(cuModuleGetFunction(
-      &cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
-  cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
-  CUDA_GET_BLOCKSIZE(
-      cuFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  void *args[] = {&task->render_buffer.samples,
-                  &task->tile_info_mem.device_pointer,
-                  &a_ptr,
-                  &b_ptr,
-                  &sample_variance_ptr,
-                  &sv_variance_ptr,
-                  &buffer_variance_ptr,
-                  &task->rect,
-                  &task->render_buffer.pass_stride,
-                  &task->render_buffer.offset};
-  CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_get_feature(int mean_offset,
-                                       int variance_offset,
-                                       device_ptr mean_ptr,
-                                       device_ptr variance_ptr,
-                                       float scale,
-                                       DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuFilterGetFeature;
-  cuda_assert(
-      cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
-  cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
-  CUDA_GET_BLOCKSIZE(cuFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  void *args[] = {&task->render_buffer.samples,
-                  &task->tile_info_mem.device_pointer,
-                  &mean_offset,
-                  &variance_offset,
-                  &mean_ptr,
-                  &variance_ptr,
-                  &scale,
-                  &task->rect,
-                  &task->render_buffer.pass_stride,
-                  &task->render_buffer.offset};
-  CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_write_feature(int out_offset,
-                                         device_ptr from_ptr,
-                                         device_ptr buffer_ptr,
-                                         DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuFilterWriteFeature;
-  cuda_assert(cuModuleGetFunction(
-      &cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature"));
-  cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1));
-  CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, task->filter_area.z, task->filter_area.w);
-
-  void *args[] = {&task->render_buffer.samples,
-                  &task->reconstruction_state.buffer_params,
-                  &task->filter_area,
-                  &from_ptr,
-                  &buffer_ptr,
-                  &out_offset,
-                  &task->rect};
-  CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_detect_outliers(device_ptr image_ptr,
-                                           device_ptr variance_ptr,
-                                           device_ptr depth_ptr,
-                                           device_ptr output_ptr,
-                                           DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuFilterDetectOutliers;
-  cuda_assert(cuModuleGetFunction(
-      &cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers"));
-  cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1));
-  CUDA_GET_BLOCKSIZE(
-      cuFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  void *args[] = {
-      &image_ptr, &variance_ptr, &depth_ptr, &output_ptr, &task->rect, &task->buffer.pass_stride};
-
-  CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-void CUDADevice::denoise(RenderTile &rtile, DenoisingTask &denoising)
-{
-  denoising.functions.construct_transform = function_bind(
-      &CUDADevice::denoising_construct_transform, this, &denoising);
-  denoising.functions.accumulate = function_bind(
-      &CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
-  denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising);
-  denoising.functions.divide_shadow = function_bind(
-      &CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
-  denoising.functions.non_local_means = function_bind(
-      &CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
-  denoising.functions.combine_halves = function_bind(
-      &CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
-  denoising.functions.get_feature = function_bind(
-      &CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
-  denoising.functions.write_feature = function_bind(
-      &CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising);
-  denoising.functions.detect_outliers = function_bind(
-      &CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
-
-  denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
-  denoising.render_buffer.samples = rtile.sample;
-  denoising.buffer.gpu_temporary_mem = true;
-
-  denoising.run_denoising(rtile);
-}
-
-void CUDADevice::adaptive_sampling_filter(uint filter_sample,
-                                          WorkTile *wtile,
-                                          CUdeviceptr d_wtile,
-                                          CUstream stream)
-{
-  const int num_threads_per_block = functions.adaptive_num_threads_per_block;
-
-  /* These are a series of tiny kernels because there is no grid synchronization
-   * from within a kernel, so multiple kernel launches it is. */
-  uint total_work_size = wtile->h * wtile->w;
-  void *args2[] = {&d_wtile, &filter_sample, &total_work_size};
-  uint num_blocks = divide_up(total_work_size, num_threads_per_block);
-  cuda_assert(cuLaunchKernel(functions.adaptive_stopping,
-                             num_blocks,
-                             1,
-                             1,
-                             num_threads_per_block,
-                             1,
-                             1,
-                             0,
-                             stream,
-                             args2,
-                             0));
-  total_work_size = wtile->h;
-  num_blocks = divide_up(total_work_size, num_threads_per_block);
-  cuda_assert(cuLaunchKernel(functions.adaptive_filter_x,
-                             num_blocks,
-                             1,
-                             1,
-                             num_threads_per_block,
-                             1,
-                             1,
-                             0,
-                             stream,
-                             args2,
-                             0));
-  total_work_size = wtile->w;
-  num_blocks = divide_up(total_work_size, num_threads_per_block);
-  cuda_assert(cuLaunchKernel(functions.adaptive_filter_y,
-                             num_blocks,
-                             1,
-                             1,
-                             num_threads_per_block,
-                             1,
-                             1,
-                             0,
-                             stream,
-                             args2,
-                             0));
-}
-
-void CUDADevice::adaptive_sampling_post(RenderTile &rtile,
-                                        WorkTile *wtile,
-                                        CUdeviceptr d_wtile,
-                                        CUstream stream)
-{
-  const int num_threads_per_block = functions.adaptive_num_threads_per_block;
-  uint total_work_size = wtile->h * wtile->w;
-
-  void *args[] = {&d_wtile, &rtile.start_sample, &rtile.sample, &total_work_size};
-  uint num_blocks = divide_up(total_work_size, num_threads_per_block);
-  cuda_assert(cuLaunchKernel(functions.adaptive_scale_samples,
-                             num_blocks,
-                             1,
-                             1,
-                             num_threads_per_block,
-                             1,
-                             1,
-                             0,
-                             stream,
-                             args,
-                             0));
-}
-
-void CUDADevice::render(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles)
-{
-  scoped_timer timer(&rtile.buffers->render_time);
-
-  if (have_error())
-    return;
-
-  CUDAContextScope scope(this);
-  CUfunction cuRender;
-
-  /* Get kernel function. */
-  if (rtile.task == RenderTile::BAKE) {
-    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake"));
-  }
-  else if (task.integrator_branched) {
-    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace"));
-  }
-  else {
-    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace"));
-  }
-
-  if (have_error()) {
-    return;
-  }
-
-  cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
-
-  /* Allocate work tile. */
-  work_tiles.alloc(1);
-
-  WorkTile *wtile = work_tiles.data();
-  wtile->x = rtile.x;
-  wtile->y = rtile.y;
-  wtile->w = rtile.w;
-  wtile->h = rtile.h;
-  wtile->offset = rtile.offset;
-  wtile->stride = rtile.stride;
-  wtile->buffer = (float *)(CUdeviceptr)rtile.buffer;
-
-  /* Prepare work size. More step samples render faster, but for now we
-   * remain conservative for GPUs connected to a display to avoid driver
-   * timeouts and display freezing. */
-  int min_blocks, num_threads_per_block;
-  cuda_assert(
-      cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0));
-  if (!info.display_device) {
-    min_blocks *= 8;
-  }
-
-  uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
-
-  /* Render all samples. */
-  int start_sample = rtile.start_sample;
-  int end_sample = rtile.start_sample + rtile.num_samples;
-
-  for (int sample = start_sample; sample < end_sample;) {
-    /* Setup and copy work tile to device. */
-    wtile->start_sample = sample;
-    wtile->num_samples = step_samples;
-    if (task.adaptive_sampling.use) {
-      wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
-    }
-    wtile->num_samples = min(wtile->num_samples, end_sample - sample);
-    work_tiles.copy_to_device();
-
-    CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
-    uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
-    uint num_blocks = divide_up(total_work_size, num_threads_per_block);
-
-    /* Launch kernel. */
-    void *args[] = {&d_work_tiles, &total_work_size};
-
-    cuda_assert(
-        cuLaunchKernel(cuRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
-
-    /* Run the adaptive sampling kernels at selected samples aligned to step samples. */
-    uint filter_sample = sample + wtile->num_samples - 1;
-    if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
-      adaptive_sampling_filter(filter_sample, wtile, d_work_tiles);
-    }
-
-    cuda_assert(cuCtxSynchronize());
-
-    /* Update progress. */
-    sample += wtile->num_samples;
-    rtile.sample = sample;
-    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
-
-    if (task.get_cancel()) {
-      if (task.need_finish_queue == false)
-        break;
-    }
-  }
-
-  /* Finalize adaptive sampling. */
-  if (task.adaptive_sampling.use) {
-    CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
-    adaptive_sampling_post(rtile, wtile, d_work_tiles);
-    cuda_assert(cuCtxSynchronize());
-    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
-  }
-}
-
-void CUDADevice::film_convert(DeviceTask &task,
-                              device_ptr buffer,
-                              device_ptr rgba_byte,
-                              device_ptr rgba_half)
-{
-  if (have_error())
-    return;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuFilmConvert;
-  CUdeviceptr d_rgba = map_pixels((rgba_byte) ? rgba_byte : rgba_half);
-  CUdeviceptr d_buffer = (CUdeviceptr)buffer;
-
-  /* get kernel function */
-  if (rgba_half) {
-    cuda_assert(
-        cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float"));
-  }
-  else {
-    cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte"));
-  }
-
-  float sample_scale = 1.0f / (task.sample + 1);
-
-  /* pass in parameters */
-  void *args[] = {&d_rgba,
-                  &d_buffer,
-                  &sample_scale,
-                  &task.x,
-                  &task.y,
-                  &task.w,
-                  &task.h,
-                  &task.offset,
-                  &task.stride};
-
-  /* launch kernel */
-  int threads_per_block;
-  cuda_assert(cuFuncGetAttribute(
-      &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert));
-
-  int xthreads = (int)sqrt(threads_per_block);
-  int ythreads = (int)sqrt(threads_per_block);
-  int xblocks = (task.w + xthreads - 1) / xthreads;
-  int yblocks = (task.h + ythreads - 1) / ythreads;
-
-  cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1));
-
-  cuda_assert(cuLaunchKernel(cuFilmConvert,
-                             xblocks,
-                             yblocks,
-                             1, /* blocks */
-                             xthreads,
-                             ythreads,
-                             1, /* threads */
-                             0,
-                             0,
-                             args,
-                             0));
-
-  unmap_pixels((rgba_byte) ? rgba_byte : rgba_half);
-
-  cuda_assert(cuCtxSynchronize());
-}
-
-void CUDADevice::shader(DeviceTask &task)
-{
-  if (have_error())
-    return;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuShader;
-  CUdeviceptr d_input = (CUdeviceptr)task.shader_input;
-  CUdeviceptr d_output = (CUdeviceptr)task.shader_output;
-
-  /* get kernel function */
-  if (task.shader_eval_type == SHADER_EVAL_DISPLACE) {
-    cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace"));
-  }
-  else {
-    cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background"));
-  }
-
-  /* do tasks in smaller chunks, so we can cancel it */
-  const int shader_chunk_size = 65536;
-  const int start = task.shader_x;
-  const int end = task.shader_x + task.shader_w;
-  int offset = task.offset;
-
-  bool canceled = false;
-  for (int sample = 0; sample < task.num_samples && !canceled; sample++) {
-    for (int shader_x = start; shader_x < end; shader_x += shader_chunk_size) {
-      int shader_w = min(shader_chunk_size, end - shader_x);
-
-      /* pass in parameters */
-      void *args[8];
-      int arg = 0;
-      args[arg++] = &d_input;
-      args[arg++] = &d_output;
-      args[arg++] = &task.shader_eval_type;
-      if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
-        args[arg++] = &task.shader_filter;
-      }
-      args[arg++] = &shader_x;
-      args[arg++] = &shader_w;
-      args[arg++] = &offset;
-      args[arg++] = &sample;
-
-      /* launch kernel */
-      int threads_per_block;
-      cuda_assert(cuFuncGetAttribute(
-          &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));
-
-      int xblocks = (shader_w + threads_per_block - 1) / threads_per_block;
-
-      cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
-      cuda_assert(cuLaunchKernel(cuShader,
-                                 xblocks,
-                                 1,
-                                 1, /* blocks */
-                                 threads_per_block,
-                                 1,
-                                 1, /* threads */
-                                 0,
-                                 0,
-                                 args,
-                                 0));
-
-      cuda_assert(cuCtxSynchronize());
-
-      if (task.get_cancel()) {
-        canceled = true;
-        break;
-      }
-    }
-
-    task.update_progress(NULL);
-  }
-}
-
-CUdeviceptr CUDADevice::map_pixels(device_ptr mem)
-{
-  if (!background) {
-    PixelMem pmem = pixel_mem_map[mem];
-    CUdeviceptr buffer;
-
-    size_t bytes;
-    cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0));
-    cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource));
-
-    return buffer;
-  }
-
-  return (CUdeviceptr)mem;
-}
-
-void CUDADevice::unmap_pixels(device_ptr mem)
-{
-  if (!background) {
-    PixelMem pmem = pixel_mem_map[mem];
-
-    cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0));
-  }
-}
-
-void CUDADevice::pixels_alloc(device_memory &mem)
-{
-  PixelMem pmem;
-
-  pmem.w = mem.data_width;
-  pmem.h = mem.data_height;
-
-  CUDAContextScope scope(this);
-
-  glGenBuffers(1, &pmem.cuPBO);
-  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
-  if (mem.data_type == TYPE_HALF)
-    glBufferData(
-        GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(GLhalf) * 4, NULL, GL_DYNAMIC_DRAW);
-  else
-    glBufferData(
-        GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(uint8_t) * 4, NULL, GL_DYNAMIC_DRAW);
-
-  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
-  glActiveTexture(GL_TEXTURE0);
-  glGenTextures(1, &pmem.cuTexId);
-  glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
-  if (mem.data_type == TYPE_HALF)
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL);
-  else
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
-  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-  glBindTexture(GL_TEXTURE_2D, 0);
-
-  CUresult result = cuGraphicsGLRegisterBuffer(
-      &pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
-
-  if (result == CUDA_SUCCESS) {
-    mem.device_pointer = pmem.cuTexId;
-    pixel_mem_map[mem.device_pointer] = pmem;
-
-    mem.device_size = mem.memory_size();
-    stats.mem_alloc(mem.device_size);
-
-    return;
-  }
-  else {
-    /* failed to register buffer, fallback to no interop */
-    glDeleteBuffers(1, &pmem.cuPBO);
-    glDeleteTextures(1, &pmem.cuTexId);
-
-    background = true;
-  }
-}
-
-void CUDADevice::pixels_copy_from(device_memory &mem, int y, int w, int h)
-{
-  PixelMem pmem = pixel_mem_map[mem.device_pointer];
-
-  CUDAContextScope scope(this);
-
-  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
-  uchar *pixels = (uchar *)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
-  size_t offset = sizeof(uchar) * 4 * y * w;
-  memcpy((uchar *)mem.host_pointer + offset, pixels + offset, sizeof(uchar) * 4 * w * h);
-  glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
-  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-}
-
-void CUDADevice::pixels_free(device_memory &mem)
-{
-  if (mem.device_pointer) {
-    PixelMem pmem = pixel_mem_map[mem.device_pointer];
-
-    CUDAContextScope scope(this);
-
-    cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource));
-    glDeleteBuffers(1, &pmem.cuPBO);
-    glDeleteTextures(1, &pmem.cuTexId);
-
-    pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
-    mem.device_pointer = 0;
-
-    stats.mem_free(mem.device_size);
-    mem.device_size = 0;
-  }
-}
-
-void CUDADevice::draw_pixels(device_memory &mem,
-                             int y,
-                             int w,
-                             int h,
-                             int width,
-                             int height,
-                             int dx,
-                             int dy,
-                             int dw,
-                             int dh,
-                             bool transparent,
-                             const DeviceDrawParams &draw_params)
-{
-  assert(mem.type == MEM_PIXELS);
-
-  if (!background) {
-    const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL);
-    PixelMem pmem = pixel_mem_map[mem.device_pointer];
-    float *vpointer;
-
-    CUDAContextScope scope(this);
-
-    /* for multi devices, this assumes the inefficient method that we allocate
-     * all pixels on the device even though we only render to a subset */
-    size_t offset = 4 * y * w;
-
-    if (mem.data_type == TYPE_HALF)
-      offset *= sizeof(GLhalf);
-    else
-      offset *= sizeof(uint8_t);
-
-    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
-    glActiveTexture(GL_TEXTURE0);
-    glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
-    if (mem.data_type == TYPE_HALF) {
-      glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_HALF_FLOAT, (void *)offset);
-    }
-    else {
-      glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void *)offset);
-    }
-    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
-    if (transparent) {
-      glEnable(GL_BLEND);
-      glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
-    }
-
-    GLint shader_program;
-    if (use_fallback_shader) {
-      if (!bind_fallback_display_space_shader(dw, dh)) {
-        return;
-      }
-      shader_program = fallback_shader_program;
-    }
-    else {
-      draw_params.bind_display_space_shader_cb();
-      glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program);
-    }
-
-    if (!vertex_buffer) {
-      glGenBuffers(1, &vertex_buffer);
-    }
-
-    glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
-    /* invalidate old contents -
-     * avoids stalling if buffer is still waiting in queue to be rendered */
-    glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
-
-    vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
-
-    if (vpointer) {
-      /* texture coordinate - vertex pair */
-      vpointer[0] = 0.0f;
-      vpointer[1] = 0.0f;
-      vpointer[2] = dx;
-      vpointer[3] = dy;
-
-      vpointer[4] = (float)w / (float)pmem.w;
-      vpointer[5] = 0.0f;
-      vpointer[6] = (float)width + dx;
-      vpointer[7] = dy;
-
-      vpointer[8] = (float)w / (float)pmem.w;
-      vpointer[9] = (float)h / (float)pmem.h;
-      vpointer[10] = (float)width + dx;
-      vpointer[11] = (float)height + dy;
-
-      vpointer[12] = 0.0f;
-      vpointer[13] = (float)h / (float)pmem.h;
-      vpointer[14] = dx;
-      vpointer[15] = (float)height + dy;
-
-      glUnmapBuffer(GL_ARRAY_BUFFER);
-    }
-
-    GLuint vertex_array_object;
-    GLuint position_attribute, texcoord_attribute;
-
-    glGenVertexArrays(1, &vertex_array_object);
-    glBindVertexArray(vertex_array_object);
-
-    texcoord_attribute = glGetAttribLocation(shader_program, "texCoord");
-    position_attribute = glGetAttribLocation(shader_program, "pos");
-
-    glEnableVertexAttribArray(texcoord_attribute);
-    glEnableVertexAttribArray(position_attribute);
-
-    glVertexAttribPointer(
-        texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
-    glVertexAttribPointer(position_attribute,
-                          2,
-                          GL_FLOAT,
-                          GL_FALSE,
-                          4 * sizeof(float),
-                          (const GLvoid *)(sizeof(float) * 2));
-
-    glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
-
-    if (use_fallback_shader) {
-      glUseProgram(0);
-    }
-    else {
-      draw_params.unbind_display_space_shader_cb();
-    }
-
-    if (transparent) {
-      glDisable(GL_BLEND);
-    }
-
-    glBindTexture(GL_TEXTURE_2D, 0);
-
-    return;
-  }
-
-  Device::draw_pixels(mem, y, w, h, width, height, dx, dy, dw, dh, transparent, draw_params);
-}
-
-void CUDADevice::thread_run(DeviceTask &task)
-{
-  CUDAContextScope scope(this);
-
-  if (task.type == DeviceTask::RENDER) {
-    DeviceRequestedFeatures requested_features;
-    if (use_split_kernel()) {
-      if (split_kernel == NULL) {
-        split_kernel = new CUDASplitKernel(this);
-        split_kernel->load_kernels(requested_features);
-      }
-    }
-
-    device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
-
-    /* keep rendering tiles until done */
-    RenderTile tile;
-    DenoisingTask denoising(this, task);
-
-    while (task.acquire_tile(this, tile, task.tile_types)) {
-      if (tile.task == RenderTile::PATH_TRACE) {
-        if (use_split_kernel()) {
-          device_only_memory<uchar> void_buffer(this, "void_buffer");
-          split_kernel->path_trace(task, tile, void_buffer, void_buffer);
-        }
-        else {
-          render(task, tile, work_tiles);
-        }
-      }
-      else if (tile.task == RenderTile::BAKE) {
-        render(task, tile, work_tiles);
-      }
-      else if (tile.task == RenderTile::DENOISE) {
-        tile.sample = tile.start_sample + tile.num_samples;
-
-        denoise(tile, denoising);
-
-        task.update_progress(&tile, tile.w * tile.h);
-      }
-
-      task.release_tile(tile);
-
-      if (task.get_cancel()) {
-        if (task.need_finish_queue == false)
-          break;
-      }
-    }
-
-    work_tiles.free();
-  }
-  else if (task.type == DeviceTask::SHADER) {
-    shader(task);
-
-    cuda_assert(cuCtxSynchronize());
-  }
-  else if (task.type == DeviceTask::DENOISE_BUFFER) {
-    RenderTile tile;
-    tile.x = task.x;
-    tile.y = task.y;
-    tile.w = task.w;
-    tile.h = task.h;
-    tile.buffer = task.buffer;
-    tile.sample = task.sample + task.num_samples;
-    tile.num_samples = task.num_samples;
-    tile.start_sample = task.sample;
-    tile.offset = task.offset;
-    tile.stride = task.stride;
-    tile.buffers = task.buffers;
-
-    DenoisingTask denoising(this, task);
-    denoise(tile, denoising);
-    task.update_progress(&tile, tile.w * tile.h);
-  }
-}
-
-void CUDADevice::task_add(DeviceTask &task)
-{
-  CUDAContextScope scope(this);
-
-  /* Load texture info. */
-  load_texture_info();
-
-  /* Synchronize all memory copies before executing task. */
-  cuda_assert(cuCtxSynchronize());
-
-  if (task.type == DeviceTask::FILM_CONVERT) {
-    /* must be done in main thread due to opengl access */
-    film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
-  }
-  else {
-    task_pool.push([=] {
-      DeviceTask task_copy = task;
-      thread_run(task_copy);
-    });
-  }
-}
-
-void CUDADevice::task_wait()
-{
-  task_pool.wait();
-}
-
-void CUDADevice::task_cancel()
-{
-  task_pool.cancel();
-}
-
-/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class
- * now that the definition of that class is complete
- */
-#  undef cuda_assert
-#  define cuda_assert(stmt) \
-    { \
-      CUresult result = stmt; \
-      if (result != CUDA_SUCCESS) { \
-        const char *name = cuewErrorString(result); \
-        device->set_error( \
-            string_printf("%s in %s (device_cuda_impl.cpp:%d)", name, #stmt, __LINE__)); \
-      } \
-    } \
-    (void)0
-
-/* CUDA context scope. */
-
-CUDAContextScope::CUDAContextScope(CUDADevice *device) : device(device)
-{
-  cuda_assert(cuCtxPushCurrent(device->cuContext));
-}
-
-CUDAContextScope::~CUDAContextScope()
-{
-  cuda_assert(cuCtxPopCurrent(NULL));
-}
-
-/* split kernel */
-
-class CUDASplitKernelFunction : public SplitKernelFunction {
-  CUDADevice *device;
-  CUfunction func;
-
- public:
-  CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func)
-  {
-  }
-
-  /* enqueue the kernel, returns false if there is an error */
-  bool enqueue(const KernelDimensions &dim, device_memory & /*kg*/, device_memory & /*data*/)
-  {
-    return enqueue(dim, NULL);
-  }
-
-  /* enqueue the kernel, returns false if there is an error */
-  bool enqueue(const KernelDimensions &dim, void *args[])
-  {
-    if (device->have_error())
-      return false;
-
-    CUDAContextScope scope(device);
-
-    /* we ignore dim.local_size for now, as this is faster */
-    int threads_per_block;
-    cuda_assert(
-        cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
-
-    int xblocks = (dim.global_size[0] * dim.global_size[1] + threads_per_block - 1) /
-                  threads_per_block;
-
-    cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
-
-    cuda_assert(cuLaunchKernel(func,
-                               xblocks,
-                               1,
-                               1, /* blocks */
-                               threads_per_block,
-                               1,
-                               1, /* threads */
-                               0,
-                               0,
-                               args,
-                               0));
-
-    return !device->have_error();
-  }
-};
-
-CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device)
-{
-}
-
-uint64_t CUDASplitKernel::state_buffer_size(device_memory & /*kg*/,
-                                            device_memory & /*data*/,
-                                            size_t num_threads)
-{
-  CUDAContextScope scope(device);
-
-  device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
-  size_buffer.alloc(1);
-  size_buffer.zero_to_device();
-
-  uint threads = num_threads;
-  CUdeviceptr d_size = (CUdeviceptr)size_buffer.device_pointer;
-
-  struct args_t {
-    uint *num_threads;
-    CUdeviceptr *size;
-  };
-
-  args_t args = {&threads, &d_size};
-
-  CUfunction state_buffer_size;
-  cuda_assert(
-      cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size"));
-
-  cuda_assert(cuLaunchKernel(state_buffer_size, 1, 1, 1, 1, 1, 1, 0, 0, (void **)&args, 0));
-
-  size_buffer.copy_from_device(0, 1, 1);
-  size_t size = size_buffer[0];
-  size_buffer.free();
-
-  return size;
-}
-
-bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim,
-                                                     RenderTile &rtile,
-                                                     int num_global_elements,
-                                                     device_memory & /*kernel_globals*/,
-                                                     device_memory & /*kernel_data*/,
-                                                     device_memory &split_data,
-                                                     device_memory &ray_state,
-                                                     device_memory &queue_index,
-                                                     device_memory &use_queues_flag,
-                                                     device_memory &work_pool_wgs)
-{
-  CUDAContextScope scope(device);
-
-  CUdeviceptr d_split_data = (CUdeviceptr)split_data.device_pointer;
-  CUdeviceptr d_ray_state = (CUdeviceptr)ray_state.device_pointer;
-  CUdeviceptr d_queue_index = (CUdeviceptr)queue_index.device_pointer;
-  CUdeviceptr d_use_queues_flag = (CUdeviceptr)use_queues_flag.device_pointer;
-  CUdeviceptr d_work_pool_wgs = (CUdeviceptr)work_pool_wgs.device_pointer;
-
-  CUdeviceptr d_buffer = (CUdeviceptr)rtile.buffer;
-
-  int end_sample = rtile.start_sample + rtile.num_samples;
-  int queue_size = dim.global_size[0] * dim.global_size[1];
-
-  struct args_t {
-    CUdeviceptr *split_data_buffer;
-    int *num_elements;
-    CUdeviceptr *ray_state;
-    int *start_sample;
-    int *end_sample;
-    int *sx;
-    int *sy;
-    int *sw;
-    int *sh;
-    int *offset;
-    int *stride;
-    CUdeviceptr *queue_index;
-    int *queuesize;
-    CUdeviceptr *use_queues_flag;
-    CUdeviceptr *work_pool_wgs;
-    int *num_samples;
-    CUdeviceptr *buffer;
-  };
-
-  args_t args = {&d_split_data,
-                 &num_global_elements,
-                 &d_ray_state,
-                 &rtile.start_sample,
-                 &end_sample,
-                 &rtile.x,
-                 &rtile.y,
-                 &rtile.w,
-                 &rtile.h,
-                 &rtile.offset,
-                 &rtile.stride,
-                 &d_queue_index,
-                 &queue_size,
-                 &d_use_queues_flag,
-                 &d_work_pool_wgs,
-                 &rtile.num_samples,
-                 &d_buffer};
-
-  CUfunction data_init;
-  cuda_assert(
-      cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init"));
-  if (device->have_error()) {
-    return false;
-  }
-
-  CUDASplitKernelFunction(device, data_init).enqueue(dim, (void **)&args);
-
-  return !device->have_error();
-}
-
-SplitKernelFunction *CUDASplitKernel::get_split_kernel_function(const string &kernel_name,
-                                                                const DeviceRequestedFeatures &)
-{
-  const CUDAContextScope scope(device);
-
-  CUfunction func;
-  const CUresult result = cuModuleGetFunction(
-      &func, device->cuModule, (string("kernel_cuda_") + kernel_name).data());
-  if (result != CUDA_SUCCESS) {
-    device->set_error(string_printf("Could not find kernel \"kernel_cuda_%s\" in module (%s)",
-                                    kernel_name.data(),
-                                    cuewErrorString(result)));
-    return NULL;
-  }
-
-  return new CUDASplitKernelFunction(device, func);
-}
-
-int2 CUDASplitKernel::split_kernel_local_size()
-{
-  return make_int2(32, 1);
-}
-
-int2 CUDASplitKernel::split_kernel_global_size(device_memory &kg,
-                                               device_memory &data,
-                                               DeviceTask & /*task*/)
-{
-  CUDAContextScope scope(device);
-  size_t free;
-  size_t total;
-
-  cuda_assert(cuMemGetInfo(&free, &total));
-
-  VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(free)
-          << " bytes. (" << string_human_readable_size(free) << ").";
-
-  size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2);
-  size_t side = round_down((int)sqrt(num_elements), 32);
-  int2 global_size = make_int2(side, round_down(num_elements / side, 16));
-  VLOG(1) << "Global size: " << global_size << ".";
-  return global_size;
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp
new file mode 100644
index 00000000000..37fab8f8293
--- /dev/null
+++ b/intern/cycles/device/cuda/device_impl.cpp
@@ -0,0 +1,1370 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include <climits>
+#  include <limits.h>
+#  include <stdio.h>
+#  include <stdlib.h>
+#  include <string.h>
+
+#  include "device/cuda/device_impl.h"
+
+#  include "render/buffers.h"
+
+#  include "util/util_debug.h"
+#  include "util/util_foreach.h"
+#  include "util/util_logging.h"
+#  include "util/util_map.h"
+#  include "util/util_md5.h"
+#  include "util/util_opengl.h"
+#  include "util/util_path.h"
+#  include "util/util_string.h"
+#  include "util/util_system.h"
+#  include "util/util_time.h"
+#  include "util/util_types.h"
+#  include "util/util_windows.h"
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice;
+
+bool CUDADevice::have_precompiled_kernels()
+{
+  string cubins_path = path_get("lib");
+  return path_exists(cubins_path);
+}
+
+bool CUDADevice::show_samples() const
+{
+  /* The CUDADevice only processes one tile at a time, so showing samples is fine. */
+  return true;
+}
+
+BVHLayoutMask CUDADevice::get_bvh_layout_mask() const
+{
+  return BVH_LAYOUT_BVH2;
+}
+
+void CUDADevice::set_error(const string &error)
+{
+  Device::set_error(error);
+
+  if (first_error) {
+    fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
+    fprintf(stderr,
+            "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
+    first_error = false;
+  }
+}
+
+CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+    : Device(info, stats, profiler), texture_info(this, "__texture_info", MEM_GLOBAL)
+{
+  first_error = true;
+
+  cuDevId = info.num;
+  cuDevice = 0;
+  cuContext = 0;
+
+  cuModule = 0;
+
+  need_texture_info = false;
+
+  device_texture_headroom = 0;
+  device_working_headroom = 0;
+  move_texture_to_host = false;
+  map_host_limit = 0;
+  map_host_used = 0;
+  can_map_host = 0;
+  pitch_alignment = 0;
+
+  /* Initialize CUDA. */
+  CUresult result = cuInit(0);
+  if (result != CUDA_SUCCESS) {
+    set_error(string_printf("Failed to initialize CUDA runtime (%s)", cuewErrorString(result)));
+    return;
+  }
+
+  /* Setup device and context. */
+  result = cuDeviceGet(&cuDevice, cuDevId);
+  if (result != CUDA_SUCCESS) {
+    set_error(string_printf("Failed to get CUDA device handle from ordinal (%s)",
+                            cuewErrorString(result)));
+    return;
+  }
+
+  /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
+   * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
+   * so we can predict which memory to map to host. */
+  cuda_assert(
+      cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
+
+  cuda_assert(cuDeviceGetAttribute(
+      &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
+
+  unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
+  if (can_map_host) {
+    ctx_flags |= CU_CTX_MAP_HOST;
+    init_host_memory();
+  }
+
+  /* Create context. */
+  result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
+
+  if (result != CUDA_SUCCESS) {
+    set_error(string_printf("Failed to create CUDA context (%s)", cuewErrorString(result)));
+    return;
+  }
+
+  int major, minor;
+  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+  cuDevArchitecture = major * 100 + minor * 10;
+
+  /* Pop context set by cuCtxCreate. */
+  cuCtxPopCurrent(NULL);
+}
+
+CUDADevice::~CUDADevice()
+{
+  texture_info.free();
+
+  cuda_assert(cuCtxDestroy(cuContext));
+}
+
+bool CUDADevice::support_device(const uint /*kernel_features*/)
+{
+  int major, minor;
+  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+
+  /* We only support sm_30 and above */
+  if (major < 3) {
+    set_error(string_printf(
+        "CUDA backend requires compute capability 3.0 or up, but found %d.%d.", major, minor));
+    return false;
+  }
+
+  return true;
+}
+
+bool CUDADevice::check_peer_access(Device *peer_device)
+{
+  if (peer_device == this) {
+    return false;
+  }
+  if (peer_device->info.type != DEVICE_CUDA && peer_device->info.type != DEVICE_OPTIX) {
+    return false;
+  }
+
+  CUDADevice *const peer_device_cuda = static_cast<CUDADevice *>(peer_device);
+
+  int can_access = 0;
+  cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice));
+  if (can_access == 0) {
+    return false;
+  }
+
+  // Ensure array access over the link is possible as well (for 3D textures)
+  cuda_assert(cuDeviceGetP2PAttribute(&can_access,
+                                      CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED,
+                                      cuDevice,
+                                      peer_device_cuda->cuDevice));
+  if (can_access == 0) {
+    return false;
+  }
+
+  // Enable peer access in both directions
+  {
+    const CUDAContextScope scope(this);
+    CUresult result = cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0);
+    if (result != CUDA_SUCCESS) {
+      set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
+                              cuewErrorString(result)));
+      return false;
+    }
+  }
+  {
+    const CUDAContextScope scope(peer_device_cuda);
+    CUresult result = cuCtxEnablePeerAccess(cuContext, 0);
+    if (result != CUDA_SUCCESS) {
+      set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
+                              cuewErrorString(result)));
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool CUDADevice::use_adaptive_compilation()
+{
+  return DebugFlags().cuda.adaptive_compile;
+}
+
+/* Common NVCC flags which stays the same regardless of shading model,
+ * kernel sources md5 and only depends on compiler or compilation settings.
+ */
+string CUDADevice::compile_kernel_get_common_cflags(const uint kernel_features)
+{
+  const int machine = system_cpu_bits();
+  const string source_path = path_get("source");
+  const string include_path = source_path;
+  string cflags = string_printf(
+      "-m%d "
+      "--ptxas-options=\"-v\" "
+      "--use_fast_math "
+      "-DNVCC "
+      "-I\"%s\"",
+      machine,
+      include_path.c_str());
+  if (use_adaptive_compilation()) {
+    cflags += " -D__KERNEL_FEATURES__=" + to_string(kernel_features);
+  }
+  const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
+  if (extra_cflags) {
+    cflags += string(" ") + string(extra_cflags);
+  }
+
+#  ifdef WITH_NANOVDB
+  cflags += " -DWITH_NANOVDB";
+#  endif
+
+  return cflags;
+}
+
+string CUDADevice::compile_kernel(const uint kernel_features,
+                                  const char *name,
+                                  const char *base,
+                                  bool force_ptx)
+{
+  /* Compute kernel name. */
+  int major, minor;
+  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+
+  /* Attempt to use kernel provided with Blender. */
+  if (!use_adaptive_compilation()) {
+    if (!force_ptx) {
+      const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor));
+      VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
+      if (path_exists(cubin)) {
+        VLOG(1) << "Using precompiled kernel.";
+        return cubin;
+      }
+    }
+
+    /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */
+    int ptx_major = major, ptx_minor = minor;
+    while (ptx_major >= 3) {
+      const string ptx = path_get(
+          string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor));
+      VLOG(1) << "Testing for pre-compiled kernel " << ptx << ".";
+      if (path_exists(ptx)) {
+        VLOG(1) << "Using precompiled kernel.";
+        return ptx;
+      }
+
+      if (ptx_minor > 0) {
+        ptx_minor--;
+      }
+      else {
+        ptx_major--;
+        ptx_minor = 9;
+      }
+    }
+  }
+
+  /* Try to use locally compiled kernel. */
+  string source_path = path_get("source");
+  const string source_md5 = path_files_md5_hash(source_path);
+
+  /* We include cflags into md5 so changing cuda toolkit or changing other
+   * compiler command line arguments makes sure cubin gets re-built.
+   */
+  string common_cflags = compile_kernel_get_common_cflags(kernel_features);
+  const string kernel_md5 = util_md5_string(source_md5 + common_cflags);
+
+  const char *const kernel_ext = force_ptx ? "ptx" : "cubin";
+  const char *const kernel_arch = force_ptx ? "compute" : "sm";
+  const string cubin_file = string_printf(
+      "cycles_%s_%s_%d%d_%s.%s", name, kernel_arch, major, minor, kernel_md5.c_str(), kernel_ext);
+  const string cubin = path_cache_get(path_join("kernels", cubin_file));
+  VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
+  if (path_exists(cubin)) {
+    VLOG(1) << "Using locally compiled kernel.";
+    return cubin;
+  }
+
+#  ifdef _WIN32
+  if (!use_adaptive_compilation() && have_precompiled_kernels()) {
+    if (major < 3) {
+      set_error(
+          string_printf("CUDA backend requires compute capability 3.0 or up, but found %d.%d. "
+                        "Your GPU is not supported.",
+                        major,
+                        minor));
+    }
+    else {
+      set_error(
+          string_printf("CUDA binary kernel for this graphics card compute "
+                        "capability (%d.%d) not found.",
+                        major,
+                        minor));
+    }
+    return string();
+  }
+#  endif
+
+  /* Compile. */
+  const char *const nvcc = cuewCompilerPath();
+  if (nvcc == NULL) {
+    set_error(
+        "CUDA nvcc compiler not found. "
+        "Install CUDA toolkit in default location.");
+    return string();
+  }
+
+  const int nvcc_cuda_version = cuewCompilerVersion();
+  VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << nvcc_cuda_version << ".";
+  if (nvcc_cuda_version < 101) {
+    printf(
+        "Unsupported CUDA version %d.%d detected, "
+        "you need CUDA 10.1 or newer.\n",
+        nvcc_cuda_version / 10,
+        nvcc_cuda_version % 10);
+    return string();
+  }
+  else if (!(nvcc_cuda_version == 101 || nvcc_cuda_version == 102 || nvcc_cuda_version == 111 ||
+             nvcc_cuda_version == 112 || nvcc_cuda_version == 113 || nvcc_cuda_version == 114)) {
+    printf(
+        "CUDA version %d.%d detected, build may succeed but only "
+        "CUDA 10.1 to 11.4 are officially supported.\n",
+        nvcc_cuda_version / 10,
+        nvcc_cuda_version % 10);
+  }
+
+  double starttime = time_dt();
+
+  path_create_directories(cubin);
+
+  source_path = path_join(path_join(source_path, "kernel"),
+                          path_join("device", path_join(base, string_printf("%s.cu", name))));
+
+  string command = string_printf(
+      "\"%s\" "
+      "-arch=%s_%d%d "
+      "--%s \"%s\" "
+      "-o \"%s\" "
+      "%s",
+      nvcc,
+      kernel_arch,
+      major,
+      minor,
+      kernel_ext,
+      source_path.c_str(),
+      cubin.c_str(),
+      common_cflags.c_str());
+
+  printf("Compiling CUDA kernel ...\n%s\n", command.c_str());
+
+#  ifdef _WIN32
+  command = "call " + command;
+#  endif
+  if (system(command.c_str()) != 0) {
+    set_error(
+        "Failed to execute compilation command, "
+        "see console for details.");
+    return string();
+  }
+
+  /* Verify if compilation succeeded */
+  if (!path_exists(cubin)) {
+    set_error(
+        "CUDA kernel compilation failed, "
+        "see console for details.");
+    return string();
+  }
+
+  printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
+
+  return cubin;
+}
+
+bool CUDADevice::load_kernels(const uint kernel_features)
+{
+  /* TODO(sergey): Support kernels re-load for CUDA devices.
+   *
+   * Currently re-loading kernel will invalidate memory pointers,
+   * causing problems in cuCtxSynchronize.
+   */
+  if (cuModule) {
+    VLOG(1) << "Skipping kernel reload, not currently supported.";
+    return true;
+  }
+
+  /* check if cuda init succeeded */
+  if (cuContext == 0)
+    return false;
+
+  /* check if GPU is supported */
+  if (!support_device(kernel_features))
+    return false;
+
+  /* get kernel */
+  const char *kernel_name = "kernel";
+  string cubin = compile_kernel(kernel_features, kernel_name);
+  if (cubin.empty())
+    return false;
+
+  /* open module */
+  CUDAContextScope scope(this);
+
+  string cubin_data;
+  CUresult result;
+
+  if (path_read_text(cubin, cubin_data))
+    result = cuModuleLoadData(&cuModule, cubin_data.c_str());
+  else
+    result = CUDA_ERROR_FILE_NOT_FOUND;
+
+  if (result != CUDA_SUCCESS)
+    set_error(string_printf(
+        "Failed to load CUDA kernel from '%s' (%s)", cubin.c_str(), cuewErrorString(result)));
+
+  if (result == CUDA_SUCCESS) {
+    kernels.load(this);
+    reserve_local_memory(kernel_features);
+  }
+
+  return (result == CUDA_SUCCESS);
+}
+
+void CUDADevice::reserve_local_memory(const uint /* kernel_features */)
+{
+  /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
+   * needed for kernel launches, so that we can reliably figure out when
+   * to allocate scene data in mapped host memory. */
+  size_t total = 0, free_before = 0, free_after = 0;
+
+  {
+    CUDAContextScope scope(this);
+    cuMemGetInfo(&free_before, &total);
+  }
+
+  {
+    /* Use the biggest kernel for estimation. */
+    const DeviceKernel test_kernel = DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE;
+
+    /* Launch kernel, using just 1 block appears sufficient to reserve memory for all
+     * multiprocessors. It would be good to do this in parallel for the multi GPU case
+     * still to make it faster. */
+    CUDADeviceQueue queue(this);
+
+    void *d_path_index = nullptr;
+    void *d_render_buffer = nullptr;
+    int d_work_size = 0;
+    void *args[] = {&d_path_index, &d_render_buffer, &d_work_size};
+
+    queue.init_execution();
+    queue.enqueue(test_kernel, 1, args);
+    queue.synchronize();
+  }
+
+  {
+    CUDAContextScope scope(this);
+    cuMemGetInfo(&free_after, &total);
+  }
+
+  VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after)
+          << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
+
+#  if 0
+  /* For testing mapped host memory, fill up device memory. */
+  const size_t keep_mb = 1024;
+
+  while (free_after > keep_mb * 1024 * 1024LL) {
+    CUdeviceptr tmp;
+    cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
+    cuMemGetInfo(&free_after, &total);
+  }
+#  endif
+}
+
+void CUDADevice::init_host_memory()
+{
+  /* Limit amount of host mapped memory, because allocating too much can
+   * cause system instability. Leave at least half or 4 GB of system
+   * memory free, whichever is smaller. */
+  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
+  size_t system_ram = system_physical_ram();
+
+  if (system_ram > 0) {
+    if (system_ram / 2 > default_limit) {
+      map_host_limit = system_ram - default_limit;
+    }
+    else {
+      map_host_limit = system_ram / 2;
+    }
+  }
+  else {
+    VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
+    map_host_limit = 0;
+  }
+
+  /* Amount of device memory to keep is free after texture memory
+   * and working memory allocations respectively. We set the working
+   * memory limit headroom lower so that some space is left after all
+   * texture memory allocations. */
+  device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
+  device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
+
+  VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
+          << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
+}
+
+void CUDADevice::load_texture_info()
+{
+  if (need_texture_info) {
+    /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
+     * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
+    need_texture_info = false;
+    texture_info.copy_to_device();
+  }
+}
+
+void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
+{
+  /* Break out of recursive call, which can happen when moving memory on a multi device. */
+  static bool any_device_moving_textures_to_host = false;
+  if (any_device_moving_textures_to_host) {
+    return;
+  }
+
+  /* Signal to reallocate textures in host memory only. */
+  move_texture_to_host = true;
+
+  while (size > 0) {
+    /* Find suitable memory allocation to move. */
+    device_memory *max_mem = NULL;
+    size_t max_size = 0;
+    bool max_is_image = false;
+
+    thread_scoped_lock lock(cuda_mem_map_mutex);
+    foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
+      device_memory &mem = *pair.first;
+      CUDAMem *cmem = &pair.second;
+
+      /* Can only move textures allocated on this device (and not those from peer devices).
+       * And need to ignore memory that is already on the host. */
+      if (!mem.is_resident(this) || cmem->use_mapped_host) {
+        continue;
+      }
+
+      bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
+                        (&mem != &texture_info);
+      bool is_image = is_texture && (mem.data_height > 1);
+
+      /* Can't move this type of memory. */
+      if (!is_texture || cmem->array) {
+        continue;
+      }
+
+      /* For other textures, only move image textures. */
+      if (for_texture && !is_image) {
+        continue;
+      }
+
+      /* Try to move largest allocation, prefer moving images. */
+      if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
+        max_is_image = is_image;
+        max_size = mem.device_size;
+        max_mem = &mem;
+      }
+    }
+    lock.unlock();
+
+    /* Move to host memory. This part is mutex protected since
+     * multiple CUDA devices could be moving the memory. The
+     * first one will do it, and the rest will adopt the pointer. */
+    if (max_mem) {
+      VLOG(1) << "Move memory from device to host: " << max_mem->name;
+
+      static thread_mutex move_mutex;
+      thread_scoped_lock lock(move_mutex);
+
+      any_device_moving_textures_to_host = true;
+
+      /* Potentially need to call back into multi device, so pointer mapping
+       * and peer devices are updated. This is also necessary since the device
+       * pointer may just be a key here, so cannot be accessed and freed directly.
+       * Unfortunately it does mean that memory is reallocated on all other
+       * devices as well, which is potentially dangerous when still in use (since
+       * a thread rendering on another devices would only be caught in this mutex
+       * if it so happens to do an allocation at the same time as well. */
+      max_mem->device_copy_to();
+      size = (max_size >= size) ? 0 : size - max_size;
+
+      any_device_moving_textures_to_host = false;
+    }
+    else {
+      break;
+    }
+  }
+
+  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
+  move_texture_to_host = false;
+
+  /* Update texture info array with new pointers. */
+  load_texture_info();
+}
+
+CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding)
+{
+  CUDAContextScope scope(this);
+
+  CUdeviceptr device_pointer = 0;
+  size_t size = mem.memory_size() + pitch_padding;
+
+  CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
+  const char *status = "";
+
+  /* First try allocating in device memory, respecting headroom. We make
+   * an exception for texture info. It is small and frequently accessed,
+   * so treat it as working memory.
+   *
+   * If there is not enough room for working memory, we will try to move
+   * textures to host memory, assuming the performance impact would have
+   * been worse for working memory. */
+  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
+  bool is_image = is_texture && (mem.data_height > 1);
+
+  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
+
+  size_t total = 0, free = 0;
+  cuMemGetInfo(&free, &total);
+
+  /* Move textures to host memory if needed. */
+  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
+    move_textures_to_host(size + headroom - free, is_texture);
+    cuMemGetInfo(&free, &total);
+  }
+
+  /* Allocate in device memory. */
+  if (!move_texture_to_host && (size + headroom) < free) {
+    mem_alloc_result = cuMemAlloc(&device_pointer, size);
+    if (mem_alloc_result == CUDA_SUCCESS) {
+      status = " in device memory";
+    }
+  }
+
+  /* Fall back to mapped host memory if needed and possible. */
+
+  void *shared_pointer = 0;
+
+  if (mem_alloc_result != CUDA_SUCCESS && can_map_host) {
+    if (mem.shared_pointer) {
+      /* Another device already allocated host memory. */
+      mem_alloc_result = CUDA_SUCCESS;
+      shared_pointer = mem.shared_pointer;
+    }
+    else if (map_host_used + size < map_host_limit) {
+      /* Allocate host memory ourselves. */
+      mem_alloc_result = cuMemHostAlloc(
+          &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
+
+      assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
+             (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
+    }
+
+    if (mem_alloc_result == CUDA_SUCCESS) {
+      cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0));
+      map_host_used += size;
+      status = " in host memory";
+    }
+  }
+
+  if (mem_alloc_result != CUDA_SUCCESS) {
+    status = " failed, out of device and host memory";
+    set_error("System is out of GPU and shared host memory");
+  }
+
+  if (mem.name) {
+    VLOG(1) << "Buffer allocate: " << mem.name << ", "
+            << string_human_readable_number(mem.memory_size()) << " bytes. ("
+            << string_human_readable_size(mem.memory_size()) << ")" << status;
+  }
+
+  mem.device_pointer = (device_ptr)device_pointer;
+  mem.device_size = size;
+  stats.mem_alloc(size);
+
+  if (!mem.device_pointer) {
+    return NULL;
+  }
+
+  /* Insert into map of allocations. */
+  thread_scoped_lock lock(cuda_mem_map_mutex);
+  CUDAMem *cmem = &cuda_mem_map[&mem];
+  if (shared_pointer != 0) {
+    /* Replace host pointer with our host allocation. Only works if
+     * CUDA memory layout is the same and has no pitch padding. Also
+     * does not work if we move textures to host during a render,
+     * since other devices might be using the memory. */
+
+    if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
+        mem.host_pointer != shared_pointer) {
+      memcpy(shared_pointer, mem.host_pointer, size);
+
+      /* A Call to device_memory::host_free() should be preceded by
+       * a call to device_memory::device_free() for host memory
+       * allocated by a device to be handled properly. Two exceptions
+       * are here and a call in OptiXDevice::generic_alloc(), where
+       * the current host memory can be assumed to be allocated by
+       * device_memory::host_alloc(), not by a device */
+
+      mem.host_free();
+      mem.host_pointer = shared_pointer;
+    }
+    mem.shared_pointer = shared_pointer;
+    mem.shared_counter++;
+    cmem->use_mapped_host = true;
+  }
+  else {
+    cmem->use_mapped_host = false;
+  }
+
+  return cmem;
+}
+
+void CUDADevice::generic_copy_to(device_memory &mem)
+{
+  if (!mem.host_pointer || !mem.device_pointer) {
+    return;
+  }
+
+  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
+   * cuMemAlloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
+   * mem.host_pointer. */
+  thread_scoped_lock lock(cuda_mem_map_mutex);
+  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+    const CUDAContextScope scope(this);
+    cuda_assert(
+        cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size()));
+  }
+}
+
+void CUDADevice::generic_free(device_memory &mem)
+{
+  if (mem.device_pointer) {
+    CUDAContextScope scope(this);
+    thread_scoped_lock lock(cuda_mem_map_mutex);
+    const CUDAMem &cmem = cuda_mem_map[&mem];
+
+    /* If cmem.use_mapped_host is true, reference counting is used
+     * to safely free a mapped host memory. */
+
+    if (cmem.use_mapped_host) {
+      assert(mem.shared_pointer);
+      if (mem.shared_pointer) {
+        assert(mem.shared_counter > 0);
+        if (--mem.shared_counter == 0) {
+          if (mem.host_pointer == mem.shared_pointer) {
+            mem.host_pointer = 0;
+          }
+          cuMemFreeHost(mem.shared_pointer);
+          mem.shared_pointer = 0;
+        }
+      }
+      map_host_used -= mem.device_size;
+    }
+    else {
+      /* Free device memory. */
+      cuda_assert(cuMemFree(mem.device_pointer));
+    }
+
+    stats.mem_free(mem.device_size);
+    mem.device_pointer = 0;
+    mem.device_size = 0;
+
+    cuda_mem_map.erase(cuda_mem_map.find(&mem));
+  }
+}
+
+void CUDADevice::mem_alloc(device_memory &mem)
+{
+  if (mem.type == MEM_TEXTURE) {
+    assert(!"mem_alloc not supported for textures.");
+  }
+  else if (mem.type == MEM_GLOBAL) {
+    assert(!"mem_alloc not supported for global memory.");
+  }
+  else {
+    generic_alloc(mem);
+  }
+}
+
+void CUDADevice::mem_copy_to(device_memory &mem)
+{
+  if (mem.type == MEM_GLOBAL) {
+    global_free(mem);
+    global_alloc(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free((device_texture &)mem);
+    tex_alloc((device_texture &)mem);
+  }
+  else {
+    if (!mem.device_pointer) {
+      generic_alloc(mem);
+    }
+    generic_copy_to(mem);
+  }
+}
+
+void CUDADevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
+{
+  if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) {
+    assert(!"mem_copy_from not supported for textures.");
+  }
+  else if (mem.host_pointer) {
+    const size_t size = elem * w * h;
+    const size_t offset = elem * y * w;
+
+    if (mem.device_pointer) {
+      const CUDAContextScope scope(this);
+      cuda_assert(cuMemcpyDtoH(
+          (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size));
+    }
+    else {
+      memset((char *)mem.host_pointer + offset, 0, size);
+    }
+  }
+}
+
+void CUDADevice::mem_zero(device_memory &mem)
+{
+  if (!mem.device_pointer) {
+    mem_alloc(mem);
+  }
+  if (!mem.device_pointer) {
+    return;
+  }
+
+  /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
+   * regardless of mem.host_pointer and mem.shared_pointer. */
+  thread_scoped_lock lock(cuda_mem_map_mutex);
+  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+    const CUDAContextScope scope(this);
+    cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
+  }
+  else if (mem.host_pointer) {
+    memset(mem.host_pointer, 0, mem.memory_size());
+  }
+}
+
+void CUDADevice::mem_free(device_memory &mem)
+{
+  if (mem.type == MEM_GLOBAL) {
+    global_free(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free((device_texture &)mem);
+  }
+  else {
+    generic_free(mem);
+  }
+}
+
+device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
+{
+  return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
+}
+
+void CUDADevice::const_copy_to(const char *name, void *host, size_t size)
+{
+  CUDAContextScope scope(this);
+  CUdeviceptr mem;
+  size_t bytes;
+
+  cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
+  // assert(bytes == size);
+  cuda_assert(cuMemcpyHtoD(mem, host, size));
+}
+
+void CUDADevice::global_alloc(device_memory &mem)
+{
+  if (mem.is_resident(this)) {
+    generic_alloc(mem);
+    generic_copy_to(mem);
+  }
+
+  const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
+}
+
+void CUDADevice::global_free(device_memory &mem)
+{
+  if (mem.is_resident(this) && mem.device_pointer) {
+    generic_free(mem);
+  }
+}
+
+void CUDADevice::tex_alloc(device_texture &mem)
+{
+  CUDAContextScope scope(this);
+
+  /* General variables for both architectures */
+  string bind_name = mem.name;
+  size_t dsize = datatype_size(mem.data_type);
+  size_t size = mem.memory_size();
+
+  CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
+  switch (mem.info.extension) {
+    case EXTENSION_REPEAT:
+      address_mode = CU_TR_ADDRESS_MODE_WRAP;
+      break;
+    case EXTENSION_EXTEND:
+      address_mode = CU_TR_ADDRESS_MODE_CLAMP;
+      break;
+    case EXTENSION_CLIP:
+      address_mode = CU_TR_ADDRESS_MODE_BORDER;
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  CUfilter_mode filter_mode;
+  if (mem.info.interpolation == INTERPOLATION_CLOSEST) {
+    filter_mode = CU_TR_FILTER_MODE_POINT;
+  }
+  else {
+    filter_mode = CU_TR_FILTER_MODE_LINEAR;
+  }
+
+  /* Image Texture Storage */
+  CUarray_format_enum format;
+  switch (mem.data_type) {
+    case TYPE_UCHAR:
+      format = CU_AD_FORMAT_UNSIGNED_INT8;
+      break;
+    case TYPE_UINT16:
+      format = CU_AD_FORMAT_UNSIGNED_INT16;
+      break;
+    case TYPE_UINT:
+      format = CU_AD_FORMAT_UNSIGNED_INT32;
+      break;
+    case TYPE_INT:
+      format = CU_AD_FORMAT_SIGNED_INT32;
+      break;
+    case TYPE_FLOAT:
+      format = CU_AD_FORMAT_FLOAT;
+      break;
+    case TYPE_HALF:
+      format = CU_AD_FORMAT_HALF;
+      break;
+    default:
+      assert(0);
+      return;
+  }
+
+  CUDAMem *cmem = NULL;
+  CUarray array_3d = NULL;
+  size_t src_pitch = mem.data_width * dsize * mem.data_elements;
+  size_t dst_pitch = src_pitch;
+
+  if (!mem.is_resident(this)) {
+    thread_scoped_lock lock(cuda_mem_map_mutex);
+    cmem = &cuda_mem_map[&mem];
+    cmem->texobject = 0;
+
+    if (mem.data_depth > 1) {
+      array_3d = (CUarray)mem.device_pointer;
+      cmem->array = array_3d;
+    }
+    else if (mem.data_height > 0) {
+      dst_pitch = align_up(src_pitch, pitch_alignment);
+    }
+  }
+  else if (mem.data_depth > 1) {
+    /* 3D texture using array, there is no API for linear memory. */
+    CUDA_ARRAY3D_DESCRIPTOR desc;
+
+    desc.Width = mem.data_width;
+    desc.Height = mem.data_height;
+    desc.Depth = mem.data_depth;
+    desc.Format = format;
+    desc.NumChannels = mem.data_elements;
+    desc.Flags = 0;
+
+    VLOG(1) << "Array 3D allocate: " << mem.name << ", "
+            << string_human_readable_number(mem.memory_size()) << " bytes. ("
+            << string_human_readable_size(mem.memory_size()) << ")";
+
+    cuda_assert(cuArray3DCreate(&array_3d, &desc));
+
+    if (!array_3d) {
+      return;
+    }
+
+    CUDA_MEMCPY3D param;
+    memset(&param, 0, sizeof(param));
+    param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+    param.dstArray = array_3d;
+    param.srcMemoryType = CU_MEMORYTYPE_HOST;
+    param.srcHost = mem.host_pointer;
+    param.srcPitch = src_pitch;
+    param.WidthInBytes = param.srcPitch;
+    param.Height = mem.data_height;
+    param.Depth = mem.data_depth;
+
+    cuda_assert(cuMemcpy3D(&param));
+
+    mem.device_pointer = (device_ptr)array_3d;
+    mem.device_size = size;
+    stats.mem_alloc(size);
+
+    thread_scoped_lock lock(cuda_mem_map_mutex);
+    cmem = &cuda_mem_map[&mem];
+    cmem->texobject = 0;
+    cmem->array = array_3d;
+  }
+  else if (mem.data_height > 0) {
+    /* 2D texture, using pitch aligned linear memory. */
+    dst_pitch = align_up(src_pitch, pitch_alignment);
+    size_t dst_size = dst_pitch * mem.data_height;
+
+    cmem = generic_alloc(mem, dst_size - mem.memory_size());
+    if (!cmem) {
+      return;
+    }
+
+    CUDA_MEMCPY2D param;
+    memset(&param, 0, sizeof(param));
+    param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+    param.dstDevice = mem.device_pointer;
+    param.dstPitch = dst_pitch;
+    param.srcMemoryType = CU_MEMORYTYPE_HOST;
+    param.srcHost = mem.host_pointer;
+    param.srcPitch = src_pitch;
+    param.WidthInBytes = param.srcPitch;
+    param.Height = mem.data_height;
+
+    cuda_assert(cuMemcpy2DUnaligned(&param));
+  }
+  else {
+    /* 1D texture, using linear memory. */
+    cmem = generic_alloc(mem);
+    if (!cmem) {
+      return;
+    }
+
+    cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
+  }
+
+  /* Resize once */
+  const uint slot = mem.slot;
+  if (slot >= texture_info.size()) {
+    /* Allocate some slots in advance, to reduce amount
+     * of re-allocations. */
+    texture_info.resize(slot + 128);
+  }
+
+  /* Set Mapping and tag that we need to (re-)upload to device */
+  texture_info[slot] = mem.info;
+  need_texture_info = true;
+
+  if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
+      mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
+    /* Kepler+, bindless textures. */
+    CUDA_RESOURCE_DESC resDesc;
+    memset(&resDesc, 0, sizeof(resDesc));
+
+    if (array_3d) {
+      resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
+      resDesc.res.array.hArray = array_3d;
+      resDesc.flags = 0;
+    }
+    else if (mem.data_height > 0) {
+      resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
+      resDesc.res.pitch2D.devPtr = mem.device_pointer;
+      resDesc.res.pitch2D.format = format;
+      resDesc.res.pitch2D.numChannels = mem.data_elements;
+      resDesc.res.pitch2D.height = mem.data_height;
+      resDesc.res.pitch2D.width = mem.data_width;
+      resDesc.res.pitch2D.pitchInBytes = dst_pitch;
+    }
+    else {
+      resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
+      resDesc.res.linear.devPtr = mem.device_pointer;
+      resDesc.res.linear.format = format;
+      resDesc.res.linear.numChannels = mem.data_elements;
+      resDesc.res.linear.sizeInBytes = mem.device_size;
+    }
+
+    CUDA_TEXTURE_DESC texDesc;
+    memset(&texDesc, 0, sizeof(texDesc));
+    texDesc.addressMode[0] = address_mode;
+    texDesc.addressMode[1] = address_mode;
+    texDesc.addressMode[2] = address_mode;
+    texDesc.filterMode = filter_mode;
+    texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
+
+    thread_scoped_lock lock(cuda_mem_map_mutex);
+    cmem = &cuda_mem_map[&mem];
+
+    cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
+
+    texture_info[slot].data = (uint64_t)cmem->texobject;
+  }
+  else {
+    texture_info[slot].data = (uint64_t)mem.device_pointer;
+  }
+}
+
+void CUDADevice::tex_free(device_texture &mem)
+{
+  if (mem.device_pointer) {
+    CUDAContextScope scope(this);
+    thread_scoped_lock lock(cuda_mem_map_mutex);
+    const CUDAMem &cmem = cuda_mem_map[&mem];
+
+    if (cmem.texobject) {
+      /* Free bindless texture. */
+      cuTexObjectDestroy(cmem.texobject);
+    }
+
+    if (!mem.is_resident(this)) {
+      /* Do not free memory here, since it was allocated on a different device. */
+      cuda_mem_map.erase(cuda_mem_map.find(&mem));
+    }
+    else if (cmem.array) {
+      /* Free array. */
+      cuArrayDestroy(cmem.array);
+      stats.mem_free(mem.device_size);
+      mem.device_pointer = 0;
+      mem.device_size = 0;
+
+      cuda_mem_map.erase(cuda_mem_map.find(&mem));
+    }
+    else {
+      lock.unlock();
+      generic_free(mem);
+    }
+  }
+}
+
+#  if 0
+void CUDADevice::render(DeviceTask &task,
+                        RenderTile &rtile,
+                        device_vector<KernelWorkTile> &work_tiles)
+{
+  scoped_timer timer(&rtile.buffers->render_time);
+
+  if (have_error())
+    return;
+
+  CUDAContextScope scope(this);
+  CUfunction cuRender;
+
+  /* Get kernel function. */
+  if (rtile.task == RenderTile::BAKE) {
+    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake"));
+  }
+  else {
+    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace"));
+  }
+
+  if (have_error()) {
+    return;
+  }
+
+  cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
+
+  /* Allocate work tile. */
+  work_tiles.alloc(1);
+
+  KernelWorkTile *wtile = work_tiles.data();
+  wtile->x = rtile.x;
+  wtile->y = rtile.y;
+  wtile->w = rtile.w;
+  wtile->h = rtile.h;
+  wtile->offset = rtile.offset;
+  wtile->stride = rtile.stride;
+  wtile->buffer = (float *)(CUdeviceptr)rtile.buffer;
+
+  /* Prepare work size. More step samples render faster, but for now we
+   * remain conservative for GPUs connected to a display to avoid driver
+   * timeouts and display freezing. */
+  int min_blocks, num_threads_per_block;
+  cuda_assert(
+      cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0));
+  if (!info.display_device) {
+    min_blocks *= 8;
+  }
+
+  uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
+
+  /* Render all samples. */
+  uint start_sample = rtile.start_sample;
+  uint end_sample = rtile.start_sample + rtile.num_samples;
+
+  for (int sample = start_sample; sample < end_sample;) {
+    /* Setup and copy work tile to device. */
+    wtile->start_sample = sample;
+    wtile->num_samples = step_samples;
+    if (task.adaptive_sampling.use) {
+      wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
+    }
+    wtile->num_samples = min(wtile->num_samples, end_sample - sample);
+    work_tiles.copy_to_device();
+
+    CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
+    uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
+    uint num_blocks = divide_up(total_work_size, num_threads_per_block);
+
+    /* Launch kernel. */
+    void *args[] = {&d_work_tiles, &total_work_size};
+
+    cuda_assert(
+        cuLaunchKernel(cuRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
+
+    /* Run the adaptive sampling kernels at selected samples aligned to step samples. */
+    uint filter_sample = sample + wtile->num_samples - 1;
+    if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
+      adaptive_sampling_filter(filter_sample, wtile, d_work_tiles);
+    }
+
+    cuda_assert(cuCtxSynchronize());
+
+    /* Update progress. */
+    sample += wtile->num_samples;
+    rtile.sample = sample;
+    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
+
+    if (task.get_cancel()) {
+      if (task.need_finish_queue == false)
+        break;
+    }
+  }
+
+  /* Finalize adaptive sampling. */
+  if (task.adaptive_sampling.use) {
+    CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
+    adaptive_sampling_post(rtile, wtile, d_work_tiles);
+    cuda_assert(cuCtxSynchronize());
+    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
+  }
+}
+
+void CUDADevice::thread_run(DeviceTask &task)
+{
+  CUDAContextScope scope(this);
+
+  if (task.type == DeviceTask::RENDER) {
+    device_vector<KernelWorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
+
+    /* keep rendering tiles until done */
+    RenderTile tile;
+    DenoisingTask denoising(this, task);
+
+    while (task.acquire_tile(this, tile, task.tile_types)) {
+      if (tile.task == RenderTile::PATH_TRACE) {
+        render(task, tile, work_tiles);
+      }
+      else if (tile.task == RenderTile::BAKE) {
+        render(task, tile, work_tiles);
+      }
+
+      task.release_tile(tile);
+
+      if (task.get_cancel()) {
+        if (task.need_finish_queue == false)
+          break;
+      }
+    }
+
+    work_tiles.free();
+  }
+}
+#  endif
+
+unique_ptr<DeviceQueue> CUDADevice::gpu_queue_create()
+{
+  return make_unique<CUDADeviceQueue>(this);
+}
+
+bool CUDADevice::should_use_graphics_interop()
+{
+  /* Check whether this device is part of OpenGL context.
+   *
+   * Using CUDA device for graphics interoperability which is not part of the OpenGL context is
+   * possible, but from the empiric measurements it can be considerably slower than using naive
+   * pixels copy. */
+
+  CUDAContextScope scope(this);
+
+  int num_all_devices = 0;
+  cuda_assert(cuDeviceGetCount(&num_all_devices));
+
+  if (num_all_devices == 0) {
+    return false;
+  }
+
+  vector<CUdevice> gl_devices(num_all_devices);
+  uint num_gl_devices;
+  cuGLGetDevices(&num_gl_devices, gl_devices.data(), num_all_devices, CU_GL_DEVICE_LIST_ALL);
+
+  for (CUdevice gl_device : gl_devices) {
+    if (gl_device == cuDevice) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+int CUDADevice::get_num_multiprocessors()
+{
+  return get_device_default_attribute(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, 0);
+}
+
+int CUDADevice::get_max_num_threads_per_multiprocessor()
+{
+  return get_device_default_attribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, 0);
+}
+
+bool CUDADevice::get_device_attribute(CUdevice_attribute attribute, int *value)
+{
+  CUDAContextScope scope(this);
+
+  return cuDeviceGetAttribute(value, attribute, cuDevice) == CUDA_SUCCESS;
+}
+
+int CUDADevice::get_device_default_attribute(CUdevice_attribute attribute, int default_value)
+{
+  int value = 0;
+  if (!get_device_attribute(attribute, &value)) {
+    return default_value;
+  }
+  return value;
+}
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/cuda/device_impl.h b/intern/cycles/device/cuda/device_impl.h
new file mode 100644
index 00000000000..6b27db54ab4
--- /dev/null
+++ b/intern/cycles/device/cuda/device_impl.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include "device/cuda/kernel.h"
+#  include "device/cuda/queue.h"
+#  include "device/cuda/util.h"
+#  include "device/device.h"
+
+#  include "util/util_map.h"
+
+#  ifdef WITH_CUDA_DYNLOAD
+#    include "cuew.h"
+#  else
+#    include "util/util_opengl.h"
+#    include <cuda.h>
+#    include <cudaGL.h>
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class DeviceQueue;
+
+class CUDADevice : public Device {
+
+  friend class CUDAContextScope;
+
+ public:
+  CUdevice cuDevice;
+  CUcontext cuContext;
+  CUmodule cuModule;
+  size_t device_texture_headroom;
+  size_t device_working_headroom;
+  bool move_texture_to_host;
+  size_t map_host_used;
+  size_t map_host_limit;
+  int can_map_host;
+  int pitch_alignment;
+  int cuDevId;
+  int cuDevArchitecture;
+  bool first_error;
+
+  struct CUDAMem {
+    CUDAMem() : texobject(0), array(0), use_mapped_host(false)
+    {
+    }
+
+    CUtexObject texobject;
+    CUarray array;
+
+    /* If true, a mapped host memory in shared_pointer is being used. */
+    bool use_mapped_host;
+  };
+  typedef map<device_memory *, CUDAMem> CUDAMemMap;
+  CUDAMemMap cuda_mem_map;
+  thread_mutex cuda_mem_map_mutex;
+
+  /* Bindless Textures */
+  device_vector<TextureInfo> texture_info;
+  bool need_texture_info;
+
+  CUDADeviceKernels kernels;
+
+  static bool have_precompiled_kernels();
+
+  virtual bool show_samples() const override;
+
+  virtual BVHLayoutMask get_bvh_layout_mask() const override;
+
+  void set_error(const string &error) override;
+
+  CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+  virtual ~CUDADevice();
+
+  bool support_device(const uint /*kernel_features*/);
+
+  bool check_peer_access(Device *peer_device) override;
+
+  bool use_adaptive_compilation();
+
+  virtual string compile_kernel_get_common_cflags(const uint kernel_features);
+
+  string compile_kernel(const uint kernel_features,
+                        const char *name,
+                        const char *base = "cuda",
+                        bool force_ptx = false);
+
+  virtual bool load_kernels(const uint kernel_features) override;
+
+  void reserve_local_memory(const uint kernel_features);
+
+  void init_host_memory();
+
+  void load_texture_info();
+
+  void move_textures_to_host(size_t size, bool for_texture);
+
+  CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
+
+  void generic_copy_to(device_memory &mem);
+
+  void generic_free(device_memory &mem);
+
+  void mem_alloc(device_memory &mem) override;
+
+  void mem_copy_to(device_memory &mem) override;
+
+  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override;
+
+  void mem_zero(device_memory &mem) override;
+
+  void mem_free(device_memory &mem) override;
+
+  device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override;
+
+  virtual void const_copy_to(const char *name, void *host, size_t size) override;
+
+  void global_alloc(device_memory &mem);
+
+  void global_free(device_memory &mem);
+
+  void tex_alloc(device_texture &mem);
+
+  void tex_free(device_texture &mem);
+
+  virtual bool should_use_graphics_interop() override;
+
+  virtual unique_ptr<DeviceQueue> gpu_queue_create() override;
+
+  int get_num_multiprocessors();
+  int get_max_num_threads_per_multiprocessor();
+
+ protected:
+  bool get_device_attribute(CUdevice_attribute attribute, int *value);
+  int get_device_default_attribute(CUdevice_attribute attribute, int default_value);
+};
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/cuda/graphics_interop.cpp b/intern/cycles/device/cuda/graphics_interop.cpp
new file mode 100644
index 00000000000..e8ca8b90eae
--- /dev/null
+++ b/intern/cycles/device/cuda/graphics_interop.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include "device/cuda/graphics_interop.h"
+
+#  include "device/cuda/device_impl.h"
+#  include "device/cuda/util.h"
+
+CCL_NAMESPACE_BEGIN
+
+CUDADeviceGraphicsInterop::CUDADeviceGraphicsInterop(CUDADeviceQueue *queue)
+    : queue_(queue), device_(static_cast<CUDADevice *>(queue->device))
+{
+}
+
+CUDADeviceGraphicsInterop::~CUDADeviceGraphicsInterop()
+{
+  CUDAContextScope scope(device_);
+
+  if (cu_graphics_resource_) {
+    cuda_device_assert(device_, cuGraphicsUnregisterResource(cu_graphics_resource_));
+  }
+}
+
+void CUDADeviceGraphicsInterop::set_destination(
+    const DeviceGraphicsInteropDestination &destination)
+{
+  const int64_t new_buffer_area = int64_t(destination.buffer_width) * destination.buffer_height;
+
+  need_clear_ = destination.need_clear;
+
+  if (opengl_pbo_id_ == destination.opengl_pbo_id && buffer_area_ == new_buffer_area) {
+    return;
+  }
+
+  CUDAContextScope scope(device_);
+
+  if (cu_graphics_resource_) {
+    cuda_device_assert(device_, cuGraphicsUnregisterResource(cu_graphics_resource_));
+  }
+
+  const CUresult result = cuGraphicsGLRegisterBuffer(
+      &cu_graphics_resource_, destination.opengl_pbo_id, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
+  if (result != CUDA_SUCCESS) {
+    LOG(ERROR) << "Error registering OpenGL buffer: " << cuewErrorString(result);
+  }
+
+  opengl_pbo_id_ = destination.opengl_pbo_id;
+  buffer_area_ = new_buffer_area;
+}
+
+device_ptr CUDADeviceGraphicsInterop::map()
+{
+  if (!cu_graphics_resource_) {
+    return 0;
+  }
+
+  CUDAContextScope scope(device_);
+
+  CUdeviceptr cu_buffer;
+  size_t bytes;
+
+  cuda_device_assert(device_, cuGraphicsMapResources(1, &cu_graphics_resource_, queue_->stream()));
+  cuda_device_assert(
+      device_, cuGraphicsResourceGetMappedPointer(&cu_buffer, &bytes, cu_graphics_resource_));
+
+  if (need_clear_) {
+    cuda_device_assert(
+        device_, cuMemsetD8Async(static_cast<CUdeviceptr>(cu_buffer), 0, bytes, queue_->stream()));
+
+    need_clear_ = false;
+  }
+
+  return static_cast<device_ptr>(cu_buffer);
+}
+
+void CUDADeviceGraphicsInterop::unmap()
+{
+  CUDAContextScope scope(device_);
+
+  cuda_device_assert(device_,
+                     cuGraphicsUnmapResources(1, &cu_graphics_resource_, queue_->stream()));
+}
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/cuda/graphics_interop.h b/intern/cycles/device/cuda/graphics_interop.h
new file mode 100644
index 00000000000..8a70c8aa71d
--- /dev/null
+++ b/intern/cycles/device/cuda/graphics_interop.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include "device/device_graphics_interop.h"
+
+#  ifdef WITH_CUDA_DYNLOAD
+#    include "cuew.h"
+#  else
+#    include <cuda.h>
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice;
+class CUDADeviceQueue;
+
+class CUDADeviceGraphicsInterop : public DeviceGraphicsInterop {
+ public:
+  explicit CUDADeviceGraphicsInterop(CUDADeviceQueue *queue);
+
+  CUDADeviceGraphicsInterop(const CUDADeviceGraphicsInterop &other) = delete;
+  CUDADeviceGraphicsInterop(CUDADeviceGraphicsInterop &&other) noexcept = delete;
+
+  ~CUDADeviceGraphicsInterop();
+
+  CUDADeviceGraphicsInterop &operator=(const CUDADeviceGraphicsInterop &other) = delete;
+  CUDADeviceGraphicsInterop &operator=(CUDADeviceGraphicsInterop &&other) = delete;
+
+  virtual void set_destination(const DeviceGraphicsInteropDestination &destination) override;
+
+  virtual device_ptr map() override;
+  virtual void unmap() override;
+
+ protected:
+  CUDADeviceQueue *queue_ = nullptr;
+  CUDADevice *device_ = nullptr;
+
+  /* OpenGL PBO which is currently registered as the destination for the CUDA buffer. */
+  uint opengl_pbo_id_ = 0;
+  /* Buffer area in pixels of the corresponding PBO. */
+  int64_t buffer_area_ = 0;
+
+  /* The destination was requested to be cleared. */
+  bool need_clear_ = false;
+
+  CUgraphicsResource cu_graphics_resource_ = nullptr;
+};
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/cuda/kernel.cpp b/intern/cycles/device/cuda/kernel.cpp
new file mode 100644
index 00000000000..0ed20ddf8e6
--- /dev/null
+++ b/intern/cycles/device/cuda/kernel.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include "device/cuda/kernel.h"
+#  include "device/cuda/device_impl.h"
+
+CCL_NAMESPACE_BEGIN
+
+void CUDADeviceKernels::load(CUDADevice *device)
+{
+  CUmodule cuModule = device->cuModule;
+
+  for (int i = 0; i < (int)DEVICE_KERNEL_NUM; i++) {
+    CUDADeviceKernel &kernel = kernels_[i];
+
+    /* No megakernel used for GPU. */
+    if (i == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
+      continue;
+    }
+
+    const std::string function_name = std::string("kernel_gpu_") +
+                                      device_kernel_as_string((DeviceKernel)i);
+    cuda_device_assert(device,
+                       cuModuleGetFunction(&kernel.function, cuModule, function_name.c_str()));
+
+    if (kernel.function) {
+      cuda_device_assert(device, cuFuncSetCacheConfig(kernel.function, CU_FUNC_CACHE_PREFER_L1));
+
+      cuda_device_assert(
+          device,
+          cuOccupancyMaxPotentialBlockSize(
+              &kernel.min_blocks, &kernel.num_threads_per_block, kernel.function, NULL, 0, 0));
+    }
+    else {
+      LOG(ERROR) << "Unable to load kernel " << function_name;
+    }
+  }
+
+  loaded = true;
+}
+
+const CUDADeviceKernel &CUDADeviceKernels::get(DeviceKernel kernel) const
+{
+  return kernels_[(int)kernel];
+}
+
+bool CUDADeviceKernels::available(DeviceKernel kernel) const
+{
+  return kernels_[(int)kernel].function != nullptr;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA*/
diff --git a/intern/cycles/device/cuda/kernel.h b/intern/cycles/device/cuda/kernel.h
new file mode 100644
index 00000000000..b489547a350
--- /dev/null
+++ b/intern/cycles/device/cuda/kernel.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_CUDA
+
+#  include "device/device_kernel.h"
+
+#  ifdef WITH_CUDA_DYNLOAD
+#    include "cuew.h"
+#  else
+#    include <cuda.h>
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice;
+
+/* CUDA kernel and associate occupancy information. */
+class CUDADeviceKernel {
+ public:
+  CUfunction function = nullptr;
+
+  int num_threads_per_block = 0;
+  int min_blocks = 0;
+};
+
+/* Cache of CUDA kernels for each DeviceKernel. */
+class CUDADeviceKernels {
+ public:
+  void load(CUDADevice *device);
+  const CUDADeviceKernel &get(DeviceKernel kernel) const;
+  bool available(DeviceKernel kernel) const;
+
+ protected:
+  CUDADeviceKernel kernels_[DEVICE_KERNEL_NUM];
+  bool loaded = false;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA */
diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp
new file mode 100644
index 00000000000..b7f86c10553
--- /dev/null
+++ b/intern/cycles/device/cuda/queue.cpp
@@ -0,0 +1,220 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include "device/cuda/queue.h"
+
+#  include "device/cuda/device_impl.h"
+#  include "device/cuda/graphics_interop.h"
+#  include "device/cuda/kernel.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* CUDADeviceQueue */
+
+CUDADeviceQueue::CUDADeviceQueue(CUDADevice *device)
+    : DeviceQueue(device), cuda_device_(device), cuda_stream_(nullptr)
+{
+  const CUDAContextScope scope(cuda_device_);
+  cuda_device_assert(cuda_device_, cuStreamCreate(&cuda_stream_, CU_STREAM_NON_BLOCKING));
+}
+
+CUDADeviceQueue::~CUDADeviceQueue()
+{
+  const CUDAContextScope scope(cuda_device_);
+  cuStreamDestroy(cuda_stream_);
+}
+
+int CUDADeviceQueue::num_concurrent_states(const size_t state_size) const
+{
+  int num_states = max(cuda_device_->get_num_multiprocessors() *
+                           cuda_device_->get_max_num_threads_per_multiprocessor() * 16,
+                       1048576);
+
+  const char *factor_str = getenv("CYCLES_CONCURRENT_STATES_FACTOR");
+  if (factor_str) {
+    num_states = max((int)(num_states * atof(factor_str)), 1024);
+  }
+
+  VLOG(3) << "GPU queue concurrent states: " << num_states << ", using up to "
+          << string_human_readable_size(num_states * state_size);
+
+  return num_states;
+}
+
+int CUDADeviceQueue::num_concurrent_busy_states() const
+{
+  const int max_num_threads = cuda_device_->get_num_multiprocessors() *
+                              cuda_device_->get_max_num_threads_per_multiprocessor();
+
+  if (max_num_threads == 0) {
+    return 65536;
+  }
+
+  return 4 * max_num_threads;
+}
+
+void CUDADeviceQueue::init_execution()
+{
+  /* Synchronize all textures and memory copies before executing task. */
+  CUDAContextScope scope(cuda_device_);
+  cuda_device_->load_texture_info();
+  cuda_device_assert(cuda_device_, cuCtxSynchronize());
+
+  debug_init_execution();
+}
+
+bool CUDADeviceQueue::kernel_available(DeviceKernel kernel) const
+{
+  return cuda_device_->kernels.available(kernel);
+}
+
+bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[])
+{
+  if (cuda_device_->have_error()) {
+    return false;
+  }
+
+  debug_enqueue(kernel, work_size);
+
+  const CUDAContextScope scope(cuda_device_);
+  const CUDADeviceKernel &cuda_kernel = cuda_device_->kernels.get(kernel);
+
+  /* Compute kernel launch parameters. */
+  const int num_threads_per_block = cuda_kernel.num_threads_per_block;
+  const int num_blocks = divide_up(work_size, num_threads_per_block);
+
+  int shared_mem_bytes = 0;
+
+  switch (kernel) {
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
+      /* See parall_active_index.h for why this amount of shared memory is needed. */
+      shared_mem_bytes = (num_threads_per_block + 1) * sizeof(int);
+      break;
+
+    default:
+      break;
+  }
+
+  /* Launch kernel. */
+  cuda_device_assert(cuda_device_,
+                     cuLaunchKernel(cuda_kernel.function,
+                                    num_blocks,
+                                    1,
+                                    1,
+                                    num_threads_per_block,
+                                    1,
+                                    1,
+                                    shared_mem_bytes,
+                                    cuda_stream_,
+                                    args,
+                                    0));
+
+  return !(cuda_device_->have_error());
+}
+
+bool CUDADeviceQueue::synchronize()
+{
+  if (cuda_device_->have_error()) {
+    return false;
+  }
+
+  const CUDAContextScope scope(cuda_device_);
+  cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_));
+  debug_synchronize();
+
+  return !(cuda_device_->have_error());
+}
+
+void CUDADeviceQueue::zero_to_device(device_memory &mem)
+{
+  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+  if (mem.memory_size() == 0) {
+    return;
+  }
+
+  /* Allocate on demand. */
+  if (mem.device_pointer == 0) {
+    cuda_device_->mem_alloc(mem);
+  }
+
+  /* Zero memory on device. */
+  assert(mem.device_pointer != 0);
+
+  const CUDAContextScope scope(cuda_device_);
+  cuda_device_assert(
+      cuda_device_,
+      cuMemsetD8Async((CUdeviceptr)mem.device_pointer, 0, mem.memory_size(), cuda_stream_));
+}
+
+void CUDADeviceQueue::copy_to_device(device_memory &mem)
+{
+  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+  if (mem.memory_size() == 0) {
+    return;
+  }
+
+  /* Allocate on demand. */
+  if (mem.device_pointer == 0) {
+    cuda_device_->mem_alloc(mem);
+  }
+
+  assert(mem.device_pointer != 0);
+  assert(mem.host_pointer != nullptr);
+
+  /* Copy memory to device. */
+  const CUDAContextScope scope(cuda_device_);
+  cuda_device_assert(
+      cuda_device_,
+      cuMemcpyHtoDAsync(
+          (CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size(), cuda_stream_));
+}
+
+void CUDADeviceQueue::copy_from_device(device_memory &mem)
+{
+  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+  if (mem.memory_size() == 0) {
+    return;
+  }
+
+  assert(mem.device_pointer != 0);
+  assert(mem.host_pointer != nullptr);
+
+  /* Copy memory from device. */
+  const CUDAContextScope scope(cuda_device_);
+  cuda_device_assert(
+      cuda_device_,
+      cuMemcpyDtoHAsync(
+          mem.host_pointer, (CUdeviceptr)mem.device_pointer, mem.memory_size(), cuda_stream_));
+}
+
+unique_ptr<DeviceGraphicsInterop> CUDADeviceQueue::graphics_interop_create()
+{
+  return make_unique<CUDADeviceGraphicsInterop>(this);
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA */
diff --git a/intern/cycles/device/cuda/queue.h b/intern/cycles/device/cuda/queue.h
new file mode 100644
index 00000000000..62e3aa3d6c2
--- /dev/null
+++ b/intern/cycles/device/cuda/queue.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_CUDA
+
+#  include "device/device_kernel.h"
+#  include "device/device_memory.h"
+#  include "device/device_queue.h"
+
+#  include "device/cuda/util.h"
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice;
+class device_memory;
+
+/* Base class for CUDA queues. */
+class CUDADeviceQueue : public DeviceQueue {
+ public:
+  CUDADeviceQueue(CUDADevice *device);
+  ~CUDADeviceQueue();
+
+  virtual int num_concurrent_states(const size_t state_size) const override;
+  virtual int num_concurrent_busy_states() const override;
+
+  virtual void init_execution() override;
+
+  virtual bool kernel_available(DeviceKernel kernel) const override;
+
+  virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) override;
+
+  virtual bool synchronize() override;
+
+  virtual void zero_to_device(device_memory &mem) override;
+  virtual void copy_to_device(device_memory &mem) override;
+  virtual void copy_from_device(device_memory &mem) override;
+
+  virtual CUstream stream()
+  {
+    return cuda_stream_;
+  }
+
+  virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create() override;
+
+ protected:
+  CUDADevice *cuda_device_;
+  CUstream cuda_stream_;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA */
diff --git a/intern/cycles/device/cuda/util.cpp b/intern/cycles/device/cuda/util.cpp
new file mode 100644
index 00000000000..8f657cc10fe
--- /dev/null
+++ b/intern/cycles/device/cuda/util.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include "device/cuda/util.h"
+#  include "device/cuda/device_impl.h"
+
+CCL_NAMESPACE_BEGIN
+
+CUDAContextScope::CUDAContextScope(CUDADevice *device) : device(device)
+{
+  cuda_device_assert(device, cuCtxPushCurrent(device->cuContext));
+}
+
+CUDAContextScope::~CUDAContextScope()
+{
+  cuda_device_assert(device, cuCtxPopCurrent(NULL));
+}
+
+#  ifndef WITH_CUDA_DYNLOAD
+const char *cuewErrorString(CUresult result)
+{
+  /* We can only give error code here without major code duplication, that
+   * should be enough since dynamic loading is only being disabled by folks
+   * who knows what they're doing anyway.
+   *
+   * NOTE: Avoid call from several threads.
+   */
+  static string error;
+  error = string_printf("%d", result);
+  return error.c_str();
+}
+
+const char *cuewCompilerPath()
+{
+  return CYCLES_CUDA_NVCC_EXECUTABLE;
+}
+
+int cuewCompilerVersion()
+{
+  return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10);
+}
+#  endif
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA */
diff --git a/intern/cycles/device/cuda/util.h b/intern/cycles/device/cuda/util.h
new file mode 100644
index 00000000000..a0898094c08
--- /dev/null
+++ b/intern/cycles/device/cuda/util.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_CUDA
+
+#  ifdef WITH_CUDA_DYNLOAD
+#    include "cuew.h"
+#  else
+#    include <cuda.h>
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice;
+
+/* Utility to push/pop CUDA context. */
+class CUDAContextScope {
+ public:
+  CUDAContextScope(CUDADevice *device);
+  ~CUDAContextScope();
+
+ private:
+  CUDADevice *device;
+};
+
+/* Utility for checking return values of CUDA function calls. */
+#  define cuda_device_assert(cuda_device, stmt) \
+    { \
+      CUresult result = stmt; \
+      if (result != CUDA_SUCCESS) { \
+        const char *name = cuewErrorString(result); \
+        cuda_device->set_error( \
+            string_printf("%s in %s (%s:%d)", name, #stmt, __FILE__, __LINE__)); \
+      } \
+    } \
+    (void)0
+
+#  define cuda_assert(stmt) cuda_device_assert(this, stmt)
+
+#  ifndef WITH_CUDA_DYNLOAD
+/* Transparently implement some functions, so majority of the file does not need
+ * to worry about difference between dynamically loaded and linked CUDA at all. */
+const char *cuewErrorString(CUresult result);
+const char *cuewCompilerPath();
+int cuewCompilerVersion();
+#  endif /* WITH_CUDA_DYNLOAD */
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA */
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index ed53fbb54ae..6ccedcf54ef 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -20,7 +20,13 @@
 #include "bvh/bvh2.h"
 
 #include "device/device.h"
-#include "device/device_intern.h"
+#include "device/device_queue.h"
+
+#include "device/cpu/device.h"
+#include "device/cuda/device.h"
+#include "device/dummy/device.h"
+#include "device/multi/device.h"
+#include "device/optix/device.h"
 
 #include "util/util_foreach.h"
 #include "util/util_half.h"
@@ -38,332 +44,15 @@ CCL_NAMESPACE_BEGIN
 bool Device::need_types_update = true;
 bool Device::need_devices_update = true;
 thread_mutex Device::device_mutex;
-vector<DeviceInfo> Device::opencl_devices;
 vector<DeviceInfo> Device::cuda_devices;
 vector<DeviceInfo> Device::optix_devices;
 vector<DeviceInfo> Device::cpu_devices;
-vector<DeviceInfo> Device::network_devices;
 uint Device::devices_initialized_mask = 0;
 
-/* Device Requested Features */
-
-std::ostream &operator<<(std::ostream &os, const DeviceRequestedFeatures &requested_features)
-{
-  os << "Experimental features: " << (requested_features.experimental ? "On" : "Off") << std::endl;
-  os << "Max nodes group: " << requested_features.max_nodes_group << std::endl;
-  /* TODO(sergey): Decode bitflag into list of names. */
-  os << "Nodes features: " << requested_features.nodes_features << std::endl;
-  os << "Use Hair: " << string_from_bool(requested_features.use_hair) << std::endl;
-  os << "Use Object Motion: " << string_from_bool(requested_features.use_object_motion)
-     << std::endl;
-  os << "Use Camera Motion: " << string_from_bool(requested_features.use_camera_motion)
-     << std::endl;
-  os << "Use Baking: " << string_from_bool(requested_features.use_baking) << std::endl;
-  os << "Use Subsurface: " << string_from_bool(requested_features.use_subsurface) << std::endl;
-  os << "Use Volume: " << string_from_bool(requested_features.use_volume) << std::endl;
-  os << "Use Branched Integrator: " << string_from_bool(requested_features.use_integrator_branched)
-     << std::endl;
-  os << "Use Patch Evaluation: " << string_from_bool(requested_features.use_patch_evaluation)
-     << std::endl;
-  os << "Use Transparent Shadows: " << string_from_bool(requested_features.use_transparent)
-     << std::endl;
-  os << "Use Principled BSDF: " << string_from_bool(requested_features.use_principled)
-     << std::endl;
-  os << "Use Denoising: " << string_from_bool(requested_features.use_denoising) << std::endl;
-  os << "Use Displacement: " << string_from_bool(requested_features.use_true_displacement)
-     << std::endl;
-  os << "Use Background Light: " << string_from_bool(requested_features.use_background_light)
-     << std::endl;
-  return os;
-}
-
 /* Device */
 
 Device::~Device() noexcept(false)
 {
-  if (!background) {
-    if (vertex_buffer != 0) {
-      glDeleteBuffers(1, &vertex_buffer);
-    }
-    if (fallback_shader_program != 0) {
-      glDeleteProgram(fallback_shader_program);
-    }
-  }
-}
-
-/* TODO move shaders to standalone .glsl file. */
-const char *FALLBACK_VERTEX_SHADER =
-    "#version 330\n"
-    "uniform vec2 fullscreen;\n"
-    "in vec2 texCoord;\n"
-    "in vec2 pos;\n"
-    "out vec2 texCoord_interp;\n"
-    "\n"
-    "vec2 normalize_coordinates()\n"
-    "{\n"
-    "   return (vec2(2.0) * (pos / fullscreen)) - vec2(1.0);\n"
-    "}\n"
-    "\n"
-    "void main()\n"
-    "{\n"
-    "   gl_Position = vec4(normalize_coordinates(), 0.0, 1.0);\n"
-    "   texCoord_interp = texCoord;\n"
-    "}\n\0";
-
-const char *FALLBACK_FRAGMENT_SHADER =
-    "#version 330\n"
-    "uniform sampler2D image_texture;\n"
-    "in vec2 texCoord_interp;\n"
-    "out vec4 fragColor;\n"
-    "\n"
-    "void main()\n"
-    "{\n"
-    "   fragColor = texture(image_texture, texCoord_interp);\n"
-    "}\n\0";
-
-static void shader_print_errors(const char *task, const char *log, const char *code)
-{
-  LOG(ERROR) << "Shader: " << task << " error:";
-  LOG(ERROR) << "===== shader string ====";
-
-  stringstream stream(code);
-  string partial;
-
-  int line = 1;
-  while (getline(stream, partial, '\n')) {
-    if (line < 10) {
-      LOG(ERROR) << " " << line << " " << partial;
-    }
-    else {
-      LOG(ERROR) << line << " " << partial;
-    }
-    line++;
-  }
-  LOG(ERROR) << log;
-}
-
-static int bind_fallback_shader(void)
-{
-  GLint status;
-  GLchar log[5000];
-  GLsizei length = 0;
-  GLuint program = 0;
-
-  struct Shader {
-    const char *source;
-    GLenum type;
-  } shaders[2] = {{FALLBACK_VERTEX_SHADER, GL_VERTEX_SHADER},
-                  {FALLBACK_FRAGMENT_SHADER, GL_FRAGMENT_SHADER}};
-
-  program = glCreateProgram();
-
-  for (int i = 0; i < 2; i++) {
-    GLuint shader = glCreateShader(shaders[i].type);
-
-    string source_str = shaders[i].source;
-    const char *c_str = source_str.c_str();
-
-    glShaderSource(shader, 1, &c_str, NULL);
-    glCompileShader(shader);
-
-    glGetShaderiv(shader, GL_COMPILE_STATUS, &status);
-
-    if (!status) {
-      glGetShaderInfoLog(shader, sizeof(log), &length, log);
-      shader_print_errors("compile", log, c_str);
-      return 0;
-    }
-
-    glAttachShader(program, shader);
-  }
-
-  /* Link output. */
-  glBindFragDataLocation(program, 0, "fragColor");
-
-  /* Link and error check. */
-  glLinkProgram(program);
-
-  glGetProgramiv(program, GL_LINK_STATUS, &status);
-  if (!status) {
-    glGetShaderInfoLog(program, sizeof(log), &length, log);
-    shader_print_errors("linking", log, FALLBACK_VERTEX_SHADER);
-    shader_print_errors("linking", log, FALLBACK_FRAGMENT_SHADER);
-    return 0;
-  }
-
-  return program;
-}
-
-bool Device::bind_fallback_display_space_shader(const float width, const float height)
-{
-  if (fallback_status == FALLBACK_SHADER_STATUS_ERROR) {
-    return false;
-  }
-
-  if (fallback_status == FALLBACK_SHADER_STATUS_NONE) {
-    fallback_shader_program = bind_fallback_shader();
-    fallback_status = FALLBACK_SHADER_STATUS_ERROR;
-
-    if (fallback_shader_program == 0) {
-      return false;
-    }
-
-    glUseProgram(fallback_shader_program);
-    image_texture_location = glGetUniformLocation(fallback_shader_program, "image_texture");
-    if (image_texture_location < 0) {
-      LOG(ERROR) << "Shader doesn't contain the 'image_texture' uniform.";
-      return false;
-    }
-
-    fullscreen_location = glGetUniformLocation(fallback_shader_program, "fullscreen");
-    if (fullscreen_location < 0) {
-      LOG(ERROR) << "Shader doesn't contain the 'fullscreen' uniform.";
-      return false;
-    }
-
-    fallback_status = FALLBACK_SHADER_STATUS_SUCCESS;
-  }
-
-  /* Run this every time. */
-  glUseProgram(fallback_shader_program);
-  glUniform1i(image_texture_location, 0);
-  glUniform2f(fullscreen_location, width, height);
-  return true;
-}
-
-void Device::draw_pixels(device_memory &rgba,
-                         int y,
-                         int w,
-                         int h,
-                         int width,
-                         int height,
-                         int dx,
-                         int dy,
-                         int dw,
-                         int dh,
-                         bool transparent,
-                         const DeviceDrawParams &draw_params)
-{
-  const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL);
-
-  assert(rgba.type == MEM_PIXELS);
-  mem_copy_from(rgba, y, w, h, rgba.memory_elements_size(1));
-
-  GLuint texid;
-  glActiveTexture(GL_TEXTURE0);
-  glGenTextures(1, &texid);
-  glBindTexture(GL_TEXTURE_2D, texid);
-
-  if (rgba.data_type == TYPE_HALF) {
-    GLhalf *data_pointer = (GLhalf *)rgba.host_pointer;
-    data_pointer += 4 * y * w;
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, w, h, 0, GL_RGBA, GL_HALF_FLOAT, data_pointer);
-  }
-  else {
-    uint8_t *data_pointer = (uint8_t *)rgba.host_pointer;
-    data_pointer += 4 * y * w;
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, w, h, 0, GL_RGBA, GL_UNSIGNED_BYTE, data_pointer);
-  }
-
-  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-
-  if (transparent) {
-    glEnable(GL_BLEND);
-    glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
-  }
-
-  GLint shader_program;
-  if (use_fallback_shader) {
-    if (!bind_fallback_display_space_shader(dw, dh)) {
-      return;
-    }
-    shader_program = fallback_shader_program;
-  }
-  else {
-    draw_params.bind_display_space_shader_cb();
-    glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program);
-  }
-
-  if (!vertex_buffer) {
-    glGenBuffers(1, &vertex_buffer);
-  }
-
-  glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
-  /* invalidate old contents - avoids stalling if buffer is still waiting in queue to be rendered
-   */
-  glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
-
-  float *vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
-
-  if (vpointer) {
-    /* texture coordinate - vertex pair */
-    vpointer[0] = 0.0f;
-    vpointer[1] = 0.0f;
-    vpointer[2] = dx;
-    vpointer[3] = dy;
-
-    vpointer[4] = 1.0f;
-    vpointer[5] = 0.0f;
-    vpointer[6] = (float)width + dx;
-    vpointer[7] = dy;
-
-    vpointer[8] = 1.0f;
-    vpointer[9] = 1.0f;
-    vpointer[10] = (float)width + dx;
-    vpointer[11] = (float)height + dy;
-
-    vpointer[12] = 0.0f;
-    vpointer[13] = 1.0f;
-    vpointer[14] = dx;
-    vpointer[15] = (float)height + dy;
-
-    if (vertex_buffer) {
-      glUnmapBuffer(GL_ARRAY_BUFFER);
-    }
-  }
-
-  GLuint vertex_array_object;
-  GLuint position_attribute, texcoord_attribute;
-
-  glGenVertexArrays(1, &vertex_array_object);
-  glBindVertexArray(vertex_array_object);
-
-  texcoord_attribute = glGetAttribLocation(shader_program, "texCoord");
-  position_attribute = glGetAttribLocation(shader_program, "pos");
-
-  glEnableVertexAttribArray(texcoord_attribute);
-  glEnableVertexAttribArray(position_attribute);
-
-  glVertexAttribPointer(
-      texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
-  glVertexAttribPointer(position_attribute,
-                        2,
-                        GL_FLOAT,
-                        GL_FALSE,
-                        4 * sizeof(float),
-                        (const GLvoid *)(sizeof(float) * 2));
-
-  glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
-
-  if (vertex_buffer) {
-    glBindBuffer(GL_ARRAY_BUFFER, 0);
-  }
-
-  if (use_fallback_shader) {
-    glUseProgram(0);
-  }
-  else {
-    draw_params.unbind_display_space_shader_cb();
-  }
-
-  glDeleteVertexArrays(1, &vertex_array_object);
-  glBindTexture(GL_TEXTURE_2D, 0);
-  glDeleteTextures(1, &texid);
-
-  if (transparent) {
-    glDisable(GL_BLEND);
-  }
 }
 
 void Device::build_bvh(BVH *bvh, Progress &progress, bool refit)
@@ -379,14 +68,14 @@ void Device::build_bvh(BVH *bvh, Progress &progress, bool refit)
   }
 }
 
-Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
+Device *Device::create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
 {
 #ifdef WITH_MULTI
   if (!info.multi_devices.empty()) {
     /* Always create a multi device when info contains multiple devices.
      * This is done so that the type can still be e.g. DEVICE_CPU to indicate
      * that it is a homogeneous collection of devices, which simplifies checks. */
-    return device_multi_create(info, stats, profiler, background);
+    return device_multi_create(info, stats, profiler);
   }
 #endif
 
@@ -394,29 +83,18 @@ Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool
 
   switch (info.type) {
     case DEVICE_CPU:
-      device = device_cpu_create(info, stats, profiler, background);
+      device = device_cpu_create(info, stats, profiler);
       break;
 #ifdef WITH_CUDA
     case DEVICE_CUDA:
       if (device_cuda_init())
-        device = device_cuda_create(info, stats, profiler, background);
+        device = device_cuda_create(info, stats, profiler);
       break;
 #endif
 #ifdef WITH_OPTIX
     case DEVICE_OPTIX:
       if (device_optix_init())
-        device = device_optix_create(info, stats, profiler, background);
-      break;
-#endif
-#ifdef WITH_NETWORK
-    case DEVICE_NETWORK:
-      device = device_network_create(info, stats, profiler, "127.0.0.1");
-      break;
-#endif
-#ifdef WITH_OPENCL
-    case DEVICE_OPENCL:
-      if (device_opencl_init())
-        device = device_opencl_create(info, stats, profiler, background);
+        device = device_optix_create(info, stats, profiler);
       break;
 #endif
     default:
@@ -424,7 +102,7 @@ Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool
   }
 
   if (device == NULL) {
-    device = device_dummy_create(info, stats, profiler, background);
+    device = device_dummy_create(info, stats, profiler);
   }
 
   return device;
@@ -438,10 +116,6 @@ DeviceType Device::type_from_string(const char *name)
     return DEVICE_CUDA;
   else if (strcmp(name, "OPTIX") == 0)
     return DEVICE_OPTIX;
-  else if (strcmp(name, "OPENCL") == 0)
-    return DEVICE_OPENCL;
-  else if (strcmp(name, "NETWORK") == 0)
-    return DEVICE_NETWORK;
   else if (strcmp(name, "MULTI") == 0)
     return DEVICE_MULTI;
 
@@ -456,10 +130,6 @@ string Device::string_from_type(DeviceType type)
     return "CUDA";
   else if (type == DEVICE_OPTIX)
     return "OPTIX";
-  else if (type == DEVICE_OPENCL)
-    return "OPENCL";
-  else if (type == DEVICE_NETWORK)
-    return "NETWORK";
   else if (type == DEVICE_MULTI)
     return "MULTI";
 
@@ -476,12 +146,6 @@ vector<DeviceType> Device::available_types()
 #ifdef WITH_OPTIX
   types.push_back(DEVICE_OPTIX);
 #endif
-#ifdef WITH_OPENCL
-  types.push_back(DEVICE_OPENCL);
-#endif
-#ifdef WITH_NETWORK
-  types.push_back(DEVICE_NETWORK);
-#endif
   return types;
 }
 
@@ -493,20 +157,6 @@ vector<DeviceInfo> Device::available_devices(uint mask)
   thread_scoped_lock lock(device_mutex);
   vector<DeviceInfo> devices;
 
-#ifdef WITH_OPENCL
-  if (mask & DEVICE_MASK_OPENCL) {
-    if (!(devices_initialized_mask & DEVICE_MASK_OPENCL)) {
-      if (device_opencl_init()) {
-        device_opencl_info(opencl_devices);
-      }
-      devices_initialized_mask |= DEVICE_MASK_OPENCL;
-    }
-    foreach (DeviceInfo &info, opencl_devices) {
-      devices.push_back(info);
-    }
-  }
-#endif
-
 #if defined(WITH_CUDA) || defined(WITH_OPTIX)
   if (mask & (DEVICE_MASK_CUDA | DEVICE_MASK_OPTIX)) {
     if (!(devices_initialized_mask & DEVICE_MASK_CUDA)) {
@@ -547,18 +197,6 @@ vector<DeviceInfo> Device::available_devices(uint mask)
     }
   }
 
-#ifdef WITH_NETWORK
-  if (mask & DEVICE_MASK_NETWORK) {
-    if (!(devices_initialized_mask & DEVICE_MASK_NETWORK)) {
-      device_network_info(network_devices);
-      devices_initialized_mask |= DEVICE_MASK_NETWORK;
-    }
-    foreach (DeviceInfo &info, network_devices) {
-      devices.push_back(info);
-    }
-  }
-#endif
-
   return devices;
 }
 
@@ -580,15 +218,6 @@ string Device::device_capabilities(uint mask)
     capabilities += device_cpu_capabilities() + "\n";
   }
 
-#ifdef WITH_OPENCL
-  if (mask & DEVICE_MASK_OPENCL) {
-    if (device_opencl_init()) {
-      capabilities += "\nOpenCL device capabilities:\n";
-      capabilities += device_opencl_capabilities();
-    }
-  }
-#endif
-
 #ifdef WITH_CUDA
   if (mask & DEVICE_MASK_CUDA) {
     if (device_cuda_init()) {
@@ -613,16 +242,13 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
   }
 
   DeviceInfo info;
-  info.type = subdevices.front().type;
+  info.type = DEVICE_NONE;
   info.id = "MULTI";
   info.description = "Multi Device";
   info.num = 0;
 
   info.has_half_images = true;
   info.has_nanovdb = true;
-  info.has_volume_decoupled = true;
-  info.has_branched_path = true;
-  info.has_adaptive_stop_per_sample = true;
   info.has_osl = true;
   info.has_profiling = true;
   info.has_peer_memory = false;
@@ -660,16 +286,16 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
     info.id += device.id;
 
     /* Set device type to MULTI if subdevices are not of a common type. */
-    if (device.type != info.type) {
+    if (info.type == DEVICE_NONE) {
+      info.type = device.type;
+    }
+    else if (device.type != info.type) {
       info.type = DEVICE_MULTI;
     }
 
     /* Accumulate device info. */
     info.has_half_images &= device.has_half_images;
     info.has_nanovdb &= device.has_nanovdb;
-    info.has_volume_decoupled &= device.has_volume_decoupled;
-    info.has_branched_path &= device.has_branched_path;
-    info.has_adaptive_stop_per_sample &= device.has_adaptive_stop_per_sample;
     info.has_osl &= device.has_osl;
     info.has_profiling &= device.has_profiling;
     info.has_peer_memory |= device.has_peer_memory;
@@ -689,60 +315,32 @@ void Device::free_memory()
   devices_initialized_mask = 0;
   cuda_devices.free_memory();
   optix_devices.free_memory();
-  opencl_devices.free_memory();
   cpu_devices.free_memory();
-  network_devices.free_memory();
 }
 
-/* DeviceInfo */
-
-void DeviceInfo::add_denoising_devices(DenoiserType denoiser_type)
+unique_ptr<DeviceQueue> Device::gpu_queue_create()
 {
-  assert(denoising_devices.empty());
-
-  if (denoiser_type == DENOISER_OPTIX && type != DEVICE_OPTIX) {
-    vector<DeviceInfo> optix_devices = Device::available_devices(DEVICE_MASK_OPTIX);
-    if (!optix_devices.empty()) {
-      /* Convert to a special multi device with separate denoising devices. */
-      if (multi_devices.empty()) {
-        multi_devices.push_back(*this);
-      }
-
-      /* Try to use the same physical devices for denoising. */
-      for (const DeviceInfo &cuda_device : multi_devices) {
-        if (cuda_device.type == DEVICE_CUDA) {
-          for (const DeviceInfo &optix_device : optix_devices) {
-            if (cuda_device.num == optix_device.num) {
-              id += optix_device.id;
-              denoising_devices.push_back(optix_device);
-              break;
-            }
-          }
-        }
-      }
-
-      if (denoising_devices.empty()) {
-        /* Simply use the first available OptiX device. */
-        const DeviceInfo optix_device = optix_devices.front();
-        id += optix_device.id; /* Uniquely identify this special multi device. */
-        denoising_devices.push_back(optix_device);
-      }
+  LOG(FATAL) << "Device does not support queues.";
+  return nullptr;
+}
 
-      denoisers = denoiser_type;
-    }
-  }
-  else if (denoiser_type == DENOISER_OPENIMAGEDENOISE && type != DEVICE_CPU) {
-    /* Convert to a special multi device with separate denoising devices. */
-    if (multi_devices.empty()) {
-      multi_devices.push_back(*this);
-    }
+const CPUKernels *Device::get_cpu_kernels() const
+{
+  LOG(FATAL) << "Device does not support CPU kernels.";
+  return nullptr;
+}
 
-    /* Add CPU denoising devices. */
-    DeviceInfo cpu_device = Device::available_devices(DEVICE_MASK_CPU).front();
-    denoising_devices.push_back(cpu_device);
+void Device::get_cpu_kernel_thread_globals(
+    vector<CPUKernelThreadGlobals> & /*kernel_thread_globals*/)
+{
+  LOG(FATAL) << "Device does not support CPU kernels.";
+}
 
-    denoisers = denoiser_type;
-  }
+void *Device::get_cpu_osl_memory()
+{
+  return nullptr;
 }
 
+/* DeviceInfo */
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index ecf79bcdfa6..02b6edb56d0 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -21,31 +21,34 @@
 
 #include "bvh/bvh_params.h"
 
+#include "device/device_denoise.h"
 #include "device/device_memory.h"
-#include "device/device_task.h"
 
+#include "util/util_function.h"
 #include "util/util_list.h"
+#include "util/util_logging.h"
 #include "util/util_stats.h"
 #include "util/util_string.h"
 #include "util/util_texture.h"
 #include "util/util_thread.h"
 #include "util/util_types.h"
+#include "util/util_unique_ptr.h"
 #include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
 class BVH;
+class DeviceQueue;
 class Progress;
-class RenderTile;
+class CPUKernels;
+class CPUKernelThreadGlobals;
 
 /* Device Types */
 
 enum DeviceType {
   DEVICE_NONE = 0,
   DEVICE_CPU,
-  DEVICE_OPENCL,
   DEVICE_CUDA,
-  DEVICE_NETWORK,
   DEVICE_MULTI,
   DEVICE_OPTIX,
   DEVICE_DUMMY,
@@ -53,20 +56,11 @@ enum DeviceType {
 
 enum DeviceTypeMask {
   DEVICE_MASK_CPU = (1 << DEVICE_CPU),
-  DEVICE_MASK_OPENCL = (1 << DEVICE_OPENCL),
   DEVICE_MASK_CUDA = (1 << DEVICE_CUDA),
   DEVICE_MASK_OPTIX = (1 << DEVICE_OPTIX),
-  DEVICE_MASK_NETWORK = (1 << DEVICE_NETWORK),
   DEVICE_MASK_ALL = ~0
 };
 
-enum DeviceKernelStatus {
-  DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE,
-  DEVICE_KERNEL_USING_FEATURE_KERNEL,
-  DEVICE_KERNEL_FEATURE_KERNEL_INVALID,
-  DEVICE_KERNEL_UNKNOWN,
-};
-
 #define DEVICE_MASK(type) (DeviceTypeMask)(1 << type)
 
 class DeviceInfo {
@@ -75,20 +69,16 @@ class DeviceInfo {
   string description;
   string id; /* used for user preferences, should stay fixed with changing hardware config */
   int num;
-  bool display_device;               /* GPU is used as a display device. */
-  bool has_half_images;              /* Support half-float textures. */
-  bool has_nanovdb;                  /* Support NanoVDB volumes. */
-  bool has_volume_decoupled;         /* Decoupled volume shading. */
-  bool has_branched_path;            /* Supports branched path tracing. */
-  bool has_adaptive_stop_per_sample; /* Per-sample adaptive sampling stopping. */
-  bool has_osl;                      /* Support Open Shading Language. */
-  bool use_split_kernel;             /* Use split or mega kernel. */
-  bool has_profiling;                /* Supports runtime collection of profiling info. */
-  bool has_peer_memory;              /* GPU has P2P access to memory of another GPU. */
-  DenoiserTypeMask denoisers;        /* Supported denoiser types. */
+  bool display_device;        /* GPU is used as a display device. */
+  bool has_nanovdb;           /* Support NanoVDB volumes. */
+  bool has_half_images;       /* Support half-float textures. */
+  bool has_osl;               /* Support Open Shading Language. */
+  bool has_profiling;         /* Supports runtime collection of profiling info. */
+  bool has_peer_memory;       /* GPU has P2P access to memory of another GPU. */
+  bool has_gpu_queue;         /* Device supports GPU queue. */
+  DenoiserTypeMask denoisers; /* Supported denoiser types. */
   int cpu_threads;
   vector<DeviceInfo> multi_devices;
-  vector<DeviceInfo> denoising_devices;
   string error_msg;
 
   DeviceInfo()
@@ -100,227 +90,35 @@ class DeviceInfo {
     display_device = false;
     has_half_images = false;
     has_nanovdb = false;
-    has_volume_decoupled = false;
-    has_branched_path = true;
-    has_adaptive_stop_per_sample = false;
     has_osl = false;
-    use_split_kernel = false;
     has_profiling = false;
     has_peer_memory = false;
+    has_gpu_queue = false;
     denoisers = DENOISER_NONE;
   }
 
-  bool operator==(const DeviceInfo &info)
+  bool operator==(const DeviceInfo &info) const
   {
     /* Multiple Devices with the same ID would be very bad. */
     assert(id != info.id ||
            (type == info.type && num == info.num && description == info.description));
     return id == info.id;
   }
-
-  /* Add additional devices needed for the specified denoiser. */
-  void add_denoising_devices(DenoiserType denoiser_type);
-};
-
-class DeviceRequestedFeatures {
- public:
-  /* Use experimental feature set. */
-  bool experimental;
-
-  /* Selective nodes compilation. */
-
-  /* Identifier of a node group up to which all the nodes needs to be
-   * compiled in. Nodes from higher group indices will be ignores.
-   */
-  int max_nodes_group;
-
-  /* Features bitfield indicating which features from the requested group
-   * will be compiled in. Nodes which corresponds to features which are not
-   * in this bitfield will be ignored even if they're in the requested group.
-   */
-  int nodes_features;
-
-  /* BVH/sampling kernel features. */
-  bool use_hair;
-  bool use_hair_thick;
-  bool use_object_motion;
-  bool use_camera_motion;
-
-  /* Denotes whether baking functionality is needed. */
-  bool use_baking;
-
-  /* Use subsurface scattering materials. */
-  bool use_subsurface;
-
-  /* Use volume materials. */
-  bool use_volume;
-
-  /* Use branched integrator. */
-  bool use_integrator_branched;
-
-  /* Use OpenSubdiv patch evaluation */
-  bool use_patch_evaluation;
-
-  /* Use Transparent shadows */
-  bool use_transparent;
-
-  /* Use various shadow tricks, such as shadow catcher. */
-  bool use_shadow_tricks;
-
-  /* Per-uber shader usage flags. */
-  bool use_principled;
-
-  /* Denoising features. */
-  bool use_denoising;
-
-  /* Use raytracing in shaders. */
-  bool use_shader_raytrace;
-
-  /* Use true displacement */
-  bool use_true_displacement;
-
-  /* Use background lights */
-  bool use_background_light;
-
-  DeviceRequestedFeatures()
-  {
-    /* TODO(sergey): Find more meaningful defaults. */
-    max_nodes_group = 0;
-    nodes_features = 0;
-    use_hair = false;
-    use_hair_thick = false;
-    use_object_motion = false;
-    use_camera_motion = false;
-    use_baking = false;
-    use_subsurface = false;
-    use_volume = false;
-    use_integrator_branched = false;
-    use_patch_evaluation = false;
-    use_transparent = false;
-    use_shadow_tricks = false;
-    use_principled = false;
-    use_denoising = false;
-    use_shader_raytrace = false;
-    use_true_displacement = false;
-    use_background_light = false;
-  }
-
-  bool modified(const DeviceRequestedFeatures &requested_features)
-  {
-    return !(max_nodes_group == requested_features.max_nodes_group &&
-             nodes_features == requested_features.nodes_features &&
-             use_hair == requested_features.use_hair &&
-             use_hair_thick == requested_features.use_hair_thick &&
-             use_object_motion == requested_features.use_object_motion &&
-             use_camera_motion == requested_features.use_camera_motion &&
-             use_baking == requested_features.use_baking &&
-             use_subsurface == requested_features.use_subsurface &&
-             use_volume == requested_features.use_volume &&
-             use_integrator_branched == requested_features.use_integrator_branched &&
-             use_patch_evaluation == requested_features.use_patch_evaluation &&
-             use_transparent == requested_features.use_transparent &&
-             use_shadow_tricks == requested_features.use_shadow_tricks &&
-             use_principled == requested_features.use_principled &&
-             use_denoising == requested_features.use_denoising &&
-             use_shader_raytrace == requested_features.use_shader_raytrace &&
-             use_true_displacement == requested_features.use_true_displacement &&
-             use_background_light == requested_features.use_background_light);
-  }
-
-  /* Convert the requested features structure to a build options,
-   * which could then be passed to compilers.
-   */
-  string get_build_options() const
-  {
-    string build_options = "";
-    if (experimental) {
-      build_options += "-D__KERNEL_EXPERIMENTAL__ ";
-    }
-    build_options += "-D__NODES_MAX_GROUP__=" + string_printf("%d", max_nodes_group);
-    build_options += " -D__NODES_FEATURES__=" + string_printf("%d", nodes_features);
-    if (!use_hair) {
-      build_options += " -D__NO_HAIR__";
-    }
-    if (!use_object_motion) {
-      build_options += " -D__NO_OBJECT_MOTION__";
-    }
-    if (!use_camera_motion) {
-      build_options += " -D__NO_CAMERA_MOTION__";
-    }
-    if (!use_baking) {
-      build_options += " -D__NO_BAKING__";
-    }
-    if (!use_volume) {
-      build_options += " -D__NO_VOLUME__";
-    }
-    if (!use_subsurface) {
-      build_options += " -D__NO_SUBSURFACE__";
-    }
-    if (!use_integrator_branched) {
-      build_options += " -D__NO_BRANCHED_PATH__";
-    }
-    if (!use_patch_evaluation) {
-      build_options += " -D__NO_PATCH_EVAL__";
-    }
-    if (!use_transparent && !use_volume) {
-      build_options += " -D__NO_TRANSPARENT__";
-    }
-    if (!use_shadow_tricks) {
-      build_options += " -D__NO_SHADOW_TRICKS__";
-    }
-    if (!use_principled) {
-      build_options += " -D__NO_PRINCIPLED__";
-    }
-    if (!use_denoising) {
-      build_options += " -D__NO_DENOISING__";
-    }
-    if (!use_shader_raytrace) {
-      build_options += " -D__NO_SHADER_RAYTRACE__";
-    }
-    return build_options;
-  }
 };
 
-std::ostream &operator<<(std::ostream &os, const DeviceRequestedFeatures &requested_features);
-
 /* Device */
 
-struct DeviceDrawParams {
-  function<void()> bind_display_space_shader_cb;
-  function<void()> unbind_display_space_shader_cb;
-};
-
 class Device {
   friend class device_sub_ptr;
 
  protected:
-  enum {
-    FALLBACK_SHADER_STATUS_NONE = 0,
-    FALLBACK_SHADER_STATUS_ERROR,
-    FALLBACK_SHADER_STATUS_SUCCESS,
-  };
-
-  Device(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background)
-      : background(background),
-        vertex_buffer(0),
-        fallback_status(FALLBACK_SHADER_STATUS_NONE),
-        fallback_shader_program(0),
-        info(info_),
-        stats(stats_),
-        profiler(profiler_)
+  Device(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_)
+      : info(info_), stats(stats_), profiler(profiler_)
   {
   }
 
-  bool background;
   string error_msg;
 
-  /* used for real time display */
-  unsigned int vertex_buffer;
-  int fallback_status, fallback_shader_program;
-  int image_texture_location, fullscreen_location;
-
-  bool bind_fallback_display_space_shader(const float width, const float height);
-
   virtual device_ptr mem_alloc_sub_ptr(device_memory & /*mem*/, int /*offset*/, int /*size*/)
   {
     /* Only required for devices that implement denoising. */
@@ -361,67 +159,31 @@ class Device {
   Stats &stats;
   Profiler &profiler;
 
-  /* memory alignment */
-  virtual int mem_sub_ptr_alignment()
-  {
-    return MIN_ALIGNMENT_CPU_DATA_TYPES;
-  }
-
   /* constant memory */
   virtual void const_copy_to(const char *name, void *host, size_t size) = 0;
 
-  /* open shading language, only for CPU device */
-  virtual void *osl_memory()
-  {
-    return NULL;
-  }
-
   /* load/compile kernels, must be called before adding tasks */
-  virtual bool load_kernels(const DeviceRequestedFeatures & /*requested_features*/)
+  virtual bool load_kernels(uint /*kernel_features*/)
   {
     return true;
   }
 
-  /* Wait for device to become available to upload data and receive tasks
-   * This method is used by the OpenCL device to load the
-   * optimized kernels or when not (yet) available load the
-   * generic kernels (only during foreground rendering) */
-  virtual bool wait_for_availability(const DeviceRequestedFeatures & /*requested_features*/)
-  {
-    return true;
-  }
-  /* Check if there are 'better' kernels available to be used
-   * We can switch over to these kernels
-   * This method is used to determine if we can switch the preview kernels
-   * to regular kernels */
-  virtual DeviceKernelStatus get_active_kernel_switch_state()
-  {
-    return DEVICE_KERNEL_USING_FEATURE_KERNEL;
-  }
+  /* GPU device only functions.
+   * These may not be used on CPU or multi-devices. */
 
-  /* tasks */
-  virtual int get_split_task_count(DeviceTask &)
-  {
-    return 1;
-  }
+  /* Create new queue for executing kernels in. */
+  virtual unique_ptr<DeviceQueue> gpu_queue_create();
+
+  /* CPU device only functions.
+   * These may not be used on GPU or multi-devices. */
 
-  virtual void task_add(DeviceTask &task) = 0;
-  virtual void task_wait() = 0;
-  virtual void task_cancel() = 0;
-
-  /* opengl drawing */
-  virtual void draw_pixels(device_memory &mem,
-                           int y,
-                           int w,
-                           int h,
-                           int width,
-                           int height,
-                           int dx,
-                           int dy,
-                           int dw,
-                           int dh,
-                           bool transparent,
-                           const DeviceDrawParams &draw_params);
+  /* Get CPU kernel functions for native instruction set. */
+  virtual const CPUKernels *get_cpu_kernels() const;
+  /* Get kernel globals to pass to kernels. */
+  virtual void get_cpu_kernel_thread_globals(
+      vector<CPUKernelThreadGlobals> & /*kernel_thread_globals*/);
+  /* Get OpenShadingLanguage memory buffer. */
+  virtual void *get_cpu_osl_memory();
 
   /* acceleration structure building */
   virtual void build_bvh(BVH *bvh, Progress &progress, bool refit);
@@ -429,25 +191,11 @@ class Device {
   /* OptiX specific destructor. */
   virtual void release_optix_bvh(BVH * /*bvh*/){};
 
-#ifdef WITH_NETWORK
-  /* networking */
-  void server_run();
-#endif
-
   /* multi device */
-  virtual void map_tile(Device * /*sub_device*/, RenderTile & /*tile*/)
-  {
-  }
   virtual int device_number(Device * /*sub_device*/)
   {
     return 0;
   }
-  virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTileNeighbors & /*neighbors*/)
-  {
-  }
-  virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTileNeighbors & /*neighbors*/)
-  {
-  }
 
   virtual bool is_resident(device_ptr /*key*/, Device *sub_device)
   {
@@ -460,11 +208,47 @@ class Device {
     return false;
   }
 
+  /* Graphics resources interoperability.
+   *
+   * The interoperability comes here by the meaning that the device is capable of computing result
+   * directly into an OpenGL (or other graphics library) buffer. */
+
+  /* Check display si to be updated using graphics interoperability.
+   * The interoperability can not be used is it is not supported by the device. But the device
+   * might also force disable the interoperability if it detects that it will be slower than
+   * copying pixels from the render buffer. */
+  virtual bool should_use_graphics_interop()
+  {
+    return false;
+  }
+
+  /* Buffer denoising. */
+
+  /* Returns true if task is fully handled. */
+  virtual bool denoise_buffer(const DeviceDenoiseTask & /*task*/)
+  {
+    LOG(ERROR) << "Request buffer denoising from a device which does not support it.";
+    return false;
+  }
+
+  virtual DeviceQueue *get_denoise_queue()
+  {
+    LOG(ERROR) << "Request denoising queue from a device which does not support it.";
+    return nullptr;
+  }
+
+  /* Sub-devices */
+
+  /* Run given callback for every individual device which will be handling rendering.
+   * For the single device the callback is called for the device itself. For the multi-device the
+   * callback is only called for the sub-devices. */
+  virtual void foreach_device(const function<void(Device *)> &callback)
+  {
+    callback(this);
+  }
+
   /* static */
-  static Device *create(DeviceInfo &info,
-                        Stats &stats,
-                        Profiler &profiler,
-                        bool background = true);
+  static Device *create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
 
   static DeviceType type_from_string(const char *name);
   static string string_from_type(DeviceType type);
@@ -499,9 +283,7 @@ class Device {
   static thread_mutex device_mutex;
   static vector<DeviceInfo> cuda_devices;
   static vector<DeviceInfo> optix_devices;
-  static vector<DeviceInfo> opencl_devices;
   static vector<DeviceInfo> cpu_devices;
-  static vector<DeviceInfo> network_devices;
   static uint devices_initialized_mask;
 };
 
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
deleted file mode 100644
index 4a6e77d6eaa..00000000000
--- a/intern/cycles/device/device_cpu.cpp
+++ /dev/null
@@ -1,1680 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stdlib.h>
-#include <string.h>
-
-/* So ImathMath is included before our kernel_cpu_compat. */
-#ifdef WITH_OSL
-/* So no context pollution happens from indirectly included windows.h */
-#  include "util/util_windows.h"
-#  include <OSL/oslexec.h>
-#endif
-
-#ifdef WITH_EMBREE
-#  include <embree3/rtcore.h>
-#endif
-
-#include "device/device.h"
-#include "device/device_denoising.h"
-#include "device/device_intern.h"
-#include "device/device_split_kernel.h"
-
-// clang-format off
-#include "kernel/kernel.h"
-#include "kernel/kernel_compat_cpu.h"
-#include "kernel/kernel_types.h"
-#include "kernel/split/kernel_split_data.h"
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_adaptive_sampling.h"
-
-#include "kernel/filter/filter.h"
-
-#include "kernel/osl/osl_shader.h"
-#include "kernel/osl/osl_globals.h"
-// clang-format on
-
-#include "bvh/bvh_embree.h"
-
-#include "render/buffers.h"
-#include "render/coverage.h"
-
-#include "util/util_debug.h"
-#include "util/util_foreach.h"
-#include "util/util_function.h"
-#include "util/util_logging.h"
-#include "util/util_map.h"
-#include "util/util_opengl.h"
-#include "util/util_openimagedenoise.h"
-#include "util/util_optimization.h"
-#include "util/util_progress.h"
-#include "util/util_system.h"
-#include "util/util_task.h"
-#include "util/util_thread.h"
-
-CCL_NAMESPACE_BEGIN
-
-class CPUDevice;
-
-/* Has to be outside of the class to be shared across template instantiations. */
-static const char *logged_architecture = "";
-
-template<typename F> class KernelFunctions {
- public:
-  KernelFunctions()
-  {
-    kernel = (F)NULL;
-  }
-
-  KernelFunctions(
-      F kernel_default, F kernel_sse2, F kernel_sse3, F kernel_sse41, F kernel_avx, F kernel_avx2)
-  {
-    const char *architecture_name = "default";
-    kernel = kernel_default;
-
-    /* Silence potential warnings about unused variables
-     * when compiling without some architectures. */
-    (void)kernel_sse2;
-    (void)kernel_sse3;
-    (void)kernel_sse41;
-    (void)kernel_avx;
-    (void)kernel_avx2;
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-    if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) {
-      architecture_name = "AVX2";
-      kernel = kernel_avx2;
-    }
-    else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-        if (DebugFlags().cpu.has_avx() && system_cpu_support_avx()) {
-      architecture_name = "AVX";
-      kernel = kernel_avx;
-    }
-    else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-        if (DebugFlags().cpu.has_sse41() && system_cpu_support_sse41()) {
-      architecture_name = "SSE4.1";
-      kernel = kernel_sse41;
-    }
-    else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-        if (DebugFlags().cpu.has_sse3() && system_cpu_support_sse3()) {
-      architecture_name = "SSE3";
-      kernel = kernel_sse3;
-    }
-    else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-        if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) {
-      architecture_name = "SSE2";
-      kernel = kernel_sse2;
-    }
-#else
-    {
-      /* Dummy to prevent the architecture if below become
-       * conditional when WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-       * is not defined. */
-    }
-#endif
-
-    if (strcmp(architecture_name, logged_architecture) != 0) {
-      VLOG(1) << "Will be using " << architecture_name << " kernels.";
-      logged_architecture = architecture_name;
-    }
-  }
-
-  inline F operator()() const
-  {
-    assert(kernel);
-    return kernel;
-  }
-
- protected:
-  F kernel;
-};
-
-class CPUSplitKernel : public DeviceSplitKernel {
-  CPUDevice *device;
-
- public:
-  explicit CPUSplitKernel(CPUDevice *device);
-
-  virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
-                                              RenderTile &rtile,
-                                              int num_global_elements,
-                                              device_memory &kernel_globals,
-                                              device_memory &kernel_data_,
-                                              device_memory &split_data,
-                                              device_memory &ray_state,
-                                              device_memory &queue_index,
-                                              device_memory &use_queues_flag,
-                                              device_memory &work_pool_wgs);
-
-  virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
-                                                         const DeviceRequestedFeatures &);
-  virtual int2 split_kernel_local_size();
-  virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task);
-  virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads);
-};
-
-class CPUDevice : public Device {
- public:
-  TaskPool task_pool;
-  KernelGlobals kernel_globals;
-
-  device_vector<TextureInfo> texture_info;
-  bool need_texture_info;
-
-#ifdef WITH_OSL
-  OSLGlobals osl_globals;
-#endif
-#ifdef WITH_OPENIMAGEDENOISE
-  oidn::DeviceRef oidn_device;
-  oidn::FilterRef oidn_filter;
-#endif
-  thread_spin_lock oidn_task_lock;
-#ifdef WITH_EMBREE
-  RTCScene embree_scene = NULL;
-  RTCDevice embree_device;
-#endif
-
-  bool use_split_kernel;
-
-  DeviceRequestedFeatures requested_features;
-
-  KernelFunctions<void (*)(KernelGlobals *, float *, int, int, int, int, int)> path_trace_kernel;
-  KernelFunctions<void (*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)>
-      convert_to_half_float_kernel;
-  KernelFunctions<void (*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)>
-      convert_to_byte_kernel;
-  KernelFunctions<void (*)(KernelGlobals *, uint4 *, float4 *, int, int, int, int, int)>
-      shader_kernel;
-  KernelFunctions<void (*)(KernelGlobals *, float *, int, int, int, int, int)> bake_kernel;
-
-  KernelFunctions<void (*)(
-      int, TileInfo *, int, int, float *, float *, float *, float *, float *, int *, int, int)>
-      filter_divide_shadow_kernel;
-  KernelFunctions<void (*)(
-      int, TileInfo *, int, int, int, int, float *, float *, float, int *, int, int)>
-      filter_get_feature_kernel;
-  KernelFunctions<void (*)(int, int, int, int *, float *, float *, int, int *)>
-      filter_write_feature_kernel;
-  KernelFunctions<void (*)(int, int, float *, float *, float *, float *, int *, int)>
-      filter_detect_outliers_kernel;
-  KernelFunctions<void (*)(int, int, float *, float *, float *, float *, int *, int)>
-      filter_combine_halves_kernel;
-
-  KernelFunctions<void (*)(
-      int, int, float *, float *, float *, float *, int *, int, int, int, float, float)>
-      filter_nlm_calc_difference_kernel;
-  KernelFunctions<void (*)(float *, float *, int *, int, int)> filter_nlm_blur_kernel;
-  KernelFunctions<void (*)(float *, float *, int *, int, int)> filter_nlm_calc_weight_kernel;
-  KernelFunctions<void (*)(
-      int, int, float *, float *, float *, float *, float *, int *, int, int, int)>
-      filter_nlm_update_output_kernel;
-  KernelFunctions<void (*)(float *, float *, int *, int)> filter_nlm_normalize_kernel;
-
-  KernelFunctions<void (*)(
-      float *, TileInfo *, int, int, int, float *, int *, int *, int, int, bool, int, float)>
-      filter_construct_transform_kernel;
-  KernelFunctions<void (*)(int,
-                           int,
-                           int,
-                           float *,
-                           float *,
-                           float *,
-                           int *,
-                           float *,
-                           float3 *,
-                           int *,
-                           int *,
-                           int,
-                           int,
-                           int,
-                           int,
-                           bool)>
-      filter_nlm_construct_gramian_kernel;
-  KernelFunctions<void (*)(int, int, int, float *, int *, float *, float3 *, int *, int)>
-      filter_finalize_kernel;
-
-  KernelFunctions<void (*)(KernelGlobals *,
-                           ccl_constant KernelData *,
-                           ccl_global void *,
-                           int,
-                           ccl_global char *,
-                           int,
-                           int,
-                           int,
-                           int,
-                           int,
-                           int,
-                           int,
-                           int,
-                           ccl_global int *,
-                           int,
-                           ccl_global char *,
-                           ccl_global unsigned int *,
-                           unsigned int,
-                           ccl_global float *)>
-      data_init_kernel;
-  unordered_map<string, KernelFunctions<void (*)(KernelGlobals *, KernelData *)>> split_kernels;
-
-#define KERNEL_FUNCTIONS(name) \
-  KERNEL_NAME_EVAL(cpu, name), KERNEL_NAME_EVAL(cpu_sse2, name), \
-      KERNEL_NAME_EVAL(cpu_sse3, name), KERNEL_NAME_EVAL(cpu_sse41, name), \
-      KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name)
-
-  CPUDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
-      : Device(info_, stats_, profiler_, background_),
-        texture_info(this, "__texture_info", MEM_GLOBAL),
-#define REGISTER_KERNEL(name) name##_kernel(KERNEL_FUNCTIONS(name))
-        REGISTER_KERNEL(path_trace),
-        REGISTER_KERNEL(convert_to_half_float),
-        REGISTER_KERNEL(convert_to_byte),
-        REGISTER_KERNEL(shader),
-        REGISTER_KERNEL(bake),
-        REGISTER_KERNEL(filter_divide_shadow),
-        REGISTER_KERNEL(filter_get_feature),
-        REGISTER_KERNEL(filter_write_feature),
-        REGISTER_KERNEL(filter_detect_outliers),
-        REGISTER_KERNEL(filter_combine_halves),
-        REGISTER_KERNEL(filter_nlm_calc_difference),
-        REGISTER_KERNEL(filter_nlm_blur),
-        REGISTER_KERNEL(filter_nlm_calc_weight),
-        REGISTER_KERNEL(filter_nlm_update_output),
-        REGISTER_KERNEL(filter_nlm_normalize),
-        REGISTER_KERNEL(filter_construct_transform),
-        REGISTER_KERNEL(filter_nlm_construct_gramian),
-        REGISTER_KERNEL(filter_finalize),
-        REGISTER_KERNEL(data_init)
-#undef REGISTER_KERNEL
-  {
-    if (info.cpu_threads == 0) {
-      info.cpu_threads = TaskScheduler::num_threads();
-    }
-
-#ifdef WITH_OSL
-    kernel_globals.osl = &osl_globals;
-#endif
-#ifdef WITH_EMBREE
-    embree_device = rtcNewDevice("verbose=0");
-#endif
-    use_split_kernel = DebugFlags().cpu.split_kernel;
-    if (use_split_kernel) {
-      VLOG(1) << "Will be using split kernel.";
-    }
-    need_texture_info = false;
-
-#define REGISTER_SPLIT_KERNEL(name) \
-  split_kernels[#name] = KernelFunctions<void (*)(KernelGlobals *, KernelData *)>( \
-      KERNEL_FUNCTIONS(name))
-    REGISTER_SPLIT_KERNEL(path_init);
-    REGISTER_SPLIT_KERNEL(scene_intersect);
-    REGISTER_SPLIT_KERNEL(lamp_emission);
-    REGISTER_SPLIT_KERNEL(do_volume);
-    REGISTER_SPLIT_KERNEL(queue_enqueue);
-    REGISTER_SPLIT_KERNEL(indirect_background);
-    REGISTER_SPLIT_KERNEL(shader_setup);
-    REGISTER_SPLIT_KERNEL(shader_sort);
-    REGISTER_SPLIT_KERNEL(shader_eval);
-    REGISTER_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao);
-    REGISTER_SPLIT_KERNEL(subsurface_scatter);
-    REGISTER_SPLIT_KERNEL(direct_lighting);
-    REGISTER_SPLIT_KERNEL(shadow_blocked_ao);
-    REGISTER_SPLIT_KERNEL(shadow_blocked_dl);
-    REGISTER_SPLIT_KERNEL(enqueue_inactive);
-    REGISTER_SPLIT_KERNEL(next_iteration_setup);
-    REGISTER_SPLIT_KERNEL(indirect_subsurface);
-    REGISTER_SPLIT_KERNEL(buffer_update);
-    REGISTER_SPLIT_KERNEL(adaptive_stopping);
-    REGISTER_SPLIT_KERNEL(adaptive_filter_x);
-    REGISTER_SPLIT_KERNEL(adaptive_filter_y);
-    REGISTER_SPLIT_KERNEL(adaptive_adjust_samples);
-#undef REGISTER_SPLIT_KERNEL
-#undef KERNEL_FUNCTIONS
-  }
-
-  ~CPUDevice()
-  {
-#ifdef WITH_EMBREE
-    rtcReleaseDevice(embree_device);
-#endif
-    task_pool.cancel();
-    texture_info.free();
-  }
-
-  virtual bool show_samples() const override
-  {
-    return (info.cpu_threads == 1);
-  }
-
-  virtual BVHLayoutMask get_bvh_layout_mask() const override
-  {
-    BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2;
-#ifdef WITH_EMBREE
-    bvh_layout_mask |= BVH_LAYOUT_EMBREE;
-#endif /* WITH_EMBREE */
-    return bvh_layout_mask;
-  }
-
-  void load_texture_info()
-  {
-    if (need_texture_info) {
-      texture_info.copy_to_device();
-      need_texture_info = false;
-    }
-  }
-
-  virtual void mem_alloc(device_memory &mem) override
-  {
-    if (mem.type == MEM_TEXTURE) {
-      assert(!"mem_alloc not supported for textures.");
-    }
-    else if (mem.type == MEM_GLOBAL) {
-      assert(!"mem_alloc not supported for global memory.");
-    }
-    else {
-      if (mem.name) {
-        VLOG(1) << "Buffer allocate: " << mem.name << ", "
-                << string_human_readable_number(mem.memory_size()) << " bytes. ("
-                << string_human_readable_size(mem.memory_size()) << ")";
-      }
-
-      if (mem.type == MEM_DEVICE_ONLY || !mem.host_pointer) {
-        size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES;
-        void *data = util_aligned_malloc(mem.memory_size(), alignment);
-        mem.device_pointer = (device_ptr)data;
-      }
-      else {
-        mem.device_pointer = (device_ptr)mem.host_pointer;
-      }
-
-      mem.device_size = mem.memory_size();
-      stats.mem_alloc(mem.device_size);
-    }
-  }
-
-  virtual void mem_copy_to(device_memory &mem) override
-  {
-    if (mem.type == MEM_GLOBAL) {
-      global_free(mem);
-      global_alloc(mem);
-    }
-    else if (mem.type == MEM_TEXTURE) {
-      tex_free((device_texture &)mem);
-      tex_alloc((device_texture &)mem);
-    }
-    else if (mem.type == MEM_PIXELS) {
-      assert(!"mem_copy_to not supported for pixels.");
-    }
-    else {
-      if (!mem.device_pointer) {
-        mem_alloc(mem);
-      }
-
-      /* copy is no-op */
-    }
-  }
-
-  virtual void mem_copy_from(
-      device_memory & /*mem*/, int /*y*/, int /*w*/, int /*h*/, int /*elem*/) override
-  {
-    /* no-op */
-  }
-
-  virtual void mem_zero(device_memory &mem) override
-  {
-    if (!mem.device_pointer) {
-      mem_alloc(mem);
-    }
-
-    if (mem.device_pointer) {
-      memset((void *)mem.device_pointer, 0, mem.memory_size());
-    }
-  }
-
-  virtual void mem_free(device_memory &mem) override
-  {
-    if (mem.type == MEM_GLOBAL) {
-      global_free(mem);
-    }
-    else if (mem.type == MEM_TEXTURE) {
-      tex_free((device_texture &)mem);
-    }
-    else if (mem.device_pointer) {
-      if (mem.type == MEM_DEVICE_ONLY || !mem.host_pointer) {
-        util_aligned_free((void *)mem.device_pointer);
-      }
-      mem.device_pointer = 0;
-      stats.mem_free(mem.device_size);
-      mem.device_size = 0;
-    }
-  }
-
-  virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override
-  {
-    return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
-  }
-
-  virtual void const_copy_to(const char *name, void *host, size_t size) override
-  {
-#if WITH_EMBREE
-    if (strcmp(name, "__data") == 0) {
-      assert(size <= sizeof(KernelData));
-
-      // Update scene handle (since it is different for each device on multi devices)
-      KernelData *const data = (KernelData *)host;
-      data->bvh.scene = embree_scene;
-    }
-#endif
-    kernel_const_copy(&kernel_globals, name, host, size);
-  }
-
-  void global_alloc(device_memory &mem)
-  {
-    VLOG(1) << "Global memory allocate: " << mem.name << ", "
-            << string_human_readable_number(mem.memory_size()) << " bytes. ("
-            << string_human_readable_size(mem.memory_size()) << ")";
-
-    kernel_global_memory_copy(&kernel_globals, mem.name, mem.host_pointer, mem.data_size);
-
-    mem.device_pointer = (device_ptr)mem.host_pointer;
-    mem.device_size = mem.memory_size();
-    stats.mem_alloc(mem.device_size);
-  }
-
-  void global_free(device_memory &mem)
-  {
-    if (mem.device_pointer) {
-      mem.device_pointer = 0;
-      stats.mem_free(mem.device_size);
-      mem.device_size = 0;
-    }
-  }
-
-  void tex_alloc(device_texture &mem)
-  {
-    VLOG(1) << "Texture allocate: " << mem.name << ", "
-            << string_human_readable_number(mem.memory_size()) << " bytes. ("
-            << string_human_readable_size(mem.memory_size()) << ")";
-
-    mem.device_pointer = (device_ptr)mem.host_pointer;
-    mem.device_size = mem.memory_size();
-    stats.mem_alloc(mem.device_size);
-
-    const uint slot = mem.slot;
-    if (slot >= texture_info.size()) {
-      /* Allocate some slots in advance, to reduce amount of re-allocations. */
-      texture_info.resize(slot + 128);
-    }
-
-    texture_info[slot] = mem.info;
-    texture_info[slot].data = (uint64_t)mem.host_pointer;
-    need_texture_info = true;
-  }
-
-  void tex_free(device_texture &mem)
-  {
-    if (mem.device_pointer) {
-      mem.device_pointer = 0;
-      stats.mem_free(mem.device_size);
-      mem.device_size = 0;
-      need_texture_info = true;
-    }
-  }
-
-  virtual void *osl_memory() override
-  {
-#ifdef WITH_OSL
-    return &osl_globals;
-#else
-    return NULL;
-#endif
-  }
-
-  void build_bvh(BVH *bvh, Progress &progress, bool refit) override
-  {
-#ifdef WITH_EMBREE
-    if (bvh->params.bvh_layout == BVH_LAYOUT_EMBREE ||
-        bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE) {
-      BVHEmbree *const bvh_embree = static_cast<BVHEmbree *>(bvh);
-      if (refit) {
-        bvh_embree->refit(progress);
-      }
-      else {
-        bvh_embree->build(progress, &stats, embree_device);
-      }
-
-      if (bvh->params.top_level) {
-        embree_scene = bvh_embree->scene;
-      }
-    }
-    else
-#endif
-      Device::build_bvh(bvh, progress, refit);
-  }
-
-  void thread_run(DeviceTask &task)
-  {
-    if (task.type == DeviceTask::RENDER)
-      thread_render(task);
-    else if (task.type == DeviceTask::SHADER)
-      thread_shader(task);
-    else if (task.type == DeviceTask::FILM_CONVERT)
-      thread_film_convert(task);
-    else if (task.type == DeviceTask::DENOISE_BUFFER)
-      thread_denoise(task);
-  }
-
-  bool denoising_non_local_means(device_ptr image_ptr,
-                                 device_ptr guide_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr out_ptr,
-                                 DenoisingTask *task)
-  {
-    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_NON_LOCAL_MEANS);
-
-    int4 rect = task->rect;
-    int r = task->nlm_state.r;
-    int f = task->nlm_state.f;
-    float a = task->nlm_state.a;
-    float k_2 = task->nlm_state.k_2;
-
-    int w = align_up(rect.z - rect.x, 4);
-    int h = rect.w - rect.y;
-    int stride = task->buffer.stride;
-    int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
-
-    float *temporary_mem = (float *)task->buffer.temporary_mem.device_pointer;
-    float *blurDifference = temporary_mem;
-    float *difference = temporary_mem + task->buffer.pass_stride;
-    float *weightAccum = temporary_mem + 2 * task->buffer.pass_stride;
-
-    memset(weightAccum, 0, sizeof(float) * w * h);
-    memset((float *)out_ptr, 0, sizeof(float) * w * h);
-
-    for (int i = 0; i < (2 * r + 1) * (2 * r + 1); i++) {
-      int dy = i / (2 * r + 1) - r;
-      int dx = i % (2 * r + 1) - r;
-
-      int local_rect[4] = {
-          max(0, -dx), max(0, -dy), rect.z - rect.x - max(0, dx), rect.w - rect.y - max(0, dy)};
-      filter_nlm_calc_difference_kernel()(dx,
-                                          dy,
-                                          (float *)guide_ptr,
-                                          (float *)variance_ptr,
-                                          NULL,
-                                          difference,
-                                          local_rect,
-                                          w,
-                                          channel_offset,
-                                          0,
-                                          a,
-                                          k_2);
-
-      filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f);
-      filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f);
-      filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f);
-
-      filter_nlm_update_output_kernel()(dx,
-                                        dy,
-                                        blurDifference,
-                                        (float *)image_ptr,
-                                        difference,
-                                        (float *)out_ptr,
-                                        weightAccum,
-                                        local_rect,
-                                        channel_offset,
-                                        stride,
-                                        f);
-    }
-
-    int local_rect[4] = {0, 0, rect.z - rect.x, rect.w - rect.y};
-    filter_nlm_normalize_kernel()((float *)out_ptr, weightAccum, local_rect, w);
-
-    return true;
-  }
-
-  bool denoising_construct_transform(DenoisingTask *task)
-  {
-    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_CONSTRUCT_TRANSFORM);
-
-    for (int y = 0; y < task->filter_area.w; y++) {
-      for (int x = 0; x < task->filter_area.z; x++) {
-        filter_construct_transform_kernel()((float *)task->buffer.mem.device_pointer,
-                                            task->tile_info,
-                                            x + task->filter_area.x,
-                                            y + task->filter_area.y,
-                                            y * task->filter_area.z + x,
-                                            (float *)task->storage.transform.device_pointer,
-                                            (int *)task->storage.rank.device_pointer,
-                                            &task->rect.x,
-                                            task->buffer.pass_stride,
-                                            task->buffer.frame_stride,
-                                            task->buffer.use_time,
-                                            task->radius,
-                                            task->pca_threshold);
-      }
-    }
-    return true;
-  }
-
-  bool denoising_accumulate(device_ptr color_ptr,
-                            device_ptr color_variance_ptr,
-                            device_ptr scale_ptr,
-                            int frame,
-                            DenoisingTask *task)
-  {
-    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_RECONSTRUCT);
-
-    float *temporary_mem = (float *)task->buffer.temporary_mem.device_pointer;
-    float *difference = temporary_mem;
-    float *blurDifference = temporary_mem + task->buffer.pass_stride;
-
-    int r = task->radius;
-    int frame_offset = frame * task->buffer.frame_stride;
-    for (int i = 0; i < (2 * r + 1) * (2 * r + 1); i++) {
-      int dy = i / (2 * r + 1) - r;
-      int dx = i % (2 * r + 1) - r;
-
-      int local_rect[4] = {max(0, -dx),
-                           max(0, -dy),
-                           task->reconstruction_state.source_w - max(0, dx),
-                           task->reconstruction_state.source_h - max(0, dy)};
-      filter_nlm_calc_difference_kernel()(dx,
-                                          dy,
-                                          (float *)color_ptr,
-                                          (float *)color_variance_ptr,
-                                          (float *)scale_ptr,
-                                          difference,
-                                          local_rect,
-                                          task->buffer.stride,
-                                          task->buffer.pass_stride,
-                                          frame_offset,
-                                          1.0f,
-                                          task->nlm_k_2);
-      filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
-      filter_nlm_calc_weight_kernel()(
-          blurDifference, difference, local_rect, task->buffer.stride, 4);
-      filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
-      filter_nlm_construct_gramian_kernel()(dx,
-                                            dy,
-                                            task->tile_info->frames[frame],
-                                            blurDifference,
-                                            (float *)task->buffer.mem.device_pointer,
-                                            (float *)task->storage.transform.device_pointer,
-                                            (int *)task->storage.rank.device_pointer,
-                                            (float *)task->storage.XtWX.device_pointer,
-                                            (float3 *)task->storage.XtWY.device_pointer,
-                                            local_rect,
-                                            &task->reconstruction_state.filter_window.x,
-                                            task->buffer.stride,
-                                            4,
-                                            task->buffer.pass_stride,
-                                            frame_offset,
-                                            task->buffer.use_time);
-    }
-
-    return true;
-  }
-
-  bool denoising_solve(device_ptr output_ptr, DenoisingTask *task)
-  {
-    for (int y = 0; y < task->filter_area.w; y++) {
-      for (int x = 0; x < task->filter_area.z; x++) {
-        filter_finalize_kernel()(x,
-                                 y,
-                                 y * task->filter_area.z + x,
-                                 (float *)output_ptr,
-                                 (int *)task->storage.rank.device_pointer,
-                                 (float *)task->storage.XtWX.device_pointer,
-                                 (float3 *)task->storage.XtWY.device_pointer,
-                                 &task->reconstruction_state.buffer_params.x,
-                                 task->render_buffer.samples);
-      }
-    }
-    return true;
-  }
-
-  bool denoising_combine_halves(device_ptr a_ptr,
-                                device_ptr b_ptr,
-                                device_ptr mean_ptr,
-                                device_ptr variance_ptr,
-                                int r,
-                                int4 rect,
-                                DenoisingTask *task)
-  {
-    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_COMBINE_HALVES);
-
-    for (int y = rect.y; y < rect.w; y++) {
-      for (int x = rect.x; x < rect.z; x++) {
-        filter_combine_halves_kernel()(x,
-                                       y,
-                                       (float *)mean_ptr,
-                                       (float *)variance_ptr,
-                                       (float *)a_ptr,
-                                       (float *)b_ptr,
-                                       &rect.x,
-                                       r);
-      }
-    }
-    return true;
-  }
-
-  bool denoising_divide_shadow(device_ptr a_ptr,
-                               device_ptr b_ptr,
-                               device_ptr sample_variance_ptr,
-                               device_ptr sv_variance_ptr,
-                               device_ptr buffer_variance_ptr,
-                               DenoisingTask *task)
-  {
-    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DIVIDE_SHADOW);
-
-    for (int y = task->rect.y; y < task->rect.w; y++) {
-      for (int x = task->rect.x; x < task->rect.z; x++) {
-        filter_divide_shadow_kernel()(task->render_buffer.samples,
-                                      task->tile_info,
-                                      x,
-                                      y,
-                                      (float *)a_ptr,
-                                      (float *)b_ptr,
-                                      (float *)sample_variance_ptr,
-                                      (float *)sv_variance_ptr,
-                                      (float *)buffer_variance_ptr,
-                                      &task->rect.x,
-                                      task->render_buffer.pass_stride,
-                                      task->render_buffer.offset);
-      }
-    }
-    return true;
-  }
-
-  bool denoising_get_feature(int mean_offset,
-                             int variance_offset,
-                             device_ptr mean_ptr,
-                             device_ptr variance_ptr,
-                             float scale,
-                             DenoisingTask *task)
-  {
-    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_GET_FEATURE);
-
-    for (int y = task->rect.y; y < task->rect.w; y++) {
-      for (int x = task->rect.x; x < task->rect.z; x++) {
-        filter_get_feature_kernel()(task->render_buffer.samples,
-                                    task->tile_info,
-                                    mean_offset,
-                                    variance_offset,
-                                    x,
-                                    y,
-                                    (float *)mean_ptr,
-                                    (float *)variance_ptr,
-                                    scale,
-                                    &task->rect.x,
-                                    task->render_buffer.pass_stride,
-                                    task->render_buffer.offset);
-      }
-    }
-    return true;
-  }
-
-  bool denoising_write_feature(int out_offset,
-                               device_ptr from_ptr,
-                               device_ptr buffer_ptr,
-                               DenoisingTask *task)
-  {
-    for (int y = 0; y < task->filter_area.w; y++) {
-      for (int x = 0; x < task->filter_area.z; x++) {
-        filter_write_feature_kernel()(task->render_buffer.samples,
-                                      x + task->filter_area.x,
-                                      y + task->filter_area.y,
-                                      &task->reconstruction_state.buffer_params.x,
-                                      (float *)from_ptr,
-                                      (float *)buffer_ptr,
-                                      out_offset,
-                                      &task->rect.x);
-      }
-    }
-    return true;
-  }
-
-  bool denoising_detect_outliers(device_ptr image_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr depth_ptr,
-                                 device_ptr output_ptr,
-                                 DenoisingTask *task)
-  {
-    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DETECT_OUTLIERS);
-
-    for (int y = task->rect.y; y < task->rect.w; y++) {
-      for (int x = task->rect.x; x < task->rect.z; x++) {
-        filter_detect_outliers_kernel()(x,
-                                        y,
-                                        (float *)image_ptr,
-                                        (float *)variance_ptr,
-                                        (float *)depth_ptr,
-                                        (float *)output_ptr,
-                                        &task->rect.x,
-                                        task->buffer.pass_stride);
-      }
-    }
-    return true;
-  }
-
-  bool adaptive_sampling_filter(KernelGlobals *kg, RenderTile &tile, int sample)
-  {
-    WorkTile wtile;
-    wtile.x = tile.x;
-    wtile.y = tile.y;
-    wtile.w = tile.w;
-    wtile.h = tile.h;
-    wtile.offset = tile.offset;
-    wtile.stride = tile.stride;
-    wtile.buffer = (float *)tile.buffer;
-
-    /* For CPU we do adaptive stopping per sample so we can stop earlier, but
-     * for combined CPU + GPU rendering we match the GPU and do it per tile
-     * after a given number of sample steps. */
-    if (!kernel_data.integrator.adaptive_stop_per_sample) {
-      for (int y = wtile.y; y < wtile.y + wtile.h; ++y) {
-        for (int x = wtile.x; x < wtile.x + wtile.w; ++x) {
-          const int index = wtile.offset + x + y * wtile.stride;
-          float *buffer = wtile.buffer + index * kernel_data.film.pass_stride;
-          kernel_do_adaptive_stopping(kg, buffer, sample);
-        }
-      }
-    }
-
-    bool any = false;
-    for (int y = wtile.y; y < wtile.y + wtile.h; ++y) {
-      any |= kernel_do_adaptive_filter_x(kg, y, &wtile);
-    }
-    for (int x = wtile.x; x < wtile.x + wtile.w; ++x) {
-      any |= kernel_do_adaptive_filter_y(kg, x, &wtile);
-    }
-    return (!any);
-  }
-
-  void adaptive_sampling_post(const RenderTile &tile, KernelGlobals *kg)
-  {
-    float *render_buffer = (float *)tile.buffer;
-    for (int y = tile.y; y < tile.y + tile.h; y++) {
-      for (int x = tile.x; x < tile.x + tile.w; x++) {
-        int index = tile.offset + x + y * tile.stride;
-        ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride;
-        if (buffer[kernel_data.film.pass_sample_count] < 0.0f) {
-          buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
-          float sample_multiplier = tile.sample / buffer[kernel_data.film.pass_sample_count];
-          if (sample_multiplier != 1.0f) {
-            kernel_adaptive_post_adjust(kg, buffer, sample_multiplier);
-          }
-        }
-        else {
-          kernel_adaptive_post_adjust(kg, buffer, tile.sample / (tile.sample - 1.0f));
-        }
-      }
-    }
-  }
-
-  void render(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
-  {
-    const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE;
-
-    scoped_timer timer(&tile.buffers->render_time);
-
-    Coverage coverage(kg, tile);
-    if (use_coverage) {
-      coverage.init_path_trace();
-    }
-
-    float *render_buffer = (float *)tile.buffer;
-    int start_sample = tile.start_sample;
-    int end_sample = tile.start_sample + tile.num_samples;
-
-    /* Needed for Embree. */
-    SIMD_SET_FLUSH_TO_ZERO;
-
-    for (int sample = start_sample; sample < end_sample; sample++) {
-      if (task.get_cancel() || TaskPool::canceled()) {
-        if (task.need_finish_queue == false)
-          break;
-      }
-
-      if (tile.stealing_state == RenderTile::CAN_BE_STOLEN && task.get_tile_stolen()) {
-        tile.stealing_state = RenderTile::WAS_STOLEN;
-        break;
-      }
-
-      if (tile.task == RenderTile::PATH_TRACE) {
-        for (int y = tile.y; y < tile.y + tile.h; y++) {
-          for (int x = tile.x; x < tile.x + tile.w; x++) {
-            if (use_coverage) {
-              coverage.init_pixel(x, y);
-            }
-            path_trace_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
-          }
-        }
-      }
-      else {
-        for (int y = tile.y; y < tile.y + tile.h; y++) {
-          for (int x = tile.x; x < tile.x + tile.w; x++) {
-            bake_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
-          }
-        }
-      }
-      tile.sample = sample + 1;
-
-      if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(sample)) {
-        const bool stop = adaptive_sampling_filter(kg, tile, sample);
-        if (stop) {
-          const int num_progress_samples = end_sample - sample;
-          tile.sample = end_sample;
-          task.update_progress(&tile, tile.w * tile.h * num_progress_samples);
-          break;
-        }
-      }
-
-      task.update_progress(&tile, tile.w * tile.h);
-    }
-    if (use_coverage) {
-      coverage.finalize();
-    }
-
-    if (task.adaptive_sampling.use && (tile.stealing_state != RenderTile::WAS_STOLEN)) {
-      adaptive_sampling_post(tile, kg);
-    }
-  }
-
-  void denoise_openimagedenoise_buffer(DeviceTask &task,
-                                       float *buffer,
-                                       const size_t offset,
-                                       const size_t stride,
-                                       const size_t x,
-                                       const size_t y,
-                                       const size_t w,
-                                       const size_t h,
-                                       const float scale)
-  {
-#ifdef WITH_OPENIMAGEDENOISE
-    assert(openimagedenoise_supported());
-
-    /* Only one at a time, since OpenImageDenoise itself is multithreaded for full
-     * buffers, and for tiled rendering because creating multiple devices and filters
-     * is slow and memory hungry as well.
-     *
-     * TODO: optimize tiled rendering case, by batching together denoising of many
-     * tiles somehow? */
-    static thread_mutex mutex;
-    thread_scoped_lock lock(mutex);
-
-    /* Create device and filter, cached for reuse. */
-    if (!oidn_device) {
-      oidn_device = oidn::newDevice();
-      oidn_device.commit();
-    }
-    if (!oidn_filter) {
-      oidn_filter = oidn_device.newFilter("RT");
-      oidn_filter.set("hdr", true);
-      oidn_filter.set("srgb", false);
-    }
-
-    /* Set images with appropriate stride for our interleaved pass storage. */
-    struct {
-      const char *name;
-      const int offset;
-      const bool scale;
-      const bool use;
-      array<float> scaled_buffer;
-    } passes[] = {{"color", task.pass_denoising_data + DENOISING_PASS_COLOR, false, true},
-                  {"albedo",
-                   task.pass_denoising_data + DENOISING_PASS_ALBEDO,
-                   true,
-                   task.denoising.input_passes >= DENOISER_INPUT_RGB_ALBEDO},
-                  {"normal",
-                   task.pass_denoising_data + DENOISING_PASS_NORMAL,
-                   true,
-                   task.denoising.input_passes >= DENOISER_INPUT_RGB_ALBEDO_NORMAL},
-                  {"output", 0, false, true},
-                  { NULL,
-                    0 }};
-
-    for (int i = 0; passes[i].name; i++) {
-      if (!passes[i].use) {
-        continue;
-      }
-
-      const int64_t pixel_offset = offset + x + y * stride;
-      const int64_t buffer_offset = (pixel_offset * task.pass_stride + passes[i].offset);
-      const int64_t pixel_stride = task.pass_stride;
-      const int64_t row_stride = stride * pixel_stride;
-
-      if (passes[i].scale && scale != 1.0f) {
-        /* Normalize albedo and normal passes as they are scaled by the number of samples.
-         * For the color passes OIDN will perform auto-exposure making it unnecessary. */
-        array<float> &scaled_buffer = passes[i].scaled_buffer;
-        scaled_buffer.resize(w * h * 3);
-
-        for (int y = 0; y < h; y++) {
-          const float *pass_row = buffer + buffer_offset + y * row_stride;
-          float *scaled_row = scaled_buffer.data() + y * w * 3;
-
-          for (int x = 0; x < w; x++) {
-            scaled_row[x * 3 + 0] = pass_row[x * pixel_stride + 0] * scale;
-            scaled_row[x * 3 + 1] = pass_row[x * pixel_stride + 1] * scale;
-            scaled_row[x * 3 + 2] = pass_row[x * pixel_stride + 2] * scale;
-          }
-        }
-
-        oidn_filter.setImage(
-            passes[i].name, scaled_buffer.data(), oidn::Format::Float3, w, h, 0, 0, 0);
-      }
-      else {
-        oidn_filter.setImage(passes[i].name,
-                             buffer + buffer_offset,
-                             oidn::Format::Float3,
-                             w,
-                             h,
-                             0,
-                             pixel_stride * sizeof(float),
-                             row_stride * sizeof(float));
-      }
-    }
-
-    /* Execute filter. */
-    oidn_filter.commit();
-    oidn_filter.execute();
-#else
-    (void)task;
-    (void)buffer;
-    (void)offset;
-    (void)stride;
-    (void)x;
-    (void)y;
-    (void)w;
-    (void)h;
-    (void)scale;
-#endif
-  }
-
-  void denoise_openimagedenoise(DeviceTask &task, RenderTile &rtile)
-  {
-    if (task.type == DeviceTask::DENOISE_BUFFER) {
-      /* Copy pixels from compute device to CPU (no-op for CPU device). */
-      rtile.buffers->buffer.copy_from_device();
-
-      denoise_openimagedenoise_buffer(task,
-                                      (float *)rtile.buffer,
-                                      rtile.offset,
-                                      rtile.stride,
-                                      rtile.x,
-                                      rtile.y,
-                                      rtile.w,
-                                      rtile.h,
-                                      1.0f / rtile.sample);
-
-      /* todo: it may be possible to avoid this copy, but we have to ensure that
-       * when other code copies data from the device it doesn't overwrite the
-       * denoiser buffers. */
-      rtile.buffers->buffer.copy_to_device();
-    }
-    else {
-      /* Per-tile denoising. */
-      rtile.sample = rtile.start_sample + rtile.num_samples;
-      const float scale = 1.0f / rtile.sample;
-      const float invscale = rtile.sample;
-      const size_t pass_stride = task.pass_stride;
-
-      /* Map neighboring tiles into one buffer for denoising. */
-      RenderTileNeighbors neighbors(rtile);
-      task.map_neighbor_tiles(neighbors, this);
-      RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
-      rtile = center_tile;
-
-      /* Calculate size of the tile to denoise (including overlap). The overlap
-       * size was chosen empirically. OpenImageDenoise specifies an overlap size
-       * of 128 but this is significantly bigger than typical tile size. */
-      const int4 rect = rect_clip(rect_expand(center_tile.bounds(), 64), neighbors.bounds());
-      const int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y);
-
-      /* Adjacent tiles are in separate memory regions, copy into single buffer. */
-      array<float> merged(rect_size.x * rect_size.y * task.pass_stride);
-
-      for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
-        RenderTile &ntile = neighbors.tiles[i];
-        if (!ntile.buffer) {
-          continue;
-        }
-
-        const int xmin = max(ntile.x, rect.x);
-        const int ymin = max(ntile.y, rect.y);
-        const int xmax = min(ntile.x + ntile.w, rect.z);
-        const int ymax = min(ntile.y + ntile.h, rect.w);
-
-        const size_t tile_offset = ntile.offset + xmin + ymin * ntile.stride;
-        const float *tile_buffer = (float *)ntile.buffer + tile_offset * pass_stride;
-
-        const size_t merged_stride = rect_size.x;
-        const size_t merged_offset = (xmin - rect.x) + (ymin - rect.y) * merged_stride;
-        float *merged_buffer = merged.data() + merged_offset * pass_stride;
-
-        for (int y = ymin; y < ymax; y++) {
-          for (int x = 0; x < pass_stride * (xmax - xmin); x++) {
-            merged_buffer[x] = tile_buffer[x] * scale;
-          }
-          tile_buffer += ntile.stride * pass_stride;
-          merged_buffer += merged_stride * pass_stride;
-        }
-      }
-
-      /* Denoise */
-      denoise_openimagedenoise_buffer(
-          task, merged.data(), 0, rect_size.x, 0, 0, rect_size.x, rect_size.y, 1.0f);
-
-      /* Copy back result from merged buffer. */
-      RenderTile &ntile = neighbors.target;
-      if (ntile.buffer) {
-        const int xmin = max(ntile.x, rect.x);
-        const int ymin = max(ntile.y, rect.y);
-        const int xmax = min(ntile.x + ntile.w, rect.z);
-        const int ymax = min(ntile.y + ntile.h, rect.w);
-
-        const size_t tile_offset = ntile.offset + xmin + ymin * ntile.stride;
-        float *tile_buffer = (float *)ntile.buffer + tile_offset * pass_stride;
-
-        const size_t merged_stride = rect_size.x;
-        const size_t merged_offset = (xmin - rect.x) + (ymin - rect.y) * merged_stride;
-        const float *merged_buffer = merged.data() + merged_offset * pass_stride;
-
-        for (int y = ymin; y < ymax; y++) {
-          for (int x = 0; x < pass_stride * (xmax - xmin); x += pass_stride) {
-            tile_buffer[x + 0] = merged_buffer[x + 0] * invscale;
-            tile_buffer[x + 1] = merged_buffer[x + 1] * invscale;
-            tile_buffer[x + 2] = merged_buffer[x + 2] * invscale;
-          }
-          tile_buffer += ntile.stride * pass_stride;
-          merged_buffer += merged_stride * pass_stride;
-        }
-      }
-
-      task.unmap_neighbor_tiles(neighbors, this);
-    }
-  }
-
-  void denoise_nlm(DenoisingTask &denoising, RenderTile &tile)
-  {
-    ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING);
-
-    tile.sample = tile.start_sample + tile.num_samples;
-
-    denoising.functions.construct_transform = function_bind(
-        &CPUDevice::denoising_construct_transform, this, &denoising);
-    denoising.functions.accumulate = function_bind(
-        &CPUDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
-    denoising.functions.solve = function_bind(&CPUDevice::denoising_solve, this, _1, &denoising);
-    denoising.functions.divide_shadow = function_bind(
-        &CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
-    denoising.functions.non_local_means = function_bind(
-        &CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
-    denoising.functions.combine_halves = function_bind(
-        &CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
-    denoising.functions.get_feature = function_bind(
-        &CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
-    denoising.functions.write_feature = function_bind(
-        &CPUDevice::denoising_write_feature, this, _1, _2, _3, &denoising);
-    denoising.functions.detect_outliers = function_bind(
-        &CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
-
-    denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h);
-    denoising.render_buffer.samples = tile.sample;
-    denoising.buffer.gpu_temporary_mem = false;
-
-    denoising.run_denoising(tile);
-  }
-
-  void thread_render(DeviceTask &task)
-  {
-    if (TaskPool::canceled()) {
-      if (task.need_finish_queue == false)
-        return;
-    }
-
-    /* allocate buffer for kernel globals */
-    device_only_memory<KernelGlobals> kgbuffer(this, "kernel_globals");
-    kgbuffer.alloc_to_device(1);
-
-    KernelGlobals *kg = new ((void *)kgbuffer.device_pointer)
-        KernelGlobals(thread_kernel_globals_init());
-
-    profiler.add_state(&kg->profiler);
-
-    CPUSplitKernel *split_kernel = NULL;
-    if (use_split_kernel) {
-      split_kernel = new CPUSplitKernel(this);
-      if (!split_kernel->load_kernels(requested_features)) {
-        thread_kernel_globals_free((KernelGlobals *)kgbuffer.device_pointer);
-        kgbuffer.free();
-        delete split_kernel;
-        return;
-      }
-    }
-
-    /* NLM denoiser. */
-    DenoisingTask *denoising = NULL;
-
-    /* OpenImageDenoise: we can only denoise with one thread at a time, so to
-     * avoid waiting with mutex locks in the denoiser, we let only a single
-     * thread acquire denoising tiles. */
-    uint tile_types = task.tile_types;
-    bool hold_denoise_lock = false;
-    if ((tile_types & RenderTile::DENOISE) && task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
-      if (!oidn_task_lock.try_lock()) {
-        tile_types &= ~RenderTile::DENOISE;
-        hold_denoise_lock = true;
-      }
-    }
-
-    RenderTile tile;
-    while (task.acquire_tile(this, tile, tile_types)) {
-      if (tile.task == RenderTile::PATH_TRACE) {
-        if (use_split_kernel) {
-          device_only_memory<uchar> void_buffer(this, "void_buffer");
-          split_kernel->path_trace(task, tile, kgbuffer, void_buffer);
-        }
-        else {
-          render(task, tile, kg);
-        }
-      }
-      else if (tile.task == RenderTile::BAKE) {
-        render(task, tile, kg);
-      }
-      else if (tile.task == RenderTile::DENOISE) {
-        if (task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
-          denoise_openimagedenoise(task, tile);
-        }
-        else if (task.denoising.type == DENOISER_NLM) {
-          if (denoising == NULL) {
-            denoising = new DenoisingTask(this, task);
-            denoising->profiler = &kg->profiler;
-          }
-          denoise_nlm(*denoising, tile);
-        }
-        task.update_progress(&tile, tile.w * tile.h);
-      }
-
-      task.release_tile(tile);
-
-      if (TaskPool::canceled()) {
-        if (task.need_finish_queue == false)
-          break;
-      }
-    }
-
-    if (hold_denoise_lock) {
-      oidn_task_lock.unlock();
-    }
-
-    profiler.remove_state(&kg->profiler);
-
-    thread_kernel_globals_free((KernelGlobals *)kgbuffer.device_pointer);
-    kg->~KernelGlobals();
-    kgbuffer.free();
-    delete split_kernel;
-    delete denoising;
-  }
-
-  void thread_denoise(DeviceTask &task)
-  {
-    RenderTile tile;
-    tile.x = task.x;
-    tile.y = task.y;
-    tile.w = task.w;
-    tile.h = task.h;
-    tile.buffer = task.buffer;
-    tile.sample = task.sample + task.num_samples;
-    tile.num_samples = task.num_samples;
-    tile.start_sample = task.sample;
-    tile.offset = task.offset;
-    tile.stride = task.stride;
-    tile.buffers = task.buffers;
-
-    if (task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
-      denoise_openimagedenoise(task, tile);
-    }
-    else {
-      DenoisingTask denoising(this, task);
-
-      ProfilingState denoising_profiler_state;
-      profiler.add_state(&denoising_profiler_state);
-      denoising.profiler = &denoising_profiler_state;
-
-      denoise_nlm(denoising, tile);
-
-      profiler.remove_state(&denoising_profiler_state);
-    }
-
-    task.update_progress(&tile, tile.w * tile.h);
-  }
-
-  void thread_film_convert(DeviceTask &task)
-  {
-    float sample_scale = 1.0f / (task.sample + 1);
-
-    if (task.rgba_half) {
-      for (int y = task.y; y < task.y + task.h; y++)
-        for (int x = task.x; x < task.x + task.w; x++)
-          convert_to_half_float_kernel()(&kernel_globals,
-                                         (uchar4 *)task.rgba_half,
-                                         (float *)task.buffer,
-                                         sample_scale,
-                                         x,
-                                         y,
-                                         task.offset,
-                                         task.stride);
-    }
-    else {
-      for (int y = task.y; y < task.y + task.h; y++)
-        for (int x = task.x; x < task.x + task.w; x++)
-          convert_to_byte_kernel()(&kernel_globals,
-                                   (uchar4 *)task.rgba_byte,
-                                   (float *)task.buffer,
-                                   sample_scale,
-                                   x,
-                                   y,
-                                   task.offset,
-                                   task.stride);
-    }
-  }
-
-  void thread_shader(DeviceTask &task)
-  {
-    KernelGlobals *kg = new KernelGlobals(thread_kernel_globals_init());
-
-    for (int sample = 0; sample < task.num_samples; sample++) {
-      for (int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-        shader_kernel()(kg,
-                        (uint4 *)task.shader_input,
-                        (float4 *)task.shader_output,
-                        task.shader_eval_type,
-                        task.shader_filter,
-                        x,
-                        task.offset,
-                        sample);
-
-      if (task.get_cancel() || TaskPool::canceled())
-        break;
-
-      task.update_progress(NULL);
-    }
-
-    thread_kernel_globals_free(kg);
-    delete kg;
-  }
-
-  virtual int get_split_task_count(DeviceTask &task) override
-  {
-    if (task.type == DeviceTask::SHADER)
-      return task.get_subtask_count(info.cpu_threads, 256);
-    else
-      return task.get_subtask_count(info.cpu_threads);
-  }
-
-  virtual void task_add(DeviceTask &task) override
-  {
-    /* Load texture info. */
-    load_texture_info();
-
-    /* split task into smaller ones */
-    list<DeviceTask> tasks;
-
-    if (task.type == DeviceTask::DENOISE_BUFFER &&
-        task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
-      /* Denoise entire buffer at once with OIDN, it has own threading. */
-      tasks.push_back(task);
-    }
-    else if (task.type == DeviceTask::SHADER) {
-      task.split(tasks, info.cpu_threads, 256);
-    }
-    else {
-      task.split(tasks, info.cpu_threads);
-    }
-
-    foreach (DeviceTask &task, tasks) {
-      task_pool.push([=] {
-        DeviceTask task_copy = task;
-        thread_run(task_copy);
-      });
-    }
-  }
-
-  virtual void task_wait() override
-  {
-    task_pool.wait_work();
-  }
-
-  virtual void task_cancel() override
-  {
-    task_pool.cancel();
-  }
-
- protected:
-  inline KernelGlobals thread_kernel_globals_init()
-  {
-    KernelGlobals kg = kernel_globals;
-    kg.transparent_shadow_intersections = NULL;
-    const int decoupled_count = sizeof(kg.decoupled_volume_steps) /
-                                sizeof(*kg.decoupled_volume_steps);
-    for (int i = 0; i < decoupled_count; ++i) {
-      kg.decoupled_volume_steps[i] = NULL;
-    }
-    kg.decoupled_volume_steps_index = 0;
-    kg.coverage_asset = kg.coverage_object = kg.coverage_material = NULL;
-#ifdef WITH_OSL
-    OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
-#endif
-    return kg;
-  }
-
-  inline void thread_kernel_globals_free(KernelGlobals *kg)
-  {
-    if (kg == NULL) {
-      return;
-    }
-
-    if (kg->transparent_shadow_intersections != NULL) {
-      free(kg->transparent_shadow_intersections);
-    }
-    const int decoupled_count = sizeof(kg->decoupled_volume_steps) /
-                                sizeof(*kg->decoupled_volume_steps);
-    for (int i = 0; i < decoupled_count; ++i) {
-      if (kg->decoupled_volume_steps[i] != NULL) {
-        free(kg->decoupled_volume_steps[i]);
-      }
-    }
-#ifdef WITH_OSL
-    OSLShader::thread_free(kg);
-#endif
-  }
-
-  virtual bool load_kernels(const DeviceRequestedFeatures &requested_features_) override
-  {
-    requested_features = requested_features_;
-
-    return true;
-  }
-};
-
-/* split kernel */
-
-class CPUSplitKernelFunction : public SplitKernelFunction {
- public:
-  CPUDevice *device;
-  void (*func)(KernelGlobals *kg, KernelData *data);
-
-  CPUSplitKernelFunction(CPUDevice *device) : device(device), func(NULL)
-  {
-  }
-  ~CPUSplitKernelFunction()
-  {
-  }
-
-  virtual bool enqueue(const KernelDimensions &dim,
-                       device_memory &kernel_globals,
-                       device_memory &data)
-  {
-    if (!func) {
-      return false;
-    }
-
-    KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer;
-    kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
-
-    for (int y = 0; y < dim.global_size[1]; y++) {
-      for (int x = 0; x < dim.global_size[0]; x++) {
-        kg->global_id = make_int2(x, y);
-
-        func(kg, (KernelData *)data.device_pointer);
-      }
-    }
-
-    return true;
-  }
-};
-
-CPUSplitKernel::CPUSplitKernel(CPUDevice *device) : DeviceSplitKernel(device), device(device)
-{
-}
-
-bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim,
-                                                    RenderTile &rtile,
-                                                    int num_global_elements,
-                                                    device_memory &kernel_globals,
-                                                    device_memory &data,
-                                                    device_memory &split_data,
-                                                    device_memory &ray_state,
-                                                    device_memory &queue_index,
-                                                    device_memory &use_queues_flags,
-                                                    device_memory &work_pool_wgs)
-{
-  KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer;
-  kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
-
-  for (int y = 0; y < dim.global_size[1]; y++) {
-    for (int x = 0; x < dim.global_size[0]; x++) {
-      kg->global_id = make_int2(x, y);
-
-      device->data_init_kernel()((KernelGlobals *)kernel_globals.device_pointer,
-                                 (KernelData *)data.device_pointer,
-                                 (void *)split_data.device_pointer,
-                                 num_global_elements,
-                                 (char *)ray_state.device_pointer,
-                                 rtile.start_sample,
-                                 rtile.start_sample + rtile.num_samples,
-                                 rtile.x,
-                                 rtile.y,
-                                 rtile.w,
-                                 rtile.h,
-                                 rtile.offset,
-                                 rtile.stride,
-                                 (int *)queue_index.device_pointer,
-                                 dim.global_size[0] * dim.global_size[1],
-                                 (char *)use_queues_flags.device_pointer,
-                                 (uint *)work_pool_wgs.device_pointer,
-                                 rtile.num_samples,
-                                 (float *)rtile.buffer);
-    }
-  }
-
-  return true;
-}
-
-SplitKernelFunction *CPUSplitKernel::get_split_kernel_function(const string &kernel_name,
-                                                               const DeviceRequestedFeatures &)
-{
-  CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
-
-  kernel->func = device->split_kernels[kernel_name]();
-  if (!kernel->func) {
-    delete kernel;
-    return NULL;
-  }
-
-  return kernel;
-}
-
-int2 CPUSplitKernel::split_kernel_local_size()
-{
-  return make_int2(1, 1);
-}
-
-int2 CPUSplitKernel::split_kernel_global_size(device_memory & /*kg*/,
-                                              device_memory & /*data*/,
-                                              DeviceTask & /*task*/)
-{
-  return make_int2(1, 1);
-}
-
-uint64_t CPUSplitKernel::state_buffer_size(device_memory &kernel_globals,
-                                           device_memory & /*data*/,
-                                           size_t num_threads)
-{
-  KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer;
-
-  return split_data_buffer_size(kg, num_threads);
-}
-
-Device *device_cpu_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
-{
-  return new CPUDevice(info, stats, profiler, background);
-}
-
-void device_cpu_info(vector<DeviceInfo> &devices)
-{
-  DeviceInfo info;
-
-  info.type = DEVICE_CPU;
-  info.description = system_cpu_brand_string();
-  info.id = "CPU";
-  info.num = 0;
-  info.has_volume_decoupled = true;
-  info.has_adaptive_stop_per_sample = true;
-  info.has_osl = true;
-  info.has_half_images = true;
-  info.has_nanovdb = true;
-  info.has_profiling = true;
-  info.denoisers = DENOISER_NLM;
-  if (openimagedenoise_supported()) {
-    info.denoisers |= DENOISER_OPENIMAGEDENOISE;
-  }
-
-  devices.insert(devices.begin(), info);
-}
-
-string device_cpu_capabilities()
-{
-  string capabilities = "";
-  capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
-  capabilities += system_cpu_support_sse3() ? "SSE3 " : "";
-  capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
-  capabilities += system_cpu_support_avx() ? "AVX " : "";
-  capabilities += system_cpu_support_avx2() ? "AVX2" : "";
-  if (capabilities[capabilities.size() - 1] == ' ')
-    capabilities.resize(capabilities.size() - 1);
-  return capabilities;
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoise.cpp b/intern/cycles/device/device_denoise.cpp
new file mode 100644
index 00000000000..aea7868f65d
--- /dev/null
+++ b/intern/cycles/device/device_denoise.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device_denoise.h"
+
+CCL_NAMESPACE_BEGIN
+
+const char *denoiserTypeToHumanReadable(DenoiserType type)
+{
+  switch (type) {
+    case DENOISER_OPTIX:
+      return "OptiX";
+    case DENOISER_OPENIMAGEDENOISE:
+      return "OpenImageDenoise";
+
+    case DENOISER_NUM:
+    case DENOISER_NONE:
+    case DENOISER_ALL:
+      return "UNKNOWN";
+  }
+
+  return "UNKNOWN";
+}
+
+const NodeEnum *DenoiseParams::get_type_enum()
+{
+  static NodeEnum type_enum;
+
+  if (type_enum.empty()) {
+    type_enum.insert("optix", DENOISER_OPTIX);
+    type_enum.insert("openimageio", DENOISER_OPENIMAGEDENOISE);
+  }
+
+  return &type_enum;
+}
+
+const NodeEnum *DenoiseParams::get_prefilter_enum()
+{
+  static NodeEnum prefilter_enum;
+
+  if (prefilter_enum.empty()) {
+    prefilter_enum.insert("none", DENOISER_PREFILTER_NONE);
+    prefilter_enum.insert("fast", DENOISER_PREFILTER_FAST);
+    prefilter_enum.insert("accurate", DENOISER_PREFILTER_ACCURATE);
+  }
+
+  return &prefilter_enum;
+}
+
+NODE_DEFINE(DenoiseParams)
+{
+  NodeType *type = NodeType::add("denoise_params", create);
+
+  const NodeEnum *type_enum = get_type_enum();
+  const NodeEnum *prefilter_enum = get_prefilter_enum();
+
+  SOCKET_BOOLEAN(use, "Use", false);
+
+  SOCKET_ENUM(type, "Type", *type_enum, DENOISER_OPENIMAGEDENOISE);
+
+  SOCKET_INT(start_sample, "Start Sample", 0);
+
+  SOCKET_BOOLEAN(use_pass_albedo, "Use Pass Albedo", true);
+  SOCKET_BOOLEAN(use_pass_normal, "Use Pass Normal", false);
+
+  SOCKET_ENUM(prefilter, "Prefilter", *prefilter_enum, DENOISER_PREFILTER_FAST);
+
+  return type;
+}
+
+DenoiseParams::DenoiseParams() : Node(get_node_type())
+{
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoise.h b/intern/cycles/device/device_denoise.h
new file mode 100644
index 00000000000..02ee63fb0ad
--- /dev/null
+++ b/intern/cycles/device/device_denoise.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "device/device_memory.h"
+#include "graph/node.h"
+#include "render/buffers.h"
+
+CCL_NAMESPACE_BEGIN
+
+enum DenoiserType {
+  DENOISER_OPTIX = 2,
+  DENOISER_OPENIMAGEDENOISE = 4,
+  DENOISER_NUM,
+
+  DENOISER_NONE = 0,
+  DENOISER_ALL = ~0,
+};
+
+/* COnstruct human-readable string which denotes the denoiser type. */
+const char *denoiserTypeToHumanReadable(DenoiserType type);
+
+typedef int DenoiserTypeMask;
+
+enum DenoiserPrefilter {
+  /* Best quality of the result without extra processing time, but requires guiding passes to be
+   * noise-free. */
+  DENOISER_PREFILTER_NONE = 1,
+
+  /* Denoise color and guiding passes together.
+   * Improves quality when guiding passes are noisy using least amount of extra processing time. */
+  DENOISER_PREFILTER_FAST = 2,
+
+  /* Prefilter noisy guiding passes before denoising color.
+   * Improves quality when guiding passes are noisy using extra processing time. */
+  DENOISER_PREFILTER_ACCURATE = 3,
+
+  DENOISER_PREFILTER_NUM,
+};
+
+/* NOTE: Is not a real scene node. Using Node API for ease of (de)serialization.
+ * The default values here do not really matter as they are always initialized from the
+ * Integrator node. */
+class DenoiseParams : public Node {
+ public:
+  NODE_DECLARE
+
+  /* Apply denoiser to image. */
+  bool use = false;
+
+  /* Denoiser type. */
+  DenoiserType type = DENOISER_OPENIMAGEDENOISE;
+
+  /* Viewport start sample. */
+  int start_sample = 0;
+
+  /* Auxiliry passes. */
+  bool use_pass_albedo = true;
+  bool use_pass_normal = true;
+
+  DenoiserPrefilter prefilter = DENOISER_PREFILTER_FAST;
+
+  static const NodeEnum *get_type_enum();
+  static const NodeEnum *get_prefilter_enum();
+
+  DenoiseParams();
+
+  bool modified(const DenoiseParams &other) const
+  {
+    return !(use == other.use && type == other.type && start_sample == other.start_sample &&
+             use_pass_albedo == other.use_pass_albedo &&
+             use_pass_normal == other.use_pass_normal && prefilter == other.prefilter);
+  }
+};
+
+/* All the parameters needed to perform buffer denoising on a device.
+ * Is not really a task in its canonical terms (as in, is not an asynchronous running task). Is
+ * more like a wrapper for all the arguments and parameters needed to perform denoising. Is a
+ * single place where they are all listed, so that it's not required to modify all device methods
+ * when these parameters do change. */
+class DeviceDenoiseTask {
+ public:
+  DenoiseParams params;
+
+  int num_samples;
+
+  RenderBuffers *render_buffers;
+  BufferParams buffer_params;
+
+  /* Allow to do in-place modification of the input passes (scaling them down i.e.). This will
+   * lower the memory footprint of the denoiser but will make input passes "invalid" (from path
+   * tracer) point of view. */
+  bool allow_inplace_modification;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
deleted file mode 100644
index 38c42d15cab..00000000000
--- a/intern/cycles/device/device_denoising.cpp
+++ /dev/null
@@ -1,353 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "device/device_denoising.h"
-
-#include "kernel/filter/filter_defines.h"
-
-CCL_NAMESPACE_BEGIN
-
-DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task)
-    : tile_info_mem(device, "denoising tile info mem", MEM_READ_WRITE),
-      profiler(NULL),
-      storage(device),
-      buffer(device),
-      device(device)
-{
-  radius = task.denoising.radius;
-  nlm_k_2 = powf(2.0f, lerp(-5.0f, 3.0f, task.denoising.strength));
-  if (task.denoising.relative_pca) {
-    pca_threshold = -powf(10.0f, lerp(-8.0f, 0.0f, task.denoising.feature_strength));
-  }
-  else {
-    pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising.feature_strength));
-  }
-
-  render_buffer.frame_stride = task.frame_stride;
-  render_buffer.pass_stride = task.pass_stride;
-  render_buffer.offset = task.pass_denoising_data;
-
-  target_buffer.pass_stride = task.target_pass_stride;
-  target_buffer.denoising_clean_offset = task.pass_denoising_clean;
-  target_buffer.offset = 0;
-
-  functions.map_neighbor_tiles = function_bind(task.map_neighbor_tiles, _1, device);
-  functions.unmap_neighbor_tiles = function_bind(task.unmap_neighbor_tiles, _1, device);
-
-  tile_info = (TileInfo *)tile_info_mem.alloc(sizeof(TileInfo) / sizeof(int));
-  tile_info->from_render = task.denoising_from_render ? 1 : 0;
-
-  tile_info->frames[0] = 0;
-  tile_info->num_frames = min(task.denoising_frames.size() + 1, DENOISE_MAX_FRAMES);
-  for (int i = 1; i < tile_info->num_frames; i++) {
-    tile_info->frames[i] = task.denoising_frames[i - 1];
-  }
-
-  do_prefilter = task.denoising.store_passes && task.denoising.type == DENOISER_NLM;
-  do_filter = task.denoising.use && task.denoising.type == DENOISER_NLM;
-}
-
-DenoisingTask::~DenoisingTask()
-{
-  storage.XtWX.free();
-  storage.XtWY.free();
-  storage.transform.free();
-  storage.rank.free();
-  buffer.mem.free();
-  buffer.temporary_mem.free();
-  tile_info_mem.free();
-}
-
-void DenoisingTask::set_render_buffer(RenderTileNeighbors &neighbors)
-{
-  for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
-    RenderTile &rtile = neighbors.tiles[i];
-    tile_info->offsets[i] = rtile.offset;
-    tile_info->strides[i] = rtile.stride;
-    tile_info->buffers[i] = rtile.buffer;
-  }
-  tile_info->x[0] = neighbors.tiles[3].x;
-  tile_info->x[1] = neighbors.tiles[4].x;
-  tile_info->x[2] = neighbors.tiles[5].x;
-  tile_info->x[3] = neighbors.tiles[5].x + neighbors.tiles[5].w;
-  tile_info->y[0] = neighbors.tiles[1].y;
-  tile_info->y[1] = neighbors.tiles[4].y;
-  tile_info->y[2] = neighbors.tiles[7].y;
-  tile_info->y[3] = neighbors.tiles[7].y + neighbors.tiles[7].h;
-
-  target_buffer.offset = neighbors.target.offset;
-  target_buffer.stride = neighbors.target.stride;
-  target_buffer.ptr = neighbors.target.buffer;
-
-  if (do_prefilter && neighbors.target.buffers) {
-    target_buffer.denoising_output_offset =
-        neighbors.target.buffers->params.get_denoising_prefiltered_offset();
-  }
-  else {
-    target_buffer.denoising_output_offset = 0;
-  }
-
-  tile_info_mem.copy_to_device();
-}
-
-void DenoisingTask::setup_denoising_buffer()
-{
-  /* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring
-   * tiles */
-  rect = rect_from_shape(filter_area.x, filter_area.y, filter_area.z, filter_area.w);
-  rect = rect_expand(rect, radius);
-  rect = rect_clip(rect,
-                   make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3]));
-
-  buffer.use_intensity = do_prefilter || (tile_info->num_frames > 1);
-  buffer.passes = buffer.use_intensity ? 15 : 14;
-  buffer.width = rect.z - rect.x;
-  buffer.stride = align_up(buffer.width, 4);
-  buffer.h = rect.w - rect.y;
-  int alignment_floats = divide_up(device->mem_sub_ptr_alignment(), sizeof(float));
-  buffer.pass_stride = align_up(buffer.stride * buffer.h, alignment_floats);
-  buffer.frame_stride = buffer.pass_stride * buffer.passes;
-  /* Pad the total size by four floats since the SIMD kernels might go a bit over the end. */
-  int mem_size = align_up(tile_info->num_frames * buffer.frame_stride + 4, alignment_floats);
-  buffer.mem.alloc_to_device(mem_size, false);
-  buffer.use_time = (tile_info->num_frames > 1);
-
-  /* CPUs process shifts sequentially while GPUs process them in parallel. */
-  int num_layers;
-  if (buffer.gpu_temporary_mem) {
-    /* Shadowing prefiltering uses a radius of 6, so allocate at least that much. */
-    int max_radius = max(radius, 6);
-    int num_shifts = (2 * max_radius + 1) * (2 * max_radius + 1);
-    num_layers = 2 * num_shifts + 1;
-  }
-  else {
-    num_layers = 3;
-  }
-  /* Allocate two layers per shift as well as one for the weight accumulation. */
-  buffer.temporary_mem.alloc_to_device(num_layers * buffer.pass_stride);
-}
-
-void DenoisingTask::prefilter_shadowing()
-{
-  device_ptr null_ptr = (device_ptr)0;
-
-  device_sub_ptr unfiltered_a(buffer.mem, 0, buffer.pass_stride);
-  device_sub_ptr unfiltered_b(buffer.mem, 1 * buffer.pass_stride, buffer.pass_stride);
-  device_sub_ptr sample_var(buffer.mem, 2 * buffer.pass_stride, buffer.pass_stride);
-  device_sub_ptr sample_var_var(buffer.mem, 3 * buffer.pass_stride, buffer.pass_stride);
-  device_sub_ptr buffer_var(buffer.mem, 5 * buffer.pass_stride, buffer.pass_stride);
-  device_sub_ptr filtered_var(buffer.mem, 6 * buffer.pass_stride, buffer.pass_stride);
-
-  /* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the
-   * sample variance and the buffer variance. */
-  functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var);
-
-  /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the
-   * sample variance. */
-  nlm_state.set_parameters(6, 3, 4.0f, 1.0f, false);
-  functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var);
-
-  /* Reuse memory, the previous data isn't needed anymore. */
-  device_ptr filtered_a = *buffer_var, filtered_b = *sample_var;
-  /* Use the smoothed variance to filter the two shadow half images using each other for weight
-   * calculation. */
-  nlm_state.set_parameters(5, 3, 1.0f, 0.25f, false);
-  functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a);
-  functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b);
-
-  device_ptr residual_var = *sample_var_var;
-  /* Estimate the residual variance between the two filtered halves. */
-  functions.combine_halves(filtered_a, filtered_b, null_ptr, residual_var, 2, rect);
-
-  device_ptr final_a = *unfiltered_a, final_b = *unfiltered_b;
-  /* Use the residual variance for a second filter pass. */
-  nlm_state.set_parameters(4, 2, 1.0f, 0.5f, false);
-  functions.non_local_means(filtered_a, filtered_b, residual_var, final_a);
-  functions.non_local_means(filtered_b, filtered_a, residual_var, final_b);
-
-  /* Combine the two double-filtered halves to a final shadow feature. */
-  device_sub_ptr shadow_pass(buffer.mem, 4 * buffer.pass_stride, buffer.pass_stride);
-  functions.combine_halves(final_a, final_b, *shadow_pass, null_ptr, 0, rect);
-}
-
-void DenoisingTask::prefilter_features()
-{
-  device_sub_ptr unfiltered(buffer.mem, 8 * buffer.pass_stride, buffer.pass_stride);
-  device_sub_ptr variance(buffer.mem, 9 * buffer.pass_stride, buffer.pass_stride);
-
-  int mean_from[] = {0, 1, 2, 12, 6, 7, 8};
-  int variance_from[] = {3, 4, 5, 13, 9, 10, 11};
-  int pass_to[] = {1, 2, 3, 0, 5, 6, 7};
-  for (int pass = 0; pass < 7; pass++) {
-    device_sub_ptr feature_pass(
-        buffer.mem, pass_to[pass] * buffer.pass_stride, buffer.pass_stride);
-    /* Get the unfiltered pass and its variance from the RenderBuffers. */
-    functions.get_feature(mean_from[pass],
-                          variance_from[pass],
-                          *unfiltered,
-                          *variance,
-                          1.0f / render_buffer.samples);
-    /* Smooth the pass and store the result in the denoising buffers. */
-    nlm_state.set_parameters(2, 2, 1.0f, 0.25f, false);
-    functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass);
-  }
-}
-
-void DenoisingTask::prefilter_color()
-{
-  int mean_from[] = {20, 21, 22};
-  int variance_from[] = {23, 24, 25};
-  int mean_to[] = {8, 9, 10};
-  int variance_to[] = {11, 12, 13};
-  int num_color_passes = 3;
-
-  device_only_memory<float> temporary_color(device, "denoising temporary color");
-  temporary_color.alloc_to_device(6 * buffer.pass_stride, false);
-
-  for (int pass = 0; pass < num_color_passes; pass++) {
-    device_sub_ptr color_pass(temporary_color, pass * buffer.pass_stride, buffer.pass_stride);
-    device_sub_ptr color_var_pass(
-        temporary_color, (pass + 3) * buffer.pass_stride, buffer.pass_stride);
-    functions.get_feature(mean_from[pass],
-                          variance_from[pass],
-                          *color_pass,
-                          *color_var_pass,
-                          1.0f / render_buffer.samples);
-  }
-
-  device_sub_ptr depth_pass(buffer.mem, 0, buffer.pass_stride);
-  device_sub_ptr color_var_pass(
-      buffer.mem, variance_to[0] * buffer.pass_stride, 3 * buffer.pass_stride);
-  device_sub_ptr output_pass(buffer.mem, mean_to[0] * buffer.pass_stride, 3 * buffer.pass_stride);
-  functions.detect_outliers(
-      temporary_color.device_pointer, *color_var_pass, *depth_pass, *output_pass);
-
-  if (buffer.use_intensity) {
-    device_sub_ptr intensity_pass(buffer.mem, 14 * buffer.pass_stride, buffer.pass_stride);
-    nlm_state.set_parameters(radius, 4, 2.0f, nlm_k_2 * 4.0f, true);
-    functions.non_local_means(*output_pass, *output_pass, *color_var_pass, *intensity_pass);
-  }
-}
-
-void DenoisingTask::load_buffer()
-{
-  device_ptr null_ptr = (device_ptr)0;
-
-  int original_offset = render_buffer.offset;
-
-  int num_passes = buffer.use_intensity ? 15 : 14;
-  for (int i = 0; i < tile_info->num_frames; i++) {
-    for (int pass = 0; pass < num_passes; pass++) {
-      device_sub_ptr to_pass(
-          buffer.mem, i * buffer.frame_stride + pass * buffer.pass_stride, buffer.pass_stride);
-      bool is_variance = (pass >= 11) && (pass <= 13);
-      functions.get_feature(
-          pass, -1, *to_pass, null_ptr, is_variance ? (1.0f / render_buffer.samples) : 1.0f);
-    }
-    render_buffer.offset += render_buffer.frame_stride;
-  }
-
-  render_buffer.offset = original_offset;
-}
-
-void DenoisingTask::write_buffer()
-{
-  reconstruction_state.buffer_params = make_int4(target_buffer.offset,
-                                                 target_buffer.stride,
-                                                 target_buffer.pass_stride,
-                                                 target_buffer.denoising_clean_offset);
-  int num_passes = buffer.use_intensity ? 15 : 14;
-  for (int pass = 0; pass < num_passes; pass++) {
-    device_sub_ptr from_pass(buffer.mem, pass * buffer.pass_stride, buffer.pass_stride);
-    int out_offset = pass + target_buffer.denoising_output_offset;
-    functions.write_feature(out_offset, *from_pass, target_buffer.ptr);
-  }
-}
-
-void DenoisingTask::construct_transform()
-{
-  storage.w = filter_area.z;
-  storage.h = filter_area.w;
-
-  storage.transform.alloc_to_device(storage.w * storage.h * TRANSFORM_SIZE, false);
-  storage.rank.alloc_to_device(storage.w * storage.h, false);
-
-  functions.construct_transform();
-}
-
-void DenoisingTask::reconstruct()
-{
-  storage.XtWX.alloc_to_device(storage.w * storage.h * XTWX_SIZE, false);
-  storage.XtWY.alloc_to_device(storage.w * storage.h * XTWY_SIZE, false);
-  storage.XtWX.zero_to_device();
-  storage.XtWY.zero_to_device();
-
-  reconstruction_state.filter_window = rect_from_shape(
-      filter_area.x - rect.x, filter_area.y - rect.y, storage.w, storage.h);
-  int tile_coordinate_offset = filter_area.y * target_buffer.stride + filter_area.x;
-  reconstruction_state.buffer_params = make_int4(target_buffer.offset + tile_coordinate_offset,
-                                                 target_buffer.stride,
-                                                 target_buffer.pass_stride,
-                                                 target_buffer.denoising_clean_offset);
-  reconstruction_state.source_w = rect.z - rect.x;
-  reconstruction_state.source_h = rect.w - rect.y;
-
-  device_sub_ptr color_ptr(buffer.mem, 8 * buffer.pass_stride, 3 * buffer.pass_stride);
-  device_sub_ptr color_var_ptr(buffer.mem, 11 * buffer.pass_stride, 3 * buffer.pass_stride);
-  for (int f = 0; f < tile_info->num_frames; f++) {
-    device_ptr scale_ptr = 0;
-    device_sub_ptr *scale_sub_ptr = NULL;
-    if (tile_info->frames[f] != 0 && (tile_info->num_frames > 1)) {
-      scale_sub_ptr = new device_sub_ptr(buffer.mem, 14 * buffer.pass_stride, buffer.pass_stride);
-      scale_ptr = **scale_sub_ptr;
-    }
-
-    functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr, f);
-    delete scale_sub_ptr;
-  }
-  functions.solve(target_buffer.ptr);
-}
-
-void DenoisingTask::run_denoising(RenderTile &tile)
-{
-  RenderTileNeighbors neighbors(tile);
-  functions.map_neighbor_tiles(neighbors);
-  set_render_buffer(neighbors);
-
-  setup_denoising_buffer();
-
-  if (tile_info->from_render) {
-    prefilter_shadowing();
-    prefilter_features();
-    prefilter_color();
-  }
-  else {
-    load_buffer();
-  }
-
-  if (do_filter) {
-    construct_transform();
-    reconstruct();
-  }
-
-  if (do_prefilter) {
-    write_buffer();
-  }
-
-  functions.unmap_neighbor_tiles(neighbors);
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h
deleted file mode 100644
index bb8bdfdd225..00000000000
--- a/intern/cycles/device/device_denoising.h
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEVICE_DENOISING_H__
-#define __DEVICE_DENOISING_H__
-
-#include "device/device.h"
-
-#include "render/buffers.h"
-
-#include "kernel/filter/filter_defines.h"
-
-#include "util/util_profiling.h"
-
-CCL_NAMESPACE_BEGIN
-
-class DenoisingTask {
- public:
-  /* Parameters of the denoising algorithm. */
-  int radius;
-  float nlm_k_2;
-  float pca_threshold;
-
-  /* Parameters of the RenderBuffers. */
-  struct RenderBuffers {
-    int offset;
-    int pass_stride;
-    int frame_stride;
-    int samples;
-  } render_buffer;
-
-  /* Pointer and parameters of the target buffer. */
-  struct TargetBuffer {
-    int offset;
-    int stride;
-    int pass_stride;
-    int denoising_clean_offset;
-    int denoising_output_offset;
-    device_ptr ptr;
-  } target_buffer;
-
-  TileInfo *tile_info;
-  device_vector<int> tile_info_mem;
-
-  ProfilingState *profiler;
-
-  int4 rect;
-  int4 filter_area;
-
-  bool do_prefilter;
-  bool do_filter;
-
-  struct DeviceFunctions {
-    function<bool(
-        device_ptr image_ptr,    /* Contains the values that are smoothed. */
-        device_ptr guide_ptr,    /* Contains the values that are used to calculate weights. */
-        device_ptr variance_ptr, /* Contains the variance of the guide image. */
-        device_ptr out_ptr       /* The filtered output is written into this image. */
-        )>
-        non_local_means;
-    function<bool(
-        device_ptr color_ptr, device_ptr color_variance_ptr, device_ptr scale_ptr, int frame)>
-        accumulate;
-    function<bool(device_ptr output_ptr)> solve;
-    function<bool()> construct_transform;
-
-    function<bool(device_ptr a_ptr,
-                  device_ptr b_ptr,
-                  device_ptr mean_ptr,
-                  device_ptr variance_ptr,
-                  int r,
-                  int4 rect)>
-        combine_halves;
-    function<bool(device_ptr a_ptr,
-                  device_ptr b_ptr,
-                  device_ptr sample_variance_ptr,
-                  device_ptr sv_variance_ptr,
-                  device_ptr buffer_variance_ptr)>
-        divide_shadow;
-    function<bool(int mean_offset,
-                  int variance_offset,
-                  device_ptr mean_ptr,
-                  device_ptr variance_ptr,
-                  float scale)>
-        get_feature;
-    function<bool(device_ptr image_ptr,
-                  device_ptr variance_ptr,
-                  device_ptr depth_ptr,
-                  device_ptr output_ptr)>
-        detect_outliers;
-    function<bool(int out_offset, device_ptr frop_ptr, device_ptr buffer_ptr)> write_feature;
-    function<void(RenderTileNeighbors &neighbors)> map_neighbor_tiles;
-    function<void(RenderTileNeighbors &neighbors)> unmap_neighbor_tiles;
-  } functions;
-
-  /* Stores state of the current Reconstruction operation,
-   * which is accessed by the device in order to perform the operation. */
-  struct ReconstructionState {
-    int4 filter_window;
-    int4 buffer_params;
-
-    int source_w;
-    int source_h;
-  } reconstruction_state;
-
-  /* Stores state of the current NLM operation,
-   * which is accessed by the device in order to perform the operation. */
-  struct NLMState {
-    int r;     /* Search radius of the filter. */
-    int f;     /* Patch size of the filter. */
-    float a;   /* Variance compensation factor in the MSE estimation. */
-    float k_2; /* Squared value of the k parameter of the filter. */
-    bool is_color;
-
-    void set_parameters(int r_, int f_, float a_, float k_2_, bool is_color_)
-    {
-      r = r_;
-      f = f_;
-      a = a_, k_2 = k_2_;
-      is_color = is_color_;
-    }
-  } nlm_state;
-
-  struct Storage {
-    device_only_memory<float> transform;
-    device_only_memory<int> rank;
-    device_only_memory<float> XtWX;
-    device_only_memory<float3> XtWY;
-    int w;
-    int h;
-
-    Storage(Device *device)
-        : transform(device, "denoising transform"),
-          rank(device, "denoising rank"),
-          XtWX(device, "denoising XtWX"),
-          XtWY(device, "denoising XtWY")
-    {
-    }
-  } storage;
-
-  DenoisingTask(Device *device, const DeviceTask &task);
-  ~DenoisingTask();
-
-  void run_denoising(RenderTile &tile);
-
-  struct DenoiseBuffers {
-    int pass_stride;
-    int passes;
-    int stride;
-    int h;
-    int width;
-    int frame_stride;
-    device_only_memory<float> mem;
-    device_only_memory<float> temporary_mem;
-    bool use_time;
-    bool use_intensity;
-
-    bool gpu_temporary_mem;
-
-    DenoiseBuffers(Device *device)
-        : mem(device, "denoising pixel buffer"),
-          temporary_mem(device, "denoising temporary mem", true)
-    {
-    }
-  } buffer;
-
- protected:
-  Device *device;
-
-  void set_render_buffer(RenderTileNeighbors &neighbors);
-  void setup_denoising_buffer();
-  void prefilter_shadowing();
-  void prefilter_features();
-  void prefilter_color();
-  void construct_transform();
-  void reconstruct();
-
-  void load_buffer();
-  void write_buffer();
-};
-
-CCL_NAMESPACE_END
-
-#endif /* __DEVICE_DENOISING_H__ */
diff --git a/intern/cycles/device/device_graphics_interop.cpp b/intern/cycles/device/device_graphics_interop.cpp
new file mode 100644
index 00000000000..a80a236759f
--- /dev/null
+++ b/intern/cycles/device/device_graphics_interop.cpp
@@ -0,0 +1,21 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device_graphics_interop.h"
+
+CCL_NAMESPACE_BEGIN
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_graphics_interop.h b/intern/cycles/device/device_graphics_interop.h
new file mode 100644
index 00000000000..671b1c189d7
--- /dev/null
+++ b/intern/cycles/device/device_graphics_interop.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Information about interoperability destination.
+ * Is provided by the GPUDisplay. */
+class DeviceGraphicsInteropDestination {
+ public:
+  /* Dimensions of the buffer, in pixels. */
+  int buffer_width = 0;
+  int buffer_height = 0;
+
+  /* OpenGL pixel buffer object. */
+  int opengl_pbo_id = 0;
+
+  /* Clear the entire destination before doing partial write to it. */
+  bool need_clear = false;
+};
+
+/* Device-side graphics interoperability support.
+ *
+ * Takes care of holding all the handlers needed by the device to implement interoperability with
+ * the graphics library. */
+class DeviceGraphicsInterop {
+ public:
+  DeviceGraphicsInterop() = default;
+  virtual ~DeviceGraphicsInterop() = default;
+
+  /* Update this device-side graphics interoperability object with the given destination resource
+   * information. */
+  virtual void set_destination(const DeviceGraphicsInteropDestination &destination) = 0;
+
+  virtual device_ptr map() = 0;
+  virtual void unmap() = 0;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_intern.h b/intern/cycles/device/device_intern.h
deleted file mode 100644
index ecc79c5d7ee..00000000000
--- a/intern/cycles/device/device_intern.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEVICE_INTERN_H__
-#define __DEVICE_INTERN_H__
-
-#include "util/util_string.h"
-#include "util/util_vector.h"
-
-CCL_NAMESPACE_BEGIN
-
-class Device;
-class DeviceInfo;
-class Profiler;
-class Stats;
-
-Device *device_cpu_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-bool device_opencl_init();
-Device *device_opencl_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-bool device_opencl_compile_kernel(const vector<string> &parameters);
-bool device_cuda_init();
-Device *device_cuda_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-bool device_optix_init();
-Device *device_optix_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-Device *device_dummy_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-
-Device *device_network_create(DeviceInfo &info,
-                              Stats &stats,
-                              Profiler &profiler,
-                              const char *address);
-Device *device_multi_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-
-void device_cpu_info(vector<DeviceInfo> &devices);
-void device_opencl_info(vector<DeviceInfo> &devices);
-void device_cuda_info(vector<DeviceInfo> &devices);
-void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices);
-void device_network_info(vector<DeviceInfo> &devices);
-
-string device_cpu_capabilities();
-string device_opencl_capabilities();
-string device_cuda_capabilities();
-
-CCL_NAMESPACE_END
-
-#endif /* __DEVICE_INTERN_H__ */
diff --git a/intern/cycles/device/device_kernel.cpp b/intern/cycles/device/device_kernel.cpp
new file mode 100644
index 00000000000..ceaddee4756
--- /dev/null
+++ b/intern/cycles/device/device_kernel.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device_kernel.h"
+
+#include "util/util_logging.h"
+
+CCL_NAMESPACE_BEGIN
+
+const char *device_kernel_as_string(DeviceKernel kernel)
+{
+  switch (kernel) {
+    /* Integrator. */
+    case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA:
+      return "integrator_init_from_camera";
+    case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE:
+      return "integrator_init_from_bake";
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
+      return "integrator_intersect_closest";
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
+      return "integrator_intersect_shadow";
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
+      return "integrator_intersect_subsurface";
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK:
+      return "integrator_intersect_volume_stack";
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
+      return "integrator_shade_background";
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
+      return "integrator_shade_light";
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
+      return "integrator_shade_shadow";
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
+      return "integrator_shade_surface";
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
+      return "integrator_shade_surface_raytrace";
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME:
+      return "integrator_shade_volume";
+    case DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL:
+      return "integrator_megakernel";
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
+      return "integrator_queued_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
+      return "integrator_queued_shadow_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
+      return "integrator_active_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
+      return "integrator_terminated_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
+      return "integrator_sorted_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
+      return "integrator_compact_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES:
+      return "integrator_compact_states";
+    case DEVICE_KERNEL_INTEGRATOR_RESET:
+      return "integrator_reset";
+    case DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS:
+      return "integrator_shadow_catcher_count_possible_splits";
+
+    /* Shader evaluation. */
+    case DEVICE_KERNEL_SHADER_EVAL_DISPLACE:
+      return "shader_eval_displace";
+    case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND:
+      return "shader_eval_background";
+
+      /* Film. */
+
+#define FILM_CONVERT_KERNEL_AS_STRING(variant, variant_lowercase) \
+  case DEVICE_KERNEL_FILM_CONVERT_##variant: \
+    return "film_convert_" #variant_lowercase; \
+  case DEVICE_KERNEL_FILM_CONVERT_##variant##_HALF_RGBA: \
+    return "film_convert_" #variant_lowercase "_half_rgba";
+
+      FILM_CONVERT_KERNEL_AS_STRING(DEPTH, depth)
+      FILM_CONVERT_KERNEL_AS_STRING(MIST, mist)
+      FILM_CONVERT_KERNEL_AS_STRING(SAMPLE_COUNT, sample_count)
+      FILM_CONVERT_KERNEL_AS_STRING(FLOAT, float)
+      FILM_CONVERT_KERNEL_AS_STRING(LIGHT_PATH, light_path)
+      FILM_CONVERT_KERNEL_AS_STRING(FLOAT3, float3)
+      FILM_CONVERT_KERNEL_AS_STRING(MOTION, motion)
+      FILM_CONVERT_KERNEL_AS_STRING(CRYPTOMATTE, cryptomatte)
+      FILM_CONVERT_KERNEL_AS_STRING(SHADOW_CATCHER, shadow_catcher)
+      FILM_CONVERT_KERNEL_AS_STRING(SHADOW_CATCHER_MATTE_WITH_SHADOW,
+                                    shadow_catcher_matte_with_shadow)
+      FILM_CONVERT_KERNEL_AS_STRING(COMBINED, combined)
+      FILM_CONVERT_KERNEL_AS_STRING(FLOAT4, float4)
+
+#undef FILM_CONVERT_KERNEL_AS_STRING
+
+    /* Adaptive sampling. */
+    case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK:
+      return "adaptive_sampling_convergence_check";
+    case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X:
+      return "adaptive_sampling_filter_x";
+    case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y:
+      return "adaptive_sampling_filter_y";
+
+    /* Denoising. */
+    case DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS:
+      return "filter_guiding_preprocess";
+    case DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO:
+      return "filter_guiding_set_fake_albedo";
+    case DEVICE_KERNEL_FILTER_COLOR_PREPROCESS:
+      return "filter_color_preprocess";
+    case DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS:
+      return "filter_color_postprocess";
+
+    /* Cryptomatte. */
+    case DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS:
+      return "cryptomatte_postprocess";
+
+    /* Generic */
+    case DEVICE_KERNEL_PREFIX_SUM:
+      return "prefix_sum";
+
+    case DEVICE_KERNEL_NUM:
+      break;
+  };
+  LOG(FATAL) << "Unhandled kernel " << static_cast<int>(kernel) << ", should never happen.";
+  return "UNKNOWN";
+}
+
+std::ostream &operator<<(std::ostream &os, DeviceKernel kernel)
+{
+  os << device_kernel_as_string(kernel);
+  return os;
+}
+
+string device_kernel_mask_as_string(DeviceKernelMask mask)
+{
+  string str;
+
+  for (uint64_t i = 0; i < sizeof(DeviceKernelMask) * 8; i++) {
+    if (mask & (uint64_t(1) << i)) {
+      if (!str.empty()) {
+        str += " ";
+      }
+      str += device_kernel_as_string((DeviceKernel)i);
+    }
+  }
+
+  return str;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_kernel.h b/intern/cycles/device/device_kernel.h
new file mode 100644
index 00000000000..83d959ca87b
--- /dev/null
+++ b/intern/cycles/device/device_kernel.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_types.h"
+
+#include "util/util_string.h"
+
+#include <ostream>  // NOLINT
+
+CCL_NAMESPACE_BEGIN
+
+const char *device_kernel_as_string(DeviceKernel kernel);
+std::ostream &operator<<(std::ostream &os, DeviceKernel kernel);
+
+typedef uint64_t DeviceKernelMask;
+string device_kernel_mask_as_string(DeviceKernelMask mask);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_memory.cpp b/intern/cycles/device/device_memory.cpp
index 80a05fc32fe..c4d45829b83 100644
--- a/intern/cycles/device/device_memory.cpp
+++ b/intern/cycles/device/device_memory.cpp
@@ -23,7 +23,7 @@ CCL_NAMESPACE_BEGIN
 
 device_memory::device_memory(Device *device, const char *name, MemoryType type)
     : data_type(device_type_traits<uchar>::data_type),
-      data_elements(device_type_traits<uchar>::num_elements),
+      data_elements(device_type_traits<uchar>::num_elements_cpu),
       data_size(0),
       device_size(0),
       data_width(0),
@@ -149,6 +149,11 @@ void device_memory::device_zero()
   }
 }
 
+bool device_memory::device_is_cpu()
+{
+  return (device->info.type == DEVICE_CPU);
+}
+
 void device_memory::swap_device(Device *new_device,
                                 size_t new_device_size,
                                 device_ptr new_device_ptr)
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index 80f4d7b0468..c51594b8580 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -38,7 +38,6 @@ enum MemoryType {
   MEM_DEVICE_ONLY,
   MEM_GLOBAL,
   MEM_TEXTURE,
-  MEM_PIXELS
 };
 
 /* Supported Data Types */
@@ -54,7 +53,7 @@ enum DataType {
   TYPE_UINT64,
 };
 
-static inline size_t datatype_size(DataType datatype)
+static constexpr size_t datatype_size(DataType datatype)
 {
   switch (datatype) {
     case TYPE_UNKNOWN:
@@ -82,112 +81,155 @@ static inline size_t datatype_size(DataType datatype)
 
 template<typename T> struct device_type_traits {
   static const DataType data_type = TYPE_UNKNOWN;
-  static const int num_elements = sizeof(T);
+  static const int num_elements_cpu = sizeof(T);
+  static const int num_elements_gpu = sizeof(T);
 };
 
 template<> struct device_type_traits<uchar> {
   static const DataType data_type = TYPE_UCHAR;
-  static const int num_elements = 1;
+  static const int num_elements_cpu = 1;
+  static const int num_elements_gpu = 1;
+  static_assert(sizeof(uchar) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uchar2> {
   static const DataType data_type = TYPE_UCHAR;
-  static const int num_elements = 2;
+  static const int num_elements_cpu = 2;
+  static const int num_elements_gpu = 2;
+  static_assert(sizeof(uchar2) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uchar3> {
   static const DataType data_type = TYPE_UCHAR;
-  static const int num_elements = 3;
+  static const int num_elements_cpu = 3;
+  static const int num_elements_gpu = 3;
+  static_assert(sizeof(uchar3) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uchar4> {
   static const DataType data_type = TYPE_UCHAR;
-  static const int num_elements = 4;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 4;
+  static_assert(sizeof(uchar4) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint> {
   static const DataType data_type = TYPE_UINT;
-  static const int num_elements = 1;
+  static const int num_elements_cpu = 1;
+  static const int num_elements_gpu = 1;
+  static_assert(sizeof(uint) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint2> {
   static const DataType data_type = TYPE_UINT;
-  static const int num_elements = 2;
+  static const int num_elements_cpu = 2;
+  static const int num_elements_gpu = 2;
+  static_assert(sizeof(uint2) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint3> {
   static const DataType data_type = TYPE_UINT;
-  static const int num_elements = 3;
+  static const int num_elements_cpu = 3;
+  static const int num_elements_gpu = 3;
+  static_assert(sizeof(uint3) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint4> {
   static const DataType data_type = TYPE_UINT;
-  static const int num_elements = 4;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 4;
+  static_assert(sizeof(uint4) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<int> {
   static const DataType data_type = TYPE_INT;
-  static const int num_elements = 1;
+  static const int num_elements_cpu = 1;
+  static const int num_elements_gpu = 1;
+  static_assert(sizeof(int) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<int2> {
   static const DataType data_type = TYPE_INT;
-  static const int num_elements = 2;
+  static const int num_elements_cpu = 2;
+  static const int num_elements_gpu = 2;
+  static_assert(sizeof(int2) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<int3> {
   static const DataType data_type = TYPE_INT;
-  static const int num_elements = 3;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 3;
+  static_assert(sizeof(int3) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<int4> {
   static const DataType data_type = TYPE_INT;
-  static const int num_elements = 4;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 4;
+  static_assert(sizeof(int4) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<float> {
   static const DataType data_type = TYPE_FLOAT;
-  static const int num_elements = 1;
+  static const int num_elements_cpu = 1;
+  static const int num_elements_gpu = 1;
+  static_assert(sizeof(float) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<float2> {
   static const DataType data_type = TYPE_FLOAT;
-  static const int num_elements = 2;
+  static const int num_elements_cpu = 2;
+  static const int num_elements_gpu = 2;
+  static_assert(sizeof(float2) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<float3> {
   static const DataType data_type = TYPE_FLOAT;
-  static const int num_elements = 4;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 3;
+  static_assert(sizeof(float3) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<float4> {
   static const DataType data_type = TYPE_FLOAT;
-  static const int num_elements = 4;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 4;
+  static_assert(sizeof(float4) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<half> {
   static const DataType data_type = TYPE_HALF;
-  static const int num_elements = 1;
+  static const int num_elements_cpu = 1;
+  static const int num_elements_gpu = 1;
+  static_assert(sizeof(half) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<ushort4> {
   static const DataType data_type = TYPE_UINT16;
-  static const int num_elements = 4;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 4;
+  static_assert(sizeof(ushort4) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint16_t> {
   static const DataType data_type = TYPE_UINT16;
-  static const int num_elements = 1;
+  static const int num_elements_cpu = 1;
+  static const int num_elements_gpu = 1;
+  static_assert(sizeof(uint16_t) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<half4> {
   static const DataType data_type = TYPE_HALF;
-  static const int num_elements = 4;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 4;
+  static_assert(sizeof(half4) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint64_t> {
   static const DataType data_type = TYPE_UINT64;
-  static const int num_elements = 1;
+  static const int num_elements_cpu = 1;
+  static const int num_elements_gpu = 1;
+  static_assert(sizeof(uint64_t) == num_elements_cpu * datatype_size(data_type));
 };
 
 /* Device Memory
@@ -257,6 +299,8 @@ class device_memory {
   void device_copy_from(int y, int w, int h, int elem);
   void device_zero();
 
+  bool device_is_cpu();
+
   device_ptr original_device_ptr;
   size_t original_device_size;
   Device *original_device;
@@ -275,7 +319,9 @@ template<typename T> class device_only_memory : public device_memory {
       : device_memory(device, name, allow_host_memory_fallback ? MEM_READ_WRITE : MEM_DEVICE_ONLY)
   {
     data_type = device_type_traits<T>::data_type;
-    data_elements = max(device_type_traits<T>::num_elements, 1);
+    data_elements = max(device_is_cpu() ? device_type_traits<T>::num_elements_cpu :
+                                          device_type_traits<T>::num_elements_gpu,
+                        1);
   }
 
   device_only_memory(device_only_memory &&other) noexcept : device_memory(std::move(other))
@@ -331,11 +377,15 @@ template<typename T> class device_only_memory : public device_memory {
 
 template<typename T> class device_vector : public device_memory {
  public:
+  /* Can only use this for types that have the same size on CPU and GPU. */
+  static_assert(device_type_traits<T>::num_elements_cpu ==
+                device_type_traits<T>::num_elements_gpu);
+
   device_vector(Device *device, const char *name, MemoryType type)
       : device_memory(device, name, type)
   {
     data_type = device_type_traits<T>::data_type;
-    data_elements = device_type_traits<T>::num_elements;
+    data_elements = device_type_traits<T>::num_elements_cpu;
     modified = true;
     need_realloc_ = true;
 
@@ -477,6 +527,11 @@ template<typename T> class device_vector : public device_memory {
     return (T *)host_pointer;
   }
 
+  const T *data() const
+  {
+    return (T *)host_pointer;
+  }
+
   T &operator[](size_t i)
   {
     assert(i < data_size);
@@ -507,7 +562,7 @@ template<typename T> class device_vector : public device_memory {
 
   void copy_from_device()
   {
-    device_copy_from(0, data_width, data_height, sizeof(T));
+    device_copy_from(0, data_width, (data_height == 0) ? 1 : data_height, sizeof(T));
   }
 
   void copy_from_device(int y, int w, int h)
@@ -535,33 +590,6 @@ template<typename T> class device_vector : public device_memory {
   }
 };
 
-/* Pixel Memory
- *
- * Device memory to efficiently draw as pixels to the screen in interactive
- * rendering. Only copying pixels from the device is supported, not copying to. */
-
-template<typename T> class device_pixels : public device_vector<T> {
- public:
-  device_pixels(Device *device, const char *name) : device_vector<T>(device, name, MEM_PIXELS)
-  {
-  }
-
-  void alloc_to_device(size_t width, size_t height, size_t depth = 0)
-  {
-    device_vector<T>::alloc(width, height, depth);
-
-    if (!device_memory::device_pointer) {
-      device_memory::device_alloc();
-    }
-  }
-
-  T *copy_from_device(int y, int w, int h)
-  {
-    device_memory::device_copy_from(y, w, h, sizeof(T));
-    return device_vector<T>::data();
-  }
-};
-
 /* Device Sub Memory
  *
  * Pointer into existing memory. It is not allocated separately, but created
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
deleted file mode 100644
index 85ffa5fcd52..00000000000
--- a/intern/cycles/device/device_multi.cpp
+++ /dev/null
@@ -1,826 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <sstream>
-#include <stdlib.h>
-
-#include "bvh/bvh_multi.h"
-
-#include "device/device.h"
-#include "device/device_intern.h"
-#include "device/device_network.h"
-
-#include "render/buffers.h"
-#include "render/geometry.h"
-
-#include "util/util_foreach.h"
-#include "util/util_list.h"
-#include "util/util_logging.h"
-#include "util/util_map.h"
-#include "util/util_time.h"
-
-CCL_NAMESPACE_BEGIN
-
-class MultiDevice : public Device {
- public:
-  struct SubDevice {
-    Stats stats;
-    Device *device;
-    map<device_ptr, device_ptr> ptr_map;
-    int peer_island_index = -1;
-  };
-
-  list<SubDevice> devices, denoising_devices;
-  device_ptr unique_key;
-  vector<vector<SubDevice *>> peer_islands;
-  bool use_denoising;
-  bool matching_rendering_and_denoising_devices;
-
-  MultiDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_)
-      : Device(info, stats, profiler, background_),
-        unique_key(1),
-        use_denoising(!info.denoising_devices.empty())
-  {
-    foreach (DeviceInfo &subinfo, info.multi_devices) {
-      /* Always add CPU devices at the back since GPU devices can change
-       * host memory pointers, which CPU uses as device pointer. */
-      SubDevice *sub;
-      if (subinfo.type == DEVICE_CPU) {
-        devices.emplace_back();
-        sub = &devices.back();
-      }
-      else {
-        devices.emplace_front();
-        sub = &devices.front();
-      }
-
-      /* The pointer to 'sub->stats' will stay valid even after new devices
-       * are added, since 'devices' is a linked list. */
-      sub->device = Device::create(subinfo, sub->stats, profiler, background);
-    }
-
-    foreach (DeviceInfo &subinfo, info.denoising_devices) {
-      denoising_devices.emplace_front();
-      SubDevice *sub = &denoising_devices.front();
-
-      sub->device = Device::create(subinfo, sub->stats, profiler, background);
-    }
-
-    /* Build a list of peer islands for the available render devices */
-    foreach (SubDevice &sub, devices) {
-      /* First ensure that every device is in at least once peer island */
-      if (sub.peer_island_index < 0) {
-        peer_islands.emplace_back();
-        sub.peer_island_index = (int)peer_islands.size() - 1;
-        peer_islands[sub.peer_island_index].push_back(&sub);
-      }
-
-      if (!info.has_peer_memory) {
-        continue;
-      }
-
-      /* Second check peer access between devices and fill up the islands accordingly */
-      foreach (SubDevice &peer_sub, devices) {
-        if (peer_sub.peer_island_index < 0 &&
-            peer_sub.device->info.type == sub.device->info.type &&
-            peer_sub.device->check_peer_access(sub.device)) {
-          peer_sub.peer_island_index = sub.peer_island_index;
-          peer_islands[sub.peer_island_index].push_back(&peer_sub);
-        }
-      }
-    }
-
-    /* Try to re-use memory when denoising and render devices use the same physical devices
-     * (e.g. OptiX denoising and CUDA rendering device pointing to the same GPU).
-     * Ordering has to match as well, so that 'DeviceTask::split' behaves consistent. */
-    matching_rendering_and_denoising_devices = denoising_devices.empty() ||
-                                               (devices.size() == denoising_devices.size());
-    if (matching_rendering_and_denoising_devices) {
-      for (list<SubDevice>::iterator device_it = devices.begin(),
-                                     denoising_device_it = denoising_devices.begin();
-           device_it != devices.end() && denoising_device_it != denoising_devices.end();
-           ++device_it, ++denoising_device_it) {
-        const DeviceInfo &info = device_it->device->info;
-        const DeviceInfo &denoising_info = denoising_device_it->device->info;
-        if ((info.type != DEVICE_CUDA && info.type != DEVICE_OPTIX) ||
-            (denoising_info.type != DEVICE_CUDA && denoising_info.type != DEVICE_OPTIX) ||
-            info.num != denoising_info.num) {
-          matching_rendering_and_denoising_devices = false;
-          break;
-        }
-      }
-    }
-
-#ifdef WITH_NETWORK
-    /* try to add network devices */
-    ServerDiscovery discovery(true);
-    time_sleep(1.0);
-
-    vector<string> servers = discovery.get_server_list();
-
-    foreach (string &server, servers) {
-      Device *device = device_network_create(info, stats, profiler, server.c_str());
-      if (device)
-        devices.push_back(SubDevice(device));
-    }
-#endif
-  }
-
-  ~MultiDevice()
-  {
-    foreach (SubDevice &sub, devices)
-      delete sub.device;
-    foreach (SubDevice &sub, denoising_devices)
-      delete sub.device;
-  }
-
-  const string &error_message() override
-  {
-    error_msg.clear();
-
-    foreach (SubDevice &sub, devices)
-      error_msg += sub.device->error_message();
-    foreach (SubDevice &sub, denoising_devices)
-      error_msg += sub.device->error_message();
-
-    return error_msg;
-  }
-
-  virtual bool show_samples() const override
-  {
-    if (devices.size() > 1) {
-      return false;
-    }
-    return devices.front().device->show_samples();
-  }
-
-  virtual BVHLayoutMask get_bvh_layout_mask() const override
-  {
-    BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_ALL;
-    BVHLayoutMask bvh_layout_mask_all = BVH_LAYOUT_NONE;
-    foreach (const SubDevice &sub_device, devices) {
-      BVHLayoutMask device_bvh_layout_mask = sub_device.device->get_bvh_layout_mask();
-      bvh_layout_mask &= device_bvh_layout_mask;
-      bvh_layout_mask_all |= device_bvh_layout_mask;
-    }
-
-    /* With multiple OptiX devices, every device needs its own acceleration structure */
-    if (bvh_layout_mask == BVH_LAYOUT_OPTIX) {
-      return BVH_LAYOUT_MULTI_OPTIX;
-    }
-
-    /* When devices do not share a common BVH layout, fall back to creating one for each */
-    const BVHLayoutMask BVH_LAYOUT_OPTIX_EMBREE = (BVH_LAYOUT_OPTIX | BVH_LAYOUT_EMBREE);
-    if ((bvh_layout_mask_all & BVH_LAYOUT_OPTIX_EMBREE) == BVH_LAYOUT_OPTIX_EMBREE) {
-      return BVH_LAYOUT_MULTI_OPTIX_EMBREE;
-    }
-
-    return bvh_layout_mask;
-  }
-
-  bool load_kernels(const DeviceRequestedFeatures &requested_features) override
-  {
-    foreach (SubDevice &sub, devices)
-      if (!sub.device->load_kernels(requested_features))
-        return false;
-
-    use_denoising = requested_features.use_denoising;
-    if (requested_features.use_denoising) {
-      /* Only need denoising feature, everything else is unused. */
-      DeviceRequestedFeatures denoising_features;
-      denoising_features.use_denoising = true;
-      foreach (SubDevice &sub, denoising_devices)
-        if (!sub.device->load_kernels(denoising_features))
-          return false;
-    }
-
-    return true;
-  }
-
-  bool wait_for_availability(const DeviceRequestedFeatures &requested_features) override
-  {
-    foreach (SubDevice &sub, devices)
-      if (!sub.device->wait_for_availability(requested_features))
-        return false;
-
-    if (requested_features.use_denoising) {
-      foreach (SubDevice &sub, denoising_devices)
-        if (!sub.device->wait_for_availability(requested_features))
-          return false;
-    }
-
-    return true;
-  }
-
-  DeviceKernelStatus get_active_kernel_switch_state() override
-  {
-    DeviceKernelStatus result = DEVICE_KERNEL_USING_FEATURE_KERNEL;
-
-    foreach (SubDevice &sub, devices) {
-      DeviceKernelStatus subresult = sub.device->get_active_kernel_switch_state();
-      switch (subresult) {
-        case DEVICE_KERNEL_FEATURE_KERNEL_INVALID:
-        case DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE:
-          return subresult;
-
-        case DEVICE_KERNEL_USING_FEATURE_KERNEL:
-        case DEVICE_KERNEL_UNKNOWN:
-          break;
-      }
-    }
-
-    return result;
-  }
-
-  void build_bvh(BVH *bvh, Progress &progress, bool refit) override
-  {
-    /* Try to build and share a single acceleration structure, if possible */
-    if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2 || bvh->params.bvh_layout == BVH_LAYOUT_EMBREE) {
-      devices.back().device->build_bvh(bvh, progress, refit);
-      return;
-    }
-
-    assert(bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX ||
-           bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE);
-
-    BVHMulti *const bvh_multi = static_cast<BVHMulti *>(bvh);
-    bvh_multi->sub_bvhs.resize(devices.size());
-
-    vector<BVHMulti *> geom_bvhs;
-    geom_bvhs.reserve(bvh->geometry.size());
-    foreach (Geometry *geom, bvh->geometry) {
-      geom_bvhs.push_back(static_cast<BVHMulti *>(geom->bvh));
-    }
-
-    /* Broadcast acceleration structure build to all render devices */
-    size_t i = 0;
-    foreach (SubDevice &sub, devices) {
-      /* Change geometry BVH pointers to the sub BVH */
-      for (size_t k = 0; k < bvh->geometry.size(); ++k) {
-        bvh->geometry[k]->bvh = geom_bvhs[k]->sub_bvhs[i];
-      }
-
-      if (!bvh_multi->sub_bvhs[i]) {
-        BVHParams params = bvh->params;
-        if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX)
-          params.bvh_layout = BVH_LAYOUT_OPTIX;
-        else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE)
-          params.bvh_layout = sub.device->info.type == DEVICE_OPTIX ? BVH_LAYOUT_OPTIX :
-                                                                      BVH_LAYOUT_EMBREE;
-
-        /* Skip building a bottom level acceleration structure for non-instanced geometry on Embree
-         * (since they are put into the top level directly, see bvh_embree.cpp) */
-        if (!params.top_level && params.bvh_layout == BVH_LAYOUT_EMBREE &&
-            !bvh->geometry[0]->is_instanced()) {
-          i++;
-          continue;
-        }
-
-        bvh_multi->sub_bvhs[i] = BVH::create(params, bvh->geometry, bvh->objects, sub.device);
-      }
-
-      sub.device->build_bvh(bvh_multi->sub_bvhs[i], progress, refit);
-      i++;
-    }
-
-    /* Change geometry BVH pointers back to the multi BVH. */
-    for (size_t k = 0; k < bvh->geometry.size(); ++k) {
-      bvh->geometry[k]->bvh = geom_bvhs[k];
-    }
-  }
-
-  virtual void *osl_memory() override
-  {
-    if (devices.size() > 1) {
-      return NULL;
-    }
-    return devices.front().device->osl_memory();
-  }
-
-  bool is_resident(device_ptr key, Device *sub_device) override
-  {
-    foreach (SubDevice &sub, devices) {
-      if (sub.device == sub_device) {
-        return find_matching_mem_device(key, sub)->device == sub_device;
-      }
-    }
-    return false;
-  }
-
-  SubDevice *find_matching_mem_device(device_ptr key, SubDevice &sub)
-  {
-    assert(key != 0 && (sub.peer_island_index >= 0 || sub.ptr_map.find(key) != sub.ptr_map.end()));
-
-    /* Get the memory owner of this key (first try current device, then peer devices) */
-    SubDevice *owner_sub = &sub;
-    if (owner_sub->ptr_map.find(key) == owner_sub->ptr_map.end()) {
-      foreach (SubDevice *island_sub, peer_islands[sub.peer_island_index]) {
-        if (island_sub != owner_sub &&
-            island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) {
-          owner_sub = island_sub;
-        }
-      }
-    }
-    return owner_sub;
-  }
-
-  SubDevice *find_suitable_mem_device(device_ptr key, const vector<SubDevice *> &island)
-  {
-    assert(!island.empty());
-
-    /* Get the memory owner of this key or the device with the lowest memory usage when new */
-    SubDevice *owner_sub = island.front();
-    foreach (SubDevice *island_sub, island) {
-      if (key ? (island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) :
-                (island_sub->device->stats.mem_used < owner_sub->device->stats.mem_used)) {
-        owner_sub = island_sub;
-      }
-    }
-    return owner_sub;
-  }
-
-  inline device_ptr find_matching_mem(device_ptr key, SubDevice &sub)
-  {
-    return find_matching_mem_device(key, sub)->ptr_map[key];
-  }
-
-  void mem_alloc(device_memory &mem) override
-  {
-    device_ptr key = unique_key++;
-
-    if (mem.type == MEM_PIXELS) {
-      /* Always allocate pixels memory on all devices
-       * This is necessary to ensure PBOs are registered everywhere, which FILM_CONVERT uses */
-      foreach (SubDevice &sub, devices) {
-        mem.device = sub.device;
-        mem.device_pointer = 0;
-        mem.device_size = 0;
-
-        sub.device->mem_alloc(mem);
-        sub.ptr_map[key] = mem.device_pointer;
-      }
-    }
-    else {
-      assert(mem.type == MEM_READ_ONLY || mem.type == MEM_READ_WRITE ||
-             mem.type == MEM_DEVICE_ONLY);
-      /* The remaining memory types can be distributed across devices */
-      foreach (const vector<SubDevice *> &island, peer_islands) {
-        SubDevice *owner_sub = find_suitable_mem_device(key, island);
-        mem.device = owner_sub->device;
-        mem.device_pointer = 0;
-        mem.device_size = 0;
-
-        owner_sub->device->mem_alloc(mem);
-        owner_sub->ptr_map[key] = mem.device_pointer;
-      }
-    }
-
-    mem.device = this;
-    mem.device_pointer = key;
-    stats.mem_alloc(mem.device_size);
-  }
-
-  void mem_copy_to(device_memory &mem) override
-  {
-    device_ptr existing_key = mem.device_pointer;
-    device_ptr key = (existing_key) ? existing_key : unique_key++;
-    size_t existing_size = mem.device_size;
-
-    /* The tile buffers are allocated on each device (see below), so copy to all of them */
-    if (strcmp(mem.name, "RenderBuffers") == 0 && use_denoising) {
-      foreach (SubDevice &sub, devices) {
-        mem.device = sub.device;
-        mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
-        mem.device_size = existing_size;
-
-        sub.device->mem_copy_to(mem);
-        sub.ptr_map[key] = mem.device_pointer;
-      }
-    }
-    else {
-      foreach (const vector<SubDevice *> &island, peer_islands) {
-        SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
-        mem.device = owner_sub->device;
-        mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
-        mem.device_size = existing_size;
-
-        owner_sub->device->mem_copy_to(mem);
-        owner_sub->ptr_map[key] = mem.device_pointer;
-
-        if (mem.type == MEM_GLOBAL || mem.type == MEM_TEXTURE) {
-          /* Need to create texture objects and update pointer in kernel globals on all devices */
-          foreach (SubDevice *island_sub, island) {
-            if (island_sub != owner_sub) {
-              island_sub->device->mem_copy_to(mem);
-            }
-          }
-        }
-      }
-    }
-
-    mem.device = this;
-    mem.device_pointer = key;
-    stats.mem_alloc(mem.device_size - existing_size);
-  }
-
-  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override
-  {
-    device_ptr key = mem.device_pointer;
-    int i = 0, sub_h = h / devices.size();
-
-    foreach (SubDevice &sub, devices) {
-      int sy = y + i * sub_h;
-      int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h;
-
-      SubDevice *owner_sub = find_matching_mem_device(key, sub);
-      mem.device = owner_sub->device;
-      mem.device_pointer = owner_sub->ptr_map[key];
-
-      owner_sub->device->mem_copy_from(mem, sy, w, sh, elem);
-      i++;
-    }
-
-    mem.device = this;
-    mem.device_pointer = key;
-  }
-
-  void mem_zero(device_memory &mem) override
-  {
-    device_ptr existing_key = mem.device_pointer;
-    device_ptr key = (existing_key) ? existing_key : unique_key++;
-    size_t existing_size = mem.device_size;
-
-    /* This is a hack to only allocate the tile buffers on denoising devices
-     * Similarly the tile buffers also need to be allocated separately on all devices so any
-     * overlap rendered for denoising does not interfere with each other */
-    if (strcmp(mem.name, "RenderBuffers") == 0 && use_denoising) {
-      vector<device_ptr> device_pointers;
-      device_pointers.reserve(devices.size());
-
-      foreach (SubDevice &sub, devices) {
-        mem.device = sub.device;
-        mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
-        mem.device_size = existing_size;
-
-        sub.device->mem_zero(mem);
-        sub.ptr_map[key] = mem.device_pointer;
-
-        device_pointers.push_back(mem.device_pointer);
-      }
-      foreach (SubDevice &sub, denoising_devices) {
-        if (matching_rendering_and_denoising_devices) {
-          sub.ptr_map[key] = device_pointers.front();
-          device_pointers.erase(device_pointers.begin());
-        }
-        else {
-          mem.device = sub.device;
-          mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
-          mem.device_size = existing_size;
-
-          sub.device->mem_zero(mem);
-          sub.ptr_map[key] = mem.device_pointer;
-        }
-      }
-    }
-    else {
-      foreach (const vector<SubDevice *> &island, peer_islands) {
-        SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
-        mem.device = owner_sub->device;
-        mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
-        mem.device_size = existing_size;
-
-        owner_sub->device->mem_zero(mem);
-        owner_sub->ptr_map[key] = mem.device_pointer;
-      }
-    }
-
-    mem.device = this;
-    mem.device_pointer = key;
-    stats.mem_alloc(mem.device_size - existing_size);
-  }
-
-  void mem_free(device_memory &mem) override
-  {
-    device_ptr key = mem.device_pointer;
-    size_t existing_size = mem.device_size;
-
-    /* Free memory that was allocated for all devices (see above) on each device */
-    if (mem.type == MEM_PIXELS || (strcmp(mem.name, "RenderBuffers") == 0 && use_denoising)) {
-      foreach (SubDevice &sub, devices) {
-        mem.device = sub.device;
-        mem.device_pointer = sub.ptr_map[key];
-        mem.device_size = existing_size;
-
-        sub.device->mem_free(mem);
-        sub.ptr_map.erase(sub.ptr_map.find(key));
-      }
-      foreach (SubDevice &sub, denoising_devices) {
-        if (matching_rendering_and_denoising_devices) {
-          sub.ptr_map.erase(key);
-        }
-        else {
-          mem.device = sub.device;
-          mem.device_pointer = sub.ptr_map[key];
-          mem.device_size = existing_size;
-
-          sub.device->mem_free(mem);
-          sub.ptr_map.erase(sub.ptr_map.find(key));
-        }
-      }
-    }
-    else {
-      foreach (const vector<SubDevice *> &island, peer_islands) {
-        SubDevice *owner_sub = find_matching_mem_device(key, *island.front());
-        mem.device = owner_sub->device;
-        mem.device_pointer = owner_sub->ptr_map[key];
-        mem.device_size = existing_size;
-
-        owner_sub->device->mem_free(mem);
-        owner_sub->ptr_map.erase(owner_sub->ptr_map.find(key));
-
-        if (mem.type == MEM_TEXTURE) {
-          /* Free texture objects on all devices */
-          foreach (SubDevice *island_sub, island) {
-            if (island_sub != owner_sub) {
-              island_sub->device->mem_free(mem);
-            }
-          }
-        }
-      }
-    }
-
-    mem.device = this;
-    mem.device_pointer = 0;
-    mem.device_size = 0;
-    stats.mem_free(existing_size);
-  }
-
-  void const_copy_to(const char *name, void *host, size_t size) override
-  {
-    foreach (SubDevice &sub, devices)
-      sub.device->const_copy_to(name, host, size);
-  }
-
-  void draw_pixels(device_memory &rgba,
-                   int y,
-                   int w,
-                   int h,
-                   int width,
-                   int height,
-                   int dx,
-                   int dy,
-                   int dw,
-                   int dh,
-                   bool transparent,
-                   const DeviceDrawParams &draw_params) override
-  {
-    assert(rgba.type == MEM_PIXELS);
-
-    device_ptr key = rgba.device_pointer;
-    int i = 0, sub_h = h / devices.size();
-    int sub_height = height / devices.size();
-
-    foreach (SubDevice &sub, devices) {
-      int sy = y + i * sub_h;
-      int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h;
-      int sheight = (i == (int)devices.size() - 1) ? height - sub_height * i : sub_height;
-      int sdy = dy + i * sub_height;
-      /* adjust math for w/width */
-
-      rgba.device_pointer = sub.ptr_map[key];
-      sub.device->draw_pixels(
-          rgba, sy, w, sh, width, sheight, dx, sdy, dw, dh, transparent, draw_params);
-      i++;
-    }
-
-    rgba.device_pointer = key;
-  }
-
-  void map_tile(Device *sub_device, RenderTile &tile) override
-  {
-    if (!tile.buffer) {
-      return;
-    }
-
-    foreach (SubDevice &sub, devices) {
-      if (sub.device == sub_device) {
-        tile.buffer = find_matching_mem(tile.buffer, sub);
-        return;
-      }
-    }
-
-    foreach (SubDevice &sub, denoising_devices) {
-      if (sub.device == sub_device) {
-        tile.buffer = sub.ptr_map[tile.buffer];
-        return;
-      }
-    }
-  }
-
-  int device_number(Device *sub_device) override
-  {
-    int i = 0;
-
-    foreach (SubDevice &sub, devices) {
-      if (sub.device == sub_device)
-        return i;
-      i++;
-    }
-
-    foreach (SubDevice &sub, denoising_devices) {
-      if (sub.device == sub_device)
-        return i;
-      i++;
-    }
-
-    return -1;
-  }
-
-  void map_neighbor_tiles(Device *sub_device, RenderTileNeighbors &neighbors) override
-  {
-    for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
-      RenderTile &tile = neighbors.tiles[i];
-
-      if (!tile.buffers) {
-        continue;
-      }
-
-      device_vector<float> &mem = tile.buffers->buffer;
-      tile.buffer = mem.device_pointer;
-
-      if (mem.device == this && matching_rendering_and_denoising_devices) {
-        /* Skip unnecessary copies in viewport mode (buffer covers the
-         * whole image), but still need to fix up the tile device pointer. */
-        map_tile(sub_device, tile);
-        continue;
-      }
-
-      /* If the tile was rendered on another device, copy its memory to
-       * to the current device now, for the duration of the denoising task.
-       * Note that this temporarily modifies the RenderBuffers and calls
-       * the device, so this function is not thread safe. */
-      if (mem.device != sub_device) {
-        /* Only copy from device to host once. This is faster, but
-         * also required for the case where a CPU thread is denoising
-         * a tile rendered on the GPU. In that case we have to avoid
-         * overwriting the buffer being de-noised by the CPU thread. */
-        if (!tile.buffers->map_neighbor_copied) {
-          tile.buffers->map_neighbor_copied = true;
-          mem.copy_from_device();
-        }
-
-        if (mem.device == this) {
-          /* Can re-use memory if tile is already allocated on the sub device. */
-          map_tile(sub_device, tile);
-          mem.swap_device(sub_device, mem.device_size, tile.buffer);
-        }
-        else {
-          mem.swap_device(sub_device, 0, 0);
-        }
-
-        mem.copy_to_device();
-
-        tile.buffer = mem.device_pointer;
-        tile.device_size = mem.device_size;
-
-        mem.restore_device();
-      }
-    }
-  }
-
-  void unmap_neighbor_tiles(Device *sub_device, RenderTileNeighbors &neighbors) override
-  {
-    RenderTile &target_tile = neighbors.target;
-    device_vector<float> &mem = target_tile.buffers->buffer;
-
-    if (mem.device == this && matching_rendering_and_denoising_devices) {
-      return;
-    }
-
-    /* Copy denoised result back to the host. */
-    mem.swap_device(sub_device, target_tile.device_size, target_tile.buffer);
-    mem.copy_from_device();
-    mem.restore_device();
-
-    /* Copy denoised result to the original device. */
-    mem.copy_to_device();
-
-    for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
-      RenderTile &tile = neighbors.tiles[i];
-      if (!tile.buffers) {
-        continue;
-      }
-
-      device_vector<float> &mem = tile.buffers->buffer;
-
-      if (mem.device != sub_device && mem.device != this) {
-        /* Free up memory again if it was allocated for the copy above. */
-        mem.swap_device(sub_device, tile.device_size, tile.buffer);
-        sub_device->mem_free(mem);
-        mem.restore_device();
-      }
-    }
-  }
-
-  int get_split_task_count(DeviceTask &task) override
-  {
-    int total_tasks = 0;
-    list<DeviceTask> tasks;
-    task.split(tasks, devices.size());
-    foreach (SubDevice &sub, devices) {
-      if (!tasks.empty()) {
-        DeviceTask subtask = tasks.front();
-        tasks.pop_front();
-
-        total_tasks += sub.device->get_split_task_count(subtask);
-      }
-    }
-    return total_tasks;
-  }
-
-  void task_add(DeviceTask &task) override
-  {
-    list<SubDevice> task_devices = devices;
-    if (!denoising_devices.empty()) {
-      if (task.type == DeviceTask::DENOISE_BUFFER) {
-        /* Denoising tasks should be redirected to the denoising devices entirely. */
-        task_devices = denoising_devices;
-      }
-      else if (task.type == DeviceTask::RENDER && (task.tile_types & RenderTile::DENOISE)) {
-        const uint tile_types = task.tile_types;
-        /* For normal rendering tasks only redirect the denoising part to the denoising devices.
-         * Do not need to split the task here, since they all run through 'acquire_tile'. */
-        task.tile_types = RenderTile::DENOISE;
-        foreach (SubDevice &sub, denoising_devices) {
-          sub.device->task_add(task);
-        }
-        /* Rendering itself should still be executed on the rendering devices. */
-        task.tile_types = tile_types ^ RenderTile::DENOISE;
-      }
-    }
-
-    list<DeviceTask> tasks;
-    task.split(tasks, task_devices.size());
-
-    foreach (SubDevice &sub, task_devices) {
-      if (!tasks.empty()) {
-        DeviceTask subtask = tasks.front();
-        tasks.pop_front();
-
-        if (task.buffer)
-          subtask.buffer = find_matching_mem(task.buffer, sub);
-        if (task.rgba_byte)
-          subtask.rgba_byte = sub.ptr_map[task.rgba_byte];
-        if (task.rgba_half)
-          subtask.rgba_half = sub.ptr_map[task.rgba_half];
-        if (task.shader_input)
-          subtask.shader_input = find_matching_mem(task.shader_input, sub);
-        if (task.shader_output)
-          subtask.shader_output = find_matching_mem(task.shader_output, sub);
-
-        sub.device->task_add(subtask);
-
-        if (task.buffers && task.buffers->buffer.device == this) {
-          /* Synchronize access to RenderBuffers, since 'map_neighbor_tiles' is not thread-safe. */
-          sub.device->task_wait();
-        }
-      }
-    }
-  }
-
-  void task_wait() override
-  {
-    foreach (SubDevice &sub, devices)
-      sub.device->task_wait();
-    foreach (SubDevice &sub, denoising_devices)
-      sub.device->task_wait();
-  }
-
-  void task_cancel() override
-  {
-    foreach (SubDevice &sub, devices)
-      sub.device->task_cancel();
-    foreach (SubDevice &sub, denoising_devices)
-      sub.device->task_cancel();
-  }
-};
-
-Device *device_multi_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
-{
-  return new MultiDevice(info, stats, profiler, background);
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp
deleted file mode 100644
index 8904b517e92..00000000000
--- a/intern/cycles/device/device_network.cpp
+++ /dev/null
@@ -1,812 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "device/device_network.h"
-#include "device/device.h"
-#include "device/device_intern.h"
-
-#include "util/util_foreach.h"
-#include "util/util_logging.h"
-
-#if defined(WITH_NETWORK)
-
-CCL_NAMESPACE_BEGIN
-
-typedef map<device_ptr, device_ptr> PtrMap;
-typedef vector<uint8_t> DataVector;
-typedef map<device_ptr, DataVector> DataMap;
-
-/* tile list */
-typedef vector<RenderTile> TileList;
-
-/* search a list of tiles and find the one that matches the passed render tile */
-static TileList::iterator tile_list_find(TileList &tile_list, RenderTile &tile)
-{
-  for (TileList::iterator it = tile_list.begin(); it != tile_list.end(); ++it)
-    if (tile.x == it->x && tile.y == it->y && tile.start_sample == it->start_sample)
-      return it;
-  return tile_list.end();
-}
-
-class NetworkDevice : public Device {
- public:
-  boost::asio::io_service io_service;
-  tcp::socket socket;
-  device_ptr mem_counter;
-  DeviceTask the_task; /* todo: handle multiple tasks */
-
-  thread_mutex rpc_lock;
-
-  virtual bool show_samples() const
-  {
-    return false;
-  }
-
-  NetworkDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, const char *address)
-      : Device(info, stats, profiler, true), socket(io_service)
-  {
-    error_func = NetworkError();
-    stringstream portstr;
-    portstr << SERVER_PORT;
-
-    tcp::resolver resolver(io_service);
-    tcp::resolver::query query(address, portstr.str());
-    tcp::resolver::iterator endpoint_iterator = resolver.resolve(query);
-    tcp::resolver::iterator end;
-
-    boost::system::error_code error = boost::asio::error::host_not_found;
-    while (error && endpoint_iterator != end) {
-      socket.close();
-      socket.connect(*endpoint_iterator++, error);
-    }
-
-    if (error)
-      error_func.network_error(error.message());
-
-    mem_counter = 0;
-  }
-
-  ~NetworkDevice()
-  {
-    RPCSend snd(socket, &error_func, "stop");
-    snd.write();
-  }
-
-  virtual BVHLayoutMask get_bvh_layout_mask() const
-  {
-    return BVH_LAYOUT_BVH2;
-  }
-
-  void mem_alloc(device_memory &mem)
-  {
-    if (mem.name) {
-      VLOG(1) << "Buffer allocate: " << mem.name << ", "
-              << string_human_readable_number(mem.memory_size()) << " bytes. ("
-              << string_human_readable_size(mem.memory_size()) << ")";
-    }
-
-    thread_scoped_lock lock(rpc_lock);
-
-    mem.device_pointer = ++mem_counter;
-
-    RPCSend snd(socket, &error_func, "mem_alloc");
-    snd.add(mem);
-    snd.write();
-  }
-
-  void mem_copy_to(device_memory &mem)
-  {
-    thread_scoped_lock lock(rpc_lock);
-
-    RPCSend snd(socket, &error_func, "mem_copy_to");
-
-    snd.add(mem);
-    snd.write();
-    snd.write_buffer(mem.host_pointer, mem.memory_size());
-  }
-
-  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
-  {
-    thread_scoped_lock lock(rpc_lock);
-
-    size_t data_size = mem.memory_size();
-
-    RPCSend snd(socket, &error_func, "mem_copy_from");
-
-    snd.add(mem);
-    snd.add(y);
-    snd.add(w);
-    snd.add(h);
-    snd.add(elem);
-    snd.write();
-
-    RPCReceive rcv(socket, &error_func);
-    rcv.read_buffer(mem.host_pointer, data_size);
-  }
-
-  void mem_zero(device_memory &mem)
-  {
-    thread_scoped_lock lock(rpc_lock);
-
-    RPCSend snd(socket, &error_func, "mem_zero");
-
-    snd.add(mem);
-    snd.write();
-  }
-
-  void mem_free(device_memory &mem)
-  {
-    if (mem.device_pointer) {
-      thread_scoped_lock lock(rpc_lock);
-
-      RPCSend snd(socket, &error_func, "mem_free");
-
-      snd.add(mem);
-      snd.write();
-
-      mem.device_pointer = 0;
-    }
-  }
-
-  void const_copy_to(const char *name, void *host, size_t size)
-  {
-    thread_scoped_lock lock(rpc_lock);
-
-    RPCSend snd(socket, &error_func, "const_copy_to");
-
-    string name_string(name);
-
-    snd.add(name_string);
-    snd.add(size);
-    snd.write();
-    snd.write_buffer(host, size);
-  }
-
-  bool load_kernels(const DeviceRequestedFeatures &requested_features)
-  {
-    if (error_func.have_error())
-      return false;
-
-    thread_scoped_lock lock(rpc_lock);
-
-    RPCSend snd(socket, &error_func, "load_kernels");
-    snd.add(requested_features.experimental);
-    snd.add(requested_features.max_closure);
-    snd.add(requested_features.max_nodes_group);
-    snd.add(requested_features.nodes_features);
-    snd.write();
-
-    bool result;
-    RPCReceive rcv(socket, &error_func);
-    rcv.read(result);
-
-    return result;
-  }
-
-  void task_add(DeviceTask &task)
-  {
-    thread_scoped_lock lock(rpc_lock);
-
-    the_task = task;
-
-    RPCSend snd(socket, &error_func, "task_add");
-    snd.add(task);
-    snd.write();
-  }
-
-  void task_wait()
-  {
-    thread_scoped_lock lock(rpc_lock);
-
-    RPCSend snd(socket, &error_func, "task_wait");
-    snd.write();
-
-    lock.unlock();
-
-    TileList the_tiles;
-
-    /* todo: run this threaded for connecting to multiple clients */
-    for (;;) {
-      if (error_func.have_error())
-        break;
-
-      RenderTile tile;
-
-      lock.lock();
-      RPCReceive rcv(socket, &error_func);
-
-      if (rcv.name == "acquire_tile") {
-        lock.unlock();
-
-        /* todo: watch out for recursive calls! */
-        if (the_task.acquire_tile(this, tile)) { /* write return as bool */
-          the_tiles.push_back(tile);
-
-          lock.lock();
-          RPCSend snd(socket, &error_func, "acquire_tile");
-          snd.add(tile);
-          snd.write();
-          lock.unlock();
-        }
-        else {
-          lock.lock();
-          RPCSend snd(socket, &error_func, "acquire_tile_none");
-          snd.write();
-          lock.unlock();
-        }
-      }
-      else if (rcv.name == "release_tile") {
-        rcv.read(tile);
-        lock.unlock();
-
-        TileList::iterator it = tile_list_find(the_tiles, tile);
-        if (it != the_tiles.end()) {
-          tile.buffers = it->buffers;
-          the_tiles.erase(it);
-        }
-
-        assert(tile.buffers != NULL);
-
-        the_task.release_tile(tile);
-
-        lock.lock();
-        RPCSend snd(socket, &error_func, "release_tile");
-        snd.write();
-        lock.unlock();
-      }
-      else if (rcv.name == "task_wait_done") {
-        lock.unlock();
-        break;
-      }
-      else
-        lock.unlock();
-    }
-  }
-
-  void task_cancel()
-  {
-    thread_scoped_lock lock(rpc_lock);
-    RPCSend snd(socket, &error_func, "task_cancel");
-    snd.write();
-  }
-
-  int get_split_task_count(DeviceTask &)
-  {
-    return 1;
-  }
-
- private:
-  NetworkError error_func;
-};
-
-Device *device_network_create(DeviceInfo &info,
-                              Stats &stats,
-                              Profiler &profiler,
-                              const char *address)
-{
-  return new NetworkDevice(info, stats, profiler, address);
-}
-
-void device_network_info(vector<DeviceInfo> &devices)
-{
-  DeviceInfo info;
-
-  info.type = DEVICE_NETWORK;
-  info.description = "Network Device";
-  info.id = "NETWORK";
-  info.num = 0;
-
-  /* todo: get this info from device */
-  info.has_volume_decoupled = false;
-  info.has_adaptive_stop_per_sample = false;
-  info.has_osl = false;
-  info.denoisers = DENOISER_NONE;
-
-  devices.push_back(info);
-}
-
-class DeviceServer {
- public:
-  thread_mutex rpc_lock;
-
-  void network_error(const string &message)
-  {
-    error_func.network_error(message);
-  }
-
-  bool have_error()
-  {
-    return error_func.have_error();
-  }
-
-  DeviceServer(Device *device_, tcp::socket &socket_)
-      : device(device_), socket(socket_), stop(false), blocked_waiting(false)
-  {
-    error_func = NetworkError();
-  }
-
-  void listen()
-  {
-    /* receive remote function calls */
-    for (;;) {
-      listen_step();
-
-      if (stop)
-        break;
-    }
-  }
-
- protected:
-  void listen_step()
-  {
-    thread_scoped_lock lock(rpc_lock);
-    RPCReceive rcv(socket, &error_func);
-
-    if (rcv.name == "stop")
-      stop = true;
-    else
-      process(rcv, lock);
-  }
-
-  /* create a memory buffer for a device buffer and insert it into mem_data */
-  DataVector &data_vector_insert(device_ptr client_pointer, size_t data_size)
-  {
-    /* create a new DataVector and insert it into mem_data */
-    pair<DataMap::iterator, bool> data_ins = mem_data.insert(
-        DataMap::value_type(client_pointer, DataVector()));
-
-    /* make sure it was a unique insertion */
-    assert(data_ins.second);
-
-    /* get a reference to the inserted vector */
-    DataVector &data_v = data_ins.first->second;
-
-    /* size the vector */
-    data_v.resize(data_size);
-
-    return data_v;
-  }
-
-  DataVector &data_vector_find(device_ptr client_pointer)
-  {
-    DataMap::iterator i = mem_data.find(client_pointer);
-    assert(i != mem_data.end());
-    return i->second;
-  }
-
-  /* setup mapping and reverse mapping of client_pointer<->real_pointer */
-  void pointer_mapping_insert(device_ptr client_pointer, device_ptr real_pointer)
-  {
-    pair<PtrMap::iterator, bool> mapins;
-
-    /* insert mapping from client pointer to our real device pointer */
-    mapins = ptr_map.insert(PtrMap::value_type(client_pointer, real_pointer));
-    assert(mapins.second);
-
-    /* insert reverse mapping from real our device pointer to client pointer */
-    mapins = ptr_imap.insert(PtrMap::value_type(real_pointer, client_pointer));
-    assert(mapins.second);
-  }
-
-  device_ptr device_ptr_from_client_pointer(device_ptr client_pointer)
-  {
-    PtrMap::iterator i = ptr_map.find(client_pointer);
-    assert(i != ptr_map.end());
-    return i->second;
-  }
-
-  device_ptr device_ptr_from_client_pointer_erase(device_ptr client_pointer)
-  {
-    PtrMap::iterator i = ptr_map.find(client_pointer);
-    assert(i != ptr_map.end());
-
-    device_ptr result = i->second;
-
-    /* erase the mapping */
-    ptr_map.erase(i);
-
-    /* erase the reverse mapping */
-    PtrMap::iterator irev = ptr_imap.find(result);
-    assert(irev != ptr_imap.end());
-    ptr_imap.erase(irev);
-
-    /* erase the data vector */
-    DataMap::iterator idata = mem_data.find(client_pointer);
-    assert(idata != mem_data.end());
-    mem_data.erase(idata);
-
-    return result;
-  }
-
-  /* note that the lock must be already acquired upon entry.
-   * This is necessary because the caller often peeks at
-   * the header and delegates control to here when it doesn't
-   * specifically handle the current RPC.
-   * The lock must be unlocked before returning */
-  void process(RPCReceive &rcv, thread_scoped_lock &lock)
-  {
-    if (rcv.name == "mem_alloc") {
-      string name;
-      network_device_memory mem(device);
-      rcv.read(mem, name);
-      lock.unlock();
-
-      /* Allocate host side data buffer. */
-      size_t data_size = mem.memory_size();
-      device_ptr client_pointer = mem.device_pointer;
-
-      DataVector &data_v = data_vector_insert(client_pointer, data_size);
-      mem.host_pointer = (data_size) ? (void *)&(data_v[0]) : 0;
-
-      /* Perform the allocation on the actual device. */
-      device->mem_alloc(mem);
-
-      /* Store a mapping to/from client_pointer and real device pointer. */
-      pointer_mapping_insert(client_pointer, mem.device_pointer);
-    }
-    else if (rcv.name == "mem_copy_to") {
-      string name;
-      network_device_memory mem(device);
-      rcv.read(mem, name);
-      lock.unlock();
-
-      size_t data_size = mem.memory_size();
-      device_ptr client_pointer = mem.device_pointer;
-
-      if (client_pointer) {
-        /* Lookup existing host side data buffer. */
-        DataVector &data_v = data_vector_find(client_pointer);
-        mem.host_pointer = (void *)&data_v[0];
-
-        /* Translate the client pointer to a real device pointer. */
-        mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
-      }
-      else {
-        /* Allocate host side data buffer. */
-        DataVector &data_v = data_vector_insert(client_pointer, data_size);
-        mem.host_pointer = (data_size) ? (void *)&(data_v[0]) : 0;
-      }
-
-      /* Copy data from network into memory buffer. */
-      rcv.read_buffer((uint8_t *)mem.host_pointer, data_size);
-
-      /* Copy the data from the memory buffer to the device buffer. */
-      device->mem_copy_to(mem);
-
-      if (!client_pointer) {
-        /* Store a mapping to/from client_pointer and real device pointer. */
-        pointer_mapping_insert(client_pointer, mem.device_pointer);
-      }
-    }
-    else if (rcv.name == "mem_copy_from") {
-      string name;
-      network_device_memory mem(device);
-      int y, w, h, elem;
-
-      rcv.read(mem, name);
-      rcv.read(y);
-      rcv.read(w);
-      rcv.read(h);
-      rcv.read(elem);
-
-      device_ptr client_pointer = mem.device_pointer;
-      mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
-
-      DataVector &data_v = data_vector_find(client_pointer);
-
-      mem.host_pointer = (device_ptr) & (data_v[0]);
-
-      device->mem_copy_from(mem, y, w, h, elem);
-
-      size_t data_size = mem.memory_size();
-
-      RPCSend snd(socket, &error_func, "mem_copy_from");
-      snd.write();
-      snd.write_buffer((uint8_t *)mem.host_pointer, data_size);
-      lock.unlock();
-    }
-    else if (rcv.name == "mem_zero") {
-      string name;
-      network_device_memory mem(device);
-      rcv.read(mem, name);
-      lock.unlock();
-
-      size_t data_size = mem.memory_size();
-      device_ptr client_pointer = mem.device_pointer;
-
-      if (client_pointer) {
-        /* Lookup existing host side data buffer. */
-        DataVector &data_v = data_vector_find(client_pointer);
-        mem.host_pointer = (void *)&data_v[0];
-
-        /* Translate the client pointer to a real device pointer. */
-        mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
-      }
-      else {
-        /* Allocate host side data buffer. */
-        DataVector &data_v = data_vector_insert(client_pointer, data_size);
-        mem.host_pointer = (void *) ? (device_ptr) & (data_v[0]) : 0;
-      }
-
-      /* Zero memory. */
-      device->mem_zero(mem);
-
-      if (!client_pointer) {
-        /* Store a mapping to/from client_pointer and real device pointer. */
-        pointer_mapping_insert(client_pointer, mem.device_pointer);
-      }
-    }
-    else if (rcv.name == "mem_free") {
-      string name;
-      network_device_memory mem(device);
-
-      rcv.read(mem, name);
-      lock.unlock();
-
-      device_ptr client_pointer = mem.device_pointer;
-
-      mem.device_pointer = device_ptr_from_client_pointer_erase(client_pointer);
-
-      device->mem_free(mem);
-    }
-    else if (rcv.name == "const_copy_to") {
-      string name_string;
-      size_t size;
-
-      rcv.read(name_string);
-      rcv.read(size);
-
-      vector<char> host_vector(size);
-      rcv.read_buffer(&host_vector[0], size);
-      lock.unlock();
-
-      device->const_copy_to(name_string.c_str(), &host_vector[0], size);
-    }
-    else if (rcv.name == "load_kernels") {
-      DeviceRequestedFeatures requested_features;
-      rcv.read(requested_features.experimental);
-      rcv.read(requested_features.max_closure);
-      rcv.read(requested_features.max_nodes_group);
-      rcv.read(requested_features.nodes_features);
-
-      bool result;
-      result = device->load_kernels(requested_features);
-      RPCSend snd(socket, &error_func, "load_kernels");
-      snd.add(result);
-      snd.write();
-      lock.unlock();
-    }
-    else if (rcv.name == "task_add") {
-      DeviceTask task;
-
-      rcv.read(task);
-      lock.unlock();
-
-      if (task.buffer)
-        task.buffer = device_ptr_from_client_pointer(task.buffer);
-
-      if (task.rgba_half)
-        task.rgba_half = device_ptr_from_client_pointer(task.rgba_half);
-
-      if (task.rgba_byte)
-        task.rgba_byte = device_ptr_from_client_pointer(task.rgba_byte);
-
-      if (task.shader_input)
-        task.shader_input = device_ptr_from_client_pointer(task.shader_input);
-
-      if (task.shader_output)
-        task.shader_output = device_ptr_from_client_pointer(task.shader_output);
-
-      task.acquire_tile = function_bind(&DeviceServer::task_acquire_tile, this, _1, _2);
-      task.release_tile = function_bind(&DeviceServer::task_release_tile, this, _1);
-      task.update_progress_sample = function_bind(&DeviceServer::task_update_progress_sample,
-                                                  this);
-      task.update_tile_sample = function_bind(&DeviceServer::task_update_tile_sample, this, _1);
-      task.get_cancel = function_bind(&DeviceServer::task_get_cancel, this);
-
-      device->task_add(task);
-    }
-    else if (rcv.name == "task_wait") {
-      lock.unlock();
-
-      blocked_waiting = true;
-      device->task_wait();
-      blocked_waiting = false;
-
-      lock.lock();
-      RPCSend snd(socket, &error_func, "task_wait_done");
-      snd.write();
-      lock.unlock();
-    }
-    else if (rcv.name == "task_cancel") {
-      lock.unlock();
-      device->task_cancel();
-    }
-    else if (rcv.name == "acquire_tile") {
-      AcquireEntry entry;
-      entry.name = rcv.name;
-      rcv.read(entry.tile);
-      acquire_queue.push_back(entry);
-      lock.unlock();
-    }
-    else if (rcv.name == "acquire_tile_none") {
-      AcquireEntry entry;
-      entry.name = rcv.name;
-      acquire_queue.push_back(entry);
-      lock.unlock();
-    }
-    else if (rcv.name == "release_tile") {
-      AcquireEntry entry;
-      entry.name = rcv.name;
-      acquire_queue.push_back(entry);
-      lock.unlock();
-    }
-    else {
-      cout << "Error: unexpected RPC receive call \"" + rcv.name + "\"\n";
-      lock.unlock();
-    }
-  }
-
-  bool task_acquire_tile(Device *, RenderTile &tile)
-  {
-    thread_scoped_lock acquire_lock(acquire_mutex);
-
-    bool result = false;
-
-    RPCSend snd(socket, &error_func, "acquire_tile");
-    snd.write();
-
-    do {
-      if (blocked_waiting)
-        listen_step();
-
-      /* todo: avoid busy wait loop */
-      thread_scoped_lock lock(rpc_lock);
-
-      if (!acquire_queue.empty()) {
-        AcquireEntry entry = acquire_queue.front();
-        acquire_queue.pop_front();
-
-        if (entry.name == "acquire_tile") {
-          tile = entry.tile;
-
-          if (tile.buffer)
-            tile.buffer = ptr_map[tile.buffer];
-
-          result = true;
-          break;
-        }
-        else if (entry.name == "acquire_tile_none") {
-          break;
-        }
-        else {
-          cout << "Error: unexpected acquire RPC receive call \"" + entry.name + "\"\n";
-        }
-      }
-    } while (acquire_queue.empty() && !stop && !have_error());
-
-    return result;
-  }
-
-  void task_update_progress_sample()
-  {
-    ; /* skip */
-  }
-
-  void task_update_tile_sample(RenderTile &)
-  {
-    ; /* skip */
-  }
-
-  void task_release_tile(RenderTile &tile)
-  {
-    thread_scoped_lock acquire_lock(acquire_mutex);
-
-    if (tile.buffer)
-      tile.buffer = ptr_imap[tile.buffer];
-
-    {
-      thread_scoped_lock lock(rpc_lock);
-      RPCSend snd(socket, &error_func, "release_tile");
-      snd.add(tile);
-      snd.write();
-      lock.unlock();
-    }
-
-    do {
-      if (blocked_waiting)
-        listen_step();
-
-      /* todo: avoid busy wait loop */
-      thread_scoped_lock lock(rpc_lock);
-
-      if (!acquire_queue.empty()) {
-        AcquireEntry entry = acquire_queue.front();
-        acquire_queue.pop_front();
-
-        if (entry.name == "release_tile") {
-          lock.unlock();
-          break;
-        }
-        else {
-          cout << "Error: unexpected release RPC receive call \"" + entry.name + "\"\n";
-        }
-      }
-    } while (acquire_queue.empty() && !stop);
-  }
-
-  bool task_get_cancel()
-  {
-    return false;
-  }
-
-  /* properties */
-  Device *device;
-  tcp::socket &socket;
-
-  /* mapping of remote to local pointer */
-  PtrMap ptr_map;
-  PtrMap ptr_imap;
-  DataMap mem_data;
-
-  struct AcquireEntry {
-    string name;
-    RenderTile tile;
-  };
-
-  thread_mutex acquire_mutex;
-  list<AcquireEntry> acquire_queue;
-
-  bool stop;
-  bool blocked_waiting;
-
- private:
-  NetworkError error_func;
-
-  /* todo: free memory and device (osl) on network error */
-};
-
-void Device::server_run()
-{
-  try {
-    /* starts thread that responds to discovery requests */
-    ServerDiscovery discovery;
-
-    for (;;) {
-      /* accept connection */
-      boost::asio::io_service io_service;
-      tcp::acceptor acceptor(io_service, tcp::endpoint(tcp::v4(), SERVER_PORT));
-
-      tcp::socket socket(io_service);
-      acceptor.accept(socket);
-
-      string remote_address = socket.remote_endpoint().address().to_string();
-      printf("Connected to remote client at: %s\n", remote_address.c_str());
-
-      DeviceServer server(this, socket);
-      server.listen();
-
-      printf("Disconnected.\n");
-    }
-  }
-  catch (exception &e) {
-    fprintf(stderr, "Network server exception: %s\n", e.what());
-  }
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/device_network.h b/intern/cycles/device/device_network.h
deleted file mode 100644
index b3a0f6daa57..00000000000
--- a/intern/cycles/device/device_network.h
+++ /dev/null
@@ -1,490 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEVICE_NETWORK_H__
-#define __DEVICE_NETWORK_H__
-
-#ifdef WITH_NETWORK
-
-#  include <boost/archive/binary_iarchive.hpp>
-#  include <boost/archive/binary_oarchive.hpp>
-#  include <boost/archive/text_iarchive.hpp>
-#  include <boost/archive/text_oarchive.hpp>
-#  include <boost/array.hpp>
-#  include <boost/asio.hpp>
-#  include <boost/bind.hpp>
-#  include <boost/serialization/vector.hpp>
-#  include <boost/thread.hpp>
-
-#  include <deque>
-#  include <iostream>
-#  include <sstream>
-
-#  include "render/buffers.h"
-
-#  include "util/util_foreach.h"
-#  include "util/util_list.h"
-#  include "util/util_map.h"
-#  include "util/util_param.h"
-#  include "util/util_string.h"
-
-CCL_NAMESPACE_BEGIN
-
-using std::cerr;
-using std::cout;
-using std::exception;
-using std::hex;
-using std::setw;
-
-using boost::asio::ip::tcp;
-
-static const int SERVER_PORT = 5120;
-static const int DISCOVER_PORT = 5121;
-static const string DISCOVER_REQUEST_MSG = "REQUEST_RENDER_SERVER_IP";
-static const string DISCOVER_REPLY_MSG = "REPLY_RENDER_SERVER_IP";
-
-#  if 0
-typedef boost::archive::text_oarchive o_archive;
-typedef boost::archive::text_iarchive i_archive;
-#  else
-typedef boost::archive::binary_oarchive o_archive;
-typedef boost::archive::binary_iarchive i_archive;
-#  endif
-
-/* Serialization of device memory */
-
-class network_device_memory : public device_memory {
- public:
-  network_device_memory(Device *device) : device_memory(device, "", MEM_READ_ONLY)
-  {
-  }
-
-  ~network_device_memory()
-  {
-    device_pointer = 0;
-  };
-
-  vector<char> local_data;
-};
-
-/* Common network error function / object for both DeviceNetwork and DeviceServer. */
-class NetworkError {
- public:
-  NetworkError()
-  {
-    error = "";
-    error_count = 0;
-  }
-
-  ~NetworkError()
-  {
-  }
-
-  void network_error(const string &message)
-  {
-    error = message;
-    error_count += 1;
-  }
-
-  bool have_error()
-  {
-    return true ? error_count > 0 : false;
-  }
-
- private:
-  string error;
-  int error_count;
-};
-
-/* Remote procedure call Send */
-
-class RPCSend {
- public:
-  RPCSend(tcp::socket &socket_, NetworkError *e, const string &name_ = "")
-      : name(name_), socket(socket_), archive(archive_stream), sent(false)
-  {
-    archive &name_;
-    error_func = e;
-    fprintf(stderr, "rpc send %s\n", name.c_str());
-  }
-
-  ~RPCSend()
-  {
-  }
-
-  void add(const device_memory &mem)
-  {
-    archive &mem.data_type &mem.data_elements &mem.data_size;
-    archive &mem.data_width &mem.data_height &mem.data_depth &mem.device_pointer;
-    archive &mem.type &string(mem.name);
-    archive &mem.interpolation &mem.extension;
-    archive &mem.device_pointer;
-  }
-
-  template<typename T> void add(const T &data)
-  {
-    archive &data;
-  }
-
-  void add(const DeviceTask &task)
-  {
-    int type = (int)task.type;
-    archive &type &task.x &task.y &task.w &task.h;
-    archive &task.rgba_byte &task.rgba_half &task.buffer &task.sample &task.num_samples;
-    archive &task.offset &task.stride;
-    archive &task.shader_input &task.shader_output &task.shader_eval_type;
-    archive &task.shader_x &task.shader_w;
-    archive &task.need_finish_queue;
-  }
-
-  void add(const RenderTile &tile)
-  {
-    archive &tile.x &tile.y &tile.w &tile.h;
-    archive &tile.start_sample &tile.num_samples &tile.sample;
-    archive &tile.resolution &tile.offset &tile.stride;
-    archive &tile.buffer;
-  }
-
-  void write()
-  {
-    boost::system::error_code error;
-
-    /* get string from stream */
-    string archive_str = archive_stream.str();
-
-    /* first send fixed size header with size of following data */
-    ostringstream header_stream;
-    header_stream << setw(8) << hex << archive_str.size();
-    string header_str = header_stream.str();
-
-    boost::asio::write(
-        socket, boost::asio::buffer(header_str), boost::asio::transfer_all(), error);
-
-    if (error.value())
-      error_func->network_error(error.message());
-
-    /* then send actual data */
-    boost::asio::write(
-        socket, boost::asio::buffer(archive_str), boost::asio::transfer_all(), error);
-
-    if (error.value())
-      error_func->network_error(error.message());
-
-    sent = true;
-  }
-
-  void write_buffer(void *buffer, size_t size)
-  {
-    boost::system::error_code error;
-
-    boost::asio::write(
-        socket, boost::asio::buffer(buffer, size), boost::asio::transfer_all(), error);
-
-    if (error.value())
-      error_func->network_error(error.message());
-  }
-
- protected:
-  string name;
-  tcp::socket &socket;
-  ostringstream archive_stream;
-  o_archive archive;
-  bool sent;
-  NetworkError *error_func;
-};
-
-/* Remote procedure call Receive */
-
-class RPCReceive {
- public:
-  RPCReceive(tcp::socket &socket_, NetworkError *e)
-      : socket(socket_), archive_stream(NULL), archive(NULL)
-  {
-    error_func = e;
-    /* read head with fixed size */
-    vector<char> header(8);
-    boost::system::error_code error;
-    size_t len = boost::asio::read(socket, boost::asio::buffer(header), error);
-
-    if (error.value()) {
-      error_func->network_error(error.message());
-    }
-
-    /* verify if we got something */
-    if (len == header.size()) {
-      /* decode header */
-      string header_str(&header[0], header.size());
-      istringstream header_stream(header_str);
-
-      size_t data_size;
-
-      if ((header_stream >> hex >> data_size)) {
-
-        vector<char> data(data_size);
-        size_t len = boost::asio::read(socket, boost::asio::buffer(data), error);
-
-        if (error.value())
-          error_func->network_error(error.message());
-
-        if (len == data_size) {
-          archive_str = (data.size()) ? string(&data[0], data.size()) : string("");
-
-          archive_stream = new istringstream(archive_str);
-          archive = new i_archive(*archive_stream);
-
-          *archive &name;
-          fprintf(stderr, "rpc receive %s\n", name.c_str());
-        }
-        else {
-          error_func->network_error("Network receive error: data size doesn't match header");
-        }
-      }
-      else {
-        error_func->network_error("Network receive error: can't decode data size from header");
-      }
-    }
-    else {
-      error_func->network_error("Network receive error: invalid header size");
-    }
-  }
-
-  ~RPCReceive()
-  {
-    delete archive;
-    delete archive_stream;
-  }
-
-  void read(network_device_memory &mem, string &name)
-  {
-    *archive &mem.data_type &mem.data_elements &mem.data_size;
-    *archive &mem.data_width &mem.data_height &mem.data_depth &mem.device_pointer;
-    *archive &mem.type &name;
-    *archive &mem.interpolation &mem.extension;
-    *archive &mem.device_pointer;
-
-    mem.name = name.c_str();
-    mem.host_pointer = 0;
-
-    /* Can't transfer OpenGL texture over network. */
-    if (mem.type == MEM_PIXELS) {
-      mem.type = MEM_READ_WRITE;
-    }
-  }
-
-  template<typename T> void read(T &data)
-  {
-    *archive &data;
-  }
-
-  void read_buffer(void *buffer, size_t size)
-  {
-    boost::system::error_code error;
-    size_t len = boost::asio::read(socket, boost::asio::buffer(buffer, size), error);
-
-    if (error.value()) {
-      error_func->network_error(error.message());
-    }
-
-    if (len != size)
-      cout << "Network receive error: buffer size doesn't match expected size\n";
-  }
-
-  void read(DeviceTask &task)
-  {
-    int type;
-
-    *archive &type &task.x &task.y &task.w &task.h;
-    *archive &task.rgba_byte &task.rgba_half &task.buffer &task.sample &task.num_samples;
-    *archive &task.offset &task.stride;
-    *archive &task.shader_input &task.shader_output &task.shader_eval_type;
-    *archive &task.shader_x &task.shader_w;
-    *archive &task.need_finish_queue;
-
-    task.type = (DeviceTask::Type)type;
-  }
-
-  void read(RenderTile &tile)
-  {
-    *archive &tile.x &tile.y &tile.w &tile.h;
-    *archive &tile.start_sample &tile.num_samples &tile.sample;
-    *archive &tile.resolution &tile.offset &tile.stride;
-    *archive &tile.buffer;
-
-    tile.buffers = NULL;
-  }
-
-  string name;
-
- protected:
-  tcp::socket &socket;
-  string archive_str;
-  istringstream *archive_stream;
-  i_archive *archive;
-  NetworkError *error_func;
-};
-
-/* Server auto discovery */
-
-class ServerDiscovery {
- public:
-  explicit ServerDiscovery(bool discover = false)
-      : listen_socket(io_service), collect_servers(false)
-  {
-    /* setup listen socket */
-    listen_endpoint.address(boost::asio::ip::address_v4::any());
-    listen_endpoint.port(DISCOVER_PORT);
-
-    listen_socket.open(listen_endpoint.protocol());
-
-    boost::asio::socket_base::reuse_address option(true);
-    listen_socket.set_option(option);
-
-    listen_socket.bind(listen_endpoint);
-
-    /* setup receive callback */
-    async_receive();
-
-    /* start server discovery */
-    if (discover) {
-      collect_servers = true;
-      servers.clear();
-
-      broadcast_message(DISCOVER_REQUEST_MSG);
-    }
-
-    /* start thread */
-    work = new boost::asio::io_service::work(io_service);
-    thread = new boost::thread(boost::bind(&boost::asio::io_service::run, &io_service));
-  }
-
-  ~ServerDiscovery()
-  {
-    io_service.stop();
-    thread->join();
-    delete thread;
-    delete work;
-  }
-
-  vector<string> get_server_list()
-  {
-    vector<string> result;
-
-    mutex.lock();
-    result = vector<string>(servers.begin(), servers.end());
-    mutex.unlock();
-
-    return result;
-  }
-
- private:
-  void handle_receive_from(const boost::system::error_code &error, size_t size)
-  {
-    if (error) {
-      cout << "Server discovery receive error: " << error.message() << "\n";
-      return;
-    }
-
-    if (size > 0) {
-      string msg = string(receive_buffer, size);
-
-      /* handle incoming message */
-      if (collect_servers) {
-        if (msg == DISCOVER_REPLY_MSG) {
-          string address = receive_endpoint.address().to_string();
-
-          mutex.lock();
-
-          /* add address if it's not already in the list */
-          bool found = std::find(servers.begin(), servers.end(), address) != servers.end();
-
-          if (!found)
-            servers.push_back(address);
-
-          mutex.unlock();
-        }
-      }
-      else {
-        /* reply to request */
-        if (msg == DISCOVER_REQUEST_MSG)
-          broadcast_message(DISCOVER_REPLY_MSG);
-      }
-    }
-
-    async_receive();
-  }
-
-  void async_receive()
-  {
-    listen_socket.async_receive_from(boost::asio::buffer(receive_buffer),
-                                     receive_endpoint,
-                                     boost::bind(&ServerDiscovery::handle_receive_from,
-                                                 this,
-                                                 boost::asio::placeholders::error,
-                                                 boost::asio::placeholders::bytes_transferred));
-  }
-
-  void broadcast_message(const string &msg)
-  {
-    /* setup broadcast socket */
-    boost::asio::ip::udp::socket socket(io_service);
-
-    socket.open(boost::asio::ip::udp::v4());
-
-    boost::asio::socket_base::broadcast option(true);
-    socket.set_option(option);
-
-    boost::asio::ip::udp::endpoint broadcast_endpoint(
-        boost::asio::ip::address::from_string("255.255.255.255"), DISCOVER_PORT);
-
-    /* broadcast message */
-    socket.send_to(boost::asio::buffer(msg), broadcast_endpoint);
-  }
-
-  /* network service and socket */
-  boost::asio::io_service io_service;
-  boost::asio::ip::udp::endpoint listen_endpoint;
-  boost::asio::ip::udp::socket listen_socket;
-
-  /* threading */
-  boost::thread *thread;
-  boost::asio::io_service::work *work;
-  boost::mutex mutex;
-
-  /* buffer and endpoint for receiving messages */
-  char receive_buffer[256];
-  boost::asio::ip::udp::endpoint receive_endpoint;
-
-  // os, version, devices, status, host name, group name, ip as far as fields go
-  struct ServerInfo {
-    string cycles_version;
-    string os;
-    int device_count;
-    string status;
-    string host_name;
-    string group_name;
-    string host_addr;
-  };
-
-  /* collection of server addresses in list */
-  bool collect_servers;
-  vector<string> servers;
-};
-
-CCL_NAMESPACE_END
-
-#endif
-
-#endif /* __DEVICE_NETWORK_H__ */
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
deleted file mode 100644
index 9abb7cfb7fe..00000000000
--- a/intern/cycles/device/device_opencl.cpp
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPENCL
-
-#  include "device/opencl/device_opencl.h"
-#  include "device/device.h"
-#  include "device/device_intern.h"
-
-#  include "util/util_foreach.h"
-#  include "util/util_logging.h"
-#  include "util/util_set.h"
-#  include "util/util_string.h"
-
-CCL_NAMESPACE_BEGIN
-
-Device *device_opencl_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
-{
-  return opencl_create_split_device(info, stats, profiler, background);
-}
-
-bool device_opencl_init()
-{
-  static bool initialized = false;
-  static bool result = false;
-
-  if (initialized)
-    return result;
-
-  initialized = true;
-
-  if (OpenCLInfo::device_type() != 0) {
-    int clew_result = clewInit();
-    if (clew_result == CLEW_SUCCESS) {
-      VLOG(1) << "CLEW initialization succeeded.";
-      result = true;
-    }
-    else {
-      VLOG(1) << "CLEW initialization failed: "
-              << ((clew_result == CLEW_ERROR_ATEXIT_FAILED) ? "Error setting up atexit() handler" :
-                                                              "Error opening the library");
-    }
-  }
-  else {
-    VLOG(1) << "Skip initializing CLEW, platform is force disabled.";
-    result = false;
-  }
-
-  return result;
-}
-
-static cl_int device_opencl_get_num_platforms_safe(cl_uint *num_platforms)
-{
-#  ifdef _WIN32
-  __try {
-    return clGetPlatformIDs(0, NULL, num_platforms);
-  }
-  __except (EXCEPTION_EXECUTE_HANDLER) {
-    /* Ignore crashes inside the OpenCL driver and hope we can
-     * survive even with corrupted OpenCL installs. */
-    fprintf(stderr, "Cycles OpenCL: driver crashed, continuing without OpenCL.\n");
-  }
-
-  *num_platforms = 0;
-  return CL_DEVICE_NOT_FOUND;
-#  else
-  return clGetPlatformIDs(0, NULL, num_platforms);
-#  endif
-}
-
-void device_opencl_info(vector<DeviceInfo> &devices)
-{
-  cl_uint num_platforms = 0;
-  device_opencl_get_num_platforms_safe(&num_platforms);
-  if (num_platforms == 0) {
-    return;
-  }
-
-  vector<OpenCLPlatformDevice> usable_devices;
-  OpenCLInfo::get_usable_devices(&usable_devices);
-  /* Devices are numbered consecutively across platforms. */
-  int num_devices = 0;
-  set<string> unique_ids;
-  foreach (OpenCLPlatformDevice &platform_device, usable_devices) {
-    /* Compute unique ID for persistent user preferences. */
-    const string &platform_name = platform_device.platform_name;
-    const string &device_name = platform_device.device_name;
-    string hardware_id = platform_device.hardware_id;
-    if (hardware_id == "") {
-      hardware_id = string_printf("ID_%d", num_devices);
-    }
-    string id = string("OPENCL_") + platform_name + "_" + device_name + "_" + hardware_id;
-
-    /* Hardware ID might not be unique, add device number in that case. */
-    if (unique_ids.find(id) != unique_ids.end()) {
-      id += string_printf("_ID_%d", num_devices);
-    }
-    unique_ids.insert(id);
-
-    /* Create DeviceInfo. */
-    DeviceInfo info;
-    info.type = DEVICE_OPENCL;
-    info.description = string_remove_trademark(string(device_name));
-    info.num = num_devices;
-    /* We don't know if it's used for display, but assume it is. */
-    info.display_device = true;
-    info.use_split_kernel = true;
-    info.has_volume_decoupled = false;
-    info.has_adaptive_stop_per_sample = false;
-    info.denoisers = DENOISER_NLM;
-    info.id = id;
-
-    /* Check OpenCL extensions */
-    info.has_half_images = platform_device.device_extensions.find("cl_khr_fp16") != string::npos;
-
-    /* Disabled for now due to apparent AMD driver bug. */
-    info.has_nanovdb = platform_name != "AMD Accelerated Parallel Processing";
-
-    devices.push_back(info);
-    num_devices++;
-  }
-}
-
-string device_opencl_capabilities()
-{
-  if (OpenCLInfo::device_type() == 0) {
-    return "All OpenCL devices are forced to be OFF";
-  }
-  string result = "";
-  string error_msg = ""; /* Only used by opencl_assert(), but in the future
-                          * it could also be nicely reported to the console.
-                          */
-  cl_uint num_platforms = 0;
-  opencl_assert(device_opencl_get_num_platforms_safe(&num_platforms));
-  if (num_platforms == 0) {
-    return "No OpenCL platforms found\n";
-  }
-  result += string_printf("Number of platforms: %u\n", num_platforms);
-
-  vector<cl_platform_id> platform_ids;
-  platform_ids.resize(num_platforms);
-  opencl_assert(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL));
-
-#  define APPEND_INFO(func, id, name, what, type) \
-    do { \
-      type data; \
-      memset(&data, 0, sizeof(data)); \
-      opencl_assert(func(id, what, sizeof(data), &data, NULL)); \
-      result += string_printf("%s: %s\n", name, to_string(data).c_str()); \
-    } while (false)
-#  define APPEND_STRING_INFO_IMPL(func, id, name, what, is_optional) \
-    do { \
-      string value; \
-      size_t length = 0; \
-      if (func(id, what, 0, NULL, &length) == CL_SUCCESS) { \
-        vector<char> buffer(length + 1); \
-        if (func(id, what, buffer.size(), buffer.data(), NULL) == CL_SUCCESS) { \
-          value = string(buffer.data()); \
-        } \
-      } \
-      if (is_optional && !(length != 0 && value[0] != '\0')) { \
-        break; \
-      } \
-      result += string_printf("%s: %s\n", name, value.c_str()); \
-    } while (false)
-#  define APPEND_PLATFORM_STRING_INFO(id, name, what) \
-    APPEND_STRING_INFO_IMPL(clGetPlatformInfo, id, "\tPlatform " name, what, false)
-#  define APPEND_STRING_EXTENSION_INFO(func, id, name, what) \
-    APPEND_STRING_INFO_IMPL(clGetPlatformInfo, id, "\tPlatform " name, what, true)
-#  define APPEND_PLATFORM_INFO(id, name, what, type) \
-    APPEND_INFO(clGetPlatformInfo, id, "\tPlatform " name, what, type)
-#  define APPEND_DEVICE_INFO(id, name, what, type) \
-    APPEND_INFO(clGetDeviceInfo, id, "\t\t\tDevice " name, what, type)
-#  define APPEND_DEVICE_STRING_INFO(id, name, what) \
-    APPEND_STRING_INFO_IMPL(clGetDeviceInfo, id, "\t\t\tDevice " name, what, false)
-#  define APPEND_DEVICE_STRING_EXTENSION_INFO(id, name, what) \
-    APPEND_STRING_INFO_IMPL(clGetDeviceInfo, id, "\t\t\tDevice " name, what, true)
-
-  vector<cl_device_id> device_ids;
-  for (cl_uint platform = 0; platform < num_platforms; ++platform) {
-    cl_platform_id platform_id = platform_ids[platform];
-
-    result += string_printf("Platform #%u\n", platform);
-
-    APPEND_PLATFORM_STRING_INFO(platform_id, "Name", CL_PLATFORM_NAME);
-    APPEND_PLATFORM_STRING_INFO(platform_id, "Vendor", CL_PLATFORM_VENDOR);
-    APPEND_PLATFORM_STRING_INFO(platform_id, "Version", CL_PLATFORM_VERSION);
-    APPEND_PLATFORM_STRING_INFO(platform_id, "Profile", CL_PLATFORM_PROFILE);
-    APPEND_PLATFORM_STRING_INFO(platform_id, "Extensions", CL_PLATFORM_EXTENSIONS);
-
-    cl_uint num_devices = 0;
-    opencl_assert(
-        clGetDeviceIDs(platform_ids[platform], CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices));
-    result += string_printf("\tNumber of devices: %u\n", num_devices);
-
-    device_ids.resize(num_devices);
-    opencl_assert(clGetDeviceIDs(
-        platform_ids[platform], CL_DEVICE_TYPE_ALL, num_devices, &device_ids[0], NULL));
-    for (cl_uint device = 0; device < num_devices; ++device) {
-      cl_device_id device_id = device_ids[device];
-
-      result += string_printf("\t\tDevice: #%u\n", device);
-
-      APPEND_DEVICE_STRING_INFO(device_id, "Name", CL_DEVICE_NAME);
-      APPEND_DEVICE_STRING_EXTENSION_INFO(device_id, "Board Name", CL_DEVICE_BOARD_NAME_AMD);
-      APPEND_DEVICE_STRING_INFO(device_id, "Vendor", CL_DEVICE_VENDOR);
-      APPEND_DEVICE_STRING_INFO(device_id, "OpenCL C Version", CL_DEVICE_OPENCL_C_VERSION);
-      APPEND_DEVICE_STRING_INFO(device_id, "Profile", CL_DEVICE_PROFILE);
-      APPEND_DEVICE_STRING_INFO(device_id, "Version", CL_DEVICE_VERSION);
-      APPEND_DEVICE_STRING_INFO(device_id, "Extensions", CL_DEVICE_EXTENSIONS);
-      APPEND_DEVICE_INFO(
-          device_id, "Max clock frequency (MHz)", CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint);
-      APPEND_DEVICE_INFO(device_id, "Max compute units", CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint);
-      APPEND_DEVICE_INFO(device_id, "Max work group size", CL_DEVICE_MAX_WORK_GROUP_SIZE, size_t);
-    }
-  }
-
-#  undef APPEND_INFO
-#  undef APPEND_STRING_INFO_IMPL
-#  undef APPEND_PLATFORM_STRING_INFO
-#  undef APPEND_STRING_EXTENSION_INFO
-#  undef APPEND_PLATFORM_INFO
-#  undef APPEND_DEVICE_INFO
-#  undef APPEND_DEVICE_STRING_INFO
-#  undef APPEND_DEVICE_STRING_EXTENSION_INFO
-
-  return result;
-}
-
-CCL_NAMESPACE_END
-
-#endif /* WITH_OPENCL */
diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp
deleted file mode 100644
index 6f9a7943722..00000000000
--- a/intern/cycles/device/device_optix.cpp
+++ /dev/null
@@ -1,1936 +0,0 @@
-/*
- * Copyright 2019, NVIDIA Corporation.
- * Copyright 2019, Blender Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPTIX
-
-#  include "bvh/bvh.h"
-#  include "bvh/bvh_optix.h"
-#  include "device/cuda/device_cuda.h"
-#  include "device/device_denoising.h"
-#  include "device/device_intern.h"
-#  include "render/buffers.h"
-#  include "render/hair.h"
-#  include "render/mesh.h"
-#  include "render/object.h"
-#  include "render/scene.h"
-#  include "util/util_debug.h"
-#  include "util/util_logging.h"
-#  include "util/util_md5.h"
-#  include "util/util_path.h"
-#  include "util/util_progress.h"
-#  include "util/util_time.h"
-
-#  ifdef WITH_CUDA_DYNLOAD
-#    include <cuew.h>
-// Do not use CUDA SDK headers when using CUEW
-#    define OPTIX_DONT_INCLUDE_CUDA
-#  endif
-#  include <optix_function_table_definition.h>
-#  include <optix_stubs.h>
-
-// TODO(pmours): Disable this once drivers have native support
-#  define OPTIX_DENOISER_NO_PIXEL_STRIDE 1
-
-CCL_NAMESPACE_BEGIN
-
-/* Make sure this stays in sync with kernel_globals.h */
-struct ShaderParams {
-  uint4 *input;
-  float4 *output;
-  int type;
-  int filter;
-  int sx;
-  int offset;
-  int sample;
-};
-struct KernelParams {
-  WorkTile tile;
-  KernelData data;
-  ShaderParams shader;
-#  define KERNEL_TEX(type, name) const type *name;
-#  include "kernel/kernel_textures.h"
-#  undef KERNEL_TEX
-};
-
-#  define check_result_cuda(stmt) \
-    { \
-      CUresult res = stmt; \
-      if (res != CUDA_SUCCESS) { \
-        const char *name; \
-        cuGetErrorName(res, &name); \
-        set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
-        return; \
-      } \
-    } \
-    (void)0
-#  define check_result_cuda_ret(stmt) \
-    { \
-      CUresult res = stmt; \
-      if (res != CUDA_SUCCESS) { \
-        const char *name; \
-        cuGetErrorName(res, &name); \
-        set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
-        return false; \
-      } \
-    } \
-    (void)0
-
-#  define check_result_optix(stmt) \
-    { \
-      enum OptixResult res = stmt; \
-      if (res != OPTIX_SUCCESS) { \
-        const char *name = optixGetErrorName(res); \
-        set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
-        return; \
-      } \
-    } \
-    (void)0
-#  define check_result_optix_ret(stmt) \
-    { \
-      enum OptixResult res = stmt; \
-      if (res != OPTIX_SUCCESS) { \
-        const char *name = optixGetErrorName(res); \
-        set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
-        return false; \
-      } \
-    } \
-    (void)0
-
-#  define launch_filter_kernel(func_name, w, h, args) \
-    { \
-      CUfunction func; \
-      check_result_cuda_ret(cuModuleGetFunction(&func, cuFilterModule, func_name)); \
-      check_result_cuda_ret(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1)); \
-      int threads; \
-      check_result_cuda_ret( \
-          cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
-      threads = (int)sqrt((float)threads); \
-      int xblocks = ((w) + threads - 1) / threads; \
-      int yblocks = ((h) + threads - 1) / threads; \
-      check_result_cuda_ret( \
-          cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0)); \
-    } \
-    (void)0
-
-class OptiXDevice : public CUDADevice {
-
-  // List of OptiX program groups
-  enum {
-    PG_RGEN,
-    PG_MISS,
-    PG_HITD,  // Default hit group
-    PG_HITS,  // __SHADOW_RECORD_ALL__ hit group
-    PG_HITL,  // __BVH_LOCAL__ hit group (only used for triangles)
-#  if OPTIX_ABI_VERSION >= 36
-    PG_HITD_MOTION,
-    PG_HITS_MOTION,
-#  endif
-    PG_BAKE,  // kernel_bake_evaluate
-    PG_DISP,  // kernel_displace_evaluate
-    PG_BACK,  // kernel_background_evaluate
-    PG_CALL,
-    NUM_PROGRAM_GROUPS = PG_CALL + 3
-  };
-
-  // List of OptiX pipelines
-  enum { PIP_PATH_TRACE, PIP_SHADER_EVAL, NUM_PIPELINES };
-
-  // A single shader binding table entry
-  struct SbtRecord {
-    char header[OPTIX_SBT_RECORD_HEADER_SIZE];
-  };
-
-  // Information stored about CUDA memory allocations
-  struct CUDAMem {
-    bool free_map_host = false;
-    CUarray array = NULL;
-    CUtexObject texobject = 0;
-    bool use_mapped_host = false;
-  };
-
-  // Helper class to manage current CUDA context
-  struct CUDAContextScope {
-    CUDAContextScope(CUcontext ctx)
-    {
-      cuCtxPushCurrent(ctx);
-    }
-    ~CUDAContextScope()
-    {
-      cuCtxPopCurrent(NULL);
-    }
-  };
-
-  // Use a pool with multiple threads to support launches with multiple CUDA streams
-  TaskPool task_pool;
-
-  vector<CUstream> cuda_stream;
-  OptixDeviceContext context = NULL;
-
-  OptixModule optix_module = NULL;  // All necessary OptiX kernels are in one module
-  OptixModule builtin_modules[2] = {};
-  OptixPipeline pipelines[NUM_PIPELINES] = {};
-
-  bool motion_blur = false;
-  device_vector<SbtRecord> sbt_data;
-  device_only_memory<KernelParams> launch_params;
-  OptixTraversableHandle tlas_handle = 0;
-
-  OptixDenoiser denoiser = NULL;
-  device_only_memory<unsigned char> denoiser_state;
-  int denoiser_input_passes = 0;
-
-  vector<device_only_memory<char>> delayed_free_bvh_memory;
-  thread_mutex delayed_free_bvh_mutex;
-
- public:
-  OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
-      : CUDADevice(info_, stats_, profiler_, background_),
-        sbt_data(this, "__sbt", MEM_READ_ONLY),
-        launch_params(this, "__params", false),
-        denoiser_state(this, "__denoiser_state", true)
-  {
-    // Store number of CUDA streams in device info
-    info.cpu_threads = DebugFlags().optix.cuda_streams;
-
-    // Make the CUDA context current
-    if (!cuContext) {
-      return;  // Do not initialize if CUDA context creation failed already
-    }
-    const CUDAContextScope scope(cuContext);
-
-    // Create OptiX context for this device
-    OptixDeviceContextOptions options = {};
-#  ifdef WITH_CYCLES_LOGGING
-    options.logCallbackLevel = 4;  // Fatal = 1, Error = 2, Warning = 3, Print = 4
-    options.logCallbackFunction =
-        [](unsigned int level, const char *, const char *message, void *) {
-          switch (level) {
-            case 1:
-              LOG_IF(FATAL, VLOG_IS_ON(1)) << message;
-              break;
-            case 2:
-              LOG_IF(ERROR, VLOG_IS_ON(1)) << message;
-              break;
-            case 3:
-              LOG_IF(WARNING, VLOG_IS_ON(1)) << message;
-              break;
-            case 4:
-              LOG_IF(INFO, VLOG_IS_ON(1)) << message;
-              break;
-          }
-        };
-#  endif
-    check_result_optix(optixDeviceContextCreate(cuContext, &options, &context));
-#  ifdef WITH_CYCLES_LOGGING
-    check_result_optix(optixDeviceContextSetLogCallback(
-        context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel));
-#  endif
-
-    // Create launch streams
-    cuda_stream.resize(info.cpu_threads);
-    for (int i = 0; i < info.cpu_threads; ++i)
-      check_result_cuda(cuStreamCreate(&cuda_stream[i], CU_STREAM_NON_BLOCKING));
-
-    // Fix weird compiler bug that assigns wrong size
-    launch_params.data_elements = sizeof(KernelParams);
-    // Allocate launch parameter buffer memory on device
-    launch_params.alloc_to_device(info.cpu_threads);
-  }
-  ~OptiXDevice()
-  {
-    // Stop processing any more tasks
-    task_pool.cancel();
-
-    // Make CUDA context current
-    const CUDAContextScope scope(cuContext);
-
-    free_bvh_memory_delayed();
-
-    sbt_data.free();
-    texture_info.free();
-    launch_params.free();
-    denoiser_state.free();
-
-    // Unload modules
-    if (optix_module != NULL)
-      optixModuleDestroy(optix_module);
-    for (unsigned int i = 0; i < 2; ++i)
-      if (builtin_modules[i] != NULL)
-        optixModuleDestroy(builtin_modules[i]);
-    for (unsigned int i = 0; i < NUM_PIPELINES; ++i)
-      if (pipelines[i] != NULL)
-        optixPipelineDestroy(pipelines[i]);
-
-    // Destroy launch streams
-    for (CUstream stream : cuda_stream)
-      cuStreamDestroy(stream);
-
-    if (denoiser != NULL)
-      optixDenoiserDestroy(denoiser);
-
-    optixDeviceContextDestroy(context);
-  }
-
- private:
-  bool show_samples() const override
-  {
-    // Only show samples if not rendering multiple tiles in parallel
-    return info.cpu_threads == 1;
-  }
-
-  BVHLayoutMask get_bvh_layout_mask() const override
-  {
-    // CUDA kernels are used when doing baking, so need to build a BVH those can understand too!
-    if (optix_module == NULL)
-      return CUDADevice::get_bvh_layout_mask();
-
-    // OptiX has its own internal acceleration structure format
-    return BVH_LAYOUT_OPTIX;
-  }
-
-  string compile_kernel_get_common_cflags(const DeviceRequestedFeatures &requested_features,
-                                          bool filter,
-                                          bool /*split*/) override
-  {
-    // Split kernel is not supported in OptiX
-    string common_cflags = CUDADevice::compile_kernel_get_common_cflags(
-        requested_features, filter, false);
-
-    // Add OptiX SDK include directory to include paths
-    const char *optix_sdk_path = getenv("OPTIX_ROOT_DIR");
-    if (optix_sdk_path) {
-      common_cflags += string_printf(" -I\"%s/include\"", optix_sdk_path);
-    }
-
-    // Specialization for shader raytracing
-    if (requested_features.use_shader_raytrace) {
-      common_cflags += " --keep-device-functions";
-    }
-    else {
-      common_cflags += " -D __NO_SHADER_RAYTRACE__";
-    }
-
-    return common_cflags;
-  }
-
-  bool load_kernels(const DeviceRequestedFeatures &requested_features) override
-  {
-    if (have_error()) {
-      // Abort early if context creation failed already
-      return false;
-    }
-
-    // Load CUDA modules because we need some of the utility kernels
-    if (!CUDADevice::load_kernels(requested_features)) {
-      return false;
-    }
-
-    // Baking is currently performed using CUDA, so no need to load OptiX kernels
-    if (requested_features.use_baking) {
-      return true;
-    }
-
-    const CUDAContextScope scope(cuContext);
-
-    // Unload existing OptiX module and pipelines first
-    if (optix_module != NULL) {
-      optixModuleDestroy(optix_module);
-      optix_module = NULL;
-    }
-    for (unsigned int i = 0; i < 2; ++i) {
-      if (builtin_modules[i] != NULL) {
-        optixModuleDestroy(builtin_modules[i]);
-        builtin_modules[i] = NULL;
-      }
-    }
-    for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
-      if (pipelines[i] != NULL) {
-        optixPipelineDestroy(pipelines[i]);
-        pipelines[i] = NULL;
-      }
-    }
-
-    OptixModuleCompileOptions module_options = {};
-    module_options.maxRegisterCount = 0;  // Do not set an explicit register limit
-    module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
-    module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
-
-#  if OPTIX_ABI_VERSION >= 41
-    module_options.boundValues = nullptr;
-    module_options.numBoundValues = 0;
-#  endif
-
-    OptixPipelineCompileOptions pipeline_options = {};
-    // Default to no motion blur and two-level graph, since it is the fastest option
-    pipeline_options.usesMotionBlur = false;
-    pipeline_options.traversableGraphFlags =
-        OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING;
-    pipeline_options.numPayloadValues = 6;
-    pipeline_options.numAttributeValues = 2;  // u, v
-    pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE;
-    pipeline_options.pipelineLaunchParamsVariableName = "__params";  // See kernel_globals.h
-
-#  if OPTIX_ABI_VERSION >= 36
-    pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE;
-    if (requested_features.use_hair) {
-      if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) {
-        pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE;
-      }
-      else {
-        pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
-      }
-    }
-#  endif
-
-    // Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
-    // This is necessary since objects may be reported to have motion if the Vector pass is
-    // active, but may still need to be rendered without motion blur if that isn't active as well
-    motion_blur = requested_features.use_object_motion;
-
-    if (motion_blur) {
-      pipeline_options.usesMotionBlur = true;
-      // Motion blur can insert motion transforms into the traversal graph
-      // It is no longer a two-level graph then, so need to set flags to allow any configuration
-      pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY;
-    }
-
-    {  // Load and compile PTX module with OptiX kernels
-      string ptx_data, ptx_filename = path_get(requested_features.use_shader_raytrace ?
-                                                   "lib/kernel_optix_shader_raytrace.ptx" :
-                                                   "lib/kernel_optix.ptx");
-      if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
-        if (!getenv("OPTIX_ROOT_DIR")) {
-          set_error(
-              "Missing OPTIX_ROOT_DIR environment variable (which must be set with the path to "
-              "the Optix SDK to be able to compile Optix kernels on demand).");
-          return false;
-        }
-        ptx_filename = compile_kernel(requested_features, "kernel_optix", "optix", true);
-      }
-      if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) {
-        set_error("Failed to load OptiX kernel from '" + ptx_filename + "'");
-        return false;
-      }
-
-      check_result_optix_ret(optixModuleCreateFromPTX(context,
-                                                      &module_options,
-                                                      &pipeline_options,
-                                                      ptx_data.data(),
-                                                      ptx_data.size(),
-                                                      nullptr,
-                                                      0,
-                                                      &optix_module));
-    }
-
-    // Create program groups
-    OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
-    OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
-    OptixProgramGroupOptions group_options = {};  // There are no options currently
-    group_descs[PG_RGEN].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
-    group_descs[PG_RGEN].raygen.module = optix_module;
-    // Ignore branched integrator for now (see "requested_features.use_integrator_branched")
-    group_descs[PG_RGEN].raygen.entryFunctionName = "__raygen__kernel_optix_path_trace";
-    group_descs[PG_MISS].kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
-    group_descs[PG_MISS].miss.module = optix_module;
-    group_descs[PG_MISS].miss.entryFunctionName = "__miss__kernel_optix_miss";
-    group_descs[PG_HITD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
-    group_descs[PG_HITD].hitgroup.moduleCH = optix_module;
-    group_descs[PG_HITD].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit";
-    group_descs[PG_HITD].hitgroup.moduleAH = optix_module;
-    group_descs[PG_HITD].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_visibility_test";
-    group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
-    group_descs[PG_HITS].hitgroup.moduleAH = optix_module;
-    group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit";
-
-    if (requested_features.use_hair) {
-      group_descs[PG_HITD].hitgroup.moduleIS = optix_module;
-      group_descs[PG_HITS].hitgroup.moduleIS = optix_module;
-
-      // Add curve intersection programs
-      if (requested_features.use_hair_thick) {
-        // Slower programs for thick hair since that also slows down ribbons.
-        // Ideally this should not be needed.
-        group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_all";
-        group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_all";
-      }
-      else {
-        group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
-        group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
-      }
-
-#  if OPTIX_ABI_VERSION >= 36
-      if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) {
-        OptixBuiltinISOptions builtin_options = {};
-        builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
-        builtin_options.usesMotionBlur = false;
-
-        check_result_optix_ret(optixBuiltinISModuleGet(
-            context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[0]));
-
-        group_descs[PG_HITD].hitgroup.moduleIS = builtin_modules[0];
-        group_descs[PG_HITD].hitgroup.entryFunctionNameIS = nullptr;
-        group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0];
-        group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr;
-
-        if (motion_blur) {
-          builtin_options.usesMotionBlur = true;
-
-          check_result_optix_ret(optixBuiltinISModuleGet(
-              context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[1]));
-
-          group_descs[PG_HITD_MOTION] = group_descs[PG_HITD];
-          group_descs[PG_HITD_MOTION].hitgroup.moduleIS = builtin_modules[1];
-          group_descs[PG_HITS_MOTION] = group_descs[PG_HITS];
-          group_descs[PG_HITS_MOTION].hitgroup.moduleIS = builtin_modules[1];
-        }
-      }
-#  endif
-    }
-
-    if (requested_features.use_subsurface || requested_features.use_shader_raytrace) {
-      // Add hit group for local intersections
-      group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
-      group_descs[PG_HITL].hitgroup.moduleAH = optix_module;
-      group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit";
-    }
-
-    if (requested_features.use_baking) {
-      group_descs[PG_BAKE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
-      group_descs[PG_BAKE].raygen.module = optix_module;
-      group_descs[PG_BAKE].raygen.entryFunctionName = "__raygen__kernel_optix_bake";
-    }
-
-    if (requested_features.use_true_displacement) {
-      group_descs[PG_DISP].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
-      group_descs[PG_DISP].raygen.module = optix_module;
-      group_descs[PG_DISP].raygen.entryFunctionName = "__raygen__kernel_optix_displace";
-    }
-
-    if (requested_features.use_background_light) {
-      group_descs[PG_BACK].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
-      group_descs[PG_BACK].raygen.module = optix_module;
-      group_descs[PG_BACK].raygen.entryFunctionName = "__raygen__kernel_optix_background";
-    }
-
-    // Shader raytracing replaces some functions with direct callables
-    if (requested_features.use_shader_raytrace) {
-      group_descs[PG_CALL + 0].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
-      group_descs[PG_CALL + 0].callables.moduleDC = optix_module;
-      group_descs[PG_CALL + 0].callables.entryFunctionNameDC = "__direct_callable__svm_eval_nodes";
-      group_descs[PG_CALL + 1].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
-      group_descs[PG_CALL + 1].callables.moduleDC = optix_module;
-      group_descs[PG_CALL + 1].callables.entryFunctionNameDC =
-          "__direct_callable__kernel_volume_shadow";
-      group_descs[PG_CALL + 2].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
-      group_descs[PG_CALL + 2].callables.moduleDC = optix_module;
-      group_descs[PG_CALL + 2].callables.entryFunctionNameDC =
-          "__direct_callable__subsurface_scatter_multi_setup";
-    }
-
-    check_result_optix_ret(optixProgramGroupCreate(
-        context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups));
-
-    // Get program stack sizes
-    OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
-    // Set up SBT, which in this case is used only to select between different programs
-    sbt_data.alloc(NUM_PROGRAM_GROUPS);
-    memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS);
-    for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
-      check_result_optix_ret(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
-      check_result_optix_ret(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
-    }
-    sbt_data.copy_to_device();  // Upload SBT to device
-
-    // Calculate maximum trace continuation stack size
-    unsigned int trace_css = stack_size[PG_HITD].cssCH;
-    // This is based on the maximum of closest-hit and any-hit/intersection programs
-    trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
-    trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
-    trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
-#  if OPTIX_ABI_VERSION >= 36
-    trace_css = std::max(trace_css,
-                         stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH);
-    trace_css = std::max(trace_css,
-                         stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH);
-#  endif
-
-    OptixPipelineLinkOptions link_options = {};
-    link_options.maxTraceDepth = 1;
-    link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
-#  if OPTIX_ABI_VERSION < 24
-    link_options.overrideUsesMotionBlur = motion_blur;
-#  endif
-
-    {  // Create path tracing pipeline
-      vector<OptixProgramGroup> pipeline_groups;
-      pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
-      pipeline_groups.push_back(groups[PG_RGEN]);
-      pipeline_groups.push_back(groups[PG_MISS]);
-      pipeline_groups.push_back(groups[PG_HITD]);
-      pipeline_groups.push_back(groups[PG_HITS]);
-      pipeline_groups.push_back(groups[PG_HITL]);
-#  if OPTIX_ABI_VERSION >= 36
-      if (motion_blur) {
-        pipeline_groups.push_back(groups[PG_HITD_MOTION]);
-        pipeline_groups.push_back(groups[PG_HITS_MOTION]);
-      }
-#  endif
-      if (requested_features.use_shader_raytrace) {
-        pipeline_groups.push_back(groups[PG_CALL + 0]);
-        pipeline_groups.push_back(groups[PG_CALL + 1]);
-        pipeline_groups.push_back(groups[PG_CALL + 2]);
-      }
-
-      check_result_optix_ret(optixPipelineCreate(context,
-                                                 &pipeline_options,
-                                                 &link_options,
-                                                 pipeline_groups.data(),
-                                                 pipeline_groups.size(),
-                                                 nullptr,
-                                                 0,
-                                                 &pipelines[PIP_PATH_TRACE]));
-
-      // Combine ray generation and trace continuation stack size
-      const unsigned int css = stack_size[PG_RGEN].cssRG + link_options.maxTraceDepth * trace_css;
-      // Max direct callable depth is one of the following, so combine accordingly
-      // - __raygen__ -> svm_eval_nodes
-      // - __raygen__ -> kernel_volume_shadow -> svm_eval_nodes
-      // - __raygen__ -> subsurface_scatter_multi_setup -> svm_eval_nodes
-      const unsigned int dss = stack_size[PG_CALL + 0].dssDC +
-                               std::max(stack_size[PG_CALL + 1].dssDC,
-                                        stack_size[PG_CALL + 2].dssDC);
-
-      // Set stack size depending on pipeline options
-      check_result_optix_ret(
-          optixPipelineSetStackSize(pipelines[PIP_PATH_TRACE],
-                                    0,
-                                    requested_features.use_shader_raytrace ? dss : 0,
-                                    css,
-                                    motion_blur ? 3 : 2));
-    }
-
-    // Only need to create shader evaluation pipeline if one of these features is used:
-    const bool use_shader_eval_pipeline = requested_features.use_baking ||
-                                          requested_features.use_background_light ||
-                                          requested_features.use_true_displacement;
-
-    if (use_shader_eval_pipeline) {  // Create shader evaluation pipeline
-      vector<OptixProgramGroup> pipeline_groups;
-      pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
-      pipeline_groups.push_back(groups[PG_BAKE]);
-      pipeline_groups.push_back(groups[PG_DISP]);
-      pipeline_groups.push_back(groups[PG_BACK]);
-      pipeline_groups.push_back(groups[PG_MISS]);
-      pipeline_groups.push_back(groups[PG_HITD]);
-      pipeline_groups.push_back(groups[PG_HITS]);
-      pipeline_groups.push_back(groups[PG_HITL]);
-#  if OPTIX_ABI_VERSION >= 36
-      if (motion_blur) {
-        pipeline_groups.push_back(groups[PG_HITD_MOTION]);
-        pipeline_groups.push_back(groups[PG_HITS_MOTION]);
-      }
-#  endif
-      if (requested_features.use_shader_raytrace) {
-        pipeline_groups.push_back(groups[PG_CALL + 0]);
-        pipeline_groups.push_back(groups[PG_CALL + 1]);
-        pipeline_groups.push_back(groups[PG_CALL + 2]);
-      }
-
-      check_result_optix_ret(optixPipelineCreate(context,
-                                                 &pipeline_options,
-                                                 &link_options,
-                                                 pipeline_groups.data(),
-                                                 pipeline_groups.size(),
-                                                 nullptr,
-                                                 0,
-                                                 &pipelines[PIP_SHADER_EVAL]));
-
-      // Calculate continuation stack size based on the maximum of all ray generation stack sizes
-      const unsigned int css = std::max(stack_size[PG_BAKE].cssRG,
-                                        std::max(stack_size[PG_DISP].cssRG,
-                                                 stack_size[PG_BACK].cssRG)) +
-                               link_options.maxTraceDepth * trace_css;
-      const unsigned int dss = stack_size[PG_CALL + 0].dssDC +
-                               std::max(stack_size[PG_CALL + 1].dssDC,
-                                        stack_size[PG_CALL + 2].dssDC);
-
-      check_result_optix_ret(
-          optixPipelineSetStackSize(pipelines[PIP_SHADER_EVAL],
-                                    0,
-                                    requested_features.use_shader_raytrace ? dss : 0,
-                                    css,
-                                    motion_blur ? 3 : 2));
-    }
-
-    // Clean up program group objects
-    for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
-      optixProgramGroupDestroy(groups[i]);
-    }
-
-    return true;
-  }
-
-  void thread_run(DeviceTask &task, int thread_index)  // Main task entry point
-  {
-    if (have_error())
-      return;  // Abort early if there was an error previously
-
-    if (task.type == DeviceTask::RENDER) {
-      if (thread_index != 0) {
-        // Only execute denoising in a single thread (see also 'task_add')
-        task.tile_types &= ~RenderTile::DENOISE;
-      }
-
-      RenderTile tile;
-      while (task.acquire_tile(this, tile, task.tile_types)) {
-        if (tile.task == RenderTile::PATH_TRACE)
-          launch_render(task, tile, thread_index);
-        else if (tile.task == RenderTile::BAKE) {
-          // Perform baking using CUDA, since it is not currently implemented in OptiX
-          device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
-          CUDADevice::render(task, tile, work_tiles);
-        }
-        else if (tile.task == RenderTile::DENOISE)
-          launch_denoise(task, tile);
-        task.release_tile(tile);
-        if (task.get_cancel() && !task.need_finish_queue)
-          break;  // User requested cancellation
-        else if (have_error())
-          break;  // Abort rendering when encountering an error
-      }
-    }
-    else if (task.type == DeviceTask::SHADER) {
-      // CUDA kernels are used when doing baking
-      if (optix_module == NULL)
-        CUDADevice::shader(task);
-      else
-        launch_shader_eval(task, thread_index);
-    }
-    else if (task.type == DeviceTask::DENOISE_BUFFER) {
-      // Set up a single tile that covers the whole task and denoise it
-      RenderTile tile;
-      tile.x = task.x;
-      tile.y = task.y;
-      tile.w = task.w;
-      tile.h = task.h;
-      tile.buffer = task.buffer;
-      tile.num_samples = task.num_samples;
-      tile.start_sample = task.sample;
-      tile.offset = task.offset;
-      tile.stride = task.stride;
-      tile.buffers = task.buffers;
-
-      launch_denoise(task, tile);
-    }
-  }
-
-  void launch_render(DeviceTask &task, RenderTile &rtile, int thread_index)
-  {
-    assert(thread_index < launch_params.data_size);
-
-    // Keep track of total render time of this tile
-    const scoped_timer timer(&rtile.buffers->render_time);
-
-    WorkTile wtile;
-    wtile.x = rtile.x;
-    wtile.y = rtile.y;
-    wtile.w = rtile.w;
-    wtile.h = rtile.h;
-    wtile.offset = rtile.offset;
-    wtile.stride = rtile.stride;
-    wtile.buffer = (float *)rtile.buffer;
-
-    const int end_sample = rtile.start_sample + rtile.num_samples;
-    // Keep this number reasonable to avoid running into TDRs
-    int step_samples = (info.display_device ? 8 : 32);
-
-    // Offset into launch params buffer so that streams use separate data
-    device_ptr launch_params_ptr = launch_params.device_pointer +
-                                   thread_index * launch_params.data_elements;
-
-    const CUDAContextScope scope(cuContext);
-
-    for (int sample = rtile.start_sample; sample < end_sample;) {
-      // Copy work tile information to device
-      wtile.start_sample = sample;
-      wtile.num_samples = step_samples;
-      if (task.adaptive_sampling.use) {
-        wtile.num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
-      }
-      wtile.num_samples = min(wtile.num_samples, end_sample - sample);
-      device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
-      check_result_cuda(
-          cuMemcpyHtoDAsync(d_wtile_ptr, &wtile, sizeof(wtile), cuda_stream[thread_index]));
-
-      OptixShaderBindingTable sbt_params = {};
-      sbt_params.raygenRecord = sbt_data.device_pointer + PG_RGEN * sizeof(SbtRecord);
-      sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord);
-      sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
-      sbt_params.missRecordCount = 1;
-      sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
-      sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
-#  if OPTIX_ABI_VERSION >= 36
-      sbt_params.hitgroupRecordCount = 5;  // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL
-#  else
-      sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITS, PG_HITL
-#  endif
-      sbt_params.callablesRecordBase = sbt_data.device_pointer + PG_CALL * sizeof(SbtRecord);
-      sbt_params.callablesRecordCount = 3;
-      sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
-
-      // Launch the ray generation program
-      check_result_optix(optixLaunch(pipelines[PIP_PATH_TRACE],
-                                     cuda_stream[thread_index],
-                                     launch_params_ptr,
-                                     launch_params.data_elements,
-                                     &sbt_params,
-                                     // Launch with samples close to each other for better locality
-                                     wtile.w * wtile.num_samples,
-                                     wtile.h,
-                                     1));
-
-      // Run the adaptive sampling kernels at selected samples aligned to step samples.
-      uint filter_sample = wtile.start_sample + wtile.num_samples - 1;
-      if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
-        adaptive_sampling_filter(filter_sample, &wtile, d_wtile_ptr, cuda_stream[thread_index]);
-      }
-
-      // Wait for launch to finish
-      check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
-
-      // Update current sample, so it is displayed correctly
-      sample += wtile.num_samples;
-      rtile.sample = sample;
-      // Update task progress after the kernel completed rendering
-      task.update_progress(&rtile, wtile.w * wtile.h * wtile.num_samples);
-
-      if (task.get_cancel() && !task.need_finish_queue)
-        return;  // Cancel rendering
-    }
-
-    // Finalize adaptive sampling
-    if (task.adaptive_sampling.use) {
-      device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
-      adaptive_sampling_post(rtile, &wtile, d_wtile_ptr, cuda_stream[thread_index]);
-      check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
-      task.update_progress(&rtile, rtile.w * rtile.h * wtile.num_samples);
-    }
-  }
-
-  bool launch_denoise(DeviceTask &task, RenderTile &rtile)
-  {
-    // Update current sample (for display and NLM denoising task)
-    rtile.sample = rtile.start_sample + rtile.num_samples;
-
-    // Make CUDA context current now, since it is used for both denoising tasks
-    const CUDAContextScope scope(cuContext);
-
-    // Choose between OptiX and NLM denoising
-    if (task.denoising.type == DENOISER_OPTIX) {
-      // Map neighboring tiles onto this device, indices are as following:
-      // Where index 4 is the center tile and index 9 is the target for the result.
-      //   0 1 2
-      //   3 4 5
-      //   6 7 8  9
-      RenderTileNeighbors neighbors(rtile);
-      task.map_neighbor_tiles(neighbors, this);
-      RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
-      RenderTile &target_tile = neighbors.target;
-      rtile = center_tile;  // Tile may have been modified by mapping code
-
-      // Calculate size of the tile to denoise (including overlap)
-      int4 rect = center_tile.bounds();
-      // Overlap between tiles has to be at least 64 pixels
-      // TODO(pmours): Query this value from OptiX
-      rect = rect_expand(rect, 64);
-      int4 clip_rect = neighbors.bounds();
-      rect = rect_clip(rect, clip_rect);
-      int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y);
-      int2 overlap_offset = make_int2(rtile.x - rect.x, rtile.y - rect.y);
-
-      // Calculate byte offsets and strides
-      int pixel_stride = task.pass_stride * (int)sizeof(float);
-      int pixel_offset = (rtile.offset + rtile.x + rtile.y * rtile.stride) * pixel_stride;
-      const int pass_offset[3] = {
-          (task.pass_denoising_data + DENOISING_PASS_COLOR) * (int)sizeof(float),
-          (task.pass_denoising_data + DENOISING_PASS_ALBEDO) * (int)sizeof(float),
-          (task.pass_denoising_data + DENOISING_PASS_NORMAL) * (int)sizeof(float)};
-
-      // Start with the current tile pointer offset
-      int input_stride = pixel_stride;
-      device_ptr input_ptr = rtile.buffer + pixel_offset;
-
-      // Copy tile data into a common buffer if necessary
-      device_only_memory<float> input(this, "denoiser input", true);
-      device_vector<TileInfo> tile_info_mem(this, "denoiser tile info", MEM_READ_ONLY);
-
-      bool contiguous_memory = true;
-      for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
-        if (neighbors.tiles[i].buffer && neighbors.tiles[i].buffer != rtile.buffer) {
-          contiguous_memory = false;
-        }
-      }
-
-      if (contiguous_memory) {
-        // Tiles are in continous memory, so can just subtract overlap offset
-        input_ptr -= (overlap_offset.x + overlap_offset.y * rtile.stride) * pixel_stride;
-        // Stride covers the whole width of the image and not just a single tile
-        input_stride *= rtile.stride;
-      }
-      else {
-        // Adjacent tiles are in separate memory regions, so need to copy them into a single one
-        input.alloc_to_device(rect_size.x * rect_size.y * task.pass_stride);
-        // Start with the new input buffer
-        input_ptr = input.device_pointer;
-        // Stride covers the width of the new input buffer, which includes tile width and overlap
-        input_stride *= rect_size.x;
-
-        TileInfo *tile_info = tile_info_mem.alloc(1);
-        for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
-          tile_info->offsets[i] = neighbors.tiles[i].offset;
-          tile_info->strides[i] = neighbors.tiles[i].stride;
-          tile_info->buffers[i] = neighbors.tiles[i].buffer;
-        }
-        tile_info->x[0] = neighbors.tiles[3].x;
-        tile_info->x[1] = neighbors.tiles[4].x;
-        tile_info->x[2] = neighbors.tiles[5].x;
-        tile_info->x[3] = neighbors.tiles[5].x + neighbors.tiles[5].w;
-        tile_info->y[0] = neighbors.tiles[1].y;
-        tile_info->y[1] = neighbors.tiles[4].y;
-        tile_info->y[2] = neighbors.tiles[7].y;
-        tile_info->y[3] = neighbors.tiles[7].y + neighbors.tiles[7].h;
-        tile_info_mem.copy_to_device();
-
-        void *args[] = {
-            &input.device_pointer, &tile_info_mem.device_pointer, &rect.x, &task.pass_stride};
-        launch_filter_kernel("kernel_cuda_filter_copy_input", rect_size.x, rect_size.y, args);
-      }
-
-#  if OPTIX_DENOISER_NO_PIXEL_STRIDE
-      device_only_memory<float> input_rgb(this, "denoiser input rgb", true);
-      input_rgb.alloc_to_device(rect_size.x * rect_size.y * 3 * task.denoising.input_passes);
-
-      void *input_args[] = {&input_rgb.device_pointer,
-                            &input_ptr,
-                            &rect_size.x,
-                            &rect_size.y,
-                            &input_stride,
-                            &task.pass_stride,
-                            const_cast<int *>(pass_offset),
-                            &task.denoising.input_passes,
-                            &rtile.sample};
-      launch_filter_kernel(
-          "kernel_cuda_filter_convert_to_rgb", rect_size.x, rect_size.y, input_args);
-
-      input_ptr = input_rgb.device_pointer;
-      pixel_stride = 3 * sizeof(float);
-      input_stride = rect_size.x * pixel_stride;
-#  endif
-
-      const bool recreate_denoiser = (denoiser == NULL) ||
-                                     (task.denoising.input_passes != denoiser_input_passes);
-      if (recreate_denoiser) {
-        // Destroy existing handle before creating new one
-        if (denoiser != NULL) {
-          optixDenoiserDestroy(denoiser);
-        }
-
-        // Create OptiX denoiser handle on demand when it is first used
-        OptixDenoiserOptions denoiser_options = {};
-        assert(task.denoising.input_passes >= 1 && task.denoising.input_passes <= 3);
-#  if OPTIX_ABI_VERSION >= 47
-        denoiser_options.guideAlbedo = task.denoising.input_passes >= 2;
-        denoiser_options.guideNormal = task.denoising.input_passes >= 3;
-        check_result_optix_ret(optixDenoiserCreate(
-            context, OPTIX_DENOISER_MODEL_KIND_HDR, &denoiser_options, &denoiser));
-#  else
-        denoiser_options.inputKind = static_cast<OptixDenoiserInputKind>(
-            OPTIX_DENOISER_INPUT_RGB + (task.denoising.input_passes - 1));
-#    if OPTIX_ABI_VERSION < 28
-        denoiser_options.pixelFormat = OPTIX_PIXEL_FORMAT_FLOAT3;
-#    endif
-        check_result_optix_ret(optixDenoiserCreate(context, &denoiser_options, &denoiser));
-        check_result_optix_ret(
-            optixDenoiserSetModel(denoiser, OPTIX_DENOISER_MODEL_KIND_HDR, NULL, 0));
-#  endif
-
-        // OptiX denoiser handle was created with the requested number of input passes
-        denoiser_input_passes = task.denoising.input_passes;
-      }
-
-      OptixDenoiserSizes sizes = {};
-      check_result_optix_ret(
-          optixDenoiserComputeMemoryResources(denoiser, rect_size.x, rect_size.y, &sizes));
-
-#  if OPTIX_ABI_VERSION < 28
-      const size_t scratch_size = sizes.recommendedScratchSizeInBytes;
-#  else
-      const size_t scratch_size = sizes.withOverlapScratchSizeInBytes;
-#  endif
-      const size_t scratch_offset = sizes.stateSizeInBytes;
-
-      // Allocate denoiser state if tile size has changed since last setup
-      if (recreate_denoiser || (denoiser_state.data_width != rect_size.x ||
-                                denoiser_state.data_height != rect_size.y)) {
-        denoiser_state.alloc_to_device(scratch_offset + scratch_size);
-
-        // Initialize denoiser state for the current tile size
-        check_result_optix_ret(optixDenoiserSetup(denoiser,
-                                                  0,
-                                                  rect_size.x,
-                                                  rect_size.y,
-                                                  denoiser_state.device_pointer,
-                                                  scratch_offset,
-                                                  denoiser_state.device_pointer + scratch_offset,
-                                                  scratch_size));
-
-        denoiser_state.data_width = rect_size.x;
-        denoiser_state.data_height = rect_size.y;
-      }
-
-      // Set up input and output layer information
-      OptixImage2D input_layers[3] = {};
-      OptixImage2D output_layers[1] = {};
-
-      for (int i = 0; i < 3; ++i) {
-#  if OPTIX_DENOISER_NO_PIXEL_STRIDE
-        input_layers[i].data = input_ptr + (rect_size.x * rect_size.y * pixel_stride * i);
-#  else
-        input_layers[i].data = input_ptr + pass_offset[i];
-#  endif
-        input_layers[i].width = rect_size.x;
-        input_layers[i].height = rect_size.y;
-        input_layers[i].rowStrideInBytes = input_stride;
-        input_layers[i].pixelStrideInBytes = pixel_stride;
-        input_layers[i].format = OPTIX_PIXEL_FORMAT_FLOAT3;
-      }
-
-#  if OPTIX_DENOISER_NO_PIXEL_STRIDE
-      output_layers[0].data = input_ptr;
-      output_layers[0].width = rect_size.x;
-      output_layers[0].height = rect_size.y;
-      output_layers[0].rowStrideInBytes = input_stride;
-      output_layers[0].pixelStrideInBytes = pixel_stride;
-      int2 output_offset = overlap_offset;
-      overlap_offset = make_int2(0, 0);  // Not supported by denoiser API, so apply manually
-#  else
-      output_layers[0].data = target_tile.buffer + pixel_offset;
-      output_layers[0].width = target_tile.w;
-      output_layers[0].height = target_tile.h;
-      output_layers[0].rowStrideInBytes = target_tile.stride * pixel_stride;
-      output_layers[0].pixelStrideInBytes = pixel_stride;
-#  endif
-      output_layers[0].format = OPTIX_PIXEL_FORMAT_FLOAT3;
-
-#  if OPTIX_ABI_VERSION >= 47
-      OptixDenoiserLayer image_layers = {};
-      image_layers.input = input_layers[0];
-      image_layers.output = output_layers[0];
-
-      OptixDenoiserGuideLayer guide_layers = {};
-      guide_layers.albedo = input_layers[1];
-      guide_layers.normal = input_layers[2];
-#  endif
-
-      // Finally run denonising
-      OptixDenoiserParams params = {};  // All parameters are disabled/zero
-#  if OPTIX_ABI_VERSION >= 47
-      check_result_optix_ret(optixDenoiserInvoke(denoiser,
-                                                 NULL,
-                                                 &params,
-                                                 denoiser_state.device_pointer,
-                                                 scratch_offset,
-                                                 &guide_layers,
-                                                 &image_layers,
-                                                 1,
-                                                 overlap_offset.x,
-                                                 overlap_offset.y,
-                                                 denoiser_state.device_pointer + scratch_offset,
-                                                 scratch_size));
-#  else
-      check_result_optix_ret(optixDenoiserInvoke(denoiser,
-                                                 NULL,
-                                                 &params,
-                                                 denoiser_state.device_pointer,
-                                                 scratch_offset,
-                                                 input_layers,
-                                                 task.denoising.input_passes,
-                                                 overlap_offset.x,
-                                                 overlap_offset.y,
-                                                 output_layers,
-                                                 denoiser_state.device_pointer + scratch_offset,
-                                                 scratch_size));
-#  endif
-
-#  if OPTIX_DENOISER_NO_PIXEL_STRIDE
-      void *output_args[] = {&input_ptr,
-                             &target_tile.buffer,
-                             &output_offset.x,
-                             &output_offset.y,
-                             &rect_size.x,
-                             &rect_size.y,
-                             &target_tile.x,
-                             &target_tile.y,
-                             &target_tile.w,
-                             &target_tile.h,
-                             &target_tile.offset,
-                             &target_tile.stride,
-                             &task.pass_stride,
-                             &rtile.sample};
-      launch_filter_kernel(
-          "kernel_cuda_filter_convert_from_rgb", target_tile.w, target_tile.h, output_args);
-#  endif
-
-      check_result_cuda_ret(cuStreamSynchronize(0));
-
-      task.unmap_neighbor_tiles(neighbors, this);
-    }
-    else {
-      // Run CUDA denoising kernels
-      DenoisingTask denoising(this, task);
-      CUDADevice::denoise(rtile, denoising);
-    }
-
-    // Update task progress after the denoiser completed processing
-    task.update_progress(&rtile, rtile.w * rtile.h);
-
-    return true;
-  }
-
-  void launch_shader_eval(DeviceTask &task, int thread_index)
-  {
-    unsigned int rgen_index = PG_BACK;
-    if (task.shader_eval_type >= SHADER_EVAL_BAKE)
-      rgen_index = PG_BAKE;
-    if (task.shader_eval_type == SHADER_EVAL_DISPLACE)
-      rgen_index = PG_DISP;
-
-    const CUDAContextScope scope(cuContext);
-
-    device_ptr launch_params_ptr = launch_params.device_pointer +
-                                   thread_index * launch_params.data_elements;
-
-    for (int sample = 0; sample < task.num_samples; ++sample) {
-      ShaderParams params;
-      params.input = (uint4 *)task.shader_input;
-      params.output = (float4 *)task.shader_output;
-      params.type = task.shader_eval_type;
-      params.filter = task.shader_filter;
-      params.sx = task.shader_x;
-      params.offset = task.offset;
-      params.sample = sample;
-
-      check_result_cuda(cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, shader),
-                                          &params,
-                                          sizeof(params),
-                                          cuda_stream[thread_index]));
-
-      OptixShaderBindingTable sbt_params = {};
-      sbt_params.raygenRecord = sbt_data.device_pointer + rgen_index * sizeof(SbtRecord);
-      sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord);
-      sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
-      sbt_params.missRecordCount = 1;
-      sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
-      sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
-#  if OPTIX_ABI_VERSION >= 36
-      sbt_params.hitgroupRecordCount = 5;  // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL
-#  else
-      sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITS, PG_HITL
-#  endif
-      sbt_params.callablesRecordBase = sbt_data.device_pointer + PG_CALL * sizeof(SbtRecord);
-      sbt_params.callablesRecordCount = 3;
-      sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
-
-      check_result_optix(optixLaunch(pipelines[PIP_SHADER_EVAL],
-                                     cuda_stream[thread_index],
-                                     launch_params_ptr,
-                                     launch_params.data_elements,
-                                     &sbt_params,
-                                     task.shader_w,
-                                     1,
-                                     1));
-
-      check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
-
-      task.update_progress(NULL);
-    }
-  }
-
-  bool build_optix_bvh(BVHOptiX *bvh,
-                       OptixBuildOperation operation,
-                       const OptixBuildInput &build_input,
-                       uint16_t num_motion_steps)
-  {
-    /* Allocate and build acceleration structures only one at a time, to prevent parallel builds
-     * from running out of memory (since both original and compacted acceleration structure memory
-     * may be allocated at the same time for the duration of this function). The builds would
-     * otherwise happen on the same CUDA stream anyway. */
-    static thread_mutex mutex;
-    thread_scoped_lock lock(mutex);
-
-    const CUDAContextScope scope(cuContext);
-
-    const bool use_fast_trace_bvh = (bvh->params.bvh_type == SceneParams::BVH_STATIC);
-
-    // Compute memory usage
-    OptixAccelBufferSizes sizes = {};
-    OptixAccelBuildOptions options = {};
-    options.operation = operation;
-    if (use_fast_trace_bvh) {
-      VLOG(2) << "Using fast to trace OptiX BVH";
-      options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
-    }
-    else {
-      VLOG(2) << "Using fast to update OptiX BVH";
-      options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD | OPTIX_BUILD_FLAG_ALLOW_UPDATE;
-    }
-
-    options.motionOptions.numKeys = num_motion_steps;
-    options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH;
-    options.motionOptions.timeBegin = 0.0f;
-    options.motionOptions.timeEnd = 1.0f;
-
-    check_result_optix_ret(
-        optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));
-
-    // Allocate required output buffers
-    device_only_memory<char> temp_mem(this, "optix temp as build mem", true);
-    temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
-    if (!temp_mem.device_pointer)
-      return false;  // Make sure temporary memory allocation succeeded
-
-    // Acceleration structure memory has to be allocated on the device (not allowed to be on host)
-    device_only_memory<char> &out_data = bvh->as_data;
-    if (operation == OPTIX_BUILD_OPERATION_BUILD) {
-      assert(out_data.device == this);
-      out_data.alloc_to_device(sizes.outputSizeInBytes);
-      if (!out_data.device_pointer)
-        return false;
-    }
-    else {
-      assert(out_data.device_pointer && out_data.device_size >= sizes.outputSizeInBytes);
-    }
-
-    // Finally build the acceleration structure
-    OptixAccelEmitDesc compacted_size_prop = {};
-    compacted_size_prop.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
-    // A tiny space was allocated for this property at the end of the temporary buffer above
-    // Make sure this pointer is 8-byte aligned
-    compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8);
-
-    OptixTraversableHandle out_handle = 0;
-    check_result_optix_ret(optixAccelBuild(context,
-                                           NULL,
-                                           &options,
-                                           &build_input,
-                                           1,
-                                           temp_mem.device_pointer,
-                                           sizes.tempSizeInBytes,
-                                           out_data.device_pointer,
-                                           sizes.outputSizeInBytes,
-                                           &out_handle,
-                                           use_fast_trace_bvh ? &compacted_size_prop : NULL,
-                                           use_fast_trace_bvh ? 1 : 0));
-    bvh->traversable_handle = static_cast<uint64_t>(out_handle);
-
-    // Wait for all operations to finish
-    check_result_cuda_ret(cuStreamSynchronize(NULL));
-
-    // Compact acceleration structure to save memory (only if using fast trace as the
-    // OPTIX_BUILD_FLAG_ALLOW_COMPACTION flag is only set in this case).
-    if (use_fast_trace_bvh) {
-      uint64_t compacted_size = sizes.outputSizeInBytes;
-      check_result_cuda_ret(
-          cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size)));
-
-      // Temporary memory is no longer needed, so free it now to make space
-      temp_mem.free();
-
-      // There is no point compacting if the size does not change
-      if (compacted_size < sizes.outputSizeInBytes) {
-        device_only_memory<char> compacted_data(this, "optix compacted as", false);
-        compacted_data.alloc_to_device(compacted_size);
-        if (!compacted_data.device_pointer)
-          // Do not compact if memory allocation for compacted acceleration structure fails
-          // Can just use the uncompacted one then, so succeed here regardless
-          return true;
-
-        check_result_optix_ret(optixAccelCompact(context,
-                                                 NULL,
-                                                 out_handle,
-                                                 compacted_data.device_pointer,
-                                                 compacted_size,
-                                                 &out_handle));
-        bvh->traversable_handle = static_cast<uint64_t>(out_handle);
-
-        // Wait for compaction to finish
-        check_result_cuda_ret(cuStreamSynchronize(NULL));
-
-        std::swap(out_data.device_size, compacted_data.device_size);
-        std::swap(out_data.device_pointer, compacted_data.device_pointer);
-        // Original acceleration structure memory is freed when 'compacted_data' goes out of scope
-      }
-    }
-
-    return true;
-  }
-
-  void build_bvh(BVH *bvh, Progress &progress, bool refit) override
-  {
-    if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2) {
-      /* For baking CUDA is used, build appropriate BVH for that. */
-      Device::build_bvh(bvh, progress, refit);
-      return;
-    }
-
-    const bool use_fast_trace_bvh = (bvh->params.bvh_type == SceneParams::BVH_STATIC);
-
-    free_bvh_memory_delayed();
-
-    BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
-
-    progress.set_substatus("Building OptiX acceleration structure");
-
-    if (!bvh->params.top_level) {
-      assert(bvh->objects.size() == 1 && bvh->geometry.size() == 1);
-
-      OptixBuildOperation operation = OPTIX_BUILD_OPERATION_BUILD;
-      /* Refit is only possible when using fast to trace BVH (because AS is built with
-       * OPTIX_BUILD_FLAG_ALLOW_UPDATE only there, see above). */
-      if (refit && !use_fast_trace_bvh) {
-        assert(bvh_optix->traversable_handle != 0);
-        operation = OPTIX_BUILD_OPERATION_UPDATE;
-      }
-      else {
-        bvh_optix->as_data.free();
-        bvh_optix->traversable_handle = 0;
-      }
-
-      // Build bottom level acceleration structures (BLAS)
-      Geometry *const geom = bvh->geometry[0];
-      if (geom->geometry_type == Geometry::HAIR) {
-        // Build BLAS for curve primitives
-        Hair *const hair = static_cast<Hair *const>(geom);
-        if (hair->num_curves() == 0) {
-          return;
-        }
-
-        const size_t num_segments = hair->num_segments();
-
-        size_t num_motion_steps = 1;
-        Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-        if (motion_blur && hair->get_use_motion_blur() && motion_keys) {
-          num_motion_steps = hair->get_motion_steps();
-        }
-
-        device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY);
-#  if OPTIX_ABI_VERSION >= 36
-        device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
-        device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
-        // Four control points for each curve segment
-        const size_t num_vertices = num_segments * 4;
-        if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
-          index_data.alloc(num_segments);
-          vertex_data.alloc(num_vertices * num_motion_steps);
-        }
-        else
-#  endif
-          aabb_data.alloc(num_segments * num_motion_steps);
-
-        // Get AABBs for each motion step
-        for (size_t step = 0; step < num_motion_steps; ++step) {
-          // The center step for motion vertices is not stored in the attribute
-          const float3 *keys = hair->get_curve_keys().data();
-          size_t center_step = (num_motion_steps - 1) / 2;
-          if (step != center_step) {
-            size_t attr_offset = (step > center_step) ? step - 1 : step;
-            // Technically this is a float4 array, but sizeof(float3) == sizeof(float4)
-            keys = motion_keys->data_float3() + attr_offset * hair->get_curve_keys().size();
-          }
-
-          for (size_t j = 0, i = 0; j < hair->num_curves(); ++j) {
-            const Hair::Curve curve = hair->get_curve(j);
-#  if OPTIX_ABI_VERSION >= 36
-            const array<float> &curve_radius = hair->get_curve_radius();
-#  endif
-
-            for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) {
-#  if OPTIX_ABI_VERSION >= 36
-              if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
-                int k0 = curve.first_key + segment;
-                int k1 = k0 + 1;
-                int ka = max(k0 - 1, curve.first_key);
-                int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1);
-
-                const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x);
-                const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y);
-                const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z);
-                const float4 pw = make_float4(
-                    curve_radius[ka], curve_radius[k0], curve_radius[k1], curve_radius[kb]);
-
-                // Convert Catmull-Rom data to Bezier spline
-                static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f;
-                static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f;
-                static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f;
-                static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f;
-
-                index_data[i] = i * 4;
-                float4 *const v = vertex_data.data() + step * num_vertices + index_data[i];
-                v[0] = make_float4(
-                    dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw));
-                v[1] = make_float4(
-                    dot(cr2bsp1, px), dot(cr2bsp1, py), dot(cr2bsp1, pz), dot(cr2bsp1, pw));
-                v[2] = make_float4(
-                    dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw));
-                v[3] = make_float4(
-                    dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw));
-              }
-              else
-#  endif
-              {
-                BoundBox bounds = BoundBox::empty;
-                curve.bounds_grow(segment, keys, hair->get_curve_radius().data(), bounds);
-
-                const size_t index = step * num_segments + i;
-                aabb_data[index].minX = bounds.min.x;
-                aabb_data[index].minY = bounds.min.y;
-                aabb_data[index].minZ = bounds.min.z;
-                aabb_data[index].maxX = bounds.max.x;
-                aabb_data[index].maxY = bounds.max.y;
-                aabb_data[index].maxZ = bounds.max.z;
-              }
-            }
-          }
-        }
-
-        // Upload AABB data to GPU
-        aabb_data.copy_to_device();
-#  if OPTIX_ABI_VERSION >= 36
-        index_data.copy_to_device();
-        vertex_data.copy_to_device();
-#  endif
-
-        vector<device_ptr> aabb_ptrs;
-        aabb_ptrs.reserve(num_motion_steps);
-#  if OPTIX_ABI_VERSION >= 36
-        vector<device_ptr> width_ptrs;
-        vector<device_ptr> vertex_ptrs;
-        width_ptrs.reserve(num_motion_steps);
-        vertex_ptrs.reserve(num_motion_steps);
-#  endif
-        for (size_t step = 0; step < num_motion_steps; ++step) {
-          aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb));
-#  if OPTIX_ABI_VERSION >= 36
-          const device_ptr base_ptr = vertex_data.device_pointer +
-                                      step * num_vertices * sizeof(float4);
-          width_ptrs.push_back(base_ptr + 3 * sizeof(float));  // Offset by vertex size
-          vertex_ptrs.push_back(base_ptr);
-#  endif
-        }
-
-        // Force a single any-hit call, so shadow record-all behavior works correctly
-        unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
-        OptixBuildInput build_input = {};
-#  if OPTIX_ABI_VERSION >= 36
-        if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
-          build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES;
-          build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
-          build_input.curveArray.numPrimitives = num_segments;
-          build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
-          build_input.curveArray.numVertices = num_vertices;
-          build_input.curveArray.vertexStrideInBytes = sizeof(float4);
-          build_input.curveArray.widthBuffers = (CUdeviceptr *)width_ptrs.data();
-          build_input.curveArray.widthStrideInBytes = sizeof(float4);
-          build_input.curveArray.indexBuffer = (CUdeviceptr)index_data.device_pointer;
-          build_input.curveArray.indexStrideInBytes = sizeof(int);
-          build_input.curveArray.flag = build_flags;
-          build_input.curveArray.primitiveIndexOffset = hair->optix_prim_offset;
-        }
-        else
-#  endif
-        {
-          // Disable visibility test any-hit program, since it is already checked during
-          // intersection. Those trace calls that require anyhit can force it with a ray flag.
-          build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
-
-          build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
-#  if OPTIX_ABI_VERSION < 23
-          build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
-          build_input.aabbArray.numPrimitives = num_segments;
-          build_input.aabbArray.strideInBytes = sizeof(OptixAabb);
-          build_input.aabbArray.flags = &build_flags;
-          build_input.aabbArray.numSbtRecords = 1;
-          build_input.aabbArray.primitiveIndexOffset = hair->optix_prim_offset;
-#  else
-          build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
-          build_input.customPrimitiveArray.numPrimitives = num_segments;
-          build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb);
-          build_input.customPrimitiveArray.flags = &build_flags;
-          build_input.customPrimitiveArray.numSbtRecords = 1;
-          build_input.customPrimitiveArray.primitiveIndexOffset = hair->optix_prim_offset;
-#  endif
-        }
-
-        if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
-          progress.set_error("Failed to build OptiX acceleration structure");
-        }
-      }
-      else if (geom->geometry_type == Geometry::MESH || geom->geometry_type == Geometry::VOLUME) {
-        // Build BLAS for triangle primitives
-        Mesh *const mesh = static_cast<Mesh *const>(geom);
-        if (mesh->num_triangles() == 0) {
-          return;
-        }
-
-        const size_t num_verts = mesh->get_verts().size();
-
-        size_t num_motion_steps = 1;
-        Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-        if (motion_blur && mesh->get_use_motion_blur() && motion_keys) {
-          num_motion_steps = mesh->get_motion_steps();
-        }
-
-        device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
-        index_data.alloc(mesh->get_triangles().size());
-        memcpy(index_data.data(),
-               mesh->get_triangles().data(),
-               mesh->get_triangles().size() * sizeof(int));
-        device_vector<float3> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
-        vertex_data.alloc(num_verts * num_motion_steps);
-
-        for (size_t step = 0; step < num_motion_steps; ++step) {
-          const float3 *verts = mesh->get_verts().data();
-
-          size_t center_step = (num_motion_steps - 1) / 2;
-          // The center step for motion vertices is not stored in the attribute
-          if (step != center_step) {
-            verts = motion_keys->data_float3() +
-                    (step > center_step ? step - 1 : step) * num_verts;
-          }
-
-          memcpy(vertex_data.data() + num_verts * step, verts, num_verts * sizeof(float3));
-        }
-
-        // Upload triangle data to GPU
-        index_data.copy_to_device();
-        vertex_data.copy_to_device();
-
-        vector<device_ptr> vertex_ptrs;
-        vertex_ptrs.reserve(num_motion_steps);
-        for (size_t step = 0; step < num_motion_steps; ++step) {
-          vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3));
-        }
-
-        // Force a single any-hit call, so shadow record-all behavior works correctly
-        unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
-        OptixBuildInput build_input = {};
-        build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
-        build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
-        build_input.triangleArray.numVertices = num_verts;
-        build_input.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3;
-        build_input.triangleArray.vertexStrideInBytes = sizeof(float3);
-        build_input.triangleArray.indexBuffer = index_data.device_pointer;
-        build_input.triangleArray.numIndexTriplets = mesh->num_triangles();
-        build_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3;
-        build_input.triangleArray.indexStrideInBytes = 3 * sizeof(int);
-        build_input.triangleArray.flags = &build_flags;
-        // The SBT does not store per primitive data since Cycles already allocates separate
-        // buffers for that purpose. OptiX does not allow this to be zero though, so just pass in
-        // one and rely on that having the same meaning in this case.
-        build_input.triangleArray.numSbtRecords = 1;
-        build_input.triangleArray.primitiveIndexOffset = mesh->optix_prim_offset;
-
-        if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
-          progress.set_error("Failed to build OptiX acceleration structure");
-        }
-      }
-    }
-    else {
-      unsigned int num_instances = 0;
-      unsigned int max_num_instances = 0xFFFFFFFF;
-
-      bvh_optix->as_data.free();
-      bvh_optix->traversable_handle = 0;
-      bvh_optix->motion_transform_data.free();
-
-      optixDeviceContextGetProperty(context,
-                                    OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID,
-                                    &max_num_instances,
-                                    sizeof(max_num_instances));
-      // Do not count first bit, which is used to distinguish instanced and non-instanced objects
-      max_num_instances >>= 1;
-      if (bvh->objects.size() > max_num_instances) {
-        progress.set_error(
-            "Failed to build OptiX acceleration structure because there are too many instances");
-        return;
-      }
-
-      // Fill instance descriptions
-#  if OPTIX_ABI_VERSION < 41
-      device_vector<OptixAabb> aabbs(this, "optix tlas aabbs", MEM_READ_ONLY);
-      aabbs.alloc(bvh->objects.size());
-#  endif
-      device_vector<OptixInstance> instances(this, "optix tlas instances", MEM_READ_ONLY);
-      instances.alloc(bvh->objects.size());
-
-      // Calculate total motion transform size and allocate memory for them
-      size_t motion_transform_offset = 0;
-      if (motion_blur) {
-        size_t total_motion_transform_size = 0;
-        for (Object *const ob : bvh->objects) {
-          if (ob->is_traceable() && ob->use_motion()) {
-            total_motion_transform_size = align_up(total_motion_transform_size,
-                                                   OPTIX_TRANSFORM_BYTE_ALIGNMENT);
-            const size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
-            total_motion_transform_size = total_motion_transform_size +
-                                          sizeof(OptixSRTMotionTransform) +
-                                          motion_keys * sizeof(OptixSRTData);
-          }
-        }
-
-        assert(bvh_optix->motion_transform_data.device == this);
-        bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size);
-      }
-
-      for (Object *ob : bvh->objects) {
-        // Skip non-traceable objects
-        if (!ob->is_traceable())
-          continue;
-
-        BVHOptiX *const blas = static_cast<BVHOptiX *>(ob->get_geometry()->bvh);
-        OptixTraversableHandle handle = blas->traversable_handle;
-
-#  if OPTIX_ABI_VERSION < 41
-        OptixAabb &aabb = aabbs[num_instances];
-        aabb.minX = ob->bounds.min.x;
-        aabb.minY = ob->bounds.min.y;
-        aabb.minZ = ob->bounds.min.z;
-        aabb.maxX = ob->bounds.max.x;
-        aabb.maxY = ob->bounds.max.y;
-        aabb.maxZ = ob->bounds.max.z;
-#  endif
-
-        OptixInstance &instance = instances[num_instances++];
-        memset(&instance, 0, sizeof(instance));
-
-        // Clear transform to identity matrix
-        instance.transform[0] = 1.0f;
-        instance.transform[5] = 1.0f;
-        instance.transform[10] = 1.0f;
-
-        // Set user instance ID to object index (but leave low bit blank)
-        instance.instanceId = ob->get_device_index() << 1;
-
-        // Have to have at least one bit in the mask, or else instance would always be culled
-        instance.visibilityMask = 1;
-
-        if (ob->get_geometry()->has_volume) {
-          // Volumes have a special bit set in the visibility mask so a trace can mask only volumes
-          instance.visibilityMask |= 2;
-        }
-
-        if (ob->get_geometry()->geometry_type == Geometry::HAIR) {
-          // Same applies to curves (so they can be skipped in local trace calls)
-          instance.visibilityMask |= 4;
-
-#  if OPTIX_ABI_VERSION >= 36
-          if (motion_blur && ob->get_geometry()->has_motion_blur() &&
-              DebugFlags().optix.curves_api &&
-              static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) {
-            // Select between motion blur and non-motion blur built-in intersection module
-            instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
-          }
-#  endif
-        }
-
-        // Insert motion traversable if object has motion
-        if (motion_blur && ob->use_motion()) {
-          size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
-          size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
-                                         motion_keys * sizeof(OptixSRTData);
-
-          const CUDAContextScope scope(cuContext);
-
-          motion_transform_offset = align_up(motion_transform_offset,
-                                             OPTIX_TRANSFORM_BYTE_ALIGNMENT);
-          CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data.device_pointer +
-                                             motion_transform_offset;
-          motion_transform_offset += motion_transform_size;
-
-          // Allocate host side memory for motion transform and fill it with transform data
-          OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
-              new uint8_t[motion_transform_size]);
-          motion_transform.child = handle;
-          motion_transform.motionOptions.numKeys = ob->get_motion().size();
-          motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
-          motion_transform.motionOptions.timeBegin = 0.0f;
-          motion_transform.motionOptions.timeEnd = 1.0f;
-
-          OptixSRTData *const srt_data = motion_transform.srtData;
-          array<DecomposedTransform> decomp(ob->get_motion().size());
-          transform_motion_decompose(
-              decomp.data(), ob->get_motion().data(), ob->get_motion().size());
-
-          for (size_t i = 0; i < ob->get_motion().size(); ++i) {
-            // Scale
-            srt_data[i].sx = decomp[i].y.w;  // scale.x.x
-            srt_data[i].sy = decomp[i].z.w;  // scale.y.y
-            srt_data[i].sz = decomp[i].w.w;  // scale.z.z
-
-            // Shear
-            srt_data[i].a = decomp[i].z.x;  // scale.x.y
-            srt_data[i].b = decomp[i].z.y;  // scale.x.z
-            srt_data[i].c = decomp[i].w.x;  // scale.y.z
-            assert(decomp[i].z.z == 0.0f);  // scale.y.x
-            assert(decomp[i].w.y == 0.0f);  // scale.z.x
-            assert(decomp[i].w.z == 0.0f);  // scale.z.y
-
-            // Pivot point
-            srt_data[i].pvx = 0.0f;
-            srt_data[i].pvy = 0.0f;
-            srt_data[i].pvz = 0.0f;
-
-            // Rotation
-            srt_data[i].qx = decomp[i].x.x;
-            srt_data[i].qy = decomp[i].x.y;
-            srt_data[i].qz = decomp[i].x.z;
-            srt_data[i].qw = decomp[i].x.w;
-
-            // Translation
-            srt_data[i].tx = decomp[i].y.x;
-            srt_data[i].ty = decomp[i].y.y;
-            srt_data[i].tz = decomp[i].y.z;
-          }
-
-          // Upload motion transform to GPU
-          cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
-          delete[] reinterpret_cast<uint8_t *>(&motion_transform);
-
-          // Disable instance transform if object uses motion transform already
-          instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
-
-          // Get traversable handle to motion transform
-          optixConvertPointerToTraversableHandle(context,
-                                                 motion_transform_gpu,
-                                                 OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
-                                                 &instance.traversableHandle);
-        }
-        else {
-          instance.traversableHandle = handle;
-
-          if (ob->get_geometry()->is_instanced()) {
-            // Set transform matrix
-            memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform));
-          }
-          else {
-            // Disable instance transform if geometry already has it applied to vertex data
-            instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
-            // Non-instanced objects read ID from 'prim_object', so distinguish
-            // them from instanced objects with the low bit set
-            instance.instanceId |= 1;
-          }
-        }
-      }
-
-      // Upload instance descriptions
-#  if OPTIX_ABI_VERSION < 41
-      aabbs.resize(num_instances);
-      aabbs.copy_to_device();
-#  endif
-      instances.resize(num_instances);
-      instances.copy_to_device();
-
-      // Build top-level acceleration structure (TLAS)
-      OptixBuildInput build_input = {};
-      build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
-#  if OPTIX_ABI_VERSION < 41  // Instance AABBs no longer need to be set since OptiX 7.2
-      build_input.instanceArray.aabbs = aabbs.device_pointer;
-      build_input.instanceArray.numAabbs = num_instances;
-#  endif
-      build_input.instanceArray.instances = instances.device_pointer;
-      build_input.instanceArray.numInstances = num_instances;
-
-      if (!build_optix_bvh(bvh_optix, OPTIX_BUILD_OPERATION_BUILD, build_input, 0)) {
-        progress.set_error("Failed to build OptiX acceleration structure");
-      }
-      tlas_handle = bvh_optix->traversable_handle;
-    }
-  }
-
-  void release_optix_bvh(BVH *bvh) override
-  {
-    thread_scoped_lock lock(delayed_free_bvh_mutex);
-    /* Do delayed free of BVH memory, since geometry holding BVH might be deleted
-     * while GPU is still rendering. */
-    BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
-
-    delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->as_data));
-    delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->motion_transform_data));
-    bvh_optix->traversable_handle = 0;
-  }
-
-  void free_bvh_memory_delayed()
-  {
-    thread_scoped_lock lock(delayed_free_bvh_mutex);
-    delayed_free_bvh_memory.free_memory();
-  }
-
-  void const_copy_to(const char *name, void *host, size_t size) override
-  {
-    // Set constant memory for CUDA module
-    // TODO(pmours): This is only used for tonemapping (see 'film_convert').
-    //               Could be removed by moving those functions to filter CUDA module.
-    CUDADevice::const_copy_to(name, host, size);
-
-    if (strcmp(name, "__data") == 0) {
-      assert(size <= sizeof(KernelData));
-
-      // Update traversable handle (since it is different for each device on multi devices)
-      KernelData *const data = (KernelData *)host;
-      *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;
-
-      update_launch_params(offsetof(KernelParams, data), host, size);
-      return;
-    }
-
-    // Update data storage pointers in launch parameters
-#  define KERNEL_TEX(data_type, tex_name) \
-    if (strcmp(name, #tex_name) == 0) { \
-      update_launch_params(offsetof(KernelParams, tex_name), host, size); \
-      return; \
-    }
-#  include "kernel/kernel_textures.h"
-#  undef KERNEL_TEX
-  }
-
-  void update_launch_params(size_t offset, void *data, size_t data_size)
-  {
-    const CUDAContextScope scope(cuContext);
-
-    for (int i = 0; i < info.cpu_threads; ++i)
-      check_result_cuda(
-          cuMemcpyHtoD(launch_params.device_pointer + i * launch_params.data_elements + offset,
-                       data,
-                       data_size));
-  }
-
-  void task_add(DeviceTask &task) override
-  {
-    // Upload texture information to device if it has changed since last launch
-    load_texture_info();
-
-    if (task.type == DeviceTask::FILM_CONVERT) {
-      // Execute in main thread because of OpenGL access
-      film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
-      return;
-    }
-
-    if (task.type == DeviceTask::DENOISE_BUFFER) {
-      // Execute denoising in a single thread (e.g. to avoid race conditions during creation)
-      task_pool.push([=] {
-        DeviceTask task_copy = task;
-        thread_run(task_copy, 0);
-      });
-      return;
-    }
-
-    // Split task into smaller ones
-    list<DeviceTask> tasks;
-    task.split(tasks, info.cpu_threads);
-
-    // Queue tasks in internal task pool
-    int task_index = 0;
-    for (DeviceTask &task : tasks) {
-      task_pool.push([=] {
-        // Using task index parameter instead of thread index, since number of CUDA streams may
-        // differ from number of threads
-        DeviceTask task_copy = task;
-        thread_run(task_copy, task_index);
-      });
-      task_index++;
-    }
-  }
-
-  void task_wait() override
-  {
-    // Wait for all queued tasks to finish
-    task_pool.wait_work();
-  }
-
-  void task_cancel() override
-  {
-    // Cancel any remaining tasks in the internal pool
-    task_pool.cancel();
-  }
-};
-
-bool device_optix_init()
-{
-  if (g_optixFunctionTable.optixDeviceContextCreate != NULL)
-    return true;  // Already initialized function table
-
-  // Need to initialize CUDA as well
-  if (!device_cuda_init())
-    return false;
-
-  const OptixResult result = optixInit();
-
-  if (result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) {
-    VLOG(1) << "OptiX initialization failed because the installed NVIDIA driver is too old. "
-               "Please update to the latest driver first!";
-    return false;
-  }
-  else if (result != OPTIX_SUCCESS) {
-    VLOG(1) << "OptiX initialization failed with error code " << (unsigned int)result;
-    return false;
-  }
-
-  // Loaded OptiX successfully!
-  return true;
-}
-
-void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices)
-{
-  devices.reserve(cuda_devices.size());
-
-  // Simply add all supported CUDA devices as OptiX devices again
-  for (DeviceInfo info : cuda_devices) {
-    assert(info.type == DEVICE_CUDA);
-
-    int major;
-    cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, info.num);
-    if (major < 5) {
-      continue;  // Only Maxwell and up are supported by OptiX
-    }
-
-    info.type = DEVICE_OPTIX;
-    info.id += "_OptiX";
-    info.denoisers |= DENOISER_OPTIX;
-    info.has_branched_path = false;
-
-    devices.push_back(info);
-  }
-}
-
-Device *device_optix_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
-{
-  return new OptiXDevice(info, stats, profiler, background);
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/device_queue.cpp b/intern/cycles/device/device_queue.cpp
new file mode 100644
index 00000000000..a89ba68d62c
--- /dev/null
+++ b/intern/cycles/device/device_queue.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device_queue.h"
+
+#include "util/util_algorithm.h"
+#include "util/util_logging.h"
+#include "util/util_time.h"
+
+#include <iomanip>
+
+CCL_NAMESPACE_BEGIN
+
+DeviceQueue::DeviceQueue(Device *device)
+    : device(device), last_kernels_enqueued_(0), last_sync_time_(0.0)
+{
+  DCHECK_NE(device, nullptr);
+}
+
+DeviceQueue::~DeviceQueue()
+{
+  if (VLOG_IS_ON(3)) {
+    /* Print kernel execution times sorted by time. */
+    vector<pair<DeviceKernelMask, double>> stats_sorted;
+    for (const auto &stat : stats_kernel_time_) {
+      stats_sorted.push_back(stat);
+    }
+
+    sort(stats_sorted.begin(),
+         stats_sorted.end(),
+         [](const pair<DeviceKernelMask, double> &a, const pair<DeviceKernelMask, double> &b) {
+           return a.second > b.second;
+         });
+
+    VLOG(3) << "GPU queue stats:";
+    for (const auto &[mask, time] : stats_sorted) {
+      VLOG(3) << "  " << std::setfill(' ') << std::setw(10) << std::fixed << std::setprecision(5)
+              << std::right << time << "s: " << device_kernel_mask_as_string(mask);
+    }
+  }
+}
+
+void DeviceQueue::debug_init_execution()
+{
+  if (VLOG_IS_ON(3)) {
+    last_sync_time_ = time_dt();
+    last_kernels_enqueued_ = 0;
+  }
+}
+
+void DeviceQueue::debug_enqueue(DeviceKernel kernel, const int work_size)
+{
+  if (VLOG_IS_ON(3)) {
+    VLOG(4) << "GPU queue launch " << device_kernel_as_string(kernel) << ", work_size "
+            << work_size;
+    last_kernels_enqueued_ |= (uint64_t(1) << (uint64_t)kernel);
+  }
+}
+
+void DeviceQueue::debug_synchronize()
+{
+  if (VLOG_IS_ON(3)) {
+    const double new_time = time_dt();
+    const double elapsed_time = new_time - last_sync_time_;
+    VLOG(4) << "GPU queue synchronize, elapsed " << std::setw(10) << elapsed_time << "s";
+
+    stats_kernel_time_[last_kernels_enqueued_] += elapsed_time;
+
+    last_sync_time_ = new_time;
+    last_kernels_enqueued_ = 0;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_queue.h b/intern/cycles/device/device_queue.h
new file mode 100644
index 00000000000..edda3e61d51
--- /dev/null
+++ b/intern/cycles/device/device_queue.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "device/device_kernel.h"
+
+#include "device/device_graphics_interop.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class device_memory;
+
+struct KernelWorkTile;
+
+/* Abstraction of a command queue for a device.
+ * Provides API to schedule kernel execution in a specific queue with minimal possible overhead
+ * from driver side.
+ *
+ * This class encapsulates all properties needed for commands execution. */
+class DeviceQueue {
+ public:
+  virtual ~DeviceQueue();
+
+  /* Number of concurrent states to process for integrator,
+   * based on number of cores and/or available memory. */
+  virtual int num_concurrent_states(const size_t state_size) const = 0;
+
+  /* Number of states which keeps the device occupied with work without loosing performance.
+   * The renderer will add more work (when available) when number of active paths falls below this
+   * value. */
+  virtual int num_concurrent_busy_states() const = 0;
+
+  /* Initialize execution of kernels on this queue.
+   *
+   * Will, for example, load all data required by the kernels from Device to global or path state.
+   *
+   * Use this method after device synchronization has finished before enqueueing any kernels. */
+  virtual void init_execution() = 0;
+
+  /* Test if an optional device kernel is available. */
+  virtual bool kernel_available(DeviceKernel kernel) const = 0;
+
+  /* Enqueue kernel execution.
+   *
+   * Execute the kernel work_size times on the device.
+   * Supported arguments types:
+   * - int: pass pointer to the int
+   * - device memory: pass pointer to device_memory.device_pointer
+   * Return false if there was an error executing this or a previous kernel. */
+  virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) = 0;
+
+  /* Wait unit all enqueued kernels have finished execution.
+   * Return false if there was an error executing any of the enqueued kernels. */
+  virtual bool synchronize() = 0;
+
+  /* Copy memory to/from device as part of the command queue, to ensure
+   * operations are done in order without having to synchronize. */
+  virtual void zero_to_device(device_memory &mem) = 0;
+  virtual void copy_to_device(device_memory &mem) = 0;
+  virtual void copy_from_device(device_memory &mem) = 0;
+
+  /* Graphics resources interoperability.
+   *
+   * The interoperability comes here by the meaning that the device is capable of computing result
+   * directly into an OpenGL (or other graphics library) buffer. */
+
+  /* Create graphics interoperability context which will be taking care of mapping graphics
+   * resource as a buffer writable by kernels of this device. */
+  virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create()
+  {
+    LOG(FATAL) << "Request of GPU interop of a device which does not support it.";
+    return nullptr;
+  }
+
+  /* Device this queue has been created for. */
+  Device *device;
+
+ protected:
+  /* Hide construction so that allocation via `Device` API is enforced. */
+  explicit DeviceQueue(Device *device);
+
+  /* Implementations call these from the corresponding methods to generate debugging logs. */
+  void debug_init_execution();
+  void debug_enqueue(DeviceKernel kernel, const int work_size);
+  void debug_synchronize();
+
+  /* Combination of kernels enqueued together sync last synchronize. */
+  DeviceKernelMask last_kernels_enqueued_;
+  /* Time of synchronize call. */
+  double last_sync_time_;
+  /* Accumulated execution time for combinations of kernels launched together. */
+  map<DeviceKernelMask, double> stats_kernel_time_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
deleted file mode 100644
index 9889f688aaa..00000000000
--- a/intern/cycles/device/device_split_kernel.cpp
+++ /dev/null
@@ -1,389 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "device/device_split_kernel.h"
-
-#include "kernel/kernel_types.h"
-#include "kernel/split/kernel_split_data_types.h"
-
-#include "util/util_logging.h"
-#include "util/util_time.h"
-
-CCL_NAMESPACE_BEGIN
-
-static const double alpha = 0.1; /* alpha for rolling average */
-
-DeviceSplitKernel::DeviceSplitKernel(Device *device)
-    : device(device),
-      split_data(device, "split_data"),
-      ray_state(device, "ray_state", MEM_READ_WRITE),
-      queue_index(device, "queue_index"),
-      use_queues_flag(device, "use_queues_flag"),
-      work_pool_wgs(device, "work_pool_wgs"),
-      kernel_data_initialized(false)
-{
-  avg_time_per_sample = 0.0;
-
-  kernel_path_init = NULL;
-  kernel_scene_intersect = NULL;
-  kernel_lamp_emission = NULL;
-  kernel_do_volume = NULL;
-  kernel_queue_enqueue = NULL;
-  kernel_indirect_background = NULL;
-  kernel_shader_setup = NULL;
-  kernel_shader_sort = NULL;
-  kernel_shader_eval = NULL;
-  kernel_holdout_emission_blurring_pathtermination_ao = NULL;
-  kernel_subsurface_scatter = NULL;
-  kernel_direct_lighting = NULL;
-  kernel_shadow_blocked_ao = NULL;
-  kernel_shadow_blocked_dl = NULL;
-  kernel_enqueue_inactive = NULL;
-  kernel_next_iteration_setup = NULL;
-  kernel_indirect_subsurface = NULL;
-  kernel_buffer_update = NULL;
-  kernel_adaptive_stopping = NULL;
-  kernel_adaptive_filter_x = NULL;
-  kernel_adaptive_filter_y = NULL;
-  kernel_adaptive_adjust_samples = NULL;
-}
-
-DeviceSplitKernel::~DeviceSplitKernel()
-{
-  split_data.free();
-  ray_state.free();
-  use_queues_flag.free();
-  queue_index.free();
-  work_pool_wgs.free();
-
-  delete kernel_path_init;
-  delete kernel_scene_intersect;
-  delete kernel_lamp_emission;
-  delete kernel_do_volume;
-  delete kernel_queue_enqueue;
-  delete kernel_indirect_background;
-  delete kernel_shader_setup;
-  delete kernel_shader_sort;
-  delete kernel_shader_eval;
-  delete kernel_holdout_emission_blurring_pathtermination_ao;
-  delete kernel_subsurface_scatter;
-  delete kernel_direct_lighting;
-  delete kernel_shadow_blocked_ao;
-  delete kernel_shadow_blocked_dl;
-  delete kernel_enqueue_inactive;
-  delete kernel_next_iteration_setup;
-  delete kernel_indirect_subsurface;
-  delete kernel_buffer_update;
-  delete kernel_adaptive_stopping;
-  delete kernel_adaptive_filter_x;
-  delete kernel_adaptive_filter_y;
-  delete kernel_adaptive_adjust_samples;
-}
-
-bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures &requested_features)
-{
-#define LOAD_KERNEL(name) \
-  kernel_##name = get_split_kernel_function(#name, requested_features); \
-  if (!kernel_##name) { \
-    device->set_error(string("Split kernel error: failed to load kernel_") + #name); \
-    return false; \
-  }
-
-  LOAD_KERNEL(path_init);
-  LOAD_KERNEL(scene_intersect);
-  LOAD_KERNEL(lamp_emission);
-  if (requested_features.use_volume) {
-    LOAD_KERNEL(do_volume);
-  }
-  LOAD_KERNEL(queue_enqueue);
-  LOAD_KERNEL(indirect_background);
-  LOAD_KERNEL(shader_setup);
-  LOAD_KERNEL(shader_sort);
-  LOAD_KERNEL(shader_eval);
-  LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
-  LOAD_KERNEL(subsurface_scatter);
-  LOAD_KERNEL(direct_lighting);
-  LOAD_KERNEL(shadow_blocked_ao);
-  LOAD_KERNEL(shadow_blocked_dl);
-  LOAD_KERNEL(enqueue_inactive);
-  LOAD_KERNEL(next_iteration_setup);
-  LOAD_KERNEL(indirect_subsurface);
-  LOAD_KERNEL(buffer_update);
-  LOAD_KERNEL(adaptive_stopping);
-  LOAD_KERNEL(adaptive_filter_x);
-  LOAD_KERNEL(adaptive_filter_y);
-  LOAD_KERNEL(adaptive_adjust_samples);
-
-#undef LOAD_KERNEL
-
-  /* Re-initialiaze kernel-dependent data when kernels change. */
-  kernel_data_initialized = false;
-
-  return true;
-}
-
-size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory &kg,
-                                                           device_memory &data,
-                                                           uint64_t max_buffer_size)
-{
-  uint64_t size_per_element = state_buffer_size(kg, data, 1024) / 1024;
-  VLOG(1) << "Split state element size: " << string_human_readable_number(size_per_element)
-          << " bytes. (" << string_human_readable_size(size_per_element) << ").";
-  return max_buffer_size / size_per_element;
-}
-
-bool DeviceSplitKernel::path_trace(DeviceTask &task,
-                                   RenderTile &tile,
-                                   device_memory &kgbuffer,
-                                   device_memory &kernel_data)
-{
-  if (device->have_error()) {
-    return false;
-  }
-
-  /* Allocate all required global memory once. */
-  if (!kernel_data_initialized) {
-    kernel_data_initialized = true;
-
-    /* Set local size */
-    int2 lsize = split_kernel_local_size();
-    local_size[0] = lsize[0];
-    local_size[1] = lsize[1];
-
-    /* Set global size */
-    int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task);
-
-    /* Make sure that set work size is a multiple of local
-     * work size dimensions.
-     */
-    global_size[0] = round_up(gsize[0], local_size[0]);
-    global_size[1] = round_up(gsize[1], local_size[1]);
-
-    int num_global_elements = global_size[0] * global_size[1];
-    assert(num_global_elements % WORK_POOL_SIZE == 0);
-
-    /* Calculate max groups */
-
-    /* Denotes the maximum work groups possible w.r.t. current requested tile size. */
-    unsigned int work_pool_size = (device->info.type == DEVICE_CPU) ? WORK_POOL_SIZE_CPU :
-                                                                      WORK_POOL_SIZE_GPU;
-    unsigned int max_work_groups = num_global_elements / work_pool_size + 1;
-
-    /* Allocate work_pool_wgs memory. */
-    work_pool_wgs.alloc_to_device(max_work_groups);
-    queue_index.alloc_to_device(NUM_QUEUES);
-    use_queues_flag.alloc_to_device(1);
-    split_data.alloc_to_device(state_buffer_size(kgbuffer, kernel_data, num_global_elements));
-    ray_state.alloc(num_global_elements);
-  }
-
-  /* Number of elements in the global state buffer */
-  int num_global_elements = global_size[0] * global_size[1];
-
-#define ENQUEUE_SPLIT_KERNEL(name, global_size, local_size) \
-  if (device->have_error()) { \
-    return false; \
-  } \
-  if (!kernel_##name->enqueue( \
-          KernelDimensions(global_size, local_size), kgbuffer, kernel_data)) { \
-    return false; \
-  }
-
-  tile.sample = tile.start_sample;
-
-  /* for exponential increase between tile updates */
-  int time_multiplier = 1;
-
-  while (tile.sample < tile.start_sample + tile.num_samples) {
-    /* to keep track of how long it takes to run a number of samples */
-    double start_time = time_dt();
-
-    /* initial guess to start rolling average */
-    const int initial_num_samples = 1;
-    /* approx number of samples per second */
-    const int samples_per_second = (avg_time_per_sample > 0.0) ?
-                                       int(double(time_multiplier) / avg_time_per_sample) + 1 :
-                                       initial_num_samples;
-
-    RenderTile subtile = tile;
-    subtile.start_sample = tile.sample;
-    subtile.num_samples = samples_per_second;
-
-    if (task.adaptive_sampling.use) {
-      subtile.num_samples = task.adaptive_sampling.align_samples(subtile.start_sample,
-                                                                 subtile.num_samples);
-    }
-
-    /* Don't go beyond requested number of samples. */
-    subtile.num_samples = min(subtile.num_samples,
-                              tile.start_sample + tile.num_samples - tile.sample);
-
-    if (device->have_error()) {
-      return false;
-    }
-
-    /* reset state memory here as global size for data_init
-     * kernel might not be large enough to do in kernel
-     */
-    work_pool_wgs.zero_to_device();
-    split_data.zero_to_device();
-    ray_state.zero_to_device();
-
-    if (!enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
-                                        subtile,
-                                        num_global_elements,
-                                        kgbuffer,
-                                        kernel_data,
-                                        split_data,
-                                        ray_state,
-                                        queue_index,
-                                        use_queues_flag,
-                                        work_pool_wgs)) {
-      return false;
-    }
-
-    ENQUEUE_SPLIT_KERNEL(path_init, global_size, local_size);
-
-    bool activeRaysAvailable = true;
-    double cancel_time = DBL_MAX;
-
-    while (activeRaysAvailable) {
-      /* Do path-iteration in host [Enqueue Path-iteration kernels. */
-      for (int PathIter = 0; PathIter < 16; PathIter++) {
-        ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
-        if (kernel_do_volume) {
-          ENQUEUE_SPLIT_KERNEL(do_volume, global_size, local_size);
-        }
-        ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(indirect_background, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(shader_setup, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(shader_sort, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(
-            holdout_emission_blurring_pathtermination_ao, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(subsurface_scatter, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(enqueue_inactive, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(indirect_subsurface, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(buffer_update, global_size, local_size);
-
-        if (task.get_cancel() && cancel_time == DBL_MAX) {
-          /* Wait up to twice as many seconds for current samples to finish
-           * to avoid artifacts in render result from ending too soon.
-           */
-          cancel_time = time_dt() + 2.0 * time_multiplier;
-        }
-
-        if (time_dt() > cancel_time) {
-          return true;
-        }
-      }
-
-      /* Decide if we should exit path-iteration in host. */
-      ray_state.copy_from_device(0, global_size[0] * global_size[1], 1);
-
-      activeRaysAvailable = false;
-
-      for (int rayStateIter = 0; rayStateIter < global_size[0] * global_size[1]; ++rayStateIter) {
-        if (!IS_STATE(ray_state.data(), rayStateIter, RAY_INACTIVE)) {
-          if (IS_STATE(ray_state.data(), rayStateIter, RAY_INVALID)) {
-            /* Something went wrong, abort to avoid looping endlessly. */
-            device->set_error("Split kernel error: invalid ray state");
-            return false;
-          }
-
-          /* Not all rays are RAY_INACTIVE. */
-          activeRaysAvailable = true;
-          break;
-        }
-      }
-
-      if (time_dt() > cancel_time) {
-        return true;
-      }
-    }
-
-    int filter_sample = tile.sample + subtile.num_samples - 1;
-    if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
-      size_t buffer_size[2];
-      buffer_size[0] = round_up(tile.w, local_size[0]);
-      buffer_size[1] = round_up(tile.h, local_size[1]);
-      kernel_adaptive_stopping->enqueue(
-          KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
-      buffer_size[0] = round_up(tile.h, local_size[0]);
-      buffer_size[1] = round_up(1, local_size[1]);
-      kernel_adaptive_filter_x->enqueue(
-          KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
-      buffer_size[0] = round_up(tile.w, local_size[0]);
-      buffer_size[1] = round_up(1, local_size[1]);
-      kernel_adaptive_filter_y->enqueue(
-          KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
-    }
-
-    double time_per_sample = ((time_dt() - start_time) / subtile.num_samples);
-
-    if (avg_time_per_sample == 0.0) {
-      /* start rolling average */
-      avg_time_per_sample = time_per_sample;
-    }
-    else {
-      avg_time_per_sample = alpha * time_per_sample + (1.0 - alpha) * avg_time_per_sample;
-    }
-
-#undef ENQUEUE_SPLIT_KERNEL
-
-    tile.sample += subtile.num_samples;
-    task.update_progress(&tile, tile.w * tile.h * subtile.num_samples);
-
-    time_multiplier = min(time_multiplier << 1, 10);
-
-    if (task.get_cancel()) {
-      return true;
-    }
-  }
-
-  if (task.adaptive_sampling.use) {
-    /* Reset the start samples. */
-    RenderTile subtile = tile;
-    subtile.start_sample = tile.start_sample;
-    subtile.num_samples = tile.sample - tile.start_sample;
-    enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
-                                   subtile,
-                                   num_global_elements,
-                                   kgbuffer,
-                                   kernel_data,
-                                   split_data,
-                                   ray_state,
-                                   queue_index,
-                                   use_queues_flag,
-                                   work_pool_wgs);
-    size_t buffer_size[2];
-    buffer_size[0] = round_up(tile.w, local_size[0]);
-    buffer_size[1] = round_up(tile.h, local_size[1]);
-    kernel_adaptive_adjust_samples->enqueue(
-        KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
-  }
-
-  return true;
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h
deleted file mode 100644
index 07a21b10299..00000000000
--- a/intern/cycles/device/device_split_kernel.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEVICE_SPLIT_KERNEL_H__
-#define __DEVICE_SPLIT_KERNEL_H__
-
-#include "device/device.h"
-#include "render/buffers.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* When allocate global memory in chunks. We may not be able to
- * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks;
- * Since some bytes may be needed for aligning chunks of memory;
- * This is the amount of memory that we dedicate for that purpose.
- */
-#define DATA_ALLOCATION_MEM_FACTOR 5000000  // 5MB
-
-/* Types used for split kernel */
-
-class KernelDimensions {
- public:
-  size_t global_size[2];
-  size_t local_size[2];
-
-  KernelDimensions(size_t global_size_[2], size_t local_size_[2])
-  {
-    memcpy(global_size, global_size_, sizeof(global_size));
-    memcpy(local_size, local_size_, sizeof(local_size));
-  }
-};
-
-class SplitKernelFunction {
- public:
-  virtual ~SplitKernelFunction()
-  {
-  }
-
-  /* enqueue the kernel, returns false if there is an error */
-  virtual bool enqueue(const KernelDimensions &dim, device_memory &kg, device_memory &data) = 0;
-};
-
-class DeviceSplitKernel {
- private:
-  Device *device;
-
-  SplitKernelFunction *kernel_path_init;
-  SplitKernelFunction *kernel_scene_intersect;
-  SplitKernelFunction *kernel_lamp_emission;
-  SplitKernelFunction *kernel_do_volume;
-  SplitKernelFunction *kernel_queue_enqueue;
-  SplitKernelFunction *kernel_indirect_background;
-  SplitKernelFunction *kernel_shader_setup;
-  SplitKernelFunction *kernel_shader_sort;
-  SplitKernelFunction *kernel_shader_eval;
-  SplitKernelFunction *kernel_holdout_emission_blurring_pathtermination_ao;
-  SplitKernelFunction *kernel_subsurface_scatter;
-  SplitKernelFunction *kernel_direct_lighting;
-  SplitKernelFunction *kernel_shadow_blocked_ao;
-  SplitKernelFunction *kernel_shadow_blocked_dl;
-  SplitKernelFunction *kernel_enqueue_inactive;
-  SplitKernelFunction *kernel_next_iteration_setup;
-  SplitKernelFunction *kernel_indirect_subsurface;
-  SplitKernelFunction *kernel_buffer_update;
-  SplitKernelFunction *kernel_adaptive_stopping;
-  SplitKernelFunction *kernel_adaptive_filter_x;
-  SplitKernelFunction *kernel_adaptive_filter_y;
-  SplitKernelFunction *kernel_adaptive_adjust_samples;
-
-  /* Global memory variables [porting]; These memory is used for
-   * co-operation between different kernels; Data written by one
-   * kernel will be available to another kernel via this global
-   * memory.
-   */
-  device_only_memory<uchar> split_data;
-  device_vector<uchar> ray_state;
-  device_only_memory<int>
-      queue_index; /* Array of size num_queues that tracks the size of each queue. */
-
-  /* Flag to make sceneintersect and lampemission kernel use queues. */
-  device_only_memory<char> use_queues_flag;
-
-  /* Approximate time it takes to complete one sample */
-  double avg_time_per_sample;
-
-  /* Work pool with respect to each work group. */
-  device_only_memory<unsigned int> work_pool_wgs;
-
-  /* Cached kernel-dependent data, initialized once. */
-  bool kernel_data_initialized;
-  size_t local_size[2];
-  size_t global_size[2];
-
- public:
-  explicit DeviceSplitKernel(Device *device);
-  virtual ~DeviceSplitKernel();
-
-  bool load_kernels(const DeviceRequestedFeatures &requested_features);
-  bool path_trace(DeviceTask &task,
-                  RenderTile &rtile,
-                  device_memory &kgbuffer,
-                  device_memory &kernel_data);
-
-  virtual uint64_t state_buffer_size(device_memory &kg,
-                                     device_memory &data,
-                                     size_t num_threads) = 0;
-  size_t max_elements_for_max_buffer_size(device_memory &kg,
-                                          device_memory &data,
-                                          uint64_t max_buffer_size);
-
-  virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
-                                              RenderTile &rtile,
-                                              int num_global_elements,
-                                              device_memory &kernel_globals,
-                                              device_memory &kernel_data_,
-                                              device_memory &split_data,
-                                              device_memory &ray_state,
-                                              device_memory &queue_index,
-                                              device_memory &use_queues_flag,
-                                              device_memory &work_pool_wgs) = 0;
-
-  virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
-                                                         const DeviceRequestedFeatures &) = 0;
-  virtual int2 split_kernel_local_size() = 0;
-  virtual int2 split_kernel_global_size(device_memory &kg,
-                                        device_memory &data,
-                                        DeviceTask &task) = 0;
-};
-
-CCL_NAMESPACE_END
-
-#endif /* __DEVICE_SPLIT_KERNEL_H__ */
diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp
deleted file mode 100644
index 55fbaa31e42..00000000000
--- a/intern/cycles/device/device_task.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stdlib.h>
-#include <string.h>
-
-#include "device/device_task.h"
-
-#include "render/buffers.h"
-
-#include "util/util_algorithm.h"
-#include "util/util_time.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* Device Task */
-
-DeviceTask::DeviceTask(Type type_)
-    : type(type_),
-      x(0),
-      y(0),
-      w(0),
-      h(0),
-      rgba_byte(0),
-      rgba_half(0),
-      buffer(0),
-      sample(0),
-      num_samples(1),
-      shader_input(0),
-      shader_output(0),
-      shader_eval_type(0),
-      shader_filter(0),
-      shader_x(0),
-      shader_w(0),
-      buffers(nullptr),
-      tile_types(0),
-      denoising_from_render(false),
-      pass_stride(0),
-      frame_stride(0),
-      target_pass_stride(0),
-      pass_denoising_data(0),
-      pass_denoising_clean(0),
-      need_finish_queue(false),
-      integrator_branched(false)
-{
-  last_update_time = time_dt();
-}
-
-int DeviceTask::get_subtask_count(int num, int max_size) const
-{
-  if (max_size != 0) {
-    int max_size_num;
-
-    if (type == SHADER) {
-      max_size_num = (shader_w + max_size - 1) / max_size;
-    }
-    else {
-      max_size = max(1, max_size / w);
-      max_size_num = (h + max_size - 1) / max_size;
-    }
-
-    num = max(max_size_num, num);
-  }
-
-  if (type == SHADER) {
-    num = min(shader_w, num);
-  }
-  else if (type == RENDER) {
-  }
-  else {
-    num = min(h, num);
-  }
-
-  return num;
-}
-
-void DeviceTask::split(list<DeviceTask> &tasks, int num, int max_size) const
-{
-  num = get_subtask_count(num, max_size);
-
-  if (type == SHADER) {
-    for (int i = 0; i < num; i++) {
-      int tx = shader_x + (shader_w / num) * i;
-      int tw = (i == num - 1) ? shader_w - i * (shader_w / num) : shader_w / num;
-
-      DeviceTask task = *this;
-
-      task.shader_x = tx;
-      task.shader_w = tw;
-
-      tasks.push_back(task);
-    }
-  }
-  else if (type == RENDER) {
-    for (int i = 0; i < num; i++)
-      tasks.push_back(*this);
-  }
-  else {
-    for (int i = 0; i < num; i++) {
-      int ty = y + (h / num) * i;
-      int th = (i == num - 1) ? h - i * (h / num) : h / num;
-
-      DeviceTask task = *this;
-
-      task.y = ty;
-      task.h = th;
-
-      tasks.push_back(task);
-    }
-  }
-}
-
-void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples)
-{
-  if (type == FILM_CONVERT)
-    return;
-
-  if (update_progress_sample) {
-    if (pixel_samples == -1) {
-      pixel_samples = shader_w;
-    }
-    update_progress_sample(pixel_samples, rtile ? rtile->sample : 0);
-  }
-
-  if (update_tile_sample) {
-    double current_time = time_dt();
-
-    if (current_time - last_update_time >= 1.0) {
-      update_tile_sample(*rtile);
-
-      last_update_time = current_time;
-    }
-  }
-}
-
-/* Adaptive Sampling */
-
-AdaptiveSampling::AdaptiveSampling() : use(true), adaptive_step(0), min_samples(0)
-{
-}
-
-/* Render samples in steps that align with the adaptive filtering. */
-int AdaptiveSampling::align_samples(int sample, int num_samples) const
-{
-  int end_sample = sample + num_samples;
-
-  /* Round down end sample to the nearest sample that needs filtering. */
-  end_sample &= ~(adaptive_step - 1);
-
-  if (end_sample <= sample) {
-    /* In order to reach the next sample that needs filtering, we'd need
-     * to increase num_samples. We don't do that in this function, so
-     * just keep it as is and don't filter this time around. */
-    return num_samples;
-  }
-  return end_sample - sample;
-}
-
-bool AdaptiveSampling::need_filter(int sample) const
-{
-  if (sample > min_samples) {
-    return (sample & (adaptive_step - 1)) == (adaptive_step - 1);
-  }
-  else {
-    return false;
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
deleted file mode 100644
index 3f7cf47b692..00000000000
--- a/intern/cycles/device/device_task.h
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEVICE_TASK_H__
-#define __DEVICE_TASK_H__
-
-#include "device/device_memory.h"
-
-#include "util/util_function.h"
-#include "util/util_list.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* Device Task */
-
-class Device;
-class RenderBuffers;
-class RenderTile;
-class RenderTileNeighbors;
-class Tile;
-
-enum DenoiserType {
-  DENOISER_NLM = 1,
-  DENOISER_OPTIX = 2,
-  DENOISER_OPENIMAGEDENOISE = 4,
-  DENOISER_NUM,
-
-  DENOISER_NONE = 0,
-  DENOISER_ALL = ~0,
-};
-
-enum DenoiserInput {
-  DENOISER_INPUT_RGB = 1,
-  DENOISER_INPUT_RGB_ALBEDO = 2,
-  DENOISER_INPUT_RGB_ALBEDO_NORMAL = 3,
-
-  DENOISER_INPUT_NUM,
-};
-
-typedef int DenoiserTypeMask;
-
-class DenoiseParams {
- public:
-  /* Apply denoiser to image. */
-  bool use;
-  /* Output denoising data passes (possibly without applying the denoiser). */
-  bool store_passes;
-
-  /* Denoiser type. */
-  DenoiserType type;
-
-  /* Viewport start sample. */
-  int start_sample;
-
-  /** Native Denoiser. */
-
-  /* Pixel radius for neighboring pixels to take into account. */
-  int radius;
-  /* Controls neighbor pixel weighting for the denoising filter. */
-  float strength;
-  /* Preserve more or less detail based on feature passes. */
-  float feature_strength;
-  /* When removing pixels that don't carry information,
-   * use a relative threshold instead of an absolute one. */
-  bool relative_pca;
-  /* How many frames before and after the current center frame are included. */
-  int neighbor_frames;
-  /* Clamp the input to the range of +-1e8. Should be enough for any legitimate data. */
-  bool clamp_input;
-
-  /** OIDN/Optix Denoiser. */
-
-  /* Passes handed over to the OIDN/OptiX denoiser (default to color + albedo). */
-  DenoiserInput input_passes;
-
-  DenoiseParams()
-  {
-    use = false;
-    store_passes = false;
-
-    type = DENOISER_NLM;
-
-    radius = 8;
-    strength = 0.5f;
-    feature_strength = 0.5f;
-    relative_pca = false;
-    neighbor_frames = 2;
-    clamp_input = true;
-
-    /* Default to color + albedo only, since normal input does not always have the desired effect
-     * when denoising with OptiX. */
-    input_passes = DENOISER_INPUT_RGB_ALBEDO;
-
-    start_sample = 0;
-  }
-
-  /* Test if a denoising task needs to run, also to prefilter passes for the native
-   * denoiser when we are not applying denoising to the combined image. */
-  bool need_denoising_task() const
-  {
-    return (use || (store_passes && type == DENOISER_NLM));
-  }
-};
-
-class AdaptiveSampling {
- public:
-  AdaptiveSampling();
-
-  int align_samples(int sample, int num_samples) const;
-  bool need_filter(int sample) const;
-
-  bool use;
-  int adaptive_step;
-  int min_samples;
-};
-
-class DeviceTask {
- public:
-  typedef enum { RENDER, FILM_CONVERT, SHADER, DENOISE_BUFFER } Type;
-  Type type;
-
-  int x, y, w, h;
-  device_ptr rgba_byte;
-  device_ptr rgba_half;
-  device_ptr buffer;
-  int sample;
-  int num_samples;
-  int offset, stride;
-
-  device_ptr shader_input;
-  device_ptr shader_output;
-  int shader_eval_type;
-  int shader_filter;
-  int shader_x, shader_w;
-
-  RenderBuffers *buffers;
-
-  explicit DeviceTask(Type type = RENDER);
-
-  int get_subtask_count(int num, int max_size = 0) const;
-  void split(list<DeviceTask> &tasks, int num, int max_size = 0) const;
-
-  void update_progress(RenderTile *rtile, int pixel_samples = -1);
-
-  function<bool(Device *device, RenderTile &, uint)> acquire_tile;
-  function<void(long, int)> update_progress_sample;
-  function<void(RenderTile &)> update_tile_sample;
-  function<void(RenderTile &)> release_tile;
-  function<bool()> get_cancel;
-  function<bool()> get_tile_stolen;
-  function<void(RenderTileNeighbors &, Device *)> map_neighbor_tiles;
-  function<void(RenderTileNeighbors &, Device *)> unmap_neighbor_tiles;
-
-  uint tile_types;
-  DenoiseParams denoising;
-  bool denoising_from_render;
-  vector<int> denoising_frames;
-
-  int pass_stride;
-  int frame_stride;
-  int target_pass_stride;
-  int pass_denoising_data;
-  int pass_denoising_clean;
-
-  bool need_finish_queue;
-  bool integrator_branched;
-  AdaptiveSampling adaptive_sampling;
-
- protected:
-  double last_update_time;
-};
-
-CCL_NAMESPACE_END
-
-#endif /* __DEVICE_TASK_H__ */
diff --git a/intern/cycles/device/device_dummy.cpp b/intern/cycles/device/dummy/device.cpp
index 5112fc152e5..678276ed025 100644
--- a/intern/cycles/device/device_dummy.cpp
+++ b/intern/cycles/device/dummy/device.cpp
@@ -14,8 +14,10 @@
  * limitations under the License.
  */
 
+#include "device/dummy/device.h"
+
 #include "device/device.h"
-#include "device/device_intern.h"
+#include "device/device_queue.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -23,8 +25,8 @@ CCL_NAMESPACE_BEGIN
 
 class DummyDevice : public Device {
  public:
-  DummyDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
-      : Device(info_, stats_, profiler_, background_)
+  DummyDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_)
+      : Device(info_, stats_, profiler_)
   {
     error_msg = info.error_msg;
   }
@@ -61,23 +63,11 @@ class DummyDevice : public Device {
   virtual void const_copy_to(const char *, void *, size_t) override
   {
   }
-
-  virtual void task_add(DeviceTask &) override
-  {
-  }
-
-  virtual void task_wait() override
-  {
-  }
-
-  virtual void task_cancel() override
-  {
-  }
 };
 
-Device *device_dummy_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
+Device *device_dummy_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
 {
-  return new DummyDevice(info, stats, profiler, background);
+  return new DummyDevice(info, stats, profiler);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/dummy/device.h b/intern/cycles/device/dummy/device.h
new file mode 100644
index 00000000000..832a9568129
--- /dev/null
+++ b/intern/cycles/device/dummy/device.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_string.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+Device *device_dummy_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/multi/device.cpp b/intern/cycles/device/multi/device.cpp
new file mode 100644
index 00000000000..6dbcce2d9a5
--- /dev/null
+++ b/intern/cycles/device/multi/device.cpp
@@ -0,0 +1,423 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/multi/device.h"
+
+#include <sstream>
+#include <stdlib.h>
+
+#include "bvh/bvh_multi.h"
+
+#include "device/device.h"
+#include "device/device_queue.h"
+
+#include "render/buffers.h"
+#include "render/geometry.h"
+
+#include "util/util_foreach.h"
+#include "util/util_list.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_time.h"
+
+CCL_NAMESPACE_BEGIN
+
+class MultiDevice : public Device {
+ public:
+  struct SubDevice {
+    Stats stats;
+    Device *device;
+    map<device_ptr, device_ptr> ptr_map;
+    int peer_island_index = -1;
+  };
+
+  list<SubDevice> devices;
+  device_ptr unique_key;
+  vector<vector<SubDevice *>> peer_islands;
+
+  MultiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+      : Device(info, stats, profiler), unique_key(1)
+  {
+    foreach (const DeviceInfo &subinfo, info.multi_devices) {
+      /* Always add CPU devices at the back since GPU devices can change
+       * host memory pointers, which CPU uses as device pointer. */
+      SubDevice *sub;
+      if (subinfo.type == DEVICE_CPU) {
+        devices.emplace_back();
+        sub = &devices.back();
+      }
+      else {
+        devices.emplace_front();
+        sub = &devices.front();
+      }
+
+      /* The pointer to 'sub->stats' will stay valid even after new devices
+       * are added, since 'devices' is a linked list. */
+      sub->device = Device::create(subinfo, sub->stats, profiler);
+    }
+
+    /* Build a list of peer islands for the available render devices */
+    foreach (SubDevice &sub, devices) {
+      /* First ensure that every device is in at least once peer island */
+      if (sub.peer_island_index < 0) {
+        peer_islands.emplace_back();
+        sub.peer_island_index = (int)peer_islands.size() - 1;
+        peer_islands[sub.peer_island_index].push_back(&sub);
+      }
+
+      if (!info.has_peer_memory) {
+        continue;
+      }
+
+      /* Second check peer access between devices and fill up the islands accordingly */
+      foreach (SubDevice &peer_sub, devices) {
+        if (peer_sub.peer_island_index < 0 &&
+            peer_sub.device->info.type == sub.device->info.type &&
+            peer_sub.device->check_peer_access(sub.device)) {
+          peer_sub.peer_island_index = sub.peer_island_index;
+          peer_islands[sub.peer_island_index].push_back(&peer_sub);
+        }
+      }
+    }
+  }
+
+  ~MultiDevice()
+  {
+    foreach (SubDevice &sub, devices)
+      delete sub.device;
+  }
+
+  const string &error_message() override
+  {
+    error_msg.clear();
+
+    foreach (SubDevice &sub, devices)
+      error_msg += sub.device->error_message();
+
+    return error_msg;
+  }
+
+  virtual bool show_samples() const override
+  {
+    if (devices.size() > 1) {
+      return false;
+    }
+    return devices.front().device->show_samples();
+  }
+
+  virtual BVHLayoutMask get_bvh_layout_mask() const override
+  {
+    BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_ALL;
+    BVHLayoutMask bvh_layout_mask_all = BVH_LAYOUT_NONE;
+    foreach (const SubDevice &sub_device, devices) {
+      BVHLayoutMask device_bvh_layout_mask = sub_device.device->get_bvh_layout_mask();
+      bvh_layout_mask &= device_bvh_layout_mask;
+      bvh_layout_mask_all |= device_bvh_layout_mask;
+    }
+
+    /* With multiple OptiX devices, every device needs its own acceleration structure */
+    if (bvh_layout_mask == BVH_LAYOUT_OPTIX) {
+      return BVH_LAYOUT_MULTI_OPTIX;
+    }
+
+    /* When devices do not share a common BVH layout, fall back to creating one for each */
+    const BVHLayoutMask BVH_LAYOUT_OPTIX_EMBREE = (BVH_LAYOUT_OPTIX | BVH_LAYOUT_EMBREE);
+    if ((bvh_layout_mask_all & BVH_LAYOUT_OPTIX_EMBREE) == BVH_LAYOUT_OPTIX_EMBREE) {
+      return BVH_LAYOUT_MULTI_OPTIX_EMBREE;
+    }
+
+    return bvh_layout_mask;
+  }
+
+  bool load_kernels(const uint kernel_features) override
+  {
+    foreach (SubDevice &sub, devices)
+      if (!sub.device->load_kernels(kernel_features))
+        return false;
+
+    return true;
+  }
+
+  void build_bvh(BVH *bvh, Progress &progress, bool refit) override
+  {
+    /* Try to build and share a single acceleration structure, if possible */
+    if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2 || bvh->params.bvh_layout == BVH_LAYOUT_EMBREE) {
+      devices.back().device->build_bvh(bvh, progress, refit);
+      return;
+    }
+
+    assert(bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX ||
+           bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE);
+
+    BVHMulti *const bvh_multi = static_cast<BVHMulti *>(bvh);
+    bvh_multi->sub_bvhs.resize(devices.size());
+
+    vector<BVHMulti *> geom_bvhs;
+    geom_bvhs.reserve(bvh->geometry.size());
+    foreach (Geometry *geom, bvh->geometry) {
+      geom_bvhs.push_back(static_cast<BVHMulti *>(geom->bvh));
+    }
+
+    /* Broadcast acceleration structure build to all render devices */
+    size_t i = 0;
+    foreach (SubDevice &sub, devices) {
+      /* Change geometry BVH pointers to the sub BVH */
+      for (size_t k = 0; k < bvh->geometry.size(); ++k) {
+        bvh->geometry[k]->bvh = geom_bvhs[k]->sub_bvhs[i];
+      }
+
+      if (!bvh_multi->sub_bvhs[i]) {
+        BVHParams params = bvh->params;
+        if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX)
+          params.bvh_layout = BVH_LAYOUT_OPTIX;
+        else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE)
+          params.bvh_layout = sub.device->info.type == DEVICE_OPTIX ? BVH_LAYOUT_OPTIX :
+                                                                      BVH_LAYOUT_EMBREE;
+
+        /* Skip building a bottom level acceleration structure for non-instanced geometry on Embree
+         * (since they are put into the top level directly, see bvh_embree.cpp) */
+        if (!params.top_level && params.bvh_layout == BVH_LAYOUT_EMBREE &&
+            !bvh->geometry[0]->is_instanced()) {
+          i++;
+          continue;
+        }
+
+        bvh_multi->sub_bvhs[i] = BVH::create(params, bvh->geometry, bvh->objects, sub.device);
+      }
+
+      sub.device->build_bvh(bvh_multi->sub_bvhs[i], progress, refit);
+      i++;
+    }
+
+    /* Change geometry BVH pointers back to the multi BVH. */
+    for (size_t k = 0; k < bvh->geometry.size(); ++k) {
+      bvh->geometry[k]->bvh = geom_bvhs[k];
+    }
+  }
+
+  virtual void *get_cpu_osl_memory() override
+  {
+    if (devices.size() > 1) {
+      return NULL;
+    }
+    return devices.front().device->get_cpu_osl_memory();
+  }
+
+  bool is_resident(device_ptr key, Device *sub_device) override
+  {
+    foreach (SubDevice &sub, devices) {
+      if (sub.device == sub_device) {
+        return find_matching_mem_device(key, sub)->device == sub_device;
+      }
+    }
+    return false;
+  }
+
+  SubDevice *find_matching_mem_device(device_ptr key, SubDevice &sub)
+  {
+    assert(key != 0 && (sub.peer_island_index >= 0 || sub.ptr_map.find(key) != sub.ptr_map.end()));
+
+    /* Get the memory owner of this key (first try current device, then peer devices) */
+    SubDevice *owner_sub = &sub;
+    if (owner_sub->ptr_map.find(key) == owner_sub->ptr_map.end()) {
+      foreach (SubDevice *island_sub, peer_islands[sub.peer_island_index]) {
+        if (island_sub != owner_sub &&
+            island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) {
+          owner_sub = island_sub;
+        }
+      }
+    }
+    return owner_sub;
+  }
+
+  SubDevice *find_suitable_mem_device(device_ptr key, const vector<SubDevice *> &island)
+  {
+    assert(!island.empty());
+
+    /* Get the memory owner of this key or the device with the lowest memory usage when new */
+    SubDevice *owner_sub = island.front();
+    foreach (SubDevice *island_sub, island) {
+      if (key ? (island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) :
+                (island_sub->device->stats.mem_used < owner_sub->device->stats.mem_used)) {
+        owner_sub = island_sub;
+      }
+    }
+    return owner_sub;
+  }
+
+  inline device_ptr find_matching_mem(device_ptr key, SubDevice &sub)
+  {
+    return find_matching_mem_device(key, sub)->ptr_map[key];
+  }
+
+  void mem_alloc(device_memory &mem) override
+  {
+    device_ptr key = unique_key++;
+
+    assert(mem.type == MEM_READ_ONLY || mem.type == MEM_READ_WRITE || mem.type == MEM_DEVICE_ONLY);
+    /* The remaining memory types can be distributed across devices */
+    foreach (const vector<SubDevice *> &island, peer_islands) {
+      SubDevice *owner_sub = find_suitable_mem_device(key, island);
+      mem.device = owner_sub->device;
+      mem.device_pointer = 0;
+      mem.device_size = 0;
+
+      owner_sub->device->mem_alloc(mem);
+      owner_sub->ptr_map[key] = mem.device_pointer;
+    }
+
+    mem.device = this;
+    mem.device_pointer = key;
+    stats.mem_alloc(mem.device_size);
+  }
+
+  void mem_copy_to(device_memory &mem) override
+  {
+    device_ptr existing_key = mem.device_pointer;
+    device_ptr key = (existing_key) ? existing_key : unique_key++;
+    size_t existing_size = mem.device_size;
+
+    /* The tile buffers are allocated on each device (see below), so copy to all of them */
+    foreach (const vector<SubDevice *> &island, peer_islands) {
+      SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
+      mem.device = owner_sub->device;
+      mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
+      mem.device_size = existing_size;
+
+      owner_sub->device->mem_copy_to(mem);
+      owner_sub->ptr_map[key] = mem.device_pointer;
+
+      if (mem.type == MEM_GLOBAL || mem.type == MEM_TEXTURE) {
+        /* Need to create texture objects and update pointer in kernel globals on all devices */
+        foreach (SubDevice *island_sub, island) {
+          if (island_sub != owner_sub) {
+            island_sub->device->mem_copy_to(mem);
+          }
+        }
+      }
+    }
+
+    mem.device = this;
+    mem.device_pointer = key;
+    stats.mem_alloc(mem.device_size - existing_size);
+  }
+
+  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override
+  {
+    device_ptr key = mem.device_pointer;
+    int i = 0, sub_h = h / devices.size();
+
+    foreach (SubDevice &sub, devices) {
+      int sy = y + i * sub_h;
+      int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h;
+
+      SubDevice *owner_sub = find_matching_mem_device(key, sub);
+      mem.device = owner_sub->device;
+      mem.device_pointer = owner_sub->ptr_map[key];
+
+      owner_sub->device->mem_copy_from(mem, sy, w, sh, elem);
+      i++;
+    }
+
+    mem.device = this;
+    mem.device_pointer = key;
+  }
+
+  void mem_zero(device_memory &mem) override
+  {
+    device_ptr existing_key = mem.device_pointer;
+    device_ptr key = (existing_key) ? existing_key : unique_key++;
+    size_t existing_size = mem.device_size;
+
+    foreach (const vector<SubDevice *> &island, peer_islands) {
+      SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
+      mem.device = owner_sub->device;
+      mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
+      mem.device_size = existing_size;
+
+      owner_sub->device->mem_zero(mem);
+      owner_sub->ptr_map[key] = mem.device_pointer;
+    }
+
+    mem.device = this;
+    mem.device_pointer = key;
+    stats.mem_alloc(mem.device_size - existing_size);
+  }
+
+  void mem_free(device_memory &mem) override
+  {
+    device_ptr key = mem.device_pointer;
+    size_t existing_size = mem.device_size;
+
+    /* Free memory that was allocated for all devices (see above) on each device */
+    foreach (const vector<SubDevice *> &island, peer_islands) {
+      SubDevice *owner_sub = find_matching_mem_device(key, *island.front());
+      mem.device = owner_sub->device;
+      mem.device_pointer = owner_sub->ptr_map[key];
+      mem.device_size = existing_size;
+
+      owner_sub->device->mem_free(mem);
+      owner_sub->ptr_map.erase(owner_sub->ptr_map.find(key));
+
+      if (mem.type == MEM_TEXTURE) {
+        /* Free texture objects on all devices */
+        foreach (SubDevice *island_sub, island) {
+          if (island_sub != owner_sub) {
+            island_sub->device->mem_free(mem);
+          }
+        }
+      }
+    }
+
+    mem.device = this;
+    mem.device_pointer = 0;
+    mem.device_size = 0;
+    stats.mem_free(existing_size);
+  }
+
+  void const_copy_to(const char *name, void *host, size_t size) override
+  {
+    foreach (SubDevice &sub, devices)
+      sub.device->const_copy_to(name, host, size);
+  }
+
+  int device_number(Device *sub_device) override
+  {
+    int i = 0;
+
+    foreach (SubDevice &sub, devices) {
+      if (sub.device == sub_device)
+        return i;
+      i++;
+    }
+
+    return -1;
+  }
+
+  virtual void foreach_device(const function<void(Device *)> &callback) override
+  {
+    foreach (SubDevice &sub, devices) {
+      sub.device->foreach_device(callback);
+    }
+  }
+};
+
+Device *device_multi_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+{
+  return new MultiDevice(info, stats, profiler);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/multi/device.h b/intern/cycles/device/multi/device.h
new file mode 100644
index 00000000000..6e121014a1f
--- /dev/null
+++ b/intern/cycles/device/multi/device.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_string.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+Device *device_multi_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/opencl/device_opencl.h b/intern/cycles/device/opencl/device_opencl.h
deleted file mode 100644
index a65e764b0d4..00000000000
--- a/intern/cycles/device/opencl/device_opencl.h
+++ /dev/null
@@ -1,658 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPENCL
-
-#  include "device/device.h"
-#  include "device/device_denoising.h"
-#  include "device/device_split_kernel.h"
-
-#  include "util/util_map.h"
-#  include "util/util_param.h"
-#  include "util/util_string.h"
-#  include "util/util_task.h"
-
-#  include "clew.h"
-
-#  include "device/opencl/memory_manager.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* Disable workarounds, seems to be working fine on latest drivers. */
-#  define CYCLES_DISABLE_DRIVER_WORKAROUNDS
-
-/* Define CYCLES_DISABLE_DRIVER_WORKAROUNDS to disable workarounds for testing. */
-#  ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS
-/* Work around AMD driver hangs by ensuring each command is finished before doing anything else. */
-#    undef clEnqueueNDRangeKernel
-#    define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \
-      CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \
-      clFinish(a);
-
-#    undef clEnqueueWriteBuffer
-#    define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \
-      CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \
-      clFinish(a);
-
-#    undef clEnqueueReadBuffer
-#    define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \
-      CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \
-      clFinish(a);
-#  endif /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */
-
-#  define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
-
-struct OpenCLPlatformDevice {
-  OpenCLPlatformDevice(cl_platform_id platform_id,
-                       const string &platform_name,
-                       cl_device_id device_id,
-                       cl_device_type device_type,
-                       const string &device_name,
-                       const string &hardware_id,
-                       const string &device_extensions)
-      : platform_id(platform_id),
-        platform_name(platform_name),
-        device_id(device_id),
-        device_type(device_type),
-        device_name(device_name),
-        hardware_id(hardware_id),
-        device_extensions(device_extensions)
-  {
-  }
-  cl_platform_id platform_id;
-  string platform_name;
-  cl_device_id device_id;
-  cl_device_type device_type;
-  string device_name;
-  string hardware_id;
-  string device_extensions;
-};
-
-/* Contains all static OpenCL helper functions. */
-class OpenCLInfo {
- public:
-  static cl_device_type device_type();
-  static bool use_debug();
-  static bool device_supported(const string &platform_name, const cl_device_id device_id);
-  static bool platform_version_check(cl_platform_id platform, string *error = NULL);
-  static bool device_version_check(cl_device_id device, string *error = NULL);
-  static bool get_device_version(cl_device_id device,
-                                 int *r_major,
-                                 int *r_minor,
-                                 string *error = NULL);
-  static string get_hardware_id(const string &platform_name, cl_device_id device_id);
-  static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices);
-
-  /* ** Some handy shortcuts to low level cl*GetInfo() functions. ** */
-
-  /* Platform information. */
-  static bool get_num_platforms(cl_uint *num_platforms, cl_int *error = NULL);
-  static cl_uint get_num_platforms();
-
-  static bool get_platforms(vector<cl_platform_id> *platform_ids, cl_int *error = NULL);
-  static vector<cl_platform_id> get_platforms();
-
-  static bool get_platform_name(cl_platform_id platform_id, string *platform_name);
-  static string get_platform_name(cl_platform_id platform_id);
-
-  static bool get_num_platform_devices(cl_platform_id platform_id,
-                                       cl_device_type device_type,
-                                       cl_uint *num_devices,
-                                       cl_int *error = NULL);
-  static cl_uint get_num_platform_devices(cl_platform_id platform_id, cl_device_type device_type);
-
-  static bool get_platform_devices(cl_platform_id platform_id,
-                                   cl_device_type device_type,
-                                   vector<cl_device_id> *device_ids,
-                                   cl_int *error = NULL);
-  static vector<cl_device_id> get_platform_devices(cl_platform_id platform_id,
-                                                   cl_device_type device_type);
-
-  /* Device information. */
-  static bool get_device_name(cl_device_id device_id, string *device_name, cl_int *error = NULL);
-
-  static string get_device_name(cl_device_id device_id);
-
-  static bool get_device_extensions(cl_device_id device_id,
-                                    string *device_extensions,
-                                    cl_int *error = NULL);
-
-  static string get_device_extensions(cl_device_id device_id);
-
-  static bool get_device_type(cl_device_id device_id,
-                              cl_device_type *device_type,
-                              cl_int *error = NULL);
-  static cl_device_type get_device_type(cl_device_id device_id);
-
-  static bool get_driver_version(cl_device_id device_id,
-                                 int *major,
-                                 int *minor,
-                                 cl_int *error = NULL);
-
-  static int mem_sub_ptr_alignment(cl_device_id device_id);
-
-  /* Get somewhat more readable device name.
-   * Main difference is AMD OpenCL here which only gives code name
-   * for the regular device name. This will give more sane device
-   * name using some extensions.
-   */
-  static string get_readable_device_name(cl_device_id device_id);
-};
-
-/* Thread safe cache for contexts and programs.
- */
-class OpenCLCache {
-  struct Slot {
-    struct ProgramEntry {
-      ProgramEntry();
-      ProgramEntry(const ProgramEntry &rhs);
-      ~ProgramEntry();
-      cl_program program;
-      thread_mutex *mutex;
-    };
-
-    Slot();
-    Slot(const Slot &rhs);
-    ~Slot();
-
-    thread_mutex *context_mutex;
-    cl_context context;
-    typedef map<ustring, ProgramEntry> EntryMap;
-    EntryMap programs;
-  };
-
-  /* key is combination of platform ID and device ID */
-  typedef pair<cl_platform_id, cl_device_id> PlatformDevicePair;
-
-  /* map of Slot objects */
-  typedef map<PlatformDevicePair, Slot> CacheMap;
-  CacheMap cache;
-
-  /* MD5 hash of the kernel source. */
-  string kernel_md5;
-
-  thread_mutex cache_lock;
-  thread_mutex kernel_md5_lock;
-
-  /* lazy instantiate */
-  static OpenCLCache &global_instance();
-
- public:
-  enum ProgramName {
-    OCL_DEV_BASE_PROGRAM,
-    OCL_DEV_MEGAKERNEL_PROGRAM,
-  };
-
-  /* Lookup context in the cache. If this returns NULL, slot_locker
-   * will be holding a lock for the cache. slot_locker should refer to a
-   * default constructed thread_scoped_lock. */
-  static cl_context get_context(cl_platform_id platform,
-                                cl_device_id device,
-                                thread_scoped_lock &slot_locker);
-  /* Same as above. */
-  static cl_program get_program(cl_platform_id platform,
-                                cl_device_id device,
-                                ustring key,
-                                thread_scoped_lock &slot_locker);
-
-  /* Store context in the cache. You MUST have tried to get the item before storing to it. */
-  static void store_context(cl_platform_id platform,
-                            cl_device_id device,
-                            cl_context context,
-                            thread_scoped_lock &slot_locker);
-  /* Same as above. */
-  static void store_program(cl_platform_id platform,
-                            cl_device_id device,
-                            cl_program program,
-                            ustring key,
-                            thread_scoped_lock &slot_locker);
-
-  static string get_kernel_md5();
-};
-
-#  define opencl_device_assert(device, stmt) \
-    { \
-      cl_int err = stmt; \
-\
-      if (err != CL_SUCCESS) { \
-        string message = string_printf( \
-            "OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
-        if ((device)->error_message() == "") { \
-          (device)->set_error(message); \
-        } \
-        fprintf(stderr, "%s\n", message.c_str()); \
-      } \
-    } \
-    (void)0
-
-#  define opencl_assert(stmt) \
-    { \
-      cl_int err = stmt; \
-\
-      if (err != CL_SUCCESS) { \
-        string message = string_printf( \
-            "OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
-        if (error_msg == "") { \
-          error_msg = message; \
-        } \
-        fprintf(stderr, "%s\n", message.c_str()); \
-      } \
-    } \
-    (void)0
-
-class OpenCLDevice : public Device {
- public:
-  DedicatedTaskPool task_pool;
-
-  /* Task pool for required kernels (base, AO kernels during foreground rendering) */
-  TaskPool load_required_kernel_task_pool;
-  /* Task pool for optional kernels (feature kernels during foreground rendering) */
-  TaskPool load_kernel_task_pool;
-  std::atomic<int> load_kernel_num_compiling;
-
-  cl_context cxContext;
-  cl_command_queue cqCommandQueue;
-  cl_platform_id cpPlatform;
-  cl_device_id cdDevice;
-  cl_int ciErr;
-  int device_num;
-
-  class OpenCLProgram {
-   public:
-    OpenCLProgram() : loaded(false), needs_compiling(true), program(NULL), device(NULL)
-    {
-    }
-    OpenCLProgram(OpenCLDevice *device,
-                  const string &program_name,
-                  const string &kernel_name,
-                  const string &kernel_build_options,
-                  bool use_stdout = true);
-    ~OpenCLProgram();
-
-    void add_kernel(ustring name);
-
-    /* Try to load the program from device cache or disk */
-    bool load();
-    /* Compile the kernel (first separate, fail-back to local). */
-    void compile();
-    /* Create the OpenCL kernels after loading or compiling */
-    void create_kernels();
-
-    bool is_loaded() const
-    {
-      return loaded;
-    }
-    const string &get_log() const
-    {
-      return log;
-    }
-    void report_error();
-
-    /* Wait until this kernel is available to be used
-     * It will return true when the kernel is available.
-     * It will return false when the kernel is not available
-     * or could not be loaded. */
-    bool wait_for_availability();
-
-    cl_kernel operator()();
-    cl_kernel operator()(ustring name);
-
-    void release();
-
-   private:
-    bool build_kernel(const string *debug_src);
-    /* Build the program by calling the own process.
-     * This is required for multithreaded OpenCL compilation, since most Frameworks serialize
-     * build calls internally if they come from the same process.
-     * If that is not supported, this function just returns false.
-     */
-    bool compile_separate(const string &clbin);
-    /* Build the program by calling OpenCL directly. */
-    bool compile_kernel(const string *debug_src);
-    /* Loading and saving the program from/to disk. */
-    bool load_binary(const string &clbin, const string *debug_src = NULL);
-    bool save_binary(const string &clbin);
-
-    void add_log(const string &msg, bool is_debug);
-    void add_error(const string &msg);
-
-    bool loaded;
-    bool needs_compiling;
-
-    cl_program program;
-    OpenCLDevice *device;
-
-    /* Used for the OpenCLCache key. */
-    string program_name;
-
-    string kernel_file, kernel_build_options, device_md5;
-
-    bool use_stdout;
-    string log, error_msg;
-    string compile_output;
-
-    map<ustring, cl_kernel> kernels;
-  };
-
-  /* Container for all types of split programs. */
-  class OpenCLSplitPrograms {
-   public:
-    OpenCLDevice *device;
-    OpenCLProgram program_split;
-    OpenCLProgram program_lamp_emission;
-    OpenCLProgram program_do_volume;
-    OpenCLProgram program_indirect_background;
-    OpenCLProgram program_shader_eval;
-    OpenCLProgram program_holdout_emission_blurring_pathtermination_ao;
-    OpenCLProgram program_subsurface_scatter;
-    OpenCLProgram program_direct_lighting;
-    OpenCLProgram program_shadow_blocked_ao;
-    OpenCLProgram program_shadow_blocked_dl;
-
-    OpenCLSplitPrograms(OpenCLDevice *device);
-    ~OpenCLSplitPrograms();
-
-    /* Load the kernels and put the created kernels in the given
-     * `programs` parameter. */
-    void load_kernels(vector<OpenCLProgram *> &programs,
-                      const DeviceRequestedFeatures &requested_features);
-  };
-
-  DeviceSplitKernel *split_kernel;
-
-  OpenCLProgram base_program;
-  OpenCLProgram bake_program;
-  OpenCLProgram displace_program;
-  OpenCLProgram background_program;
-  OpenCLProgram denoising_program;
-
-  OpenCLSplitPrograms kernel_programs;
-
-  typedef map<string, device_vector<uchar> *> ConstMemMap;
-  typedef map<string, device_ptr> MemMap;
-
-  ConstMemMap const_mem_map;
-  MemMap mem_map;
-
-  bool device_initialized;
-  string platform_name;
-  string device_name;
-
-  bool opencl_error(cl_int err);
-  void opencl_error(const string &message);
-  void opencl_assert_err(cl_int err, const char *where);
-
-  OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-  ~OpenCLDevice();
-
-  static void CL_CALLBACK context_notify_callback(const char *err_info,
-                                                  const void * /*private_info*/,
-                                                  size_t /*cb*/,
-                                                  void *user_data);
-
-  bool opencl_version_check();
-  OpenCLSplitPrograms *get_split_programs();
-
-  string device_md5_hash(string kernel_custom_build_options = "");
-  bool load_kernels(const DeviceRequestedFeatures &requested_features);
-  void load_required_kernels(const DeviceRequestedFeatures &requested_features);
-
-  bool wait_for_availability(const DeviceRequestedFeatures &requested_features);
-  DeviceKernelStatus get_active_kernel_switch_state();
-
-  /* Get the name of the opencl program for the given kernel */
-  const string get_opencl_program_name(const string &kernel_name);
-  /* Get the program file name to compile (*.cl) for the given kernel */
-  const string get_opencl_program_filename(const string &kernel_name);
-  string get_build_options(const DeviceRequestedFeatures &requested_features,
-                           const string &opencl_program_name);
-  /* Enable the default features to reduce recompilation events */
-  void enable_default_features(DeviceRequestedFeatures &features);
-
-  void mem_alloc(device_memory &mem);
-  void mem_copy_to(device_memory &mem);
-  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem);
-  void mem_zero(device_memory &mem);
-  void mem_free(device_memory &mem);
-
-  int mem_sub_ptr_alignment();
-
-  void const_copy_to(const char *name, void *host, size_t size);
-  void global_alloc(device_memory &mem);
-  void global_free(device_memory &mem);
-  void tex_alloc(device_texture &mem);
-  void tex_free(device_texture &mem);
-
-  size_t global_size_round_up(int group_size, int global_size);
-  void enqueue_kernel(cl_kernel kernel,
-                      size_t w,
-                      size_t h,
-                      bool x_workgroups = false,
-                      size_t max_workgroup_size = -1);
-  void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name);
-  void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
-
-  void film_convert(DeviceTask &task,
-                    device_ptr buffer,
-                    device_ptr rgba_byte,
-                    device_ptr rgba_half);
-  void shader(DeviceTask &task);
-  void update_adaptive(DeviceTask &task, RenderTile &tile, int sample);
-  void bake(DeviceTask &task, RenderTile &tile);
-
-  void denoise(RenderTile &tile, DenoisingTask &denoising);
-
-  int get_split_task_count(DeviceTask & /*task*/)
-  {
-    return 1;
-  }
-
-  void task_add(DeviceTask &task)
-  {
-    task_pool.push([=] {
-      DeviceTask task_copy = task;
-      thread_run(task_copy);
-    });
-  }
-
-  void task_wait()
-  {
-    task_pool.wait();
-  }
-
-  void task_cancel()
-  {
-    task_pool.cancel();
-  }
-
-  void thread_run(DeviceTask &task);
-
-  virtual BVHLayoutMask get_bvh_layout_mask() const
-  {
-    return BVH_LAYOUT_BVH2;
-  }
-
-  virtual bool show_samples() const
-  {
-    return true;
-  }
-
- protected:
-  string kernel_build_options(const string *debug_src = NULL);
-
-  void mem_zero_kernel(device_ptr ptr, size_t size);
-
-  bool denoising_non_local_means(device_ptr image_ptr,
-                                 device_ptr guide_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr out_ptr,
-                                 DenoisingTask *task);
-  bool denoising_construct_transform(DenoisingTask *task);
-  bool denoising_accumulate(device_ptr color_ptr,
-                            device_ptr color_variance_ptr,
-                            device_ptr scale_ptr,
-                            int frame,
-                            DenoisingTask *task);
-  bool denoising_solve(device_ptr output_ptr, DenoisingTask *task);
-  bool denoising_combine_halves(device_ptr a_ptr,
-                                device_ptr b_ptr,
-                                device_ptr mean_ptr,
-                                device_ptr variance_ptr,
-                                int r,
-                                int4 rect,
-                                DenoisingTask *task);
-  bool denoising_divide_shadow(device_ptr a_ptr,
-                               device_ptr b_ptr,
-                               device_ptr sample_variance_ptr,
-                               device_ptr sv_variance_ptr,
-                               device_ptr buffer_variance_ptr,
-                               DenoisingTask *task);
-  bool denoising_get_feature(int mean_offset,
-                             int variance_offset,
-                             device_ptr mean_ptr,
-                             device_ptr variance_ptr,
-                             float scale,
-                             DenoisingTask *task);
-  bool denoising_write_feature(int to_offset,
-                               device_ptr from_ptr,
-                               device_ptr buffer_ptr,
-                               DenoisingTask *task);
-  bool denoising_detect_outliers(device_ptr image_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr depth_ptr,
-                                 device_ptr output_ptr,
-                                 DenoisingTask *task);
-
-  device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int size);
-  void mem_free_sub_ptr(device_ptr ptr);
-
-  class ArgumentWrapper {
-   public:
-    ArgumentWrapper() : size(0), pointer(NULL)
-    {
-    }
-
-    ArgumentWrapper(device_memory &argument)
-        : size(sizeof(void *)), pointer((void *)(&argument.device_pointer))
-    {
-    }
-
-    template<typename T>
-    ArgumentWrapper(device_vector<T> &argument)
-        : size(sizeof(void *)), pointer((void *)(&argument.device_pointer))
-    {
-    }
-
-    template<typename T>
-    ArgumentWrapper(device_only_memory<T> &argument)
-        : size(sizeof(void *)), pointer((void *)(&argument.device_pointer))
-    {
-    }
-    template<typename T> ArgumentWrapper(T &argument) : size(sizeof(argument)), pointer(&argument)
-    {
-    }
-
-    ArgumentWrapper(int argument) : size(sizeof(int)), int_value(argument), pointer(&int_value)
-    {
-    }
-
-    ArgumentWrapper(float argument)
-        : size(sizeof(float)), float_value(argument), pointer(&float_value)
-    {
-    }
-
-    size_t size;
-    int int_value;
-    float float_value;
-    void *pointer;
-  };
-
-  /* TODO(sergey): In the future we can use variadic templates, once
-   * C++0x is allowed. Should allow to clean this up a bit.
-   */
-  int kernel_set_args(cl_kernel kernel,
-                      int start_argument_index,
-                      const ArgumentWrapper &arg1 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg2 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg3 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg4 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg5 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg6 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg7 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg8 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg9 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg10 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg11 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg12 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg13 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg14 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg15 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg16 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg17 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg18 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg19 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg20 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg21 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg22 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg23 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg24 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg25 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg26 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg27 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg28 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg29 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg30 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg31 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg32 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg33 = ArgumentWrapper());
-
-  void release_kernel_safe(cl_kernel kernel);
-  void release_mem_object_safe(cl_mem mem);
-  void release_program_safe(cl_program program);
-
-  /* ** Those guys are for working around some compiler-specific bugs ** */
-
-  cl_program load_cached_kernel(ustring key, thread_scoped_lock &cache_locker);
-
-  void store_cached_kernel(cl_program program, ustring key, thread_scoped_lock &cache_locker);
-
- private:
-  MemoryManager memory_manager;
-  friend class MemoryManager;
-
-  static_assert_align(TextureInfo, 16);
-  device_vector<TextureInfo> texture_info;
-
-  typedef map<string, device_memory *> TexturesMap;
-  TexturesMap textures;
-
-  bool textures_need_update;
-
- protected:
-  void flush_texture_buffers();
-
-  friend class OpenCLSplitKernel;
-  friend class OpenCLSplitKernelFunction;
-};
-
-Device *opencl_create_split_device(DeviceInfo &info,
-                                   Stats &stats,
-                                   Profiler &profiler,
-                                   bool background);
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/opencl/device_opencl_impl.cpp b/intern/cycles/device/opencl/device_opencl_impl.cpp
deleted file mode 100644
index 31a2265700c..00000000000
--- a/intern/cycles/device/opencl/device_opencl_impl.cpp
+++ /dev/null
@@ -1,2113 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPENCL
-
-#  include "device/opencl/device_opencl.h"
-
-#  include "kernel/kernel_types.h"
-#  include "kernel/split/kernel_split_data_types.h"
-
-#  include "util/util_algorithm.h"
-#  include "util/util_debug.h"
-#  include "util/util_foreach.h"
-#  include "util/util_logging.h"
-#  include "util/util_md5.h"
-#  include "util/util_path.h"
-#  include "util/util_time.h"
-
-CCL_NAMESPACE_BEGIN
-
-struct texture_slot_t {
-  texture_slot_t(const string &name, int slot) : name(name), slot(slot)
-  {
-  }
-  string name;
-  int slot;
-};
-
-static const string NON_SPLIT_KERNELS =
-    "denoising "
-    "base "
-    "background "
-    "displace ";
-
-static const string SPLIT_BUNDLE_KERNELS =
-    "data_init "
-    "path_init "
-    "state_buffer_size "
-    "scene_intersect "
-    "queue_enqueue "
-    "shader_setup "
-    "shader_sort "
-    "enqueue_inactive "
-    "next_iteration_setup "
-    "indirect_subsurface "
-    "buffer_update "
-    "adaptive_stopping "
-    "adaptive_filter_x "
-    "adaptive_filter_y "
-    "adaptive_adjust_samples";
-
-const string OpenCLDevice::get_opencl_program_name(const string &kernel_name)
-{
-  if (NON_SPLIT_KERNELS.find(kernel_name) != std::string::npos) {
-    return kernel_name;
-  }
-  else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) {
-    return "split_bundle";
-  }
-  else {
-    return "split_" + kernel_name;
-  }
-}
-
-const string OpenCLDevice::get_opencl_program_filename(const string &kernel_name)
-{
-  if (kernel_name == "denoising") {
-    return "filter.cl";
-  }
-  else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) {
-    return "kernel_split_bundle.cl";
-  }
-  else {
-    return "kernel_" + kernel_name + ".cl";
-  }
-}
-
-/* Enable features that we always want to compile to reduce recompilation events */
-void OpenCLDevice::enable_default_features(DeviceRequestedFeatures &features)
-{
-  features.use_transparent = true;
-  features.use_shadow_tricks = true;
-  features.use_principled = true;
-  features.use_denoising = true;
-
-  if (!background) {
-    features.max_nodes_group = NODE_GROUP_LEVEL_MAX;
-    features.nodes_features = NODE_FEATURE_ALL;
-    features.use_hair = true;
-    features.use_subsurface = true;
-    features.use_camera_motion = false;
-    features.use_object_motion = false;
-  }
-}
-
-string OpenCLDevice::get_build_options(const DeviceRequestedFeatures &requested_features,
-                                       const string &opencl_program_name)
-{
-  /* first check for non-split kernel programs */
-  if (opencl_program_name == "base" || opencl_program_name == "denoising") {
-    return "";
-  }
-  else if (opencl_program_name == "bake") {
-    /* Note: get_build_options for bake is only requested when baking is enabled.
-     * displace and background are always requested.
-     * `__SPLIT_KERNEL__` must not be present in the compile directives for bake */
-    DeviceRequestedFeatures features(requested_features);
-    enable_default_features(features);
-    features.use_denoising = false;
-    features.use_object_motion = false;
-    features.use_camera_motion = false;
-    features.use_hair = true;
-    features.use_subsurface = true;
-    features.max_nodes_group = NODE_GROUP_LEVEL_MAX;
-    features.nodes_features = NODE_FEATURE_ALL;
-    features.use_integrator_branched = false;
-    return features.get_build_options();
-  }
-  else if (opencl_program_name == "displace") {
-    /* As displacement does not use any nodes from the Shading group (eg BSDF).
-     * We disable all features that are related to shading. */
-    DeviceRequestedFeatures features(requested_features);
-    enable_default_features(features);
-    features.use_denoising = false;
-    features.use_object_motion = false;
-    features.use_camera_motion = false;
-    features.use_baking = false;
-    features.use_transparent = false;
-    features.use_shadow_tricks = false;
-    features.use_subsurface = false;
-    features.use_volume = false;
-    features.nodes_features &= ~NODE_FEATURE_VOLUME;
-    features.use_denoising = false;
-    features.use_principled = false;
-    features.use_integrator_branched = false;
-    return features.get_build_options();
-  }
-  else if (opencl_program_name == "background") {
-    /* Background uses Background shading
-     * It is save to disable shadow features, subsurface and volumetric. */
-    DeviceRequestedFeatures features(requested_features);
-    enable_default_features(features);
-    features.use_baking = false;
-    features.use_object_motion = false;
-    features.use_camera_motion = false;
-    features.use_transparent = false;
-    features.use_shadow_tricks = false;
-    features.use_denoising = false;
-    /* NOTE: currently possible to use surface nodes like `Hair Info`, `Bump` node.
-     * Perhaps we should remove them in UI as it does not make any sense when
-     * rendering background. */
-    features.nodes_features &= ~NODE_FEATURE_VOLUME;
-    features.use_subsurface = false;
-    features.use_volume = false;
-    features.use_shader_raytrace = false;
-    features.use_patch_evaluation = false;
-    features.use_integrator_branched = false;
-    return features.get_build_options();
-  }
-
-  string build_options = "-D__SPLIT_KERNEL__ ";
-  /* Set compute device build option. */
-  cl_device_type device_type;
-  OpenCLInfo::get_device_type(this->cdDevice, &device_type, &this->ciErr);
-  assert(this->ciErr == CL_SUCCESS);
-  if (device_type == CL_DEVICE_TYPE_GPU) {
-    build_options += "-D__COMPUTE_DEVICE_GPU__ ";
-  }
-
-  DeviceRequestedFeatures nofeatures;
-  enable_default_features(nofeatures);
-
-  /* Add program specific optimized compile directives */
-  if (opencl_program_name == "split_do_volume" && !requested_features.use_volume) {
-    build_options += nofeatures.get_build_options();
-  }
-  else {
-    DeviceRequestedFeatures features(requested_features);
-    enable_default_features(features);
-
-    /* Always turn off baking at this point. Baking is only useful when building the bake kernel.
-     * this also makes sure that the kernels that are build during baking can be reused
-     * when not doing any baking. */
-    features.use_baking = false;
-
-    /* Do not vary on shaders when program doesn't do any shading.
-     * We have bundled them in a single program. */
-    if (opencl_program_name == "split_bundle") {
-      features.max_nodes_group = 0;
-      features.nodes_features = 0;
-      features.use_shader_raytrace = false;
-    }
-
-    /* No specific settings, just add the regular ones */
-    build_options += features.get_build_options();
-  }
-
-  return build_options;
-}
-
-OpenCLDevice::OpenCLSplitPrograms::OpenCLSplitPrograms(OpenCLDevice *device_)
-{
-  device = device_;
-}
-
-OpenCLDevice::OpenCLSplitPrograms::~OpenCLSplitPrograms()
-{
-  program_split.release();
-  program_lamp_emission.release();
-  program_do_volume.release();
-  program_indirect_background.release();
-  program_shader_eval.release();
-  program_holdout_emission_blurring_pathtermination_ao.release();
-  program_subsurface_scatter.release();
-  program_direct_lighting.release();
-  program_shadow_blocked_ao.release();
-  program_shadow_blocked_dl.release();
-}
-
-void OpenCLDevice::OpenCLSplitPrograms::load_kernels(
-    vector<OpenCLProgram *> &programs, const DeviceRequestedFeatures &requested_features)
-{
-  if (!requested_features.use_baking) {
-#  define ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(kernel_name) \
-    program_split.add_kernel(ustring("path_trace_" #kernel_name));
-#  define ADD_SPLIT_KERNEL_PROGRAM(kernel_name) \
-    const string program_name_##kernel_name = "split_" #kernel_name; \
-    program_##kernel_name = OpenCLDevice::OpenCLProgram( \
-        device, \
-        program_name_##kernel_name, \
-        "kernel_" #kernel_name ".cl", \
-        device->get_build_options(requested_features, program_name_##kernel_name)); \
-    program_##kernel_name.add_kernel(ustring("path_trace_" #kernel_name)); \
-    programs.push_back(&program_##kernel_name);
-
-    /* Ordered with most complex kernels first, to reduce overall compile time. */
-    ADD_SPLIT_KERNEL_PROGRAM(subsurface_scatter);
-    ADD_SPLIT_KERNEL_PROGRAM(direct_lighting);
-    ADD_SPLIT_KERNEL_PROGRAM(indirect_background);
-    if (requested_features.use_volume) {
-      ADD_SPLIT_KERNEL_PROGRAM(do_volume);
-    }
-    ADD_SPLIT_KERNEL_PROGRAM(shader_eval);
-    ADD_SPLIT_KERNEL_PROGRAM(lamp_emission);
-    ADD_SPLIT_KERNEL_PROGRAM(holdout_emission_blurring_pathtermination_ao);
-    ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_dl);
-    ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_ao);
-
-    /* Quick kernels bundled in a single program to reduce overhead of starting
-     * Blender processes. */
-    program_split = OpenCLDevice::OpenCLProgram(
-        device,
-        "split_bundle",
-        "kernel_split_bundle.cl",
-        device->get_build_options(requested_features, "split_bundle"));
-
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(data_init);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(state_buffer_size);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(path_init);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(scene_intersect);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(queue_enqueue);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_setup);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_sort);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(enqueue_inactive);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(next_iteration_setup);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(indirect_subsurface);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(buffer_update);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_stopping);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_x);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_y);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_adjust_samples);
-    programs.push_back(&program_split);
-
-#  undef ADD_SPLIT_KERNEL_PROGRAM
-#  undef ADD_SPLIT_KERNEL_BUNDLE_PROGRAM
-  }
-}
-
-namespace {
-
-/* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
- * fetch its size.
- */
-typedef struct KernelGlobalsDummy {
-  ccl_constant KernelData *data;
-  ccl_global char *buffers[8];
-
-#  define KERNEL_TEX(type, name) TextureInfo name;
-#  include "kernel/kernel_textures.h"
-#  undef KERNEL_TEX
-  SplitData split_data;
-  SplitParams split_param_data;
-} KernelGlobalsDummy;
-
-}  // namespace
-
-struct CachedSplitMemory {
-  int id;
-  device_memory *split_data;
-  device_memory *ray_state;
-  device_memory *queue_index;
-  device_memory *use_queues_flag;
-  device_memory *work_pools;
-  device_ptr *buffer;
-};
-
-class OpenCLSplitKernelFunction : public SplitKernelFunction {
- public:
-  OpenCLDevice *device;
-  OpenCLDevice::OpenCLProgram program;
-  CachedSplitMemory &cached_memory;
-  int cached_id;
-
-  OpenCLSplitKernelFunction(OpenCLDevice *device, CachedSplitMemory &cached_memory)
-      : device(device), cached_memory(cached_memory), cached_id(cached_memory.id - 1)
-  {
-  }
-
-  ~OpenCLSplitKernelFunction()
-  {
-    program.release();
-  }
-
-  virtual bool enqueue(const KernelDimensions &dim, device_memory &kg, device_memory &data)
-  {
-    if (cached_id != cached_memory.id) {
-      cl_uint start_arg_index = device->kernel_set_args(
-          program(), 0, kg, data, *cached_memory.split_data, *cached_memory.ray_state);
-
-      device->set_kernel_arg_buffers(program(), &start_arg_index);
-
-      start_arg_index += device->kernel_set_args(program(),
-                                                 start_arg_index,
-                                                 *cached_memory.queue_index,
-                                                 *cached_memory.use_queues_flag,
-                                                 *cached_memory.work_pools,
-                                                 *cached_memory.buffer);
-
-      cached_id = cached_memory.id;
-    }
-
-    device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
-                                           program(),
-                                           2,
-                                           NULL,
-                                           dim.global_size,
-                                           dim.local_size,
-                                           0,
-                                           NULL,
-                                           NULL);
-
-    device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
-
-    if (device->ciErr != CL_SUCCESS) {
-      string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
-                                     clewErrorString(device->ciErr));
-      device->opencl_error(message);
-      return false;
-    }
-
-    return true;
-  }
-};
-
-class OpenCLSplitKernel : public DeviceSplitKernel {
-  OpenCLDevice *device;
-  CachedSplitMemory cached_memory;
-
- public:
-  explicit OpenCLSplitKernel(OpenCLDevice *device) : DeviceSplitKernel(device), device(device)
-  {
-  }
-
-  virtual SplitKernelFunction *get_split_kernel_function(
-      const string &kernel_name, const DeviceRequestedFeatures &requested_features)
-  {
-    OpenCLSplitKernelFunction *kernel = new OpenCLSplitKernelFunction(device, cached_memory);
-
-    const string program_name = device->get_opencl_program_name(kernel_name);
-    kernel->program = OpenCLDevice::OpenCLProgram(
-        device,
-        program_name,
-        device->get_opencl_program_filename(kernel_name),
-        device->get_build_options(requested_features, program_name));
-
-    kernel->program.add_kernel(ustring("path_trace_" + kernel_name));
-    kernel->program.load();
-
-    if (!kernel->program.is_loaded()) {
-      delete kernel;
-      return NULL;
-    }
-
-    return kernel;
-  }
-
-  virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads)
-  {
-    device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
-    size_buffer.alloc(1);
-    size_buffer.zero_to_device();
-
-    uint threads = num_threads;
-    OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs();
-    cl_kernel kernel_state_buffer_size = programs->program_split(
-        ustring("path_trace_state_buffer_size"));
-    device->kernel_set_args(kernel_state_buffer_size, 0, kg, data, threads, size_buffer);
-
-    size_t global_size = 64;
-    device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
-                                           kernel_state_buffer_size,
-                                           1,
-                                           NULL,
-                                           &global_size,
-                                           NULL,
-                                           0,
-                                           NULL,
-                                           NULL);
-
-    device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
-
-    size_buffer.copy_from_device(0, 1, 1);
-    size_t size = size_buffer[0];
-    size_buffer.free();
-
-    if (device->ciErr != CL_SUCCESS) {
-      string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
-                                     clewErrorString(device->ciErr));
-      device->opencl_error(message);
-      return 0;
-    }
-
-    return size;
-  }
-
-  virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
-                                              RenderTile &rtile,
-                                              int num_global_elements,
-                                              device_memory &kernel_globals,
-                                              device_memory &kernel_data,
-                                              device_memory &split_data,
-                                              device_memory &ray_state,
-                                              device_memory &queue_index,
-                                              device_memory &use_queues_flag,
-                                              device_memory &work_pool_wgs)
-  {
-    cl_int dQueue_size = dim.global_size[0] * dim.global_size[1];
-
-    /* Set the range of samples to be processed for every ray in
-     * path-regeneration logic.
-     */
-    cl_int start_sample = rtile.start_sample;
-    cl_int end_sample = rtile.start_sample + rtile.num_samples;
-
-    OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs();
-    cl_kernel kernel_data_init = programs->program_split(ustring("path_trace_data_init"));
-
-    cl_uint start_arg_index = device->kernel_set_args(kernel_data_init,
-                                                      0,
-                                                      kernel_globals,
-                                                      kernel_data,
-                                                      split_data,
-                                                      num_global_elements,
-                                                      ray_state);
-
-    device->set_kernel_arg_buffers(kernel_data_init, &start_arg_index);
-
-    start_arg_index += device->kernel_set_args(kernel_data_init,
-                                               start_arg_index,
-                                               start_sample,
-                                               end_sample,
-                                               rtile.x,
-                                               rtile.y,
-                                               rtile.w,
-                                               rtile.h,
-                                               rtile.offset,
-                                               rtile.stride,
-                                               queue_index,
-                                               dQueue_size,
-                                               use_queues_flag,
-                                               work_pool_wgs,
-                                               rtile.num_samples,
-                                               rtile.buffer);
-
-    /* Enqueue ckPathTraceKernel_data_init kernel. */
-    device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
-                                           kernel_data_init,
-                                           2,
-                                           NULL,
-                                           dim.global_size,
-                                           dim.local_size,
-                                           0,
-                                           NULL,
-                                           NULL);
-
-    device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
-
-    if (device->ciErr != CL_SUCCESS) {
-      string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
-                                     clewErrorString(device->ciErr));
-      device->opencl_error(message);
-      return false;
-    }
-
-    cached_memory.split_data = &split_data;
-    cached_memory.ray_state = &ray_state;
-    cached_memory.queue_index = &queue_index;
-    cached_memory.use_queues_flag = &use_queues_flag;
-    cached_memory.work_pools = &work_pool_wgs;
-    cached_memory.buffer = &rtile.buffer;
-    cached_memory.id++;
-
-    return true;
-  }
-
-  virtual int2 split_kernel_local_size()
-  {
-    return make_int2(64, 1);
-  }
-
-  virtual int2 split_kernel_global_size(device_memory &kg,
-                                        device_memory &data,
-                                        DeviceTask & /*task*/)
-  {
-    cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice);
-    /* Use small global size on CPU devices as it seems to be much faster. */
-    if (type == CL_DEVICE_TYPE_CPU) {
-      VLOG(1) << "Global size: (64, 64).";
-      return make_int2(64, 64);
-    }
-
-    cl_ulong max_buffer_size;
-    clGetDeviceInfo(
-        device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
-
-    if (DebugFlags().opencl.mem_limit) {
-      max_buffer_size = min(max_buffer_size,
-                            cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used));
-    }
-
-    VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(max_buffer_size)
-            << " bytes. (" << string_human_readable_size(max_buffer_size) << ").";
-
-    /* Limit to 2gb, as we shouldn't need more than that and some devices may support much more. */
-    max_buffer_size = min(max_buffer_size / 2, (cl_ulong)2l * 1024 * 1024 * 1024);
-
-    size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size);
-    int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64),
-                                 (int)sqrt(num_elements));
-
-    if (device->info.description.find("Intel") != string::npos) {
-      global_size = make_int2(min(512, global_size.x), min(512, global_size.y));
-    }
-
-    VLOG(1) << "Global size: " << global_size << ".";
-    return global_size;
-  }
-};
-
-bool OpenCLDevice::opencl_error(cl_int err)
-{
-  if (err != CL_SUCCESS) {
-    string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err));
-    if (error_msg == "")
-      error_msg = message;
-    fprintf(stderr, "%s\n", message.c_str());
-    return true;
-  }
-
-  return false;
-}
-
-void OpenCLDevice::opencl_error(const string &message)
-{
-  if (error_msg == "")
-    error_msg = message;
-  fprintf(stderr, "%s\n", message.c_str());
-}
-
-void OpenCLDevice::opencl_assert_err(cl_int err, const char *where)
-{
-  if (err != CL_SUCCESS) {
-    string message = string_printf(
-        "OpenCL error (%d): %s in %s", err, clewErrorString(err), where);
-    if (error_msg == "")
-      error_msg = message;
-    fprintf(stderr, "%s\n", message.c_str());
-#  ifndef NDEBUG
-    abort();
-#  endif
-  }
-}
-
-OpenCLDevice::OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
-    : Device(info, stats, profiler, background),
-      load_kernel_num_compiling(0),
-      kernel_programs(this),
-      memory_manager(this),
-      texture_info(this, "__texture_info", MEM_GLOBAL)
-{
-  cpPlatform = NULL;
-  cdDevice = NULL;
-  cxContext = NULL;
-  cqCommandQueue = NULL;
-  device_initialized = false;
-  textures_need_update = true;
-
-  vector<OpenCLPlatformDevice> usable_devices;
-  OpenCLInfo::get_usable_devices(&usable_devices);
-  if (usable_devices.size() == 0) {
-    opencl_error("OpenCL: no devices found.");
-    return;
-  }
-  assert(info.num < usable_devices.size());
-  OpenCLPlatformDevice &platform_device = usable_devices[info.num];
-  device_num = info.num;
-  cpPlatform = platform_device.platform_id;
-  cdDevice = platform_device.device_id;
-  platform_name = platform_device.platform_name;
-  device_name = platform_device.device_name;
-  VLOG(2) << "Creating new Cycles device for OpenCL platform " << platform_name << ", device "
-          << device_name << ".";
-
-  {
-    /* try to use cached context */
-    thread_scoped_lock cache_locker;
-    cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker);
-
-    if (cxContext == NULL) {
-      /* create context properties array to specify platform */
-      const cl_context_properties context_props[] = {
-          CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, 0, 0};
-
-      /* create context */
-      cxContext = clCreateContext(
-          context_props, 1, &cdDevice, context_notify_callback, cdDevice, &ciErr);
-
-      if (opencl_error(ciErr)) {
-        opencl_error("OpenCL: clCreateContext failed");
-        return;
-      }
-
-      /* cache it */
-      OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker);
-    }
-  }
-
-  cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
-  if (opencl_error(ciErr)) {
-    opencl_error("OpenCL: Error creating command queue");
-    return;
-  }
-
-  /* Allocate this right away so that texture_info
-   * is placed at offset 0 in the device memory buffers. */
-  texture_info.resize(1);
-  memory_manager.alloc("texture_info", texture_info);
-
-  device_initialized = true;
-
-  split_kernel = new OpenCLSplitKernel(this);
-}
-
-OpenCLDevice::~OpenCLDevice()
-{
-  task_pool.cancel();
-  load_required_kernel_task_pool.cancel();
-  load_kernel_task_pool.cancel();
-
-  memory_manager.free();
-
-  ConstMemMap::iterator mt;
-  for (mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
-    delete mt->second;
-  }
-
-  base_program.release();
-  bake_program.release();
-  displace_program.release();
-  background_program.release();
-  denoising_program.release();
-
-  if (cqCommandQueue)
-    clReleaseCommandQueue(cqCommandQueue);
-  if (cxContext)
-    clReleaseContext(cxContext);
-
-  delete split_kernel;
-}
-
-void CL_CALLBACK OpenCLDevice::context_notify_callback(const char *err_info,
-                                                       const void * /*private_info*/,
-                                                       size_t /*cb*/,
-                                                       void *user_data)
-{
-  string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data);
-  fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info);
-}
-
-bool OpenCLDevice::opencl_version_check()
-{
-  string error;
-  if (!OpenCLInfo::platform_version_check(cpPlatform, &error)) {
-    opencl_error(error);
-    return false;
-  }
-  if (!OpenCLInfo::device_version_check(cdDevice, &error)) {
-    opencl_error(error);
-    return false;
-  }
-  return true;
-}
-
-string OpenCLDevice::device_md5_hash(string kernel_custom_build_options)
-{
-  MD5Hash md5;
-  char version[256], driver[256], name[256], vendor[256];
-
-  clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
-  clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
-  clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
-  clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL);
-
-  md5.append((uint8_t *)vendor, strlen(vendor));
-  md5.append((uint8_t *)version, strlen(version));
-  md5.append((uint8_t *)name, strlen(name));
-  md5.append((uint8_t *)driver, strlen(driver));
-
-  string options = kernel_build_options();
-  options += kernel_custom_build_options;
-  md5.append((uint8_t *)options.c_str(), options.size());
-
-  return md5.get_hex();
-}
-
-bool OpenCLDevice::load_kernels(const DeviceRequestedFeatures &requested_features)
-{
-  VLOG(2) << "Loading kernels for platform " << platform_name << ", device " << device_name << ".";
-  /* Verify if device was initialized. */
-  if (!device_initialized) {
-    fprintf(stderr, "OpenCL: failed to initialize device.\n");
-    return false;
-  }
-
-  /* Verify we have right opencl version. */
-  if (!opencl_version_check())
-    return false;
-
-  load_required_kernels(requested_features);
-
-  vector<OpenCLProgram *> programs;
-  kernel_programs.load_kernels(programs, requested_features);
-
-  if (!requested_features.use_baking && requested_features.use_denoising) {
-    denoising_program = OpenCLProgram(
-        this, "denoising", "filter.cl", get_build_options(requested_features, "denoising"));
-    denoising_program.add_kernel(ustring("filter_divide_shadow"));
-    denoising_program.add_kernel(ustring("filter_get_feature"));
-    denoising_program.add_kernel(ustring("filter_write_feature"));
-    denoising_program.add_kernel(ustring("filter_detect_outliers"));
-    denoising_program.add_kernel(ustring("filter_combine_halves"));
-    denoising_program.add_kernel(ustring("filter_construct_transform"));
-    denoising_program.add_kernel(ustring("filter_nlm_calc_difference"));
-    denoising_program.add_kernel(ustring("filter_nlm_blur"));
-    denoising_program.add_kernel(ustring("filter_nlm_calc_weight"));
-    denoising_program.add_kernel(ustring("filter_nlm_update_output"));
-    denoising_program.add_kernel(ustring("filter_nlm_normalize"));
-    denoising_program.add_kernel(ustring("filter_nlm_construct_gramian"));
-    denoising_program.add_kernel(ustring("filter_finalize"));
-    programs.push_back(&denoising_program);
-  }
-
-  load_required_kernel_task_pool.wait_work();
-
-  /* Parallel compilation of Cycles kernels, this launches multiple
-   * processes to workaround OpenCL frameworks serializing the calls
-   * internally within a single process. */
-  foreach (OpenCLProgram *program, programs) {
-    if (!program->load()) {
-      load_kernel_num_compiling++;
-      load_kernel_task_pool.push([=] {
-        program->compile();
-        load_kernel_num_compiling--;
-      });
-    }
-  }
-  return true;
-}
-
-void OpenCLDevice::load_required_kernels(const DeviceRequestedFeatures &requested_features)
-{
-  vector<OpenCLProgram *> programs;
-  base_program = OpenCLProgram(
-      this, "base", "kernel_base.cl", get_build_options(requested_features, "base"));
-  base_program.add_kernel(ustring("convert_to_byte"));
-  base_program.add_kernel(ustring("convert_to_half_float"));
-  base_program.add_kernel(ustring("zero_buffer"));
-  programs.push_back(&base_program);
-
-  if (requested_features.use_true_displacement) {
-    displace_program = OpenCLProgram(
-        this, "displace", "kernel_displace.cl", get_build_options(requested_features, "displace"));
-    displace_program.add_kernel(ustring("displace"));
-    programs.push_back(&displace_program);
-  }
-
-  if (requested_features.use_background_light) {
-    background_program = OpenCLProgram(this,
-                                       "background",
-                                       "kernel_background.cl",
-                                       get_build_options(requested_features, "background"));
-    background_program.add_kernel(ustring("background"));
-    programs.push_back(&background_program);
-  }
-
-  if (requested_features.use_baking) {
-    bake_program = OpenCLProgram(
-        this, "bake", "kernel_bake.cl", get_build_options(requested_features, "bake"));
-    bake_program.add_kernel(ustring("bake"));
-    programs.push_back(&bake_program);
-  }
-
-  foreach (OpenCLProgram *program, programs) {
-    if (!program->load()) {
-      load_required_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program));
-    }
-  }
-}
-
-bool OpenCLDevice::wait_for_availability(const DeviceRequestedFeatures &requested_features)
-{
-  if (requested_features.use_baking) {
-    /* For baking, kernels have already been loaded in load_required_kernels(). */
-    return true;
-  }
-
-  load_kernel_task_pool.wait_work();
-  return split_kernel->load_kernels(requested_features);
-}
-
-OpenCLDevice::OpenCLSplitPrograms *OpenCLDevice::get_split_programs()
-{
-  return &kernel_programs;
-}
-
-DeviceKernelStatus OpenCLDevice::get_active_kernel_switch_state()
-{
-  return DEVICE_KERNEL_USING_FEATURE_KERNEL;
-}
-
-void OpenCLDevice::mem_alloc(device_memory &mem)
-{
-  if (mem.name) {
-    VLOG(1) << "Buffer allocate: " << mem.name << ", "
-            << string_human_readable_number(mem.memory_size()) << " bytes. ("
-            << string_human_readable_size(mem.memory_size()) << ")";
-  }
-
-  size_t size = mem.memory_size();
-
-  /* check there is enough memory available for the allocation */
-  cl_ulong max_alloc_size = 0;
-  clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL);
-
-  if (DebugFlags().opencl.mem_limit) {
-    max_alloc_size = min(max_alloc_size, cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used));
-  }
-
-  if (size > max_alloc_size) {
-    string error = "Scene too complex to fit in available memory.";
-    if (mem.name != NULL) {
-      error += string_printf(" (allocating buffer %s failed.)", mem.name);
-    }
-    set_error(error);
-
-    return;
-  }
-
-  cl_mem_flags mem_flag;
-  void *mem_ptr = NULL;
-
-  if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL)
-    mem_flag = CL_MEM_READ_ONLY;
-  else
-    mem_flag = CL_MEM_READ_WRITE;
-
-  /* Zero-size allocation might be invoked by render, but not really
-   * supported by OpenCL. Using NULL as device pointer also doesn't really
-   * work for some reason, so for the time being we'll use special case
-   * will null_mem buffer.
-   */
-  if (size != 0) {
-    mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, mem_flag, size, mem_ptr, &ciErr);
-    opencl_assert_err(ciErr, "clCreateBuffer");
-  }
-  else {
-    mem.device_pointer = 0;
-  }
-
-  stats.mem_alloc(size);
-  mem.device_size = size;
-}
-
-void OpenCLDevice::mem_copy_to(device_memory &mem)
-{
-  if (mem.type == MEM_GLOBAL) {
-    global_free(mem);
-    global_alloc(mem);
-  }
-  else if (mem.type == MEM_TEXTURE) {
-    tex_free((device_texture &)mem);
-    tex_alloc((device_texture &)mem);
-  }
-  else {
-    if (!mem.device_pointer) {
-      mem_alloc(mem);
-    }
-
-    /* this is blocking */
-    size_t size = mem.memory_size();
-    if (size != 0) {
-      opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
-                                         CL_MEM_PTR(mem.device_pointer),
-                                         CL_TRUE,
-                                         0,
-                                         size,
-                                         mem.host_pointer,
-                                         0,
-                                         NULL,
-                                         NULL));
-    }
-  }
-}
-
-void OpenCLDevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
-{
-  size_t offset = elem * y * w;
-  size_t size = elem * w * h;
-  assert(size != 0);
-  opencl_assert(clEnqueueReadBuffer(cqCommandQueue,
-                                    CL_MEM_PTR(mem.device_pointer),
-                                    CL_TRUE,
-                                    offset,
-                                    size,
-                                    (uchar *)mem.host_pointer + offset,
-                                    0,
-                                    NULL,
-                                    NULL));
-}
-
-void OpenCLDevice::mem_zero_kernel(device_ptr mem, size_t size)
-{
-  base_program.wait_for_availability();
-  cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
-
-  size_t global_size[] = {1024, 1024};
-  size_t num_threads = global_size[0] * global_size[1];
-
-  cl_mem d_buffer = CL_MEM_PTR(mem);
-  cl_ulong d_offset = 0;
-  cl_ulong d_size = 0;
-
-  while (d_offset < size) {
-    d_size = std::min<cl_ulong>(num_threads * sizeof(float4), size - d_offset);
-
-    kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
-
-    ciErr = clEnqueueNDRangeKernel(
-        cqCommandQueue, ckZeroBuffer, 2, NULL, global_size, NULL, 0, NULL, NULL);
-    opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
-
-    d_offset += d_size;
-  }
-}
-
-void OpenCLDevice::mem_zero(device_memory &mem)
-{
-  if (!mem.device_pointer) {
-    mem_alloc(mem);
-  }
-
-  if (mem.device_pointer) {
-    if (base_program.is_loaded()) {
-      mem_zero_kernel(mem.device_pointer, mem.memory_size());
-    }
-
-    if (mem.host_pointer) {
-      memset(mem.host_pointer, 0, mem.memory_size());
-    }
-
-    if (!base_program.is_loaded()) {
-      void *zero = mem.host_pointer;
-
-      if (!mem.host_pointer) {
-        zero = util_aligned_malloc(mem.memory_size(), 16);
-        memset(zero, 0, mem.memory_size());
-      }
-
-      opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
-                                         CL_MEM_PTR(mem.device_pointer),
-                                         CL_TRUE,
-                                         0,
-                                         mem.memory_size(),
-                                         zero,
-                                         0,
-                                         NULL,
-                                         NULL));
-
-      if (!mem.host_pointer) {
-        util_aligned_free(zero);
-      }
-    }
-  }
-}
-
-void OpenCLDevice::mem_free(device_memory &mem)
-{
-  if (mem.type == MEM_GLOBAL) {
-    global_free(mem);
-  }
-  else if (mem.type == MEM_TEXTURE) {
-    tex_free((device_texture &)mem);
-  }
-  else {
-    if (mem.device_pointer) {
-      if (mem.device_pointer != 0) {
-        opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
-      }
-      mem.device_pointer = 0;
-
-      stats.mem_free(mem.device_size);
-      mem.device_size = 0;
-    }
-  }
-}
-
-int OpenCLDevice::mem_sub_ptr_alignment()
-{
-  return OpenCLInfo::mem_sub_ptr_alignment(cdDevice);
-}
-
-device_ptr OpenCLDevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int size)
-{
-  cl_mem_flags mem_flag;
-  if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL)
-    mem_flag = CL_MEM_READ_ONLY;
-  else
-    mem_flag = CL_MEM_READ_WRITE;
-
-  cl_buffer_region info;
-  info.origin = mem.memory_elements_size(offset);
-  info.size = mem.memory_elements_size(size);
-
-  device_ptr sub_buf = (device_ptr)clCreateSubBuffer(
-      CL_MEM_PTR(mem.device_pointer), mem_flag, CL_BUFFER_CREATE_TYPE_REGION, &info, &ciErr);
-  opencl_assert_err(ciErr, "clCreateSubBuffer");
-  return sub_buf;
-}
-
-void OpenCLDevice::mem_free_sub_ptr(device_ptr device_pointer)
-{
-  if (device_pointer != 0) {
-    opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer)));
-  }
-}
-
-void OpenCLDevice::const_copy_to(const char *name, void *host, size_t size)
-{
-  ConstMemMap::iterator i = const_mem_map.find(name);
-  device_vector<uchar> *data;
-
-  if (i == const_mem_map.end()) {
-    data = new device_vector<uchar>(this, name, MEM_READ_ONLY);
-    data->alloc(size);
-    const_mem_map.insert(ConstMemMap::value_type(name, data));
-  }
-  else {
-    data = i->second;
-  }
-
-  memcpy(data->data(), host, size);
-  data->copy_to_device();
-}
-
-void OpenCLDevice::global_alloc(device_memory &mem)
-{
-  VLOG(1) << "Global memory allocate: " << mem.name << ", "
-          << string_human_readable_number(mem.memory_size()) << " bytes. ("
-          << string_human_readable_size(mem.memory_size()) << ")";
-
-  memory_manager.alloc(mem.name, mem);
-  /* Set the pointer to non-null to keep code that inspects its value from thinking its
-   * unallocated. */
-  mem.device_pointer = 1;
-  textures[mem.name] = &mem;
-  textures_need_update = true;
-}
-
-void OpenCLDevice::global_free(device_memory &mem)
-{
-  if (mem.device_pointer) {
-    mem.device_pointer = 0;
-
-    if (memory_manager.free(mem)) {
-      textures_need_update = true;
-    }
-
-    foreach (TexturesMap::value_type &value, textures) {
-      if (value.second == &mem) {
-        textures.erase(value.first);
-        break;
-      }
-    }
-  }
-}
-
-void OpenCLDevice::tex_alloc(device_texture &mem)
-{
-  VLOG(1) << "Texture allocate: " << mem.name << ", "
-          << string_human_readable_number(mem.memory_size()) << " bytes. ("
-          << string_human_readable_size(mem.memory_size()) << ")";
-
-  memory_manager.alloc(mem.name, mem);
-  /* Set the pointer to non-null to keep code that inspects its value from thinking its
-   * unallocated. */
-  mem.device_pointer = 1;
-  textures[mem.name] = &mem;
-  textures_need_update = true;
-}
-
-void OpenCLDevice::tex_free(device_texture &mem)
-{
-  global_free(mem);
-}
-
-size_t OpenCLDevice::global_size_round_up(int group_size, int global_size)
-{
-  int r = global_size % group_size;
-  return global_size + ((r == 0) ? 0 : group_size - r);
-}
-
-void OpenCLDevice::enqueue_kernel(
-    cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size)
-{
-  size_t workgroup_size, max_work_items[3];
-
-  clGetKernelWorkGroupInfo(
-      kernel, cdDevice, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
-  clGetDeviceInfo(
-      cdDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, max_work_items, NULL);
-
-  if (max_workgroup_size > 0 && workgroup_size > max_workgroup_size) {
-    workgroup_size = max_workgroup_size;
-  }
-
-  /* Try to divide evenly over 2 dimensions. */
-  size_t local_size[2];
-  if (x_workgroups) {
-    local_size[0] = workgroup_size;
-    local_size[1] = 1;
-  }
-  else {
-    size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
-    local_size[0] = local_size[1] = sqrt_workgroup_size;
-  }
-
-  /* Some implementations have max size 1 on 2nd dimension. */
-  if (local_size[1] > max_work_items[1]) {
-    local_size[0] = workgroup_size / max_work_items[1];
-    local_size[1] = max_work_items[1];
-  }
-
-  size_t global_size[2] = {global_size_round_up(local_size[0], w),
-                           global_size_round_up(local_size[1], h)};
-
-  /* Vertical size of 1 is coming from bake/shade kernels where we should
-   * not round anything up because otherwise we'll either be doing too
-   * much work per pixel (if we don't check global ID on Y axis) or will
-   * be checking for global ID to always have Y of 0.
-   */
-  if (h == 1) {
-    global_size[h] = 1;
-  }
-
-  /* run kernel */
-  opencl_assert(
-      clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
-  opencl_assert(clFlush(cqCommandQueue));
-}
-
-void OpenCLDevice::set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name)
-{
-  cl_mem ptr;
-
-  MemMap::iterator i = mem_map.find(name);
-  if (i != mem_map.end()) {
-    ptr = CL_MEM_PTR(i->second);
-  }
-  else {
-    ptr = 0;
-  }
-
-  opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void *)&ptr));
-}
-
-void OpenCLDevice::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
-{
-  flush_texture_buffers();
-
-  memory_manager.set_kernel_arg_buffers(kernel, narg);
-}
-
-void OpenCLDevice::flush_texture_buffers()
-{
-  if (!textures_need_update) {
-    return;
-  }
-  textures_need_update = false;
-
-  /* Setup slots for textures. */
-  int num_slots = 0;
-
-  vector<texture_slot_t> texture_slots;
-
-#  define KERNEL_TEX(type, name) \
-    if (textures.find(#name) != textures.end()) { \
-      texture_slots.push_back(texture_slot_t(#name, num_slots)); \
-    } \
-    num_slots++;
-#  include "kernel/kernel_textures.h"
-
-  int num_data_slots = num_slots;
-
-  foreach (TexturesMap::value_type &tex, textures) {
-    string name = tex.first;
-    device_memory *mem = tex.second;
-
-    if (mem->type == MEM_TEXTURE) {
-      const uint id = ((device_texture *)mem)->slot;
-      texture_slots.push_back(texture_slot_t(name, num_data_slots + id));
-      num_slots = max(num_slots, num_data_slots + id + 1);
-    }
-  }
-
-  /* Realloc texture descriptors buffer. */
-  memory_manager.free(texture_info);
-  texture_info.resize(num_slots);
-  memory_manager.alloc("texture_info", texture_info);
-
-  /* Fill in descriptors */
-  foreach (texture_slot_t &slot, texture_slots) {
-    device_memory *mem = textures[slot.name];
-    TextureInfo &info = texture_info[slot.slot];
-
-    MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name);
-
-    if (mem->type == MEM_TEXTURE) {
-      info = ((device_texture *)mem)->info;
-    }
-    else {
-      memset(&info, 0, sizeof(TextureInfo));
-    }
-
-    info.data = desc.offset;
-    info.cl_buffer = desc.device_buffer;
-  }
-
-  /* Force write of descriptors. */
-  memory_manager.free(texture_info);
-  memory_manager.alloc("texture_info", texture_info);
-}
-
-void OpenCLDevice::thread_run(DeviceTask &task)
-{
-  flush_texture_buffers();
-
-  if (task.type == DeviceTask::RENDER) {
-    RenderTile tile;
-    DenoisingTask denoising(this, task);
-
-    /* Allocate buffer for kernel globals */
-    device_only_memory<KernelGlobalsDummy> kgbuffer(this, "kernel_globals");
-    kgbuffer.alloc_to_device(1);
-
-    /* Keep rendering tiles until done. */
-    while (task.acquire_tile(this, tile, task.tile_types)) {
-      if (tile.task == RenderTile::PATH_TRACE) {
-        assert(tile.task == RenderTile::PATH_TRACE);
-        scoped_timer timer(&tile.buffers->render_time);
-
-        split_kernel->path_trace(task, tile, kgbuffer, *const_mem_map["__data"]);
-
-        /* Complete kernel execution before release tile. */
-        /* This helps in multi-device render;
-         * The device that reaches the critical-section function
-         * release_tile waits (stalling other devices from entering
-         * release_tile) for all kernels to complete. If device1 (a
-         * slow-render device) reaches release_tile first then it would
-         * stall device2 (a fast-render device) from proceeding to render
-         * next tile.
-         */
-        clFinish(cqCommandQueue);
-      }
-      else if (tile.task == RenderTile::BAKE) {
-        bake(task, tile);
-      }
-      else if (tile.task == RenderTile::DENOISE) {
-        tile.sample = tile.start_sample + tile.num_samples;
-        denoise(tile, denoising);
-        task.update_progress(&tile, tile.w * tile.h);
-      }
-
-      task.release_tile(tile);
-    }
-
-    kgbuffer.free();
-  }
-  else if (task.type == DeviceTask::SHADER) {
-    shader(task);
-  }
-  else if (task.type == DeviceTask::FILM_CONVERT) {
-    film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
-  }
-  else if (task.type == DeviceTask::DENOISE_BUFFER) {
-    RenderTile tile;
-    tile.x = task.x;
-    tile.y = task.y;
-    tile.w = task.w;
-    tile.h = task.h;
-    tile.buffer = task.buffer;
-    tile.sample = task.sample + task.num_samples;
-    tile.num_samples = task.num_samples;
-    tile.start_sample = task.sample;
-    tile.offset = task.offset;
-    tile.stride = task.stride;
-    tile.buffers = task.buffers;
-
-    DenoisingTask denoising(this, task);
-    denoise(tile, denoising);
-    task.update_progress(&tile, tile.w * tile.h);
-  }
-}
-
-void OpenCLDevice::film_convert(DeviceTask &task,
-                                device_ptr buffer,
-                                device_ptr rgba_byte,
-                                device_ptr rgba_half)
-{
-  /* cast arguments to cl types */
-  cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
-  cl_mem d_rgba = (rgba_byte) ? CL_MEM_PTR(rgba_byte) : CL_MEM_PTR(rgba_half);
-  cl_mem d_buffer = CL_MEM_PTR(buffer);
-  cl_int d_x = task.x;
-  cl_int d_y = task.y;
-  cl_int d_w = task.w;
-  cl_int d_h = task.h;
-  cl_float d_sample_scale = 1.0f / (task.sample + 1);
-  cl_int d_offset = task.offset;
-  cl_int d_stride = task.stride;
-
-  cl_kernel ckFilmConvertKernel = (rgba_byte) ? base_program(ustring("convert_to_byte")) :
-                                                base_program(ustring("convert_to_half_float"));
-
-  cl_uint start_arg_index = kernel_set_args(ckFilmConvertKernel, 0, d_data, d_rgba, d_buffer);
-
-  set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index);
-
-  start_arg_index += kernel_set_args(ckFilmConvertKernel,
-                                     start_arg_index,
-                                     d_sample_scale,
-                                     d_x,
-                                     d_y,
-                                     d_w,
-                                     d_h,
-                                     d_offset,
-                                     d_stride);
-
-  enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
-}
-
-bool OpenCLDevice::denoising_non_local_means(device_ptr image_ptr,
-                                             device_ptr guide_ptr,
-                                             device_ptr variance_ptr,
-                                             device_ptr out_ptr,
-                                             DenoisingTask *task)
-{
-  int stride = task->buffer.stride;
-  int w = task->buffer.width;
-  int h = task->buffer.h;
-  int r = task->nlm_state.r;
-  int f = task->nlm_state.f;
-  float a = task->nlm_state.a;
-  float k_2 = task->nlm_state.k_2;
-
-  int pass_stride = task->buffer.pass_stride;
-  int num_shifts = (2 * r + 1) * (2 * r + 1);
-  int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
-
-  device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride * num_shifts);
-  device_sub_ptr blurDifference(
-      task->buffer.temporary_mem, pass_stride * num_shifts, pass_stride * num_shifts);
-  device_sub_ptr weightAccum(
-      task->buffer.temporary_mem, 2 * pass_stride * num_shifts, pass_stride);
-  cl_mem weightAccum_mem = CL_MEM_PTR(*weightAccum);
-  cl_mem difference_mem = CL_MEM_PTR(*difference);
-  cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference);
-
-  cl_mem image_mem = CL_MEM_PTR(image_ptr);
-  cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
-  cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
-  cl_mem out_mem = CL_MEM_PTR(out_ptr);
-  cl_mem scale_mem = NULL;
-
-  mem_zero_kernel(*weightAccum, sizeof(float) * pass_stride);
-  mem_zero_kernel(out_ptr, sizeof(float) * pass_stride);
-
-  cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
-  cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
-  cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
-  cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output"));
-  cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize"));
-
-  kernel_set_args(ckNLMCalcDifference,
-                  0,
-                  guide_mem,
-                  variance_mem,
-                  scale_mem,
-                  difference_mem,
-                  w,
-                  h,
-                  stride,
-                  pass_stride,
-                  r,
-                  channel_offset,
-                  0,
-                  a,
-                  k_2);
-  kernel_set_args(
-      ckNLMBlur, 0, difference_mem, blurDifference_mem, w, h, stride, pass_stride, r, f);
-  kernel_set_args(
-      ckNLMCalcWeight, 0, blurDifference_mem, difference_mem, w, h, stride, pass_stride, r, f);
-  kernel_set_args(ckNLMUpdateOutput,
-                  0,
-                  blurDifference_mem,
-                  image_mem,
-                  out_mem,
-                  weightAccum_mem,
-                  w,
-                  h,
-                  stride,
-                  pass_stride,
-                  channel_offset,
-                  r,
-                  f);
-
-  enqueue_kernel(ckNLMCalcDifference, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMCalcWeight, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMUpdateOutput, w * h, num_shifts, true);
-
-  kernel_set_args(ckNLMNormalize, 0, out_mem, weightAccum_mem, w, h, stride);
-  enqueue_kernel(ckNLMNormalize, w, h);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_construct_transform(DenoisingTask *task)
-{
-  cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
-  cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
-  cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
-  cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
-
-  char use_time = task->buffer.use_time ? 1 : 0;
-
-  cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
-
-  int arg_ofs = kernel_set_args(ckFilterConstructTransform, 0, buffer_mem, tile_info_mem);
-  cl_mem buffers[9];
-  for (int i = 0; i < 9; i++) {
-    buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
-    arg_ofs += kernel_set_args(ckFilterConstructTransform, arg_ofs, buffers[i]);
-  }
-  kernel_set_args(ckFilterConstructTransform,
-                  arg_ofs,
-                  transform_mem,
-                  rank_mem,
-                  task->filter_area,
-                  task->rect,
-                  task->buffer.pass_stride,
-                  task->buffer.frame_stride,
-                  use_time,
-                  task->radius,
-                  task->pca_threshold);
-
-  enqueue_kernel(ckFilterConstructTransform, task->storage.w, task->storage.h, 256);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_accumulate(device_ptr color_ptr,
-                                        device_ptr color_variance_ptr,
-                                        device_ptr scale_ptr,
-                                        int frame,
-                                        DenoisingTask *task)
-{
-  cl_mem color_mem = CL_MEM_PTR(color_ptr);
-  cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
-  cl_mem scale_mem = CL_MEM_PTR(scale_ptr);
-
-  cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
-  cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
-  cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
-  cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
-  cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
-
-  cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
-  cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
-  cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
-  cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
-
-  int w = task->reconstruction_state.source_w;
-  int h = task->reconstruction_state.source_h;
-  int stride = task->buffer.stride;
-  int frame_offset = frame * task->buffer.frame_stride;
-  int t = task->tile_info->frames[frame];
-  char use_time = task->buffer.use_time ? 1 : 0;
-
-  int r = task->radius;
-  int pass_stride = task->buffer.pass_stride;
-  int num_shifts = (2 * r + 1) * (2 * r + 1);
-
-  device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride * num_shifts);
-  device_sub_ptr blurDifference(
-      task->buffer.temporary_mem, pass_stride * num_shifts, pass_stride * num_shifts);
-  cl_mem difference_mem = CL_MEM_PTR(*difference);
-  cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference);
-
-  kernel_set_args(ckNLMCalcDifference,
-                  0,
-                  color_mem,
-                  color_variance_mem,
-                  scale_mem,
-                  difference_mem,
-                  w,
-                  h,
-                  stride,
-                  pass_stride,
-                  r,
-                  pass_stride,
-                  frame_offset,
-                  1.0f,
-                  task->nlm_k_2);
-  kernel_set_args(
-      ckNLMBlur, 0, difference_mem, blurDifference_mem, w, h, stride, pass_stride, r, 4);
-  kernel_set_args(
-      ckNLMCalcWeight, 0, blurDifference_mem, difference_mem, w, h, stride, pass_stride, r, 4);
-  kernel_set_args(ckNLMConstructGramian,
-                  0,
-                  t,
-                  blurDifference_mem,
-                  buffer_mem,
-                  transform_mem,
-                  rank_mem,
-                  XtWX_mem,
-                  XtWY_mem,
-                  task->reconstruction_state.filter_window,
-                  w,
-                  h,
-                  stride,
-                  pass_stride,
-                  r,
-                  4,
-                  frame_offset,
-                  use_time);
-
-  enqueue_kernel(ckNLMCalcDifference, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMCalcWeight, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMConstructGramian, w * h, num_shifts, true, 256);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_solve(device_ptr output_ptr, DenoisingTask *task)
-{
-  cl_kernel ckFinalize = denoising_program(ustring("filter_finalize"));
-
-  cl_mem output_mem = CL_MEM_PTR(output_ptr);
-  cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
-  cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
-  cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
-
-  int w = task->reconstruction_state.source_w;
-  int h = task->reconstruction_state.source_h;
-
-  kernel_set_args(ckFinalize,
-                  0,
-                  output_mem,
-                  rank_mem,
-                  XtWX_mem,
-                  XtWY_mem,
-                  task->filter_area,
-                  task->reconstruction_state.buffer_params,
-                  task->render_buffer.samples);
-  enqueue_kernel(ckFinalize, w, h);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_combine_halves(device_ptr a_ptr,
-                                            device_ptr b_ptr,
-                                            device_ptr mean_ptr,
-                                            device_ptr variance_ptr,
-                                            int r,
-                                            int4 rect,
-                                            DenoisingTask *task)
-{
-  cl_mem a_mem = CL_MEM_PTR(a_ptr);
-  cl_mem b_mem = CL_MEM_PTR(b_ptr);
-  cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
-  cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
-
-  cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves"));
-
-  kernel_set_args(ckFilterCombineHalves, 0, mean_mem, variance_mem, a_mem, b_mem, rect, r);
-  enqueue_kernel(ckFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_divide_shadow(device_ptr a_ptr,
-                                           device_ptr b_ptr,
-                                           device_ptr sample_variance_ptr,
-                                           device_ptr sv_variance_ptr,
-                                           device_ptr buffer_variance_ptr,
-                                           DenoisingTask *task)
-{
-  cl_mem a_mem = CL_MEM_PTR(a_ptr);
-  cl_mem b_mem = CL_MEM_PTR(b_ptr);
-  cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr);
-  cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr);
-  cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr);
-
-  cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
-
-  cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
-
-  int arg_ofs = kernel_set_args(
-      ckFilterDivideShadow, 0, task->render_buffer.samples, tile_info_mem);
-  cl_mem buffers[9];
-  for (int i = 0; i < 9; i++) {
-    buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
-    arg_ofs += kernel_set_args(ckFilterDivideShadow, arg_ofs, buffers[i]);
-  }
-  kernel_set_args(ckFilterDivideShadow,
-                  arg_ofs,
-                  a_mem,
-                  b_mem,
-                  sample_variance_mem,
-                  sv_variance_mem,
-                  buffer_variance_mem,
-                  task->rect,
-                  task->render_buffer.pass_stride,
-                  task->render_buffer.offset);
-  enqueue_kernel(ckFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_get_feature(int mean_offset,
-                                         int variance_offset,
-                                         device_ptr mean_ptr,
-                                         device_ptr variance_ptr,
-                                         float scale,
-                                         DenoisingTask *task)
-{
-  cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
-  cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
-
-  cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
-
-  cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
-
-  int arg_ofs = kernel_set_args(ckFilterGetFeature, 0, task->render_buffer.samples, tile_info_mem);
-  cl_mem buffers[9];
-  for (int i = 0; i < 9; i++) {
-    buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
-    arg_ofs += kernel_set_args(ckFilterGetFeature, arg_ofs, buffers[i]);
-  }
-  kernel_set_args(ckFilterGetFeature,
-                  arg_ofs,
-                  mean_offset,
-                  variance_offset,
-                  mean_mem,
-                  variance_mem,
-                  scale,
-                  task->rect,
-                  task->render_buffer.pass_stride,
-                  task->render_buffer.offset);
-  enqueue_kernel(ckFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_write_feature(int out_offset,
-                                           device_ptr from_ptr,
-                                           device_ptr buffer_ptr,
-                                           DenoisingTask *task)
-{
-  cl_mem from_mem = CL_MEM_PTR(from_ptr);
-  cl_mem buffer_mem = CL_MEM_PTR(buffer_ptr);
-
-  cl_kernel ckFilterWriteFeature = denoising_program(ustring("filter_write_feature"));
-
-  kernel_set_args(ckFilterWriteFeature,
-                  0,
-                  task->render_buffer.samples,
-                  task->reconstruction_state.buffer_params,
-                  task->filter_area,
-                  from_mem,
-                  buffer_mem,
-                  out_offset,
-                  task->rect);
-  enqueue_kernel(ckFilterWriteFeature, task->filter_area.z, task->filter_area.w);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_detect_outliers(device_ptr image_ptr,
-                                             device_ptr variance_ptr,
-                                             device_ptr depth_ptr,
-                                             device_ptr output_ptr,
-                                             DenoisingTask *task)
-{
-  cl_mem image_mem = CL_MEM_PTR(image_ptr);
-  cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
-  cl_mem depth_mem = CL_MEM_PTR(depth_ptr);
-  cl_mem output_mem = CL_MEM_PTR(output_ptr);
-
-  cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers"));
-
-  kernel_set_args(ckFilterDetectOutliers,
-                  0,
-                  image_mem,
-                  variance_mem,
-                  depth_mem,
-                  output_mem,
-                  task->rect,
-                  task->buffer.pass_stride);
-  enqueue_kernel(ckFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  return true;
-}
-
-void OpenCLDevice::denoise(RenderTile &rtile, DenoisingTask &denoising)
-{
-  denoising.functions.construct_transform = function_bind(
-      &OpenCLDevice::denoising_construct_transform, this, &denoising);
-  denoising.functions.accumulate = function_bind(
-      &OpenCLDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
-  denoising.functions.solve = function_bind(&OpenCLDevice::denoising_solve, this, _1, &denoising);
-  denoising.functions.divide_shadow = function_bind(
-      &OpenCLDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
-  denoising.functions.non_local_means = function_bind(
-      &OpenCLDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
-  denoising.functions.combine_halves = function_bind(
-      &OpenCLDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
-  denoising.functions.get_feature = function_bind(
-      &OpenCLDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
-  denoising.functions.write_feature = function_bind(
-      &OpenCLDevice::denoising_write_feature, this, _1, _2, _3, &denoising);
-  denoising.functions.detect_outliers = function_bind(
-      &OpenCLDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
-
-  denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
-  denoising.render_buffer.samples = rtile.sample;
-  denoising.buffer.gpu_temporary_mem = true;
-
-  denoising.run_denoising(rtile);
-}
-
-void OpenCLDevice::shader(DeviceTask &task)
-{
-  /* cast arguments to cl types */
-  cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
-  cl_mem d_input = CL_MEM_PTR(task.shader_input);
-  cl_mem d_output = CL_MEM_PTR(task.shader_output);
-  cl_int d_shader_eval_type = task.shader_eval_type;
-  cl_int d_shader_filter = task.shader_filter;
-  cl_int d_shader_x = task.shader_x;
-  cl_int d_shader_w = task.shader_w;
-  cl_int d_offset = task.offset;
-
-  OpenCLDevice::OpenCLProgram *program = &background_program;
-  if (task.shader_eval_type == SHADER_EVAL_DISPLACE) {
-    program = &displace_program;
-  }
-  program->wait_for_availability();
-  cl_kernel kernel = (*program)();
-
-  cl_uint start_arg_index = kernel_set_args(kernel, 0, d_data, d_input, d_output);
-
-  set_kernel_arg_buffers(kernel, &start_arg_index);
-
-  start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_eval_type);
-  if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
-    start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_filter);
-  }
-  start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_x, d_shader_w, d_offset);
-
-  for (int sample = 0; sample < task.num_samples; sample++) {
-
-    if (task.get_cancel())
-      break;
-
-    kernel_set_args(kernel, start_arg_index, sample);
-
-    enqueue_kernel(kernel, task.shader_w, 1);
-
-    clFinish(cqCommandQueue);
-
-    task.update_progress(NULL);
-  }
-}
-
-void OpenCLDevice::bake(DeviceTask &task, RenderTile &rtile)
-{
-  scoped_timer timer(&rtile.buffers->render_time);
-
-  /* Cast arguments to cl types. */
-  cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
-  cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
-  cl_int d_x = rtile.x;
-  cl_int d_y = rtile.y;
-  cl_int d_w = rtile.w;
-  cl_int d_h = rtile.h;
-  cl_int d_offset = rtile.offset;
-  cl_int d_stride = rtile.stride;
-
-  bake_program.wait_for_availability();
-  cl_kernel kernel = bake_program();
-
-  cl_uint start_arg_index = kernel_set_args(kernel, 0, d_data, d_buffer);
-
-  set_kernel_arg_buffers(kernel, &start_arg_index);
-
-  start_arg_index += kernel_set_args(
-      kernel, start_arg_index, d_x, d_y, d_w, d_h, d_offset, d_stride);
-
-  int start_sample = rtile.start_sample;
-  int end_sample = rtile.start_sample + rtile.num_samples;
-
-  for (int sample = start_sample; sample < end_sample; sample++) {
-    if (task.get_cancel()) {
-      if (task.need_finish_queue == false)
-        break;
-    }
-
-    kernel_set_args(kernel, start_arg_index, sample);
-
-    enqueue_kernel(kernel, d_w, d_h);
-    clFinish(cqCommandQueue);
-
-    rtile.sample = sample + 1;
-
-    task.update_progress(&rtile, rtile.w * rtile.h);
-  }
-}
-
-static bool kernel_build_opencl_2(cl_device_id cdDevice)
-{
-  /* Build with OpenCL 2.0 if available, this improves performance
-   * with AMD OpenCL drivers on Windows and Linux (legacy drivers).
-   * Note that OpenCL selects the highest 1.x version by default,
-   * only for 2.0 do we need the explicit compiler flag. */
-  int version_major, version_minor;
-  if (OpenCLInfo::get_device_version(cdDevice, &version_major, &version_minor)) {
-    if (version_major >= 2) {
-      /* This appears to trigger a driver bug in Radeon RX cards with certain
-       * driver version, so don't use OpenCL 2.0 for those. */
-      string device_name = OpenCLInfo::get_readable_device_name(cdDevice);
-      if (string_startswith(device_name, "Radeon RX 4") ||
-          string_startswith(device_name, "Radeon (TM) RX 4") ||
-          string_startswith(device_name, "Radeon RX 5") ||
-          string_startswith(device_name, "Radeon (TM) RX 5")) {
-        char version[256] = "";
-        int driver_major, driver_minor;
-        clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
-        if (sscanf(version, "OpenCL 2.0 AMD-APP (%d.%d)", &driver_major, &driver_minor) == 2) {
-          return !(driver_major == 3075 && driver_minor <= 12);
-        }
-      }
-
-      return true;
-    }
-  }
-
-  return false;
-}
-
-string OpenCLDevice::kernel_build_options(const string *debug_src)
-{
-  string build_options = "-cl-no-signed-zeros -cl-mad-enable ";
-
-  if (kernel_build_opencl_2(cdDevice)) {
-    build_options += "-cl-std=CL2.0 ";
-  }
-
-  if (platform_name == "NVIDIA CUDA") {
-    build_options +=
-        "-D__KERNEL_OPENCL_NVIDIA__ "
-        "-cl-nv-maxrregcount=32 "
-        "-cl-nv-verbose ";
-
-    uint compute_capability_major, compute_capability_minor;
-    clGetDeviceInfo(cdDevice,
-                    CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
-                    sizeof(cl_uint),
-                    &compute_capability_major,
-                    NULL);
-    clGetDeviceInfo(cdDevice,
-                    CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
-                    sizeof(cl_uint),
-                    &compute_capability_minor,
-                    NULL);
-
-    build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ",
-                                   compute_capability_major * 100 + compute_capability_minor * 10);
-  }
-
-  else if (platform_name == "Apple")
-    build_options += "-D__KERNEL_OPENCL_APPLE__ ";
-
-  else if (platform_name == "AMD Accelerated Parallel Processing")
-    build_options += "-D__KERNEL_OPENCL_AMD__ ";
-
-  else if (platform_name == "Intel(R) OpenCL") {
-    build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ ";
-
-    /* Options for gdb source level kernel debugging.
-     * this segfaults on linux currently.
-     */
-    if (OpenCLInfo::use_debug() && debug_src)
-      build_options += "-g -s \"" + *debug_src + "\" ";
-  }
-
-  if (info.has_half_images) {
-    build_options += "-D__KERNEL_CL_KHR_FP16__ ";
-  }
-
-  if (OpenCLInfo::use_debug()) {
-    build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
-  }
-
-#  ifdef WITH_NANOVDB
-  if (info.has_nanovdb) {
-    build_options += "-DWITH_NANOVDB ";
-  }
-#  endif
-
-  return build_options;
-}
-
-/* TODO(sergey): In the future we can use variadic templates, once
- * C++0x is allowed. Should allow to clean this up a bit.
- */
-int OpenCLDevice::kernel_set_args(cl_kernel kernel,
-                                  int start_argument_index,
-                                  const ArgumentWrapper &arg1,
-                                  const ArgumentWrapper &arg2,
-                                  const ArgumentWrapper &arg3,
-                                  const ArgumentWrapper &arg4,
-                                  const ArgumentWrapper &arg5,
-                                  const ArgumentWrapper &arg6,
-                                  const ArgumentWrapper &arg7,
-                                  const ArgumentWrapper &arg8,
-                                  const ArgumentWrapper &arg9,
-                                  const ArgumentWrapper &arg10,
-                                  const ArgumentWrapper &arg11,
-                                  const ArgumentWrapper &arg12,
-                                  const ArgumentWrapper &arg13,
-                                  const ArgumentWrapper &arg14,
-                                  const ArgumentWrapper &arg15,
-                                  const ArgumentWrapper &arg16,
-                                  const ArgumentWrapper &arg17,
-                                  const ArgumentWrapper &arg18,
-                                  const ArgumentWrapper &arg19,
-                                  const ArgumentWrapper &arg20,
-                                  const ArgumentWrapper &arg21,
-                                  const ArgumentWrapper &arg22,
-                                  const ArgumentWrapper &arg23,
-                                  const ArgumentWrapper &arg24,
-                                  const ArgumentWrapper &arg25,
-                                  const ArgumentWrapper &arg26,
-                                  const ArgumentWrapper &arg27,
-                                  const ArgumentWrapper &arg28,
-                                  const ArgumentWrapper &arg29,
-                                  const ArgumentWrapper &arg30,
-                                  const ArgumentWrapper &arg31,
-                                  const ArgumentWrapper &arg32,
-                                  const ArgumentWrapper &arg33)
-{
-  int current_arg_index = 0;
-#  define FAKE_VARARG_HANDLE_ARG(arg) \
-    do { \
-      if (arg.pointer != NULL) { \
-        opencl_assert(clSetKernelArg( \
-            kernel, start_argument_index + current_arg_index, arg.size, arg.pointer)); \
-        ++current_arg_index; \
-      } \
-      else { \
-        return current_arg_index; \
-      } \
-    } while (false)
-  FAKE_VARARG_HANDLE_ARG(arg1);
-  FAKE_VARARG_HANDLE_ARG(arg2);
-  FAKE_VARARG_HANDLE_ARG(arg3);
-  FAKE_VARARG_HANDLE_ARG(arg4);
-  FAKE_VARARG_HANDLE_ARG(arg5);
-  FAKE_VARARG_HANDLE_ARG(arg6);
-  FAKE_VARARG_HANDLE_ARG(arg7);
-  FAKE_VARARG_HANDLE_ARG(arg8);
-  FAKE_VARARG_HANDLE_ARG(arg9);
-  FAKE_VARARG_HANDLE_ARG(arg10);
-  FAKE_VARARG_HANDLE_ARG(arg11);
-  FAKE_VARARG_HANDLE_ARG(arg12);
-  FAKE_VARARG_HANDLE_ARG(arg13);
-  FAKE_VARARG_HANDLE_ARG(arg14);
-  FAKE_VARARG_HANDLE_ARG(arg15);
-  FAKE_VARARG_HANDLE_ARG(arg16);
-  FAKE_VARARG_HANDLE_ARG(arg17);
-  FAKE_VARARG_HANDLE_ARG(arg18);
-  FAKE_VARARG_HANDLE_ARG(arg19);
-  FAKE_VARARG_HANDLE_ARG(arg20);
-  FAKE_VARARG_HANDLE_ARG(arg21);
-  FAKE_VARARG_HANDLE_ARG(arg22);
-  FAKE_VARARG_HANDLE_ARG(arg23);
-  FAKE_VARARG_HANDLE_ARG(arg24);
-  FAKE_VARARG_HANDLE_ARG(arg25);
-  FAKE_VARARG_HANDLE_ARG(arg26);
-  FAKE_VARARG_HANDLE_ARG(arg27);
-  FAKE_VARARG_HANDLE_ARG(arg28);
-  FAKE_VARARG_HANDLE_ARG(arg29);
-  FAKE_VARARG_HANDLE_ARG(arg30);
-  FAKE_VARARG_HANDLE_ARG(arg31);
-  FAKE_VARARG_HANDLE_ARG(arg32);
-  FAKE_VARARG_HANDLE_ARG(arg33);
-#  undef FAKE_VARARG_HANDLE_ARG
-  return current_arg_index;
-}
-
-void OpenCLDevice::release_kernel_safe(cl_kernel kernel)
-{
-  if (kernel) {
-    clReleaseKernel(kernel);
-  }
-}
-
-void OpenCLDevice::release_mem_object_safe(cl_mem mem)
-{
-  if (mem != NULL) {
-    clReleaseMemObject(mem);
-  }
-}
-
-void OpenCLDevice::release_program_safe(cl_program program)
-{
-  if (program) {
-    clReleaseProgram(program);
-  }
-}
-
-/* ** Those guys are for working around some compiler-specific bugs ** */
-
-cl_program OpenCLDevice::load_cached_kernel(ustring key, thread_scoped_lock &cache_locker)
-{
-  return OpenCLCache::get_program(cpPlatform, cdDevice, key, cache_locker);
-}
-
-void OpenCLDevice::store_cached_kernel(cl_program program,
-                                       ustring key,
-                                       thread_scoped_lock &cache_locker)
-{
-  OpenCLCache::store_program(cpPlatform, cdDevice, program, key, cache_locker);
-}
-
-Device *opencl_create_split_device(DeviceInfo &info,
-                                   Stats &stats,
-                                   Profiler &profiler,
-                                   bool background)
-{
-  return new OpenCLDevice(info, stats, profiler, background);
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/opencl/memory_manager.cpp b/intern/cycles/device/opencl/memory_manager.cpp
deleted file mode 100644
index 4330e07cb37..00000000000
--- a/intern/cycles/device/opencl/memory_manager.cpp
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPENCL
-
-#  include "util/util_foreach.h"
-
-#  include "device/opencl/device_opencl.h"
-#  include "device/opencl/memory_manager.h"
-
-CCL_NAMESPACE_BEGIN
-
-void MemoryManager::DeviceBuffer::add_allocation(Allocation &allocation)
-{
-  allocations.push_back(&allocation);
-}
-
-void MemoryManager::DeviceBuffer::update_device_memory(OpenCLDevice *device)
-{
-  bool need_realloc = false;
-
-  /* Calculate total size and remove any freed. */
-  size_t total_size = 0;
-
-  for (int i = allocations.size() - 1; i >= 0; i--) {
-    Allocation *allocation = allocations[i];
-
-    /* Remove allocations that have been freed. */
-    if (!allocation->mem || allocation->mem->memory_size() == 0) {
-      allocation->device_buffer = NULL;
-      allocation->size = 0;
-
-      allocations.erase(allocations.begin() + i);
-
-      need_realloc = true;
-
-      continue;
-    }
-
-    /* Get actual size for allocation. */
-    size_t alloc_size = align_up(allocation->mem->memory_size(), 16);
-
-    if (allocation->size != alloc_size) {
-      /* Allocation is either new or resized. */
-      allocation->size = alloc_size;
-      allocation->needs_copy_to_device = true;
-
-      need_realloc = true;
-    }
-
-    total_size += alloc_size;
-  }
-
-  /* Always allocate non-empty buffer, NULL pointers cause problems with some drivers. */
-  total_size = std::max(total_size, (size_t)16);
-
-  if (need_realloc) {
-    cl_ulong max_buffer_size;
-    clGetDeviceInfo(
-        device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
-
-    if (total_size > max_buffer_size) {
-      device->set_error("Scene too complex to fit in available memory.");
-      return;
-    }
-
-    device_only_memory<uchar> *new_buffer = new device_only_memory<uchar>(device,
-                                                                          "memory manager buffer");
-
-    new_buffer->alloc_to_device(total_size);
-
-    size_t offset = 0;
-
-    foreach (Allocation *allocation, allocations) {
-      if (allocation->needs_copy_to_device) {
-        /* Copy from host to device. */
-        opencl_device_assert(device,
-                             clEnqueueWriteBuffer(device->cqCommandQueue,
-                                                  CL_MEM_PTR(new_buffer->device_pointer),
-                                                  CL_FALSE,
-                                                  offset,
-                                                  allocation->mem->memory_size(),
-                                                  allocation->mem->host_pointer,
-                                                  0,
-                                                  NULL,
-                                                  NULL));
-
-        allocation->needs_copy_to_device = false;
-      }
-      else {
-        /* Fast copy from memory already on device. */
-        opencl_device_assert(device,
-                             clEnqueueCopyBuffer(device->cqCommandQueue,
-                                                 CL_MEM_PTR(buffer->device_pointer),
-                                                 CL_MEM_PTR(new_buffer->device_pointer),
-                                                 allocation->desc.offset,
-                                                 offset,
-                                                 allocation->mem->memory_size(),
-                                                 0,
-                                                 NULL,
-                                                 NULL));
-      }
-
-      allocation->desc.offset = offset;
-      offset += allocation->size;
-    }
-
-    delete buffer;
-
-    buffer = new_buffer;
-  }
-  else {
-    assert(total_size == buffer->data_size);
-
-    size_t offset = 0;
-
-    foreach (Allocation *allocation, allocations) {
-      if (allocation->needs_copy_to_device) {
-        /* Copy from host to device. */
-        opencl_device_assert(device,
-                             clEnqueueWriteBuffer(device->cqCommandQueue,
-                                                  CL_MEM_PTR(buffer->device_pointer),
-                                                  CL_FALSE,
-                                                  offset,
-                                                  allocation->mem->memory_size(),
-                                                  allocation->mem->host_pointer,
-                                                  0,
-                                                  NULL,
-                                                  NULL));
-
-        allocation->needs_copy_to_device = false;
-      }
-
-      offset += allocation->size;
-    }
-  }
-
-  /* Not really necessary, but seems to improve responsiveness for some reason. */
-  clFinish(device->cqCommandQueue);
-}
-
-void MemoryManager::DeviceBuffer::free(OpenCLDevice *)
-{
-  buffer->free();
-}
-
-MemoryManager::DeviceBuffer *MemoryManager::smallest_device_buffer()
-{
-  DeviceBuffer *smallest = device_buffers;
-
-  foreach (DeviceBuffer &device_buffer, device_buffers) {
-    if (device_buffer.size < smallest->size) {
-      smallest = &device_buffer;
-    }
-  }
-
-  return smallest;
-}
-
-MemoryManager::MemoryManager(OpenCLDevice *device) : device(device), need_update(false)
-{
-  foreach (DeviceBuffer &device_buffer, device_buffers) {
-    device_buffer.buffer = new device_only_memory<uchar>(device, "memory manager buffer");
-  }
-}
-
-void MemoryManager::free()
-{
-  foreach (DeviceBuffer &device_buffer, device_buffers) {
-    device_buffer.free(device);
-  }
-}
-
-void MemoryManager::alloc(const char *name, device_memory &mem)
-{
-  Allocation &allocation = allocations[name];
-
-  allocation.mem = &mem;
-  allocation.needs_copy_to_device = true;
-
-  if (!allocation.device_buffer) {
-    DeviceBuffer *device_buffer = smallest_device_buffer();
-    allocation.device_buffer = device_buffer;
-
-    allocation.desc.device_buffer = device_buffer - device_buffers;
-
-    device_buffer->add_allocation(allocation);
-
-    device_buffer->size += mem.memory_size();
-  }
-
-  need_update = true;
-}
-
-bool MemoryManager::free(device_memory &mem)
-{
-  foreach (AllocationsMap::value_type &value, allocations) {
-    Allocation &allocation = value.second;
-    if (allocation.mem == &mem) {
-
-      allocation.device_buffer->size -= mem.memory_size();
-
-      allocation.mem = NULL;
-      allocation.needs_copy_to_device = false;
-
-      need_update = true;
-      return true;
-    }
-  }
-
-  return false;
-}
-
-MemoryManager::BufferDescriptor MemoryManager::get_descriptor(string name)
-{
-  update_device_memory();
-
-  Allocation &allocation = allocations[name];
-  return allocation.desc;
-}
-
-void MemoryManager::update_device_memory()
-{
-  if (!need_update) {
-    return;
-  }
-
-  need_update = false;
-
-  foreach (DeviceBuffer &device_buffer, device_buffers) {
-    device_buffer.update_device_memory(device);
-  }
-}
-
-void MemoryManager::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
-{
-  update_device_memory();
-
-  foreach (DeviceBuffer &device_buffer, device_buffers) {
-    if (device_buffer.buffer->device_pointer) {
-      device->kernel_set_args(kernel, (*narg)++, *device_buffer.buffer);
-    }
-    else {
-      device->kernel_set_args(kernel, (*narg)++);
-    }
-  }
-}
-
-CCL_NAMESPACE_END
-
-#endif /* WITH_OPENCL */
diff --git a/intern/cycles/device/opencl/memory_manager.h b/intern/cycles/device/opencl/memory_manager.h
deleted file mode 100644
index 23624f837a6..00000000000
--- a/intern/cycles/device/opencl/memory_manager.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "device/device.h"
-
-#include "util/util_map.h"
-#include "util/util_string.h"
-#include "util/util_vector.h"
-
-#include "clew.h"
-
-CCL_NAMESPACE_BEGIN
-
-class OpenCLDevice;
-
-class MemoryManager {
- public:
-  static const int NUM_DEVICE_BUFFERS = 8;
-
-  struct BufferDescriptor {
-    uint device_buffer;
-    cl_ulong offset;
-  };
-
- private:
-  struct DeviceBuffer;
-
-  struct Allocation {
-    device_memory *mem;
-
-    DeviceBuffer *device_buffer;
-    size_t size; /* Size of actual allocation, may be larger than requested. */
-
-    BufferDescriptor desc;
-
-    bool needs_copy_to_device;
-
-    Allocation() : mem(NULL), device_buffer(NULL), size(0), needs_copy_to_device(false)
-    {
-    }
-  };
-
-  struct DeviceBuffer {
-    device_only_memory<uchar> *buffer;
-    vector<Allocation *> allocations;
-    size_t size; /* Size of all allocations. */
-
-    DeviceBuffer() : buffer(NULL), size(0)
-    {
-    }
-
-    ~DeviceBuffer()
-    {
-      delete buffer;
-      buffer = NULL;
-    }
-
-    void add_allocation(Allocation &allocation);
-
-    void update_device_memory(OpenCLDevice *device);
-
-    void free(OpenCLDevice *device);
-  };
-
-  OpenCLDevice *device;
-
-  DeviceBuffer device_buffers[NUM_DEVICE_BUFFERS];
-
-  typedef unordered_map<string, Allocation> AllocationsMap;
-  AllocationsMap allocations;
-
-  bool need_update;
-
-  DeviceBuffer *smallest_device_buffer();
-
- public:
-  MemoryManager(OpenCLDevice *device);
-
-  void free(); /* Free all memory. */
-
-  void alloc(const char *name, device_memory &mem);
-  bool free(device_memory &mem);
-
-  BufferDescriptor get_descriptor(string name);
-
-  void update_device_memory();
-  void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
-};
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp
deleted file mode 100644
index 3929cf77f15..00000000000
--- a/intern/cycles/device/opencl/opencl_util.cpp
+++ /dev/null
@@ -1,1326 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPENCL
-
-#  include "device/device_intern.h"
-#  include "device/opencl/device_opencl.h"
-
-#  include "util/util_debug.h"
-#  include "util/util_logging.h"
-#  include "util/util_md5.h"
-#  include "util/util_path.h"
-#  include "util/util_semaphore.h"
-#  include "util/util_system.h"
-#  include "util/util_time.h"
-
-using std::cerr;
-using std::endl;
-
-CCL_NAMESPACE_BEGIN
-
-OpenCLCache::Slot::ProgramEntry::ProgramEntry() : program(NULL), mutex(NULL)
-{
-}
-
-OpenCLCache::Slot::ProgramEntry::ProgramEntry(const ProgramEntry &rhs)
-    : program(rhs.program), mutex(NULL)
-{
-}
-
-OpenCLCache::Slot::ProgramEntry::~ProgramEntry()
-{
-  delete mutex;
-}
-
-OpenCLCache::Slot::Slot() : context_mutex(NULL), context(NULL)
-{
-}
-
-OpenCLCache::Slot::Slot(const Slot &rhs)
-    : context_mutex(NULL), context(NULL), programs(rhs.programs)
-{
-}
-
-OpenCLCache::Slot::~Slot()
-{
-  delete context_mutex;
-}
-
-OpenCLCache &OpenCLCache::global_instance()
-{
-  static OpenCLCache instance;
-  return instance;
-}
-
-cl_context OpenCLCache::get_context(cl_platform_id platform,
-                                    cl_device_id device,
-                                    thread_scoped_lock &slot_locker)
-{
-  assert(platform != NULL);
-
-  OpenCLCache &self = global_instance();
-
-  thread_scoped_lock cache_lock(self.cache_lock);
-
-  pair<CacheMap::iterator, bool> ins = self.cache.insert(
-      CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
-
-  Slot &slot = ins.first->second;
-
-  /* create slot lock only while holding cache lock */
-  if (!slot.context_mutex)
-    slot.context_mutex = new thread_mutex;
-
-  /* need to unlock cache before locking slot, to allow store to complete */
-  cache_lock.unlock();
-
-  /* lock the slot */
-  slot_locker = thread_scoped_lock(*slot.context_mutex);
-
-  /* If the thing isn't cached */
-  if (slot.context == NULL) {
-    /* return with the caller's lock holder holding the slot lock */
-    return NULL;
-  }
-
-  /* the item was already cached, release the slot lock */
-  slot_locker.unlock();
-
-  cl_int ciErr = clRetainContext(slot.context);
-  assert(ciErr == CL_SUCCESS);
-  (void)ciErr;
-
-  return slot.context;
-}
-
-cl_program OpenCLCache::get_program(cl_platform_id platform,
-                                    cl_device_id device,
-                                    ustring key,
-                                    thread_scoped_lock &slot_locker)
-{
-  assert(platform != NULL);
-
-  OpenCLCache &self = global_instance();
-
-  thread_scoped_lock cache_lock(self.cache_lock);
-
-  pair<CacheMap::iterator, bool> ins = self.cache.insert(
-      CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
-
-  Slot &slot = ins.first->second;
-
-  pair<Slot::EntryMap::iterator, bool> ins2 = slot.programs.insert(
-      Slot::EntryMap::value_type(key, Slot::ProgramEntry()));
-
-  Slot::ProgramEntry &entry = ins2.first->second;
-
-  /* create slot lock only while holding cache lock */
-  if (!entry.mutex)
-    entry.mutex = new thread_mutex;
-
-  /* need to unlock cache before locking slot, to allow store to complete */
-  cache_lock.unlock();
-
-  /* lock the slot */
-  slot_locker = thread_scoped_lock(*entry.mutex);
-
-  /* If the thing isn't cached */
-  if (entry.program == NULL) {
-    /* return with the caller's lock holder holding the slot lock */
-    return NULL;
-  }
-
-  /* the item was already cached, release the slot lock */
-  slot_locker.unlock();
-
-  cl_int ciErr = clRetainProgram(entry.program);
-  assert(ciErr == CL_SUCCESS);
-  (void)ciErr;
-
-  return entry.program;
-}
-
-void OpenCLCache::store_context(cl_platform_id platform,
-                                cl_device_id device,
-                                cl_context context,
-                                thread_scoped_lock &slot_locker)
-{
-  assert(platform != NULL);
-  assert(device != NULL);
-  assert(context != NULL);
-
-  OpenCLCache &self = global_instance();
-
-  thread_scoped_lock cache_lock(self.cache_lock);
-  CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
-  cache_lock.unlock();
-
-  Slot &slot = i->second;
-
-  /* sanity check */
-  assert(i != self.cache.end());
-  assert(slot.context == NULL);
-
-  slot.context = context;
-
-  /* unlock the slot */
-  slot_locker.unlock();
-
-  /* increment reference count in OpenCL.
-   * The caller is going to release the object when done with it. */
-  cl_int ciErr = clRetainContext(context);
-  assert(ciErr == CL_SUCCESS);
-  (void)ciErr;
-}
-
-void OpenCLCache::store_program(cl_platform_id platform,
-                                cl_device_id device,
-                                cl_program program,
-                                ustring key,
-                                thread_scoped_lock &slot_locker)
-{
-  assert(platform != NULL);
-  assert(device != NULL);
-  assert(program != NULL);
-
-  OpenCLCache &self = global_instance();
-
-  thread_scoped_lock cache_lock(self.cache_lock);
-
-  CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
-  assert(i != self.cache.end());
-  Slot &slot = i->second;
-
-  Slot::EntryMap::iterator i2 = slot.programs.find(key);
-  assert(i2 != slot.programs.end());
-  Slot::ProgramEntry &entry = i2->second;
-
-  assert(entry.program == NULL);
-
-  cache_lock.unlock();
-
-  entry.program = program;
-
-  /* unlock the slot */
-  slot_locker.unlock();
-
-  /* Increment reference count in OpenCL.
-   * The caller is going to release the object when done with it.
-   */
-  cl_int ciErr = clRetainProgram(program);
-  assert(ciErr == CL_SUCCESS);
-  (void)ciErr;
-}
-
-string OpenCLCache::get_kernel_md5()
-{
-  OpenCLCache &self = global_instance();
-  thread_scoped_lock lock(self.kernel_md5_lock);
-
-  if (self.kernel_md5.empty()) {
-    self.kernel_md5 = path_files_md5_hash(path_get("source"));
-  }
-  return self.kernel_md5;
-}
-
-static string get_program_source(const string &kernel_file)
-{
-  string source = "#include \"kernel/kernels/opencl/" + kernel_file + "\"\n";
-  /* We compile kernels consisting of many files. unfortunately OpenCL
-   * kernel caches do not seem to recognize changes in included files.
-   * so we force recompile on changes by adding the md5 hash of all files.
-   */
-  source = path_source_replace_includes(source, path_get("source"));
-  source += "\n// " + util_md5_string(source) + "\n";
-  return source;
-}
-
-OpenCLDevice::OpenCLProgram::OpenCLProgram(OpenCLDevice *device,
-                                           const string &program_name,
-                                           const string &kernel_file,
-                                           const string &kernel_build_options,
-                                           bool use_stdout)
-    : device(device),
-      program_name(program_name),
-      kernel_file(kernel_file),
-      kernel_build_options(kernel_build_options),
-      use_stdout(use_stdout)
-{
-  loaded = false;
-  needs_compiling = true;
-  program = NULL;
-}
-
-OpenCLDevice::OpenCLProgram::~OpenCLProgram()
-{
-  release();
-}
-
-void OpenCLDevice::OpenCLProgram::release()
-{
-  for (map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end();
-       ++kernel) {
-    if (kernel->second) {
-      clReleaseKernel(kernel->second);
-      kernel->second = NULL;
-    }
-  }
-  if (program) {
-    clReleaseProgram(program);
-    program = NULL;
-  }
-}
-
-void OpenCLDevice::OpenCLProgram::add_log(const string &msg, bool debug)
-{
-  if (!use_stdout) {
-    log += msg + "\n";
-  }
-  else if (!debug) {
-    printf("%s\n", msg.c_str());
-    fflush(stdout);
-  }
-  else {
-    VLOG(2) << msg;
-  }
-}
-
-void OpenCLDevice::OpenCLProgram::add_error(const string &msg)
-{
-  if (use_stdout) {
-    fprintf(stderr, "%s\n", msg.c_str());
-  }
-  if (error_msg == "") {
-    error_msg += "\n";
-  }
-  error_msg += msg;
-}
-
-void OpenCLDevice::OpenCLProgram::add_kernel(ustring name)
-{
-  if (!kernels.count(name)) {
-    kernels[name] = NULL;
-  }
-}
-
-bool OpenCLDevice::OpenCLProgram::build_kernel(const string *debug_src)
-{
-  string build_options;
-  build_options = device->kernel_build_options(debug_src) + kernel_build_options;
-
-  VLOG(1) << "Build options passed to clBuildProgram: '" << build_options << "'.";
-  cl_int ciErr = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL);
-
-  /* show warnings even if build is successful */
-  size_t ret_val_size = 0;
-
-  clGetProgramBuildInfo(program, device->cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
-
-  if (ciErr != CL_SUCCESS) {
-    add_error(string("OpenCL build failed with error ") + clewErrorString(ciErr) +
-              ", errors in console.");
-  }
-
-  if (ret_val_size > 1) {
-    vector<char> build_log(ret_val_size + 1);
-    clGetProgramBuildInfo(
-        program, device->cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL);
-
-    build_log[ret_val_size] = '\0';
-    /* Skip meaningless empty output from the NVidia compiler. */
-    if (!(ret_val_size == 2 && build_log[0] == '\n')) {
-      add_log(string("OpenCL program ") + program_name + " build output: " + string(&build_log[0]),
-              ciErr == CL_SUCCESS);
-    }
-  }
-
-  return (ciErr == CL_SUCCESS);
-}
-
-bool OpenCLDevice::OpenCLProgram::compile_kernel(const string *debug_src)
-{
-  string source = get_program_source(kernel_file);
-
-  if (debug_src) {
-    path_write_text(*debug_src, source);
-  }
-
-  size_t source_len = source.size();
-  const char *source_str = source.c_str();
-  cl_int ciErr;
-
-  program = clCreateProgramWithSource(device->cxContext, 1, &source_str, &source_len, &ciErr);
-
-  if (ciErr != CL_SUCCESS) {
-    add_error(string("OpenCL program creation failed: ") + clewErrorString(ciErr));
-    return false;
-  }
-
-  double starttime = time_dt();
-  add_log(string("Cycles: compiling OpenCL program ") + program_name + "...", false);
-  add_log(string("Build flags: ") + kernel_build_options, true);
-
-  if (!build_kernel(debug_src))
-    return false;
-
-  double elapsed = time_dt() - starttime;
-  add_log(
-      string_printf("Kernel compilation of %s finished in %.2lfs.", program_name.c_str(), elapsed),
-      false);
-
-  return true;
-}
-
-static void escape_python_string(string &str)
-{
-  /* Escape string to be passed as a Python raw string with '' quotes'. */
-  string_replace(str, "'", "\'");
-}
-
-static int opencl_compile_process_limit()
-{
-  /* Limit number of concurrent processes compiling, with a heuristic based
-   * on total physical RAM and estimate of memory usage needed when compiling
-   * with all Cycles features enabled.
-   *
-   * This is somewhat arbitrary as we don't know the actual available RAM or
-   * how much the kernel compilation will needed depending on the features, but
-   * better than not limiting at all. */
-  static const int64_t GB = 1024LL * 1024LL * 1024LL;
-  static const int64_t process_memory = 2 * GB;
-  static const int64_t base_memory = 2 * GB;
-  static const int64_t system_memory = system_physical_ram();
-  static const int64_t process_limit = (system_memory - base_memory) / process_memory;
-
-  return max((int)process_limit, 1);
-}
-
-bool OpenCLDevice::OpenCLProgram::compile_separate(const string &clbin)
-{
-  /* Construct arguments. */
-  vector<string> args;
-  args.push_back("--background");
-  args.push_back("--factory-startup");
-  args.push_back("--python-expr");
-
-  int device_platform_id = device->device_num;
-  string device_name = device->device_name;
-  string platform_name = device->platform_name;
-  string build_options = device->kernel_build_options(NULL) + kernel_build_options;
-  string kernel_file_escaped = kernel_file;
-  string clbin_escaped = clbin;
-
-  escape_python_string(device_name);
-  escape_python_string(platform_name);
-  escape_python_string(build_options);
-  escape_python_string(kernel_file_escaped);
-  escape_python_string(clbin_escaped);
-
-  args.push_back(string_printf(
-      "import _cycles; _cycles.opencl_compile(r'%d', r'%s', r'%s', r'%s', r'%s', r'%s')",
-      device_platform_id,
-      device_name.c_str(),
-      platform_name.c_str(),
-      build_options.c_str(),
-      kernel_file_escaped.c_str(),
-      clbin_escaped.c_str()));
-
-  /* Limit number of concurrent processes compiling. */
-  static thread_counting_semaphore semaphore(opencl_compile_process_limit());
-  semaphore.acquire();
-
-  /* Compile. */
-  const double starttime = time_dt();
-  add_log(string("Cycles: compiling OpenCL program ") + program_name + "...", false);
-  add_log(string("Build flags: ") + kernel_build_options, true);
-  const bool success = system_call_self(args);
-  const double elapsed = time_dt() - starttime;
-
-  semaphore.release();
-
-  if (!success || !path_exists(clbin)) {
-    return false;
-  }
-
-  add_log(
-      string_printf("Kernel compilation of %s finished in %.2lfs.", program_name.c_str(), elapsed),
-      false);
-
-  return load_binary(clbin);
-}
-
-/* Compile opencl kernel. This method is called from the _cycles Python
- * module compile kernels. Parameters must match function above. */
-bool device_opencl_compile_kernel(const vector<string> &parameters)
-{
-  int device_platform_id = std::stoi(parameters[0]);
-  const string &device_name = parameters[1];
-  const string &platform_name = parameters[2];
-  const string &build_options = parameters[3];
-  const string &kernel_file = parameters[4];
-  const string &binary_path = parameters[5];
-
-  if (clewInit() != CLEW_SUCCESS) {
-    return false;
-  }
-
-  vector<OpenCLPlatformDevice> usable_devices;
-  OpenCLInfo::get_usable_devices(&usable_devices);
-  if (device_platform_id >= usable_devices.size()) {
-    return false;
-  }
-
-  OpenCLPlatformDevice &platform_device = usable_devices[device_platform_id];
-  if (platform_device.platform_name != platform_name ||
-      platform_device.device_name != device_name) {
-    return false;
-  }
-
-  cl_platform_id platform = platform_device.platform_id;
-  cl_device_id device = platform_device.device_id;
-  const cl_context_properties context_props[] = {
-      CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0, 0};
-
-  cl_int err;
-  cl_context context = clCreateContext(context_props, 1, &device, NULL, NULL, &err);
-  if (err != CL_SUCCESS) {
-    return false;
-  }
-
-  string source = get_program_source(kernel_file);
-  size_t source_len = source.size();
-  const char *source_str = source.c_str();
-  cl_program program = clCreateProgramWithSource(context, 1, &source_str, &source_len, &err);
-  bool result = false;
-
-  if (err == CL_SUCCESS) {
-    err = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL);
-
-    if (err == CL_SUCCESS) {
-      size_t size = 0;
-      clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
-      if (size > 0) {
-        vector<uint8_t> binary(size);
-        uint8_t *bytes = &binary[0];
-        clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &bytes, NULL);
-        result = path_write_binary(binary_path, binary);
-      }
-    }
-    clReleaseProgram(program);
-  }
-
-  clReleaseContext(context);
-
-  return result;
-}
-
-bool OpenCLDevice::OpenCLProgram::load_binary(const string &clbin, const string *debug_src)
-{
-  /* read binary into memory */
-  vector<uint8_t> binary;
-
-  if (!path_read_binary(clbin, binary)) {
-    add_error(string_printf("OpenCL failed to read cached binary %s.", clbin.c_str()));
-    return false;
-  }
-
-  /* create program */
-  cl_int status, ciErr;
-  size_t size = binary.size();
-  const uint8_t *bytes = &binary[0];
-
-  program = clCreateProgramWithBinary(
-      device->cxContext, 1, &device->cdDevice, &size, &bytes, &status, &ciErr);
-
-  if (status != CL_SUCCESS || ciErr != CL_SUCCESS) {
-    add_error(string("OpenCL failed create program from cached binary ") + clbin + ": " +
-              clewErrorString(status) + " " + clewErrorString(ciErr));
-    return false;
-  }
-
-  if (!build_kernel(debug_src))
-    return false;
-
-  return true;
-}
-
-bool OpenCLDevice::OpenCLProgram::save_binary(const string &clbin)
-{
-  size_t size = 0;
-  clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
-
-  if (!size)
-    return false;
-
-  vector<uint8_t> binary(size);
-  uint8_t *bytes = &binary[0];
-
-  clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &bytes, NULL);
-
-  return path_write_binary(clbin, binary);
-}
-
-bool OpenCLDevice::OpenCLProgram::load()
-{
-  loaded = false;
-  string device_md5 = device->device_md5_hash(kernel_build_options);
-
-  /* Try to use cached kernel. */
-  thread_scoped_lock cache_locker;
-  ustring cache_key(program_name + device_md5);
-  program = device->load_cached_kernel(cache_key, cache_locker);
-  if (!program) {
-    add_log(string("OpenCL program ") + program_name + " not found in cache.", true);
-
-    /* need to create source to get md5 */
-    string source = get_program_source(kernel_file);
-
-    string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" +
-                      util_md5_string(source);
-    basename = path_cache_get(path_join("kernels", basename));
-    string clbin = basename + ".clbin";
-
-    /* If binary kernel exists already, try use it. */
-    if (path_exists(clbin) && load_binary(clbin)) {
-      /* Kernel loaded from binary, nothing to do. */
-      add_log(string("Loaded program from ") + clbin + ".", true);
-
-      /* Cache the program. */
-      device->store_cached_kernel(program, cache_key, cache_locker);
-    }
-    else {
-      add_log(string("OpenCL program ") + program_name + " not found on disk.", true);
-      cache_locker.unlock();
-    }
-  }
-
-  if (program) {
-    create_kernels();
-    loaded = true;
-    needs_compiling = false;
-  }
-
-  return loaded;
-}
-
-void OpenCLDevice::OpenCLProgram::compile()
-{
-  assert(device);
-
-  string device_md5 = device->device_md5_hash(kernel_build_options);
-
-  /* Try to use cached kernel. */
-  thread_scoped_lock cache_locker;
-  ustring cache_key(program_name + device_md5);
-  program = device->load_cached_kernel(cache_key, cache_locker);
-
-  if (!program) {
-
-    add_log(string("OpenCL program ") + program_name + " not found in cache.", true);
-
-    /* need to create source to get md5 */
-    string source = get_program_source(kernel_file);
-
-    string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" +
-                      util_md5_string(source);
-    basename = path_cache_get(path_join("kernels", basename));
-    string clbin = basename + ".clbin";
-
-    /* path to preprocessed source for debugging */
-    string clsrc, *debug_src = NULL;
-
-    if (OpenCLInfo::use_debug()) {
-      clsrc = basename + ".cl";
-      debug_src = &clsrc;
-    }
-
-    if (DebugFlags().running_inside_blender && compile_separate(clbin)) {
-      add_log(string("Built and loaded program from ") + clbin + ".", true);
-      loaded = true;
-    }
-    else {
-      if (DebugFlags().running_inside_blender) {
-        add_log(string("Separate-process building of ") + clbin +
-                    " failed, will fall back to regular building.",
-                true);
-      }
-
-      /* If does not exist or loading binary failed, compile kernel. */
-      if (!compile_kernel(debug_src)) {
-        needs_compiling = false;
-        return;
-      }
-
-      /* Save binary for reuse. */
-      if (!save_binary(clbin)) {
-        add_log(string("Saving compiled OpenCL kernel to ") + clbin + " failed!", true);
-      }
-    }
-
-    /* Cache the program. */
-    device->store_cached_kernel(program, cache_key, cache_locker);
-  }
-
-  create_kernels();
-  needs_compiling = false;
-  loaded = true;
-}
-
-void OpenCLDevice::OpenCLProgram::create_kernels()
-{
-  for (map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end();
-       ++kernel) {
-    assert(kernel->second == NULL);
-    cl_int ciErr;
-    string name = "kernel_ocl_" + kernel->first.string();
-    kernel->second = clCreateKernel(program, name.c_str(), &ciErr);
-    if (device->opencl_error(ciErr)) {
-      add_error(string("Error getting kernel ") + name + " from program " + program_name + ": " +
-                clewErrorString(ciErr));
-      return;
-    }
-  }
-}
-
-bool OpenCLDevice::OpenCLProgram::wait_for_availability()
-{
-  add_log(string("Waiting for availability of ") + program_name + ".", true);
-  while (needs_compiling) {
-    time_sleep(0.1);
-  }
-  return loaded;
-}
-
-void OpenCLDevice::OpenCLProgram::report_error()
-{
-  /* If loaded is true, there was no error. */
-  if (loaded)
-    return;
-  /* if use_stdout is true, the error was already reported. */
-  if (use_stdout)
-    return;
-
-  cerr << error_msg << endl;
-  if (!compile_output.empty()) {
-    cerr << "OpenCL kernel build output for " << program_name << ":" << endl;
-    cerr << compile_output << endl;
-  }
-}
-
-cl_kernel OpenCLDevice::OpenCLProgram::operator()()
-{
-  assert(kernels.size() == 1);
-  return kernels.begin()->second;
-}
-
-cl_kernel OpenCLDevice::OpenCLProgram::operator()(ustring name)
-{
-  assert(kernels.count(name));
-  return kernels[name];
-}
-
-cl_device_type OpenCLInfo::device_type()
-{
-  switch (DebugFlags().opencl.device_type) {
-    case DebugFlags::OpenCL::DEVICE_NONE:
-      return 0;
-    case DebugFlags::OpenCL::DEVICE_ALL:
-      return CL_DEVICE_TYPE_ALL;
-    case DebugFlags::OpenCL::DEVICE_DEFAULT:
-      return CL_DEVICE_TYPE_DEFAULT;
-    case DebugFlags::OpenCL::DEVICE_CPU:
-      return CL_DEVICE_TYPE_CPU;
-    case DebugFlags::OpenCL::DEVICE_GPU:
-      return CL_DEVICE_TYPE_GPU;
-    case DebugFlags::OpenCL::DEVICE_ACCELERATOR:
-      return CL_DEVICE_TYPE_ACCELERATOR;
-    default:
-      return CL_DEVICE_TYPE_ALL;
-  }
-}
-
-bool OpenCLInfo::use_debug()
-{
-  return DebugFlags().opencl.debug;
-}
-
-bool OpenCLInfo::device_supported(const string &platform_name, const cl_device_id device_id)
-{
-  cl_device_type device_type;
-  if (!get_device_type(device_id, &device_type)) {
-    return false;
-  }
-  string device_name;
-  if (!get_device_name(device_id, &device_name)) {
-    return false;
-  }
-
-  int driver_major = 0;
-  int driver_minor = 0;
-  if (!get_driver_version(device_id, &driver_major, &driver_minor)) {
-    return false;
-  }
-  VLOG(3) << "OpenCL driver version " << driver_major << "." << driver_minor;
-
-  if (getenv("CYCLES_OPENCL_TEST")) {
-    return true;
-  }
-
-  /* Allow Intel GPUs on Intel OpenCL platform. */
-  if (platform_name.find("Intel") != string::npos) {
-    if (device_type != CL_DEVICE_TYPE_GPU) {
-      /* OpenCL on Intel CPU is not an officially supported configuration.
-       * Use hybrid CPU+GPU rendering to utilize both GPU and CPU. */
-      return false;
-    }
-
-#  ifdef __APPLE__
-    /* Apple uses own framework, which can also put Iris onto AMD frame-work.
-     * This isn't supported configuration. */
-    return false;
-#  else
-    if (device_name.find("Iris") != string::npos || device_name.find("Xe") != string::npos) {
-      return true;
-    }
-#  endif
-  }
-
-  if (platform_name == "AMD Accelerated Parallel Processing" &&
-      device_type == CL_DEVICE_TYPE_GPU) {
-    if (driver_major < 2236) {
-      VLOG(1) << "AMD driver version " << driver_major << "." << driver_minor << " not supported.";
-      return false;
-    }
-    const char *blacklist[] = {/* GCN 1 */
-                               "Tahiti",
-                               "Pitcairn",
-                               "Capeverde",
-                               "Oland",
-                               "Hainan",
-                               NULL};
-    for (int i = 0; blacklist[i] != NULL; i++) {
-      if (device_name == blacklist[i]) {
-        VLOG(1) << "AMD device " << device_name << " not supported";
-        return false;
-      }
-    }
-    return true;
-  }
-  if (platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) {
-    return false;
-  }
-  return false;
-}
-
-bool OpenCLInfo::platform_version_check(cl_platform_id platform, string *error)
-{
-  const int req_major = 1, req_minor = 1;
-  int major, minor;
-  char version[256];
-  clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(version), &version, NULL);
-  if (sscanf(version, "OpenCL %d.%d", &major, &minor) < 2) {
-    if (error != NULL) {
-      *error = string_printf("OpenCL: failed to parse platform version string (%s).", version);
-    }
-    return false;
-  }
-  if (!((major == req_major && minor >= req_minor) || (major > req_major))) {
-    if (error != NULL) {
-      *error = string_printf(
-          "OpenCL: platform version 1.1 or later required, found %d.%d", major, minor);
-    }
-    return false;
-  }
-  if (error != NULL) {
-    *error = "";
-  }
-  return true;
-}
-
-bool OpenCLInfo::get_device_version(cl_device_id device, int *r_major, int *r_minor, string *error)
-{
-  char version[256];
-  clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, sizeof(version), &version, NULL);
-  if (sscanf(version, "OpenCL C %d.%d", r_major, r_minor) < 2) {
-    if (error != NULL) {
-      *error = string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version);
-    }
-    return false;
-  }
-  if (error != NULL) {
-    *error = "";
-  }
-  return true;
-}
-
-bool OpenCLInfo::device_version_check(cl_device_id device, string *error)
-{
-  const int req_major = 1, req_minor = 1;
-  int major, minor;
-  if (!get_device_version(device, &major, &minor, error)) {
-    return false;
-  }
-
-  if (!((major == req_major && minor >= req_minor) || (major > req_major))) {
-    if (error != NULL) {
-      *error = string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor);
-    }
-    return false;
-  }
-  if (error != NULL) {
-    *error = "";
-  }
-  return true;
-}
-
-string OpenCLInfo::get_hardware_id(const string &platform_name, cl_device_id device_id)
-{
-  if (platform_name == "AMD Accelerated Parallel Processing" || platform_name == "Apple") {
-    /* Use cl_amd_device_topology extension. */
-    cl_char topology[24];
-    if (clGetDeviceInfo(device_id, 0x4037, sizeof(topology), topology, NULL) == CL_SUCCESS &&
-        topology[0] == 1) {
-      return string_printf("%02x:%02x.%01x",
-                           (unsigned int)topology[21],
-                           (unsigned int)topology[22],
-                           (unsigned int)topology[23]);
-    }
-  }
-  else if (platform_name == "NVIDIA CUDA") {
-    /* Use two undocumented options of the cl_nv_device_attribute_query extension. */
-    cl_int bus_id, slot_id;
-    if (clGetDeviceInfo(device_id, 0x4008, sizeof(cl_int), &bus_id, NULL) == CL_SUCCESS &&
-        clGetDeviceInfo(device_id, 0x4009, sizeof(cl_int), &slot_id, NULL) == CL_SUCCESS) {
-      return string_printf("%02x:%02x.%01x",
-                           (unsigned int)(bus_id),
-                           (unsigned int)(slot_id >> 3),
-                           (unsigned int)(slot_id & 0x7));
-    }
-  }
-  /* No general way to get a hardware ID from OpenCL => give up. */
-  return "";
-}
-
-void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices)
-{
-  const cl_device_type device_type = OpenCLInfo::device_type();
-  static bool first_time = true;
-#  define FIRST_VLOG(severity) \
-    if (first_time) \
-    VLOG(severity)
-
-  usable_devices->clear();
-
-  if (device_type == 0) {
-    FIRST_VLOG(2) << "OpenCL devices are forced to be disabled.";
-    first_time = false;
-    return;
-  }
-
-  cl_int error;
-  vector<cl_device_id> device_ids;
-  vector<cl_platform_id> platform_ids;
-
-  /* Get platforms. */
-  if (!get_platforms(&platform_ids, &error)) {
-    FIRST_VLOG(2) << "Error fetching platforms:" << string(clewErrorString(error));
-    first_time = false;
-    return;
-  }
-  if (platform_ids.size() == 0) {
-    FIRST_VLOG(2) << "No OpenCL platforms were found.";
-    first_time = false;
-    return;
-  }
-  /* Devices are numbered consecutively across platforms. */
-  for (int platform = 0; platform < platform_ids.size(); platform++) {
-    cl_platform_id platform_id = platform_ids[platform];
-    string platform_name;
-    if (!get_platform_name(platform_id, &platform_name)) {
-      FIRST_VLOG(2) << "Failed to get platform name, ignoring.";
-      continue;
-    }
-    FIRST_VLOG(2) << "Enumerating devices for platform " << platform_name << ".";
-    if (!platform_version_check(platform_id)) {
-      FIRST_VLOG(2) << "Ignoring platform " << platform_name
-                    << " due to too old compiler version.";
-      continue;
-    }
-    if (!get_platform_devices(platform_id, device_type, &device_ids, &error)) {
-      FIRST_VLOG(2) << "Ignoring platform " << platform_name
-                    << ", failed to fetch of devices: " << string(clewErrorString(error));
-      continue;
-    }
-    if (device_ids.size() == 0) {
-      FIRST_VLOG(2) << "Ignoring platform " << platform_name << ", it has no devices.";
-      continue;
-    }
-    for (int num = 0; num < device_ids.size(); num++) {
-      const cl_device_id device_id = device_ids[num];
-      string device_name;
-      if (!get_device_name(device_id, &device_name, &error)) {
-        FIRST_VLOG(2) << "Failed to fetch device name: " << string(clewErrorString(error))
-                      << ", ignoring.";
-        continue;
-      }
-      if (!device_version_check(device_id)) {
-        FIRST_VLOG(2) << "Ignoring device " << device_name << " due to old compiler version.";
-        continue;
-      }
-      if (device_supported(platform_name, device_id)) {
-        cl_device_type device_type;
-        if (!get_device_type(device_id, &device_type, &error)) {
-          FIRST_VLOG(2) << "Ignoring device " << device_name
-                        << ", failed to fetch device type:" << string(clewErrorString(error));
-          continue;
-        }
-        string readable_device_name = get_readable_device_name(device_id);
-        if (readable_device_name != device_name) {
-          FIRST_VLOG(2) << "Using more readable device name: " << readable_device_name;
-        }
-        FIRST_VLOG(2) << "Adding new device " << readable_device_name << ".";
-        string hardware_id = get_hardware_id(platform_name, device_id);
-        string device_extensions = get_device_extensions(device_id);
-        usable_devices->push_back(OpenCLPlatformDevice(platform_id,
-                                                       platform_name,
-                                                       device_id,
-                                                       device_type,
-                                                       readable_device_name,
-                                                       hardware_id,
-                                                       device_extensions));
-      }
-      else {
-        FIRST_VLOG(2) << "Ignoring device " << device_name << ", not officially supported yet.";
-      }
-    }
-  }
-  first_time = false;
-}
-
-bool OpenCLInfo::get_platforms(vector<cl_platform_id> *platform_ids, cl_int *error)
-{
-  /* Reset from possible previous state. */
-  platform_ids->resize(0);
-  cl_uint num_platforms;
-  if (!get_num_platforms(&num_platforms, error)) {
-    return false;
-  }
-  /* Get actual platforms. */
-  cl_int err;
-  platform_ids->resize(num_platforms);
-  if ((err = clGetPlatformIDs(num_platforms, &platform_ids->at(0), NULL)) != CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  return true;
-}
-
-vector<cl_platform_id> OpenCLInfo::get_platforms()
-{
-  vector<cl_platform_id> platform_ids;
-  get_platforms(&platform_ids);
-  return platform_ids;
-}
-
-bool OpenCLInfo::get_num_platforms(cl_uint *num_platforms, cl_int *error)
-{
-  cl_int err;
-  if ((err = clGetPlatformIDs(0, NULL, num_platforms)) != CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    *num_platforms = 0;
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  return true;
-}
-
-cl_uint OpenCLInfo::get_num_platforms()
-{
-  cl_uint num_platforms;
-  if (!get_num_platforms(&num_platforms)) {
-    return 0;
-  }
-  return num_platforms;
-}
-
-bool OpenCLInfo::get_platform_name(cl_platform_id platform_id, string *platform_name)
-{
-  char buffer[256];
-  if (clGetPlatformInfo(platform_id, CL_PLATFORM_NAME, sizeof(buffer), &buffer, NULL) !=
-      CL_SUCCESS) {
-    *platform_name = "";
-    return false;
-  }
-  *platform_name = buffer;
-  return true;
-}
-
-string OpenCLInfo::get_platform_name(cl_platform_id platform_id)
-{
-  string platform_name;
-  if (!get_platform_name(platform_id, &platform_name)) {
-    return "";
-  }
-  return platform_name;
-}
-
-bool OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id,
-                                          cl_device_type device_type,
-                                          cl_uint *num_devices,
-                                          cl_int *error)
-{
-  cl_int err;
-  if ((err = clGetDeviceIDs(platform_id, device_type, 0, NULL, num_devices)) != CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    *num_devices = 0;
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  return true;
-}
-
-cl_uint OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id,
-                                             cl_device_type device_type)
-{
-  cl_uint num_devices;
-  if (!get_num_platform_devices(platform_id, device_type, &num_devices)) {
-    return 0;
-  }
-  return num_devices;
-}
-
-bool OpenCLInfo::get_platform_devices(cl_platform_id platform_id,
-                                      cl_device_type device_type,
-                                      vector<cl_device_id> *device_ids,
-                                      cl_int *error)
-{
-  /* Reset from possible previous state. */
-  device_ids->resize(0);
-  /* Get number of devices to pre-allocate memory. */
-  cl_uint num_devices;
-  if (!get_num_platform_devices(platform_id, device_type, &num_devices, error)) {
-    return false;
-  }
-  /* Get actual device list. */
-  device_ids->resize(num_devices);
-  cl_int err;
-  if ((err = clGetDeviceIDs(platform_id, device_type, num_devices, &device_ids->at(0), NULL)) !=
-      CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  return true;
-}
-
-vector<cl_device_id> OpenCLInfo::get_platform_devices(cl_platform_id platform_id,
-                                                      cl_device_type device_type)
-{
-  vector<cl_device_id> devices;
-  get_platform_devices(platform_id, device_type, &devices);
-  return devices;
-}
-
-bool OpenCLInfo::get_device_name(cl_device_id device_id, string *device_name, cl_int *error)
-{
-  char buffer[1024];
-  cl_int err;
-  if ((err = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(buffer), &buffer, NULL)) !=
-      CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    *device_name = "";
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  *device_name = buffer;
-  return true;
-}
-
-string OpenCLInfo::get_device_name(cl_device_id device_id)
-{
-  string device_name;
-  if (!get_device_name(device_id, &device_name)) {
-    return "";
-  }
-  return device_name;
-}
-
-bool OpenCLInfo::get_device_extensions(cl_device_id device_id,
-                                       string *device_extensions,
-                                       cl_int *error)
-{
-  size_t extension_length = 0;
-  cl_int err;
-  /* Determine the size of the extension string. */
-  if ((err = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, 0, 0, &extension_length)) !=
-      CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    *device_extensions = "";
-    return false;
-  }
-  vector<char> buffer(extension_length);
-  if ((err = clGetDeviceInfo(
-           device_id, CL_DEVICE_EXTENSIONS, extension_length, buffer.data(), NULL)) !=
-      CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    *device_extensions = "";
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  *device_extensions = string(buffer.data());
-  return true;
-}
-
-string OpenCLInfo::get_device_extensions(cl_device_id device_id)
-{
-  string device_extensions;
-  if (!get_device_extensions(device_id, &device_extensions)) {
-    return "";
-  }
-  return device_extensions;
-}
-
-bool OpenCLInfo::get_device_type(cl_device_id device_id,
-                                 cl_device_type *device_type,
-                                 cl_int *error)
-{
-  cl_int err;
-  if ((err = clGetDeviceInfo(
-           device_id, CL_DEVICE_TYPE, sizeof(cl_device_type), device_type, NULL)) != CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    *device_type = 0;
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  return true;
-}
-
-cl_device_type OpenCLInfo::get_device_type(cl_device_id device_id)
-{
-  cl_device_type device_type;
-  if (!get_device_type(device_id, &device_type)) {
-    return 0;
-  }
-  return device_type;
-}
-
-string OpenCLInfo::get_readable_device_name(cl_device_id device_id)
-{
-  string name = "";
-  char board_name[1024];
-  size_t length = 0;
-  if (clGetDeviceInfo(
-          device_id, CL_DEVICE_BOARD_NAME_AMD, sizeof(board_name), &board_name, &length) ==
-      CL_SUCCESS) {
-    if (length != 0 && board_name[0] != '\0') {
-      name = board_name;
-    }
-  }
-
-  /* Fallback to standard device name API. */
-  if (name.empty()) {
-    name = get_device_name(device_id);
-  }
-
-  /* Special exception for AMD Vega, need to be able to tell
-   * Vega 56 from 64 apart.
-   */
-  if (name == "Radeon RX Vega") {
-    cl_int max_compute_units = 0;
-    if (clGetDeviceInfo(device_id,
-                        CL_DEVICE_MAX_COMPUTE_UNITS,
-                        sizeof(max_compute_units),
-                        &max_compute_units,
-                        NULL) == CL_SUCCESS) {
-      name += " " + to_string(max_compute_units);
-    }
-  }
-
-  /* Distinguish from our native CPU device. */
-  if (get_device_type(device_id) & CL_DEVICE_TYPE_CPU) {
-    name += " (OpenCL)";
-  }
-
-  return name;
-}
-
-bool OpenCLInfo::get_driver_version(cl_device_id device_id, int *major, int *minor, cl_int *error)
-{
-  char buffer[1024];
-  cl_int err;
-  if ((err = clGetDeviceInfo(device_id, CL_DRIVER_VERSION, sizeof(buffer), &buffer, NULL)) !=
-      CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  if (sscanf(buffer, "%d.%d", major, minor) < 2) {
-    VLOG(1) << string_printf("OpenCL: failed to parse driver version string (%s).", buffer);
-    return false;
-  }
-  return true;
-}
-
-int OpenCLInfo::mem_sub_ptr_alignment(cl_device_id device_id)
-{
-  int base_align_bits;
-  if (clGetDeviceInfo(
-          device_id, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(int), &base_align_bits, NULL) ==
-      CL_SUCCESS) {
-    return base_align_bits / 8;
-  }
-  return 1;
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/optix/device.cpp b/intern/cycles/device/optix/device.cpp
new file mode 100644
index 00000000000..13f23bd229a
--- /dev/null
+++ b/intern/cycles/device/optix/device.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/optix/device.h"
+
+#include "device/cuda/device.h"
+#include "device/optix/device_impl.h"
+#include "util/util_logging.h"
+
+#ifdef WITH_OPTIX
+#  include <optix_function_table_definition.h>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+bool device_optix_init()
+{
+#ifdef WITH_OPTIX
+  if (g_optixFunctionTable.optixDeviceContextCreate != NULL) {
+    /* Already initialized function table. */
+    return true;
+  }
+
+  /* Need to initialize CUDA as well. */
+  if (!device_cuda_init()) {
+    return false;
+  }
+
+  const OptixResult result = optixInit();
+
+  if (result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) {
+    VLOG(1) << "OptiX initialization failed because the installed NVIDIA driver is too old. "
+               "Please update to the latest driver first!";
+    return false;
+  }
+  else if (result != OPTIX_SUCCESS) {
+    VLOG(1) << "OptiX initialization failed with error code " << (unsigned int)result;
+    return false;
+  }
+
+  /* Loaded OptiX successfully! */
+  return true;
+#else
+  return false;
+#endif
+}
+
+void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices)
+{
+#ifdef WITH_OPTIX
+  devices.reserve(cuda_devices.size());
+
+  /* Simply add all supported CUDA devices as OptiX devices again. */
+  for (DeviceInfo info : cuda_devices) {
+    assert(info.type == DEVICE_CUDA);
+
+    int major;
+    cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, info.num);
+    if (major < 5) {
+      /* Only Maxwell and up are supported by OptiX. */
+      continue;
+    }
+
+    info.type = DEVICE_OPTIX;
+    info.id += "_OptiX";
+    info.denoisers |= DENOISER_OPTIX;
+
+    devices.push_back(info);
+  }
+#else
+  (void)cuda_devices;
+  (void)devices;
+#endif
+}
+
+Device *device_optix_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+{
+#ifdef WITH_OPTIX
+  return new OptiXDevice(info, stats, profiler);
+#else
+  (void)info;
+  (void)stats;
+  (void)profiler;
+
+  LOG(FATAL) << "Request to create OptiX device without compiled-in support. Should never happen.";
+
+  return nullptr;
+#endif
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/optix/device.h b/intern/cycles/device/optix/device.h
new file mode 100644
index 00000000000..29fa729c2e4
--- /dev/null
+++ b/intern/cycles/device/optix/device.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_string.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+bool device_optix_init();
+
+Device *device_optix_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp
new file mode 100644
index 00000000000..cd16b8c9f01
--- /dev/null
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -0,0 +1,1573 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_OPTIX
+
+#  include "device/optix/device_impl.h"
+
+#  include "bvh/bvh.h"
+#  include "bvh/bvh_optix.h"
+#  include "integrator/pass_accessor_gpu.h"
+#  include "render/buffers.h"
+#  include "render/hair.h"
+#  include "render/mesh.h"
+#  include "render/object.h"
+#  include "render/pass.h"
+#  include "render/scene.h"
+
+#  include "util/util_debug.h"
+#  include "util/util_logging.h"
+#  include "util/util_md5.h"
+#  include "util/util_path.h"
+#  include "util/util_progress.h"
+#  include "util/util_time.h"
+
+#  undef __KERNEL_CPU__
+#  define __KERNEL_OPTIX__
+#  include "kernel/device/optix/globals.h"
+
+CCL_NAMESPACE_BEGIN
+
+OptiXDevice::Denoiser::Denoiser(OptiXDevice *device)
+    : device(device), queue(device), state(device, "__denoiser_state")
+{
+}
+
+OptiXDevice::Denoiser::~Denoiser()
+{
+  const CUDAContextScope scope(device);
+  if (optix_denoiser != nullptr) {
+    optixDenoiserDestroy(optix_denoiser);
+  }
+}
+
+OptiXDevice::OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+    : CUDADevice(info, stats, profiler),
+      sbt_data(this, "__sbt", MEM_READ_ONLY),
+      launch_params(this, "__params"),
+      denoiser_(this)
+{
+  /* Make the CUDA context current. */
+  if (!cuContext) {
+    /* Do not initialize if CUDA context creation failed already. */
+    return;
+  }
+  const CUDAContextScope scope(this);
+
+  /* Create OptiX context for this device. */
+  OptixDeviceContextOptions options = {};
+#  ifdef WITH_CYCLES_LOGGING
+  options.logCallbackLevel = 4; /* Fatal = 1, Error = 2, Warning = 3, Print = 4. */
+  options.logCallbackFunction = [](unsigned int level, const char *, const char *message, void *) {
+    switch (level) {
+      case 1:
+        LOG_IF(FATAL, VLOG_IS_ON(1)) << message;
+        break;
+      case 2:
+        LOG_IF(ERROR, VLOG_IS_ON(1)) << message;
+        break;
+      case 3:
+        LOG_IF(WARNING, VLOG_IS_ON(1)) << message;
+        break;
+      case 4:
+        LOG_IF(INFO, VLOG_IS_ON(1)) << message;
+        break;
+    }
+  };
+#  endif
+  if (DebugFlags().optix.use_debug) {
+    options.validationMode = OPTIX_DEVICE_CONTEXT_VALIDATION_MODE_ALL;
+  }
+  optix_assert(optixDeviceContextCreate(cuContext, &options, &context));
+#  ifdef WITH_CYCLES_LOGGING
+  optix_assert(optixDeviceContextSetLogCallback(
+      context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel));
+#  endif
+
+  /* Fix weird compiler bug that assigns wrong size. */
+  launch_params.data_elements = sizeof(KernelParamsOptiX);
+
+  /* Allocate launch parameter buffer memory on device. */
+  launch_params.alloc_to_device(1);
+}
+
+OptiXDevice::~OptiXDevice()
+{
+  /* Make CUDA context current. */
+  const CUDAContextScope scope(this);
+
+  free_bvh_memory_delayed();
+
+  sbt_data.free();
+  texture_info.free();
+  launch_params.free();
+
+  /* Unload modules. */
+  if (optix_module != NULL) {
+    optixModuleDestroy(optix_module);
+  }
+  for (unsigned int i = 0; i < 2; ++i) {
+    if (builtin_modules[i] != NULL) {
+      optixModuleDestroy(builtin_modules[i]);
+    }
+  }
+  for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
+    if (pipelines[i] != NULL) {
+      optixPipelineDestroy(pipelines[i]);
+    }
+  }
+
+  optixDeviceContextDestroy(context);
+}
+
+unique_ptr<DeviceQueue> OptiXDevice::gpu_queue_create()
+{
+  return make_unique<OptiXDeviceQueue>(this);
+}
+
+BVHLayoutMask OptiXDevice::get_bvh_layout_mask() const
+{
+  /* OptiX has its own internal acceleration structure format. */
+  return BVH_LAYOUT_OPTIX;
+}
+
+string OptiXDevice::compile_kernel_get_common_cflags(const uint kernel_features)
+{
+  string common_cflags = CUDADevice::compile_kernel_get_common_cflags(kernel_features);
+
+  /* Add OptiX SDK include directory to include paths. */
+  const char *optix_sdk_path = getenv("OPTIX_ROOT_DIR");
+  if (optix_sdk_path) {
+    common_cflags += string_printf(" -I\"%s/include\"", optix_sdk_path);
+  }
+
+  /* Specialization for shader raytracing. */
+  if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+    common_cflags += " --keep-device-functions";
+  }
+
+  return common_cflags;
+}
+
+bool OptiXDevice::load_kernels(const uint kernel_features)
+{
+  if (have_error()) {
+    /* Abort early if context creation failed already. */
+    return false;
+  }
+
+  /* Load CUDA modules because we need some of the utility kernels. */
+  if (!CUDADevice::load_kernels(kernel_features)) {
+    return false;
+  }
+
+  /* Skip creating OptiX module if only doing denoising. */
+  if (!(kernel_features & (KERNEL_FEATURE_PATH_TRACING | KERNEL_FEATURE_BAKING))) {
+    return true;
+  }
+
+  const CUDAContextScope scope(this);
+
+  /* Unload existing OptiX module and pipelines first. */
+  if (optix_module != NULL) {
+    optixModuleDestroy(optix_module);
+    optix_module = NULL;
+  }
+  for (unsigned int i = 0; i < 2; ++i) {
+    if (builtin_modules[i] != NULL) {
+      optixModuleDestroy(builtin_modules[i]);
+      builtin_modules[i] = NULL;
+    }
+  }
+  for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
+    if (pipelines[i] != NULL) {
+      optixPipelineDestroy(pipelines[i]);
+      pipelines[i] = NULL;
+    }
+  }
+
+  OptixModuleCompileOptions module_options = {};
+  module_options.maxRegisterCount = 0; /* Do not set an explicit register limit. */
+
+  if (DebugFlags().optix.use_debug) {
+    module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_0;
+    module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
+  }
+  else {
+    module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
+    module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
+  }
+
+  module_options.boundValues = nullptr;
+  module_options.numBoundValues = 0;
+
+  OptixPipelineCompileOptions pipeline_options = {};
+  /* Default to no motion blur and two-level graph, since it is the fastest option. */
+  pipeline_options.usesMotionBlur = false;
+  pipeline_options.traversableGraphFlags =
+      OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING;
+  pipeline_options.numPayloadValues = 6;
+  pipeline_options.numAttributeValues = 2; /* u, v */
+  pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE;
+  pipeline_options.pipelineLaunchParamsVariableName = "__params"; /* See globals.h */
+
+  pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE;
+  if (kernel_features & KERNEL_FEATURE_HAIR) {
+    if (kernel_features & KERNEL_FEATURE_HAIR_THICK) {
+      pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE;
+    }
+    else
+      pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
+  }
+
+  /* Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
+   * This is necessary since objects may be reported to have motion if the Vector pass is
+   * active, but may still need to be rendered without motion blur if that isn't active as well. */
+  motion_blur = (kernel_features & KERNEL_FEATURE_OBJECT_MOTION) != 0;
+
+  if (motion_blur) {
+    pipeline_options.usesMotionBlur = true;
+    /* Motion blur can insert motion transforms into the traversal graph.
+     * It is no longer a two-level graph then, so need to set flags to allow any configuration. */
+    pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY;
+  }
+
+  { /* Load and compile PTX module with OptiX kernels. */
+    string ptx_data, ptx_filename = path_get((kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ?
+                                                 "lib/kernel_optix_shader_raytrace.ptx" :
+                                                 "lib/kernel_optix.ptx");
+    if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
+      if (!getenv("OPTIX_ROOT_DIR")) {
+        set_error(
+            "Missing OPTIX_ROOT_DIR environment variable (which must be set with the path to "
+            "the Optix SDK to be able to compile Optix kernels on demand).");
+        return false;
+      }
+      ptx_filename = compile_kernel(
+          kernel_features,
+          (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ? "kernel_shader_raytrace" : "kernel",
+          "optix",
+          true);
+    }
+    if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) {
+      set_error(string_printf("Failed to load OptiX kernel from '%s'", ptx_filename.c_str()));
+      return false;
+    }
+
+    const OptixResult result = optixModuleCreateFromPTX(context,
+                                                        &module_options,
+                                                        &pipeline_options,
+                                                        ptx_data.data(),
+                                                        ptx_data.size(),
+                                                        nullptr,
+                                                        0,
+                                                        &optix_module);
+    if (result != OPTIX_SUCCESS) {
+      set_error(string_printf("Failed to load OptiX kernel from '%s' (%s)",
+                              ptx_filename.c_str(),
+                              optixGetErrorName(result)));
+      return false;
+    }
+  }
+
+  /* Create program groups. */
+  OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
+  OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
+  OptixProgramGroupOptions group_options = {}; /* There are no options currently. */
+  group_descs[PG_RGEN_INTERSECT_CLOSEST].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+  group_descs[PG_RGEN_INTERSECT_CLOSEST].raygen.module = optix_module;
+  group_descs[PG_RGEN_INTERSECT_CLOSEST].raygen.entryFunctionName =
+      "__raygen__kernel_optix_integrator_intersect_closest";
+  group_descs[PG_RGEN_INTERSECT_SHADOW].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+  group_descs[PG_RGEN_INTERSECT_SHADOW].raygen.module = optix_module;
+  group_descs[PG_RGEN_INTERSECT_SHADOW].raygen.entryFunctionName =
+      "__raygen__kernel_optix_integrator_intersect_shadow";
+  group_descs[PG_RGEN_INTERSECT_SUBSURFACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+  group_descs[PG_RGEN_INTERSECT_SUBSURFACE].raygen.module = optix_module;
+  group_descs[PG_RGEN_INTERSECT_SUBSURFACE].raygen.entryFunctionName =
+      "__raygen__kernel_optix_integrator_intersect_subsurface";
+  group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+  group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].raygen.module = optix_module;
+  group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].raygen.entryFunctionName =
+      "__raygen__kernel_optix_integrator_intersect_volume_stack";
+  group_descs[PG_MISS].kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
+  group_descs[PG_MISS].miss.module = optix_module;
+  group_descs[PG_MISS].miss.entryFunctionName = "__miss__kernel_optix_miss";
+  group_descs[PG_HITD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+  group_descs[PG_HITD].hitgroup.moduleCH = optix_module;
+  group_descs[PG_HITD].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit";
+  group_descs[PG_HITD].hitgroup.moduleAH = optix_module;
+  group_descs[PG_HITD].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_visibility_test";
+  group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+  group_descs[PG_HITS].hitgroup.moduleAH = optix_module;
+  group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit";
+
+  if (kernel_features & KERNEL_FEATURE_HAIR) {
+    if (kernel_features & KERNEL_FEATURE_HAIR_THICK) {
+      /* Built-in thick curve intersection. */
+      OptixBuiltinISOptions builtin_options = {};
+      builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
+      builtin_options.usesMotionBlur = false;
+
+      optix_assert(optixBuiltinISModuleGet(
+          context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[0]));
+
+      group_descs[PG_HITD].hitgroup.moduleIS = builtin_modules[0];
+      group_descs[PG_HITD].hitgroup.entryFunctionNameIS = nullptr;
+      group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0];
+      group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr;
+
+      if (motion_blur) {
+        builtin_options.usesMotionBlur = true;
+
+        optix_assert(optixBuiltinISModuleGet(
+            context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[1]));
+
+        group_descs[PG_HITD_MOTION] = group_descs[PG_HITD];
+        group_descs[PG_HITD_MOTION].hitgroup.moduleIS = builtin_modules[1];
+        group_descs[PG_HITS_MOTION] = group_descs[PG_HITS];
+        group_descs[PG_HITS_MOTION].hitgroup.moduleIS = builtin_modules[1];
+      }
+    }
+    else {
+      /* Custom ribbon intersection. */
+      group_descs[PG_HITD].hitgroup.moduleIS = optix_module;
+      group_descs[PG_HITS].hitgroup.moduleIS = optix_module;
+      group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
+      group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
+    }
+  }
+
+  if (kernel_features & (KERNEL_FEATURE_SUBSURFACE | KERNEL_FEATURE_NODE_RAYTRACE)) {
+    /* Add hit group for local intersections. */
+    group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+    group_descs[PG_HITL].hitgroup.moduleAH = optix_module;
+    group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit";
+  }
+
+  /* Shader raytracing replaces some functions with direct callables. */
+  if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+    group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.module = optix_module;
+    group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.entryFunctionName =
+        "__raygen__kernel_optix_integrator_shade_surface_raytrace";
+    group_descs[PG_CALL_SVM_AO].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+    group_descs[PG_CALL_SVM_AO].callables.moduleDC = optix_module;
+    group_descs[PG_CALL_SVM_AO].callables.entryFunctionNameDC = "__direct_callable__svm_node_ao";
+    group_descs[PG_CALL_SVM_BEVEL].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+    group_descs[PG_CALL_SVM_BEVEL].callables.moduleDC = optix_module;
+    group_descs[PG_CALL_SVM_BEVEL].callables.entryFunctionNameDC =
+        "__direct_callable__svm_node_bevel";
+    group_descs[PG_CALL_AO_PASS].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+    group_descs[PG_CALL_AO_PASS].callables.moduleDC = optix_module;
+    group_descs[PG_CALL_AO_PASS].callables.entryFunctionNameDC = "__direct_callable__ao_pass";
+  }
+
+  optix_assert(optixProgramGroupCreate(
+      context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups));
+
+  /* Get program stack sizes. */
+  OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
+  /* Set up SBT, which in this case is used only to select between different programs. */
+  sbt_data.alloc(NUM_PROGRAM_GROUPS);
+  memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS);
+  for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+    optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
+    optix_assert(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
+  }
+  sbt_data.copy_to_device(); /* Upload SBT to device. */
+
+  /* Calculate maximum trace continuation stack size. */
+  unsigned int trace_css = stack_size[PG_HITD].cssCH;
+  /* This is based on the maximum of closest-hit and any-hit/intersection programs. */
+  trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
+  trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
+  trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
+  trace_css = std::max(trace_css,
+                       stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH);
+  trace_css = std::max(trace_css,
+                       stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH);
+
+  OptixPipelineLinkOptions link_options = {};
+  link_options.maxTraceDepth = 1;
+
+  if (DebugFlags().optix.use_debug) {
+    link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
+  }
+  else {
+    link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
+  }
+
+  if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+    /* Create shader raytracing pipeline. */
+    vector<OptixProgramGroup> pipeline_groups;
+    pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
+    pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]);
+    pipeline_groups.push_back(groups[PG_MISS]);
+    pipeline_groups.push_back(groups[PG_HITD]);
+    pipeline_groups.push_back(groups[PG_HITS]);
+    pipeline_groups.push_back(groups[PG_HITL]);
+    if (motion_blur) {
+      pipeline_groups.push_back(groups[PG_HITD_MOTION]);
+      pipeline_groups.push_back(groups[PG_HITS_MOTION]);
+    }
+    pipeline_groups.push_back(groups[PG_CALL_SVM_AO]);
+    pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]);
+
+    optix_assert(optixPipelineCreate(context,
+                                     &pipeline_options,
+                                     &link_options,
+                                     pipeline_groups.data(),
+                                     pipeline_groups.size(),
+                                     nullptr,
+                                     0,
+                                     &pipelines[PIP_SHADE_RAYTRACE]));
+
+    /* Combine ray generation and trace continuation stack size. */
+    const unsigned int css = stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG +
+                             link_options.maxTraceDepth * trace_css;
+    const unsigned int dss = std::max(stack_size[PG_CALL_SVM_AO].dssDC,
+                                      stack_size[PG_CALL_SVM_BEVEL].dssDC);
+
+    /* Set stack size depending on pipeline options. */
+    optix_assert(optixPipelineSetStackSize(
+        pipelines[PIP_SHADE_RAYTRACE], 0, dss, css, motion_blur ? 3 : 2));
+  }
+
+  { /* Create intersection-only pipeline. */
+    vector<OptixProgramGroup> pipeline_groups;
+    pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
+    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_CLOSEST]);
+    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SHADOW]);
+    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SUBSURFACE]);
+    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_VOLUME_STACK]);
+    pipeline_groups.push_back(groups[PG_MISS]);
+    pipeline_groups.push_back(groups[PG_HITD]);
+    pipeline_groups.push_back(groups[PG_HITS]);
+    pipeline_groups.push_back(groups[PG_HITL]);
+    if (motion_blur) {
+      pipeline_groups.push_back(groups[PG_HITD_MOTION]);
+      pipeline_groups.push_back(groups[PG_HITS_MOTION]);
+    }
+
+    optix_assert(optixPipelineCreate(context,
+                                     &pipeline_options,
+                                     &link_options,
+                                     pipeline_groups.data(),
+                                     pipeline_groups.size(),
+                                     nullptr,
+                                     0,
+                                     &pipelines[PIP_INTERSECT]));
+
+    /* Calculate continuation stack size based on the maximum of all ray generation stack sizes. */
+    const unsigned int css =
+        std::max(stack_size[PG_RGEN_INTERSECT_CLOSEST].cssRG,
+                 std::max(stack_size[PG_RGEN_INTERSECT_SHADOW].cssRG,
+                          std::max(stack_size[PG_RGEN_INTERSECT_SUBSURFACE].cssRG,
+                                   stack_size[PG_RGEN_INTERSECT_VOLUME_STACK].cssRG))) +
+        link_options.maxTraceDepth * trace_css;
+
+    optix_assert(
+        optixPipelineSetStackSize(pipelines[PIP_INTERSECT], 0, 0, css, motion_blur ? 3 : 2));
+  }
+
+  /* Clean up program group objects. */
+  for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+    optixProgramGroupDestroy(groups[i]);
+  }
+
+  return true;
+}
+
+/* --------------------------------------------------------------------
+ * Buffer denoising.
+ */
+
+class OptiXDevice::DenoiseContext {
+ public:
+  explicit DenoiseContext(OptiXDevice *device, const DeviceDenoiseTask &task)
+      : denoise_params(task.params),
+        render_buffers(task.render_buffers),
+        buffer_params(task.buffer_params),
+        guiding_buffer(device, "denoiser guiding passes buffer"),
+        num_samples(task.num_samples)
+  {
+    num_input_passes = 1;
+    if (denoise_params.use_pass_albedo) {
+      num_input_passes += 1;
+      use_pass_albedo = true;
+      pass_denoising_albedo = buffer_params.get_pass_offset(PASS_DENOISING_ALBEDO);
+      if (denoise_params.use_pass_normal) {
+        num_input_passes += 1;
+        use_pass_normal = true;
+        pass_denoising_normal = buffer_params.get_pass_offset(PASS_DENOISING_NORMAL);
+      }
+    }
+
+    const int num_guiding_passes = num_input_passes - 1;
+
+    if (num_guiding_passes) {
+      if (task.allow_inplace_modification) {
+        guiding_params.device_pointer = render_buffers->buffer.device_pointer;
+
+        guiding_params.pass_albedo = pass_denoising_albedo;
+        guiding_params.pass_normal = pass_denoising_normal;
+
+        guiding_params.stride = buffer_params.stride;
+        guiding_params.pass_stride = buffer_params.pass_stride;
+      }
+      else {
+        guiding_params.pass_stride = 0;
+        if (use_pass_albedo) {
+          guiding_params.pass_albedo = guiding_params.pass_stride;
+          guiding_params.pass_stride += 3;
+        }
+        if (use_pass_normal) {
+          guiding_params.pass_normal = guiding_params.pass_stride;
+          guiding_params.pass_stride += 3;
+        }
+
+        guiding_params.stride = buffer_params.width;
+
+        guiding_buffer.alloc_to_device(buffer_params.width * buffer_params.height *
+                                       guiding_params.pass_stride);
+        guiding_params.device_pointer = guiding_buffer.device_pointer;
+      }
+    }
+
+    pass_sample_count = buffer_params.get_pass_offset(PASS_SAMPLE_COUNT);
+  }
+
+  const DenoiseParams &denoise_params;
+
+  RenderBuffers *render_buffers = nullptr;
+  const BufferParams &buffer_params;
+
+  /* Device-side storage of the guiding passes. */
+  device_only_memory<float> guiding_buffer;
+
+  struct {
+    device_ptr device_pointer = 0;
+
+    /* NOTE: Are only initialized when the corresponding guiding pass is enabled. */
+    int pass_albedo = PASS_UNUSED;
+    int pass_normal = PASS_UNUSED;
+
+    int stride = -1;
+    int pass_stride = -1;
+  } guiding_params;
+
+  /* Number of input passes. Including the color and extra auxillary passes. */
+  int num_input_passes = 0;
+  bool use_pass_albedo = false;
+  bool use_pass_normal = false;
+
+  int num_samples = 0;
+
+  int pass_sample_count = PASS_UNUSED;
+
+  /* NOTE: Are only initialized when the corresponding guiding pass is enabled. */
+  int pass_denoising_albedo = PASS_UNUSED;
+  int pass_denoising_normal = PASS_UNUSED;
+
+  /* For passes which don't need albedo channel for denoising we replace the actual albedo with
+   * the (0.5, 0.5, 0.5). This flag indicates that the real albedo pass has been replaced with
+   * the fake values and denoising of passes which do need albedo can no longer happen. */
+  bool albedo_replaced_with_fake = false;
+};
+
+class OptiXDevice::DenoisePass {
+ public:
+  DenoisePass(const PassType type, const BufferParams &buffer_params) : type(type)
+  {
+    noisy_offset = buffer_params.get_pass_offset(type, PassMode::NOISY);
+    denoised_offset = buffer_params.get_pass_offset(type, PassMode::DENOISED);
+
+    const PassInfo pass_info = Pass::get_info(type);
+    num_components = pass_info.num_components;
+    use_compositing = pass_info.use_compositing;
+    use_denoising_albedo = pass_info.use_denoising_albedo;
+  }
+
+  PassType type;
+
+  int noisy_offset;
+  int denoised_offset;
+
+  int num_components;
+  bool use_compositing;
+  bool use_denoising_albedo;
+};
+
+bool OptiXDevice::denoise_buffer(const DeviceDenoiseTask &task)
+{
+  const CUDAContextScope scope(this);
+
+  DenoiseContext context(this, task);
+
+  if (!denoise_ensure(context)) {
+    return false;
+  }
+
+  if (!denoise_filter_guiding_preprocess(context)) {
+    LOG(ERROR) << "Error preprocessing guiding passes.";
+    return false;
+  }
+
+  /* Passes which will use real albedo when it is available. */
+  denoise_pass(context, PASS_COMBINED);
+  denoise_pass(context, PASS_SHADOW_CATCHER_MATTE);
+
+  /* Passes which do not need albedo and hence if real is present it needs to become fake. */
+  denoise_pass(context, PASS_SHADOW_CATCHER);
+
+  return true;
+}
+
+DeviceQueue *OptiXDevice::get_denoise_queue()
+{
+  return &denoiser_.queue;
+}
+
+bool OptiXDevice::denoise_filter_guiding_preprocess(DenoiseContext &context)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+
+  const int work_size = buffer_params.width * buffer_params.height;
+
+  void *args[] = {const_cast<device_ptr *>(&context.guiding_params.device_pointer),
+                  const_cast<int *>(&context.guiding_params.pass_stride),
+                  const_cast<int *>(&context.guiding_params.pass_albedo),
+                  const_cast<int *>(&context.guiding_params.pass_normal),
+                  &context.render_buffers->buffer.device_pointer,
+                  const_cast<int *>(&buffer_params.offset),
+                  const_cast<int *>(&buffer_params.stride),
+                  const_cast<int *>(&buffer_params.pass_stride),
+                  const_cast<int *>(&context.pass_sample_count),
+                  const_cast<int *>(&context.pass_denoising_albedo),
+                  const_cast<int *>(&context.pass_denoising_normal),
+                  const_cast<int *>(&buffer_params.full_x),
+                  const_cast<int *>(&buffer_params.full_y),
+                  const_cast<int *>(&buffer_params.width),
+                  const_cast<int *>(&buffer_params.height),
+                  const_cast<int *>(&context.num_samples)};
+
+  return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS, work_size, args);
+}
+
+bool OptiXDevice::denoise_filter_guiding_set_fake_albedo(DenoiseContext &context)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+
+  const int work_size = buffer_params.width * buffer_params.height;
+
+  void *args[] = {const_cast<device_ptr *>(&context.guiding_params.device_pointer),
+                  const_cast<int *>(&context.guiding_params.pass_stride),
+                  const_cast<int *>(&context.guiding_params.pass_albedo),
+                  const_cast<int *>(&buffer_params.width),
+                  const_cast<int *>(&buffer_params.height)};
+
+  return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO, work_size, args);
+}
+
+void OptiXDevice::denoise_pass(DenoiseContext &context, PassType pass_type)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+
+  const DenoisePass pass(pass_type, buffer_params);
+
+  if (pass.noisy_offset == PASS_UNUSED) {
+    return;
+  }
+  if (pass.denoised_offset == PASS_UNUSED) {
+    LOG(DFATAL) << "Missing denoised pass " << pass_type_as_string(pass_type);
+    return;
+  }
+
+  if (pass.use_denoising_albedo) {
+    if (context.albedo_replaced_with_fake) {
+      LOG(ERROR) << "Pass which requires albedo is denoised after fake albedo has been set.";
+      return;
+    }
+  }
+  else if (!context.albedo_replaced_with_fake) {
+    context.albedo_replaced_with_fake = true;
+    if (!denoise_filter_guiding_set_fake_albedo(context)) {
+      LOG(ERROR) << "Error replacing real albedo with the fake one.";
+      return;
+    }
+  }
+
+  /* Read and preprocess noisy color input pass. */
+  denoise_color_read(context, pass);
+  if (!denoise_filter_color_preprocess(context, pass)) {
+    LOG(ERROR) << "Error connverting denoising passes to RGB buffer.";
+    return;
+  }
+
+  if (!denoise_run(context, pass)) {
+    LOG(ERROR) << "Error running OptiX denoiser.";
+    return;
+  }
+
+  /* Store result in the combined pass of the render buffer.
+   *
+   * This will scale the denoiser result up to match the number of, possibly per-pixel, samples. */
+  if (!denoise_filter_color_postprocess(context, pass)) {
+    LOG(ERROR) << "Error copying denoiser result to the denoised pass.";
+    return;
+  }
+
+  denoiser_.queue.synchronize();
+}
+
+void OptiXDevice::denoise_color_read(DenoiseContext &context, const DenoisePass &pass)
+{
+  PassAccessor::PassAccessInfo pass_access_info;
+  pass_access_info.type = pass.type;
+  pass_access_info.mode = PassMode::NOISY;
+  pass_access_info.offset = pass.noisy_offset;
+
+  /* Denoiser operates on passes which are used to calculate the approximation, and is never used
+   * on the approximation. The latter is not even possible because OptiX does not support
+   * denoising of semi-transparent pixels. */
+  pass_access_info.use_approximate_shadow_catcher = false;
+  pass_access_info.use_approximate_shadow_catcher_background = false;
+  pass_access_info.show_active_pixels = false;
+
+  /* TODO(sergey): Consider adding support of actual exposure, to avoid clamping in extreme cases.
+   */
+  const PassAccessorGPU pass_accessor(
+      &denoiser_.queue, pass_access_info, 1.0f, context.num_samples);
+
+  PassAccessor::Destination destination(pass_access_info.type);
+  destination.d_pixels = context.render_buffers->buffer.device_pointer +
+                         pass.denoised_offset * sizeof(float);
+  destination.num_components = 3;
+  destination.pixel_stride = context.buffer_params.pass_stride;
+
+  pass_accessor.get_render_tile_pixels(context.render_buffers, context.buffer_params, destination);
+}
+
+bool OptiXDevice::denoise_filter_color_preprocess(DenoiseContext &context, const DenoisePass &pass)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+
+  const int work_size = buffer_params.width * buffer_params.height;
+
+  void *args[] = {&context.render_buffers->buffer.device_pointer,
+                  const_cast<int *>(&buffer_params.full_x),
+                  const_cast<int *>(&buffer_params.full_y),
+                  const_cast<int *>(&buffer_params.width),
+                  const_cast<int *>(&buffer_params.height),
+                  const_cast<int *>(&buffer_params.offset),
+                  const_cast<int *>(&buffer_params.stride),
+                  const_cast<int *>(&buffer_params.pass_stride),
+                  const_cast<int *>(&pass.denoised_offset)};
+
+  return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_COLOR_PREPROCESS, work_size, args);
+}
+
+bool OptiXDevice::denoise_filter_color_postprocess(DenoiseContext &context,
+                                                   const DenoisePass &pass)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+
+  const int work_size = buffer_params.width * buffer_params.height;
+
+  void *args[] = {&context.render_buffers->buffer.device_pointer,
+                  const_cast<int *>(&buffer_params.full_x),
+                  const_cast<int *>(&buffer_params.full_y),
+                  const_cast<int *>(&buffer_params.width),
+                  const_cast<int *>(&buffer_params.height),
+                  const_cast<int *>(&buffer_params.offset),
+                  const_cast<int *>(&buffer_params.stride),
+                  const_cast<int *>(&buffer_params.pass_stride),
+                  const_cast<int *>(&context.num_samples),
+                  const_cast<int *>(&pass.noisy_offset),
+                  const_cast<int *>(&pass.denoised_offset),
+                  const_cast<int *>(&context.pass_sample_count),
+                  const_cast<int *>(&pass.num_components),
+                  const_cast<bool *>(&pass.use_compositing)};
+
+  return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS, work_size, args);
+}
+
+bool OptiXDevice::denoise_ensure(DenoiseContext &context)
+{
+  if (!denoise_create_if_needed(context)) {
+    LOG(ERROR) << "OptiX denoiser creation has failed.";
+    return false;
+  }
+
+  if (!denoise_configure_if_needed(context)) {
+    LOG(ERROR) << "OptiX denoiser configuration has failed.";
+    return false;
+  }
+
+  return true;
+}
+
+bool OptiXDevice::denoise_create_if_needed(DenoiseContext &context)
+{
+  const bool recreate_denoiser = (denoiser_.optix_denoiser == nullptr) ||
+                                 (denoiser_.use_pass_albedo != context.use_pass_albedo) ||
+                                 (denoiser_.use_pass_normal != context.use_pass_normal);
+  if (!recreate_denoiser) {
+    return true;
+  }
+
+  /* Destroy existing handle before creating new one. */
+  if (denoiser_.optix_denoiser) {
+    optixDenoiserDestroy(denoiser_.optix_denoiser);
+  }
+
+  /* Create OptiX denoiser handle on demand when it is first used. */
+  OptixDenoiserOptions denoiser_options = {};
+  denoiser_options.guideAlbedo = context.use_pass_albedo;
+  denoiser_options.guideNormal = context.use_pass_normal;
+  const OptixResult result = optixDenoiserCreate(
+      this->context, OPTIX_DENOISER_MODEL_KIND_HDR, &denoiser_options, &denoiser_.optix_denoiser);
+
+  if (result != OPTIX_SUCCESS) {
+    set_error("Failed to create OptiX denoiser");
+    return false;
+  }
+
+  /* OptiX denoiser handle was created with the requested number of input passes. */
+  denoiser_.use_pass_albedo = context.use_pass_albedo;
+  denoiser_.use_pass_normal = context.use_pass_normal;
+
+  /* OptiX denoiser has been created, but it needs configuration. */
+  denoiser_.is_configured = false;
+
+  return true;
+}
+
+bool OptiXDevice::denoise_configure_if_needed(DenoiseContext &context)
+{
+  if (denoiser_.is_configured && (denoiser_.configured_size.x == context.buffer_params.width &&
+                                  denoiser_.configured_size.y == context.buffer_params.height)) {
+    return true;
+  }
+
+  const BufferParams &buffer_params = context.buffer_params;
+
+  OptixDenoiserSizes sizes = {};
+  optix_assert(optixDenoiserComputeMemoryResources(
+      denoiser_.optix_denoiser, buffer_params.width, buffer_params.height, &sizes));
+
+  denoiser_.scratch_size = sizes.withOverlapScratchSizeInBytes;
+  denoiser_.scratch_offset = sizes.stateSizeInBytes;
+
+  /* Allocate denoiser state if tile size has changed since last setup. */
+  denoiser_.state.alloc_to_device(denoiser_.scratch_offset + denoiser_.scratch_size);
+
+  /* Initialize denoiser state for the current tile size. */
+  const OptixResult result = optixDenoiserSetup(denoiser_.optix_denoiser,
+                                                denoiser_.queue.stream(),
+                                                buffer_params.width,
+                                                buffer_params.height,
+                                                denoiser_.state.device_pointer,
+                                                denoiser_.scratch_offset,
+                                                denoiser_.state.device_pointer +
+                                                    denoiser_.scratch_offset,
+                                                denoiser_.scratch_size);
+  if (result != OPTIX_SUCCESS) {
+    set_error("Failed to set up OptiX denoiser");
+    return false;
+  }
+
+  denoiser_.is_configured = true;
+  denoiser_.configured_size.x = buffer_params.width;
+  denoiser_.configured_size.y = buffer_params.height;
+
+  return true;
+}
+
+bool OptiXDevice::denoise_run(DenoiseContext &context, const DenoisePass &pass)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+  const int width = buffer_params.width;
+  const int height = buffer_params.height;
+
+  /* Set up input and output layer information. */
+  OptixImage2D color_layer = {0};
+  OptixImage2D albedo_layer = {0};
+  OptixImage2D normal_layer = {0};
+
+  OptixImage2D output_layer = {0};
+
+  /* Color pass. */
+  {
+    const int pass_denoised = pass.denoised_offset;
+    const int64_t pass_stride_in_bytes = context.buffer_params.pass_stride * sizeof(float);
+
+    color_layer.data = context.render_buffers->buffer.device_pointer +
+                       pass_denoised * sizeof(float);
+    color_layer.width = width;
+    color_layer.height = height;
+    color_layer.rowStrideInBytes = pass_stride_in_bytes * context.buffer_params.stride;
+    color_layer.pixelStrideInBytes = pass_stride_in_bytes;
+    color_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
+  }
+
+  device_vector<float> fake_albedo(this, "fake_albedo", MEM_READ_WRITE);
+
+  /* Optional albedo and color passes. */
+  if (context.num_input_passes > 1) {
+    const device_ptr d_guiding_buffer = context.guiding_params.device_pointer;
+    const int64_t pixel_stride_in_bytes = context.guiding_params.pass_stride * sizeof(float);
+    const int64_t row_stride_in_bytes = context.guiding_params.stride * pixel_stride_in_bytes;
+
+    if (context.use_pass_albedo) {
+      albedo_layer.data = d_guiding_buffer + context.guiding_params.pass_albedo * sizeof(float);
+      albedo_layer.width = width;
+      albedo_layer.height = height;
+      albedo_layer.rowStrideInBytes = row_stride_in_bytes;
+      albedo_layer.pixelStrideInBytes = pixel_stride_in_bytes;
+      albedo_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
+    }
+
+    if (context.use_pass_normal) {
+      normal_layer.data = d_guiding_buffer + context.guiding_params.pass_normal * sizeof(float);
+      normal_layer.width = width;
+      normal_layer.height = height;
+      normal_layer.rowStrideInBytes = row_stride_in_bytes;
+      normal_layer.pixelStrideInBytes = pixel_stride_in_bytes;
+      normal_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
+    }
+  }
+
+  /* Denoise in-place of the noisy input in the render buffers. */
+  output_layer = color_layer;
+
+  /* Finally run denonising. */
+  OptixDenoiserParams params = {}; /* All parameters are disabled/zero. */
+  OptixDenoiserLayer image_layers = {};
+  image_layers.input = color_layer;
+  image_layers.output = output_layer;
+
+  OptixDenoiserGuideLayer guide_layers = {};
+  guide_layers.albedo = albedo_layer;
+  guide_layers.normal = normal_layer;
+
+  optix_assert(optixDenoiserInvoke(denoiser_.optix_denoiser,
+                                   denoiser_.queue.stream(),
+                                   &params,
+                                   denoiser_.state.device_pointer,
+                                   denoiser_.scratch_offset,
+                                   &guide_layers,
+                                   &image_layers,
+                                   1,
+                                   0,
+                                   0,
+                                   denoiser_.state.device_pointer + denoiser_.scratch_offset,
+                                   denoiser_.scratch_size));
+
+  return true;
+}
+
+bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh,
+                                  OptixBuildOperation operation,
+                                  const OptixBuildInput &build_input,
+                                  uint16_t num_motion_steps)
+{
+  const CUDAContextScope scope(this);
+
+  const bool use_fast_trace_bvh = (bvh->params.bvh_type == BVH_TYPE_STATIC);
+
+  /* Compute memory usage. */
+  OptixAccelBufferSizes sizes = {};
+  OptixAccelBuildOptions options = {};
+  options.operation = operation;
+  if (use_fast_trace_bvh) {
+    VLOG(2) << "Using fast to trace OptiX BVH";
+    options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
+  }
+  else {
+    VLOG(2) << "Using fast to update OptiX BVH";
+    options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD | OPTIX_BUILD_FLAG_ALLOW_UPDATE;
+  }
+
+  options.motionOptions.numKeys = num_motion_steps;
+  options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH;
+  options.motionOptions.timeBegin = 0.0f;
+  options.motionOptions.timeEnd = 1.0f;
+
+  optix_assert(optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));
+
+  /* Allocate required output buffers. */
+  device_only_memory<char> temp_mem(this, "optix temp as build mem");
+  temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
+  if (!temp_mem.device_pointer) {
+    /* Make sure temporary memory allocation succeeded. */
+    return false;
+  }
+
+  device_only_memory<char> &out_data = bvh->as_data;
+  if (operation == OPTIX_BUILD_OPERATION_BUILD) {
+    assert(out_data.device == this);
+    out_data.alloc_to_device(sizes.outputSizeInBytes);
+    if (!out_data.device_pointer) {
+      return false;
+    }
+  }
+  else {
+    assert(out_data.device_pointer && out_data.device_size >= sizes.outputSizeInBytes);
+  }
+
+  /* Finally build the acceleration structure. */
+  OptixAccelEmitDesc compacted_size_prop = {};
+  compacted_size_prop.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
+  /* A tiny space was allocated for this property at the end of the temporary buffer above.
+   * Make sure this pointer is 8-byte aligned. */
+  compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8);
+
+  OptixTraversableHandle out_handle = 0;
+  optix_assert(optixAccelBuild(context,
+                               NULL,
+                               &options,
+                               &build_input,
+                               1,
+                               temp_mem.device_pointer,
+                               sizes.tempSizeInBytes,
+                               out_data.device_pointer,
+                               sizes.outputSizeInBytes,
+                               &out_handle,
+                               use_fast_trace_bvh ? &compacted_size_prop : NULL,
+                               use_fast_trace_bvh ? 1 : 0));
+  bvh->traversable_handle = static_cast<uint64_t>(out_handle);
+
+  /* Wait for all operations to finish. */
+  cuda_assert(cuStreamSynchronize(NULL));
+
+  /* Compact acceleration structure to save memory (do not do this in viewport for faster builds).
+   */
+  if (use_fast_trace_bvh) {
+    uint64_t compacted_size = sizes.outputSizeInBytes;
+    cuda_assert(cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size)));
+
+    /* Temporary memory is no longer needed, so free it now to make space. */
+    temp_mem.free();
+
+    /* There is no point compacting if the size does not change. */
+    if (compacted_size < sizes.outputSizeInBytes) {
+      device_only_memory<char> compacted_data(this, "optix compacted as");
+      compacted_data.alloc_to_device(compacted_size);
+      if (!compacted_data.device_pointer)
+        /* Do not compact if memory allocation for compacted acceleration structure fails.
+         * Can just use the uncompacted one then, so succeed here regardless. */
+        return !have_error();
+
+      optix_assert(optixAccelCompact(
+          context, NULL, out_handle, compacted_data.device_pointer, compacted_size, &out_handle));
+      bvh->traversable_handle = static_cast<uint64_t>(out_handle);
+
+      /* Wait for compaction to finish. */
+      cuda_assert(cuStreamSynchronize(NULL));
+
+      std::swap(out_data.device_size, compacted_data.device_size);
+      std::swap(out_data.device_pointer, compacted_data.device_pointer);
+    }
+  }
+
+  return !have_error();
+}
+
+void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
+{
+  const bool use_fast_trace_bvh = (bvh->params.bvh_type == BVH_TYPE_STATIC);
+
+  free_bvh_memory_delayed();
+
+  BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
+
+  progress.set_substatus("Building OptiX acceleration structure");
+
+  if (!bvh->params.top_level) {
+    assert(bvh->objects.size() == 1 && bvh->geometry.size() == 1);
+
+    /* Refit is only possible in viewport for now (because AS is built with
+     * OPTIX_BUILD_FLAG_ALLOW_UPDATE only there, see above). */
+    OptixBuildOperation operation = OPTIX_BUILD_OPERATION_BUILD;
+    if (refit && !use_fast_trace_bvh) {
+      assert(bvh_optix->traversable_handle != 0);
+      operation = OPTIX_BUILD_OPERATION_UPDATE;
+    }
+    else {
+      bvh_optix->as_data.free();
+      bvh_optix->traversable_handle = 0;
+    }
+
+    /* Build bottom level acceleration structures (BLAS). */
+    Geometry *const geom = bvh->geometry[0];
+    if (geom->geometry_type == Geometry::HAIR) {
+      /* Build BLAS for curve primitives. */
+      Hair *const hair = static_cast<Hair *const>(geom);
+      if (hair->num_curves() == 0) {
+        return;
+      }
+
+      const size_t num_segments = hair->num_segments();
+
+      size_t num_motion_steps = 1;
+      Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+      if (motion_blur && hair->get_use_motion_blur() && motion_keys) {
+        num_motion_steps = hair->get_motion_steps();
+      }
+
+      device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY);
+      device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
+      device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
+      /* Four control points for each curve segment. */
+      const size_t num_vertices = num_segments * 4;
+      if (hair->curve_shape == CURVE_THICK) {
+        index_data.alloc(num_segments);
+        vertex_data.alloc(num_vertices * num_motion_steps);
+      }
+      else
+        aabb_data.alloc(num_segments * num_motion_steps);
+
+      /* Get AABBs for each motion step. */
+      for (size_t step = 0; step < num_motion_steps; ++step) {
+        /* The center step for motion vertices is not stored in the attribute. */
+        const float3 *keys = hair->get_curve_keys().data();
+        size_t center_step = (num_motion_steps - 1) / 2;
+        if (step != center_step) {
+          size_t attr_offset = (step > center_step) ? step - 1 : step;
+          /* Technically this is a float4 array, but sizeof(float3) == sizeof(float4). */
+          keys = motion_keys->data_float3() + attr_offset * hair->get_curve_keys().size();
+        }
+
+        for (size_t j = 0, i = 0; j < hair->num_curves(); ++j) {
+          const Hair::Curve curve = hair->get_curve(j);
+          const array<float> &curve_radius = hair->get_curve_radius();
+
+          for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) {
+            if (hair->curve_shape == CURVE_THICK) {
+              int k0 = curve.first_key + segment;
+              int k1 = k0 + 1;
+              int ka = max(k0 - 1, curve.first_key);
+              int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1);
+
+              const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x);
+              const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y);
+              const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z);
+              const float4 pw = make_float4(
+                  curve_radius[ka], curve_radius[k0], curve_radius[k1], curve_radius[kb]);
+
+              /* Convert Catmull-Rom data to Bezier spline. */
+              static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f;
+              static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f;
+              static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f;
+              static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f;
+
+              index_data[i] = i * 4;
+              float4 *const v = vertex_data.data() + step * num_vertices + index_data[i];
+              v[0] = make_float4(
+                  dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw));
+              v[1] = make_float4(
+                  dot(cr2bsp1, px), dot(cr2bsp1, py), dot(cr2bsp1, pz), dot(cr2bsp1, pw));
+              v[2] = make_float4(
+                  dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw));
+              v[3] = make_float4(
+                  dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw));
+            }
+            else {
+              BoundBox bounds = BoundBox::empty;
+              curve.bounds_grow(segment, keys, hair->get_curve_radius().data(), bounds);
+
+              const size_t index = step * num_segments + i;
+              aabb_data[index].minX = bounds.min.x;
+              aabb_data[index].minY = bounds.min.y;
+              aabb_data[index].minZ = bounds.min.z;
+              aabb_data[index].maxX = bounds.max.x;
+              aabb_data[index].maxY = bounds.max.y;
+              aabb_data[index].maxZ = bounds.max.z;
+            }
+          }
+        }
+      }
+
+      /* Upload AABB data to GPU. */
+      aabb_data.copy_to_device();
+      index_data.copy_to_device();
+      vertex_data.copy_to_device();
+
+      vector<device_ptr> aabb_ptrs;
+      aabb_ptrs.reserve(num_motion_steps);
+      vector<device_ptr> width_ptrs;
+      vector<device_ptr> vertex_ptrs;
+      width_ptrs.reserve(num_motion_steps);
+      vertex_ptrs.reserve(num_motion_steps);
+      for (size_t step = 0; step < num_motion_steps; ++step) {
+        aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb));
+        const device_ptr base_ptr = vertex_data.device_pointer +
+                                    step * num_vertices * sizeof(float4);
+        width_ptrs.push_back(base_ptr + 3 * sizeof(float)); /* Offset by vertex size. */
+        vertex_ptrs.push_back(base_ptr);
+      }
+
+      /* Force a single any-hit call, so shadow record-all behavior works correctly. */
+      unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
+      OptixBuildInput build_input = {};
+      if (hair->curve_shape == CURVE_THICK) {
+        build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES;
+        build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
+        build_input.curveArray.numPrimitives = num_segments;
+        build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
+        build_input.curveArray.numVertices = num_vertices;
+        build_input.curveArray.vertexStrideInBytes = sizeof(float4);
+        build_input.curveArray.widthBuffers = (CUdeviceptr *)width_ptrs.data();
+        build_input.curveArray.widthStrideInBytes = sizeof(float4);
+        build_input.curveArray.indexBuffer = (CUdeviceptr)index_data.device_pointer;
+        build_input.curveArray.indexStrideInBytes = sizeof(int);
+        build_input.curveArray.flag = build_flags;
+        build_input.curveArray.primitiveIndexOffset = hair->optix_prim_offset;
+      }
+      else {
+        /* Disable visibility test any-hit program, since it is already checked during
+         * intersection. Those trace calls that require anyhit can force it with a ray flag. */
+        build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
+
+        build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
+        build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
+        build_input.customPrimitiveArray.numPrimitives = num_segments;
+        build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb);
+        build_input.customPrimitiveArray.flags = &build_flags;
+        build_input.customPrimitiveArray.numSbtRecords = 1;
+        build_input.customPrimitiveArray.primitiveIndexOffset = hair->optix_prim_offset;
+      }
+
+      if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
+        progress.set_error("Failed to build OptiX acceleration structure");
+      }
+    }
+    else if (geom->geometry_type == Geometry::MESH || geom->geometry_type == Geometry::VOLUME) {
+      /* Build BLAS for triangle primitives. */
+      Mesh *const mesh = static_cast<Mesh *const>(geom);
+      if (mesh->num_triangles() == 0) {
+        return;
+      }
+
+      const size_t num_verts = mesh->get_verts().size();
+
+      size_t num_motion_steps = 1;
+      Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+      if (motion_blur && mesh->get_use_motion_blur() && motion_keys) {
+        num_motion_steps = mesh->get_motion_steps();
+      }
+
+      device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
+      index_data.alloc(mesh->get_triangles().size());
+      memcpy(index_data.data(),
+             mesh->get_triangles().data(),
+             mesh->get_triangles().size() * sizeof(int));
+      device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
+      vertex_data.alloc(num_verts * num_motion_steps);
+
+      for (size_t step = 0; step < num_motion_steps; ++step) {
+        const float3 *verts = mesh->get_verts().data();
+
+        size_t center_step = (num_motion_steps - 1) / 2;
+        /* The center step for motion vertices is not stored in the attribute. */
+        if (step != center_step) {
+          verts = motion_keys->data_float3() + (step > center_step ? step - 1 : step) * num_verts;
+        }
+
+        memcpy(vertex_data.data() + num_verts * step, verts, num_verts * sizeof(float3));
+      }
+
+      /* Upload triangle data to GPU. */
+      index_data.copy_to_device();
+      vertex_data.copy_to_device();
+
+      vector<device_ptr> vertex_ptrs;
+      vertex_ptrs.reserve(num_motion_steps);
+      for (size_t step = 0; step < num_motion_steps; ++step) {
+        vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3));
+      }
+
+      /* Force a single any-hit call, so shadow record-all behavior works correctly. */
+      unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
+      OptixBuildInput build_input = {};
+      build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
+      build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
+      build_input.triangleArray.numVertices = num_verts;
+      build_input.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3;
+      build_input.triangleArray.vertexStrideInBytes = sizeof(float4);
+      build_input.triangleArray.indexBuffer = index_data.device_pointer;
+      build_input.triangleArray.numIndexTriplets = mesh->num_triangles();
+      build_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3;
+      build_input.triangleArray.indexStrideInBytes = 3 * sizeof(int);
+      build_input.triangleArray.flags = &build_flags;
+      /* The SBT does not store per primitive data since Cycles already allocates separate
+       * buffers for that purpose. OptiX does not allow this to be zero though, so just pass in
+       * one and rely on that having the same meaning in this case. */
+      build_input.triangleArray.numSbtRecords = 1;
+      build_input.triangleArray.primitiveIndexOffset = mesh->optix_prim_offset;
+
+      if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
+        progress.set_error("Failed to build OptiX acceleration structure");
+      }
+    }
+  }
+  else {
+    unsigned int num_instances = 0;
+    unsigned int max_num_instances = 0xFFFFFFFF;
+
+    bvh_optix->as_data.free();
+    bvh_optix->traversable_handle = 0;
+    bvh_optix->motion_transform_data.free();
+
+    optixDeviceContextGetProperty(context,
+                                  OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID,
+                                  &max_num_instances,
+                                  sizeof(max_num_instances));
+    /* Do not count first bit, which is used to distinguish instanced and non-instanced objects. */
+    max_num_instances >>= 1;
+    if (bvh->objects.size() > max_num_instances) {
+      progress.set_error(
+          "Failed to build OptiX acceleration structure because there are too many instances");
+      return;
+    }
+
+    /* Fill instance descriptions. */
+    device_vector<OptixInstance> instances(this, "optix tlas instances", MEM_READ_ONLY);
+    instances.alloc(bvh->objects.size());
+
+    /* Calculate total motion transform size and allocate memory for them. */
+    size_t motion_transform_offset = 0;
+    if (motion_blur) {
+      size_t total_motion_transform_size = 0;
+      for (Object *const ob : bvh->objects) {
+        if (ob->is_traceable() && ob->use_motion()) {
+          total_motion_transform_size = align_up(total_motion_transform_size,
+                                                 OPTIX_TRANSFORM_BYTE_ALIGNMENT);
+          const size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
+          total_motion_transform_size = total_motion_transform_size +
+                                        sizeof(OptixSRTMotionTransform) +
+                                        motion_keys * sizeof(OptixSRTData);
+        }
+      }
+
+      assert(bvh_optix->motion_transform_data.device == this);
+      bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size);
+    }
+
+    for (Object *ob : bvh->objects) {
+      /* Skip non-traceable objects. */
+      if (!ob->is_traceable()) {
+        continue;
+      }
+
+      BVHOptiX *const blas = static_cast<BVHOptiX *>(ob->get_geometry()->bvh);
+      OptixTraversableHandle handle = blas->traversable_handle;
+
+      OptixInstance &instance = instances[num_instances++];
+      memset(&instance, 0, sizeof(instance));
+
+      /* Clear transform to identity matrix. */
+      instance.transform[0] = 1.0f;
+      instance.transform[5] = 1.0f;
+      instance.transform[10] = 1.0f;
+
+      /* Set user instance ID to object index (but leave low bit blank). */
+      instance.instanceId = ob->get_device_index() << 1;
+
+      /* Have to have at least one bit in the mask, or else instance would always be culled. */
+      instance.visibilityMask = 1;
+
+      if (ob->get_geometry()->has_volume) {
+        /* Volumes have a special bit set in the visibility mask so a trace can mask only volumes.
+         */
+        instance.visibilityMask |= 2;
+      }
+
+      if (ob->get_geometry()->geometry_type == Geometry::HAIR) {
+        /* Same applies to curves (so they can be skipped in local trace calls). */
+        instance.visibilityMask |= 4;
+
+        if (motion_blur && ob->get_geometry()->has_motion_blur() &&
+            static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) {
+          /* Select between motion blur and non-motion blur built-in intersection module. */
+          instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
+        }
+      }
+
+      /* Insert motion traversable if object has motion. */
+      if (motion_blur && ob->use_motion()) {
+        size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
+        size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
+                                       motion_keys * sizeof(OptixSRTData);
+
+        const CUDAContextScope scope(this);
+
+        motion_transform_offset = align_up(motion_transform_offset,
+                                           OPTIX_TRANSFORM_BYTE_ALIGNMENT);
+        CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data.device_pointer +
+                                           motion_transform_offset;
+        motion_transform_offset += motion_transform_size;
+
+        /* Allocate host side memory for motion transform and fill it with transform data. */
+        OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
+            new uint8_t[motion_transform_size]);
+        motion_transform.child = handle;
+        motion_transform.motionOptions.numKeys = ob->get_motion().size();
+        motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
+        motion_transform.motionOptions.timeBegin = 0.0f;
+        motion_transform.motionOptions.timeEnd = 1.0f;
+
+        OptixSRTData *const srt_data = motion_transform.srtData;
+        array<DecomposedTransform> decomp(ob->get_motion().size());
+        transform_motion_decompose(
+            decomp.data(), ob->get_motion().data(), ob->get_motion().size());
+
+        for (size_t i = 0; i < ob->get_motion().size(); ++i) {
+          /* Scale. */
+          srt_data[i].sx = decomp[i].y.w; /* scale.x.x */
+          srt_data[i].sy = decomp[i].z.w; /* scale.y.y */
+          srt_data[i].sz = decomp[i].w.w; /* scale.z.z */
+
+          /* Shear. */
+          srt_data[i].a = decomp[i].z.x; /* scale.x.y */
+          srt_data[i].b = decomp[i].z.y; /* scale.x.z */
+          srt_data[i].c = decomp[i].w.x; /* scale.y.z */
+          assert(decomp[i].z.z == 0.0f); /* scale.y.x */
+          assert(decomp[i].w.y == 0.0f); /* scale.z.x */
+          assert(decomp[i].w.z == 0.0f); /* scale.z.y */
+
+          /* Pivot point. */
+          srt_data[i].pvx = 0.0f;
+          srt_data[i].pvy = 0.0f;
+          srt_data[i].pvz = 0.0f;
+
+          /* Rotation. */
+          srt_data[i].qx = decomp[i].x.x;
+          srt_data[i].qy = decomp[i].x.y;
+          srt_data[i].qz = decomp[i].x.z;
+          srt_data[i].qw = decomp[i].x.w;
+
+          /* Translation. */
+          srt_data[i].tx = decomp[i].y.x;
+          srt_data[i].ty = decomp[i].y.y;
+          srt_data[i].tz = decomp[i].y.z;
+        }
+
+        /* Upload motion transform to GPU. */
+        cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
+        delete[] reinterpret_cast<uint8_t *>(&motion_transform);
+
+        /* Disable instance transform if object uses motion transform already. */
+        instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
+
+        /* Get traversable handle to motion transform. */
+        optixConvertPointerToTraversableHandle(context,
+                                               motion_transform_gpu,
+                                               OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
+                                               &instance.traversableHandle);
+      }
+      else {
+        instance.traversableHandle = handle;
+
+        if (ob->get_geometry()->is_instanced()) {
+          /* Set transform matrix. */
+          memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform));
+        }
+        else {
+          /* Disable instance transform if geometry already has it applied to vertex data. */
+          instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
+          /* Non-instanced objects read ID from 'prim_object', so distinguish
+           * them from instanced objects with the low bit set. */
+          instance.instanceId |= 1;
+        }
+      }
+    }
+
+    /* Upload instance descriptions. */
+    instances.resize(num_instances);
+    instances.copy_to_device();
+
+    /* Build top-level acceleration structure (TLAS) */
+    OptixBuildInput build_input = {};
+    build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
+    build_input.instanceArray.instances = instances.device_pointer;
+    build_input.instanceArray.numInstances = num_instances;
+
+    if (!build_optix_bvh(bvh_optix, OPTIX_BUILD_OPERATION_BUILD, build_input, 0)) {
+      progress.set_error("Failed to build OptiX acceleration structure");
+    }
+    tlas_handle = bvh_optix->traversable_handle;
+  }
+}
+
+void OptiXDevice::release_optix_bvh(BVH *bvh)
+{
+  thread_scoped_lock lock(delayed_free_bvh_mutex);
+  /* Do delayed free of BVH memory, since geometry holding BVH might be deleted
+   * while GPU is still rendering. */
+  BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
+
+  delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->as_data));
+  delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->motion_transform_data));
+  bvh_optix->traversable_handle = 0;
+}
+
+void OptiXDevice::free_bvh_memory_delayed()
+{
+  thread_scoped_lock lock(delayed_free_bvh_mutex);
+  delayed_free_bvh_memory.free_memory();
+}
+
+void OptiXDevice::const_copy_to(const char *name, void *host, size_t size)
+{
+  /* Set constant memory for CUDA module. */
+  CUDADevice::const_copy_to(name, host, size);
+
+  if (strcmp(name, "__data") == 0) {
+    assert(size <= sizeof(KernelData));
+
+    /* Update traversable handle (since it is different for each device on multi devices). */
+    KernelData *const data = (KernelData *)host;
+    *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;
+
+    update_launch_params(offsetof(KernelParamsOptiX, data), host, size);
+    return;
+  }
+
+  /* Update data storage pointers in launch parameters. */
+#  define KERNEL_TEX(data_type, tex_name) \
+    if (strcmp(name, #tex_name) == 0) { \
+      update_launch_params(offsetof(KernelParamsOptiX, tex_name), host, size); \
+      return; \
+    }
+  KERNEL_TEX(IntegratorStateGPU, __integrator_state)
+#  include "kernel/kernel_textures.h"
+#  undef KERNEL_TEX
+}
+
+void OptiXDevice::update_launch_params(size_t offset, void *data, size_t data_size)
+{
+  const CUDAContextScope scope(this);
+
+  cuda_assert(cuMemcpyHtoD(launch_params.device_pointer + offset, data, data_size));
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_OPTIX */
diff --git a/intern/cycles/device/optix/device_impl.h b/intern/cycles/device/optix/device_impl.h
new file mode 100644
index 00000000000..742ae0f1bab
--- /dev/null
+++ b/intern/cycles/device/optix/device_impl.h
@@ -0,0 +1,186 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_OPTIX
+
+#  include "device/cuda/device_impl.h"
+#  include "device/optix/queue.h"
+#  include "device/optix/util.h"
+#  include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BVHOptiX;
+struct KernelParamsOptiX;
+
+/* List of OptiX program groups. */
+enum {
+  PG_RGEN_INTERSECT_CLOSEST,
+  PG_RGEN_INTERSECT_SHADOW,
+  PG_RGEN_INTERSECT_SUBSURFACE,
+  PG_RGEN_INTERSECT_VOLUME_STACK,
+  PG_RGEN_SHADE_SURFACE_RAYTRACE,
+  PG_MISS,
+  PG_HITD, /* Default hit group. */
+  PG_HITS, /* __SHADOW_RECORD_ALL__ hit group. */
+  PG_HITL, /* __BVH_LOCAL__ hit group (only used for triangles). */
+  PG_HITD_MOTION,
+  PG_HITS_MOTION,
+  PG_CALL_SVM_AO,
+  PG_CALL_SVM_BEVEL,
+  PG_CALL_AO_PASS,
+  NUM_PROGRAM_GROUPS
+};
+
+static const int MISS_PROGRAM_GROUP_OFFSET = PG_MISS;
+static const int NUM_MIS_PROGRAM_GROUPS = 1;
+static const int HIT_PROGAM_GROUP_OFFSET = PG_HITD;
+static const int NUM_HIT_PROGRAM_GROUPS = 5;
+static const int CALLABLE_PROGRAM_GROUPS_BASE = PG_CALL_SVM_AO;
+static const int NUM_CALLABLE_PROGRAM_GROUPS = 3;
+
+/* List of OptiX pipelines. */
+enum { PIP_SHADE_RAYTRACE, PIP_INTERSECT, NUM_PIPELINES };
+
+/* A single shader binding table entry. */
+struct SbtRecord {
+  char header[OPTIX_SBT_RECORD_HEADER_SIZE];
+};
+
+class OptiXDevice : public CUDADevice {
+ public:
+  OptixDeviceContext context = NULL;
+
+  OptixModule optix_module = NULL; /* All necessary OptiX kernels are in one module. */
+  OptixModule builtin_modules[2] = {};
+  OptixPipeline pipelines[NUM_PIPELINES] = {};
+
+  bool motion_blur = false;
+  device_vector<SbtRecord> sbt_data;
+  device_only_memory<KernelParamsOptiX> launch_params;
+  OptixTraversableHandle tlas_handle = 0;
+
+  vector<device_only_memory<char>> delayed_free_bvh_memory;
+  thread_mutex delayed_free_bvh_mutex;
+
+  class Denoiser {
+   public:
+    explicit Denoiser(OptiXDevice *device);
+    ~Denoiser();
+
+    OptiXDevice *device;
+    OptiXDeviceQueue queue;
+
+    OptixDenoiser optix_denoiser = nullptr;
+
+    /* Configuration size, as provided to `optixDenoiserSetup`.
+     * If the `optixDenoiserSetup()` was never used on the current `optix_denoiser` the
+     * `is_configured` will be false. */
+    bool is_configured = false;
+    int2 configured_size = make_int2(0, 0);
+
+    /* OptiX denoiser state and scratch buffers, stored in a single memory buffer.
+     * The memory layout goes as following: [denoiser state][scratch buffer]. */
+    device_only_memory<unsigned char> state;
+    size_t scratch_offset = 0;
+    size_t scratch_size = 0;
+
+    bool use_pass_albedo = false;
+    bool use_pass_normal = false;
+  };
+  Denoiser denoiser_;
+
+ public:
+  OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+  ~OptiXDevice();
+
+ private:
+  BVHLayoutMask get_bvh_layout_mask() const override;
+
+  string compile_kernel_get_common_cflags(const uint kernel_features) override;
+
+  bool load_kernels(const uint kernel_features) override;
+
+  bool build_optix_bvh(BVHOptiX *bvh,
+                       OptixBuildOperation operation,
+                       const OptixBuildInput &build_input,
+                       uint16_t num_motion_steps);
+
+  void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
+
+  void release_optix_bvh(BVH *bvh) override;
+  void free_bvh_memory_delayed();
+
+  void const_copy_to(const char *name, void *host, size_t size) override;
+
+  void update_launch_params(size_t offset, void *data, size_t data_size);
+
+  virtual unique_ptr<DeviceQueue> gpu_queue_create() override;
+
+  /* --------------------------------------------------------------------
+   * Denoising.
+   */
+
+  class DenoiseContext;
+  class DenoisePass;
+
+  virtual bool denoise_buffer(const DeviceDenoiseTask &task) override;
+  virtual DeviceQueue *get_denoise_queue() override;
+
+  /* Read guiding passes from the render buffers, preprocess them in a way which is expected by
+   * OptiX and store in the guiding passes memory within the given context.
+   *
+   * Pre=-processing of the guiding passes is to only hapopen once per context lifetime. DO not
+   * preprocess them for every pass which is being denoised. */
+  bool denoise_filter_guiding_preprocess(DenoiseContext &context);
+
+  /* Set fake albedo pixels in the albedo guiding pass storage.
+   * After this point only passes which do not need albedo for denoising can be processed. */
+  bool denoise_filter_guiding_set_fake_albedo(DenoiseContext &context);
+
+  void denoise_pass(DenoiseContext &context, PassType pass_type);
+
+  /* Read input color pass from the render buffer into the memory which corresponds to the noisy
+   * input within the given context. Pixels are scaled to the number of samples, but are not
+   * preprocessed yet. */
+  void denoise_color_read(DenoiseContext &context, const DenoisePass &pass);
+
+  /* Run corresponding filter kernels, preparing data for the denoiser or copying data from the
+   * denoiser result to the render buffer. */
+  bool denoise_filter_color_preprocess(DenoiseContext &context, const DenoisePass &pass);
+  bool denoise_filter_color_postprocess(DenoiseContext &context, const DenoisePass &pass);
+
+  /* Make sure the OptiX denoiser is created and configured. */
+  bool denoise_ensure(DenoiseContext &context);
+
+  /* Create OptiX denoiser descriptor if needed.
+   * Will do nothing if the current OptiX descriptor is usable for the given parameters.
+   * If the OptiX denoiser descriptor did re-allocate here it is left unconfigured. */
+  bool denoise_create_if_needed(DenoiseContext &context);
+
+  /* Configure existing OptiX denoiser descriptor for the use for the given task. */
+  bool denoise_configure_if_needed(DenoiseContext &context);
+
+  /* Run configured denoiser. */
+  bool denoise_run(DenoiseContext &context, const DenoisePass &pass);
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_OPTIX */
diff --git a/intern/cycles/device/optix/queue.cpp b/intern/cycles/device/optix/queue.cpp
new file mode 100644
index 00000000000..458ed70baa8
--- /dev/null
+++ b/intern/cycles/device/optix/queue.cpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_OPTIX
+
+#  include "device/optix/queue.h"
+#  include "device/optix/device_impl.h"
+
+#  include "util/util_time.h"
+
+#  undef __KERNEL_CPU__
+#  define __KERNEL_OPTIX__
+#  include "kernel/device/optix/globals.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* CUDADeviceQueue */
+
+OptiXDeviceQueue::OptiXDeviceQueue(OptiXDevice *device) : CUDADeviceQueue(device)
+{
+}
+
+void OptiXDeviceQueue::init_execution()
+{
+  CUDADeviceQueue::init_execution();
+}
+
+static bool is_optix_specific_kernel(DeviceKernel kernel)
+{
+  return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
+}
+
+bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[])
+{
+  if (!is_optix_specific_kernel(kernel)) {
+    return CUDADeviceQueue::enqueue(kernel, work_size, args);
+  }
+
+  if (cuda_device_->have_error()) {
+    return false;
+  }
+
+  debug_enqueue(kernel, work_size);
+
+  const CUDAContextScope scope(cuda_device_);
+
+  OptiXDevice *const optix_device = static_cast<OptiXDevice *>(cuda_device_);
+
+  const device_ptr sbt_data_ptr = optix_device->sbt_data.device_pointer;
+  const device_ptr launch_params_ptr = optix_device->launch_params.device_pointer;
+
+  cuda_device_assert(
+      cuda_device_,
+      cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, path_index_array),
+                        args[0],  // &d_path_index
+                        sizeof(device_ptr),
+                        cuda_stream_));
+
+  if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+    cuda_device_assert(
+        cuda_device_,
+        cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, render_buffer),
+                          args[1],  // &d_render_buffer
+                          sizeof(device_ptr),
+                          cuda_stream_));
+  }
+
+  cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_));
+
+  OptixPipeline pipeline = nullptr;
+  OptixShaderBindingTable sbt_params = {};
+
+  switch (kernel) {
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
+      pipeline = optix_device->pipelines[PIP_SHADE_RAYTRACE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE_RAYTRACE * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
+      pipeline = optix_device->pipelines[PIP_INTERSECT];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_CLOSEST * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
+      pipeline = optix_device->pipelines[PIP_INTERSECT];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_SHADOW * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
+      pipeline = optix_device->pipelines[PIP_INTERSECT];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_SUBSURFACE * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK:
+      pipeline = optix_device->pipelines[PIP_INTERSECT];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_VOLUME_STACK * sizeof(SbtRecord);
+      break;
+
+    default:
+      LOG(ERROR) << "Invalid kernel " << device_kernel_as_string(kernel)
+                 << " is attempted to be enqueued.";
+      return false;
+  }
+
+  sbt_params.missRecordBase = sbt_data_ptr + MISS_PROGRAM_GROUP_OFFSET * sizeof(SbtRecord);
+  sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
+  sbt_params.missRecordCount = NUM_MIS_PROGRAM_GROUPS;
+  sbt_params.hitgroupRecordBase = sbt_data_ptr + HIT_PROGAM_GROUP_OFFSET * sizeof(SbtRecord);
+  sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
+  sbt_params.hitgroupRecordCount = NUM_HIT_PROGRAM_GROUPS;
+  sbt_params.callablesRecordBase = sbt_data_ptr + CALLABLE_PROGRAM_GROUPS_BASE * sizeof(SbtRecord);
+  sbt_params.callablesRecordCount = NUM_CALLABLE_PROGRAM_GROUPS;
+  sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
+
+  /* Launch the ray generation program. */
+  optix_device_assert(optix_device,
+                      optixLaunch(pipeline,
+                                  cuda_stream_,
+                                  launch_params_ptr,
+                                  optix_device->launch_params.data_elements,
+                                  &sbt_params,
+                                  work_size,
+                                  1,
+                                  1));
+
+  return !(optix_device->have_error());
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_OPTIX */
diff --git a/intern/cycles/device/optix/queue.h b/intern/cycles/device/optix/queue.h
new file mode 100644
index 00000000000..0de422ccc71
--- /dev/null
+++ b/intern/cycles/device/optix/queue.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_OPTIX
+
+#  include "device/cuda/queue.h"
+
+CCL_NAMESPACE_BEGIN
+
+class OptiXDevice;
+
+/* Base class for CUDA queues. */
+class OptiXDeviceQueue : public CUDADeviceQueue {
+ public:
+  OptiXDeviceQueue(OptiXDevice *device);
+
+  virtual void init_execution() override;
+
+  virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) override;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_OPTIX */
diff --git a/intern/cycles/device/optix/util.h b/intern/cycles/device/optix/util.h
new file mode 100644
index 00000000000..34ae5bb5609
--- /dev/null
+++ b/intern/cycles/device/optix/util.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_OPTIX
+
+#  include "device/cuda/util.h"
+
+#  ifdef WITH_CUDA_DYNLOAD
+#    include <cuew.h>
+// Do not use CUDA SDK headers when using CUEW
+#    define OPTIX_DONT_INCLUDE_CUDA
+#  endif
+
+#  include <optix_stubs.h>
+
+/* Utility for checking return values of OptiX function calls. */
+#  define optix_device_assert(optix_device, stmt) \
+    { \
+      OptixResult result = stmt; \
+      if (result != OPTIX_SUCCESS) { \
+        const char *name = optixGetErrorName(result); \
+        optix_device->set_error( \
+            string_printf("%s in %s (%s:%d)", name, #stmt, __FILE__, __LINE__)); \
+      } \
+    } \
+    (void)0
+
+#  define optix_assert(stmt) optix_device_assert(this, stmt)
+
+#endif /* WITH_OPTIX */
author	Brecht Van Lommel <brecht@blender.org>	2021-09-20 18:59:20 +0300
committer	Brecht Van Lommel <brecht@blender.org>	2021-09-21 15:55:54 +0300
commit	08031197250aeecbaca3803254e6f25b8c7b7b37 (patch)
tree	6fe7ab045f0dc0a423d6557c4073f34309ef4740 /intern/cycles/device
parent	fa6b1007bad065440950cd67deb16a04f368856f (diff)