65 files changed, 6970 insertions, 15812 deletions
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 928249931a3..d18f4360aef 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -36,49 +36,70 @@ endif()
 
 set(SRC
   device.cpp
-  device_cpu.cpp
-  device_cuda.cpp
-  device_denoising.cpp
-  device_dummy.cpp
+  device_denoise.cpp
+  device_graphics_interop.cpp
+  device_kernel.cpp
   device_memory.cpp
-  device_multi.cpp
-  device_opencl.cpp
-  device_optix.cpp
-  device_split_kernel.cpp
-  device_task.cpp
+  device_queue.cpp
+)
+
+set(SRC_CPU
+  cpu/device.cpp
+  cpu/device.h
+  cpu/device_impl.cpp
+  cpu/device_impl.h
+  cpu/kernel.cpp
+  cpu/kernel.h
+  cpu/kernel_function.h
+  cpu/kernel_thread_globals.cpp
+  cpu/kernel_thread_globals.h
 )
 
 set(SRC_CUDA
-  cuda/device_cuda.h
-  cuda/device_cuda_impl.cpp
+  cuda/device.cpp
+  cuda/device.h
+  cuda/device_impl.cpp
+  cuda/device_impl.h
+  cuda/graphics_interop.cpp
+  cuda/graphics_interop.h
+  cuda/kernel.cpp
+  cuda/kernel.h
+  cuda/queue.cpp
+  cuda/queue.h
+  cuda/util.cpp
+  cuda/util.h
 )
 
-set(SRC_OPENCL
-  opencl/device_opencl.h
-  opencl/device_opencl_impl.cpp
-  opencl/memory_manager.h
-  opencl/memory_manager.cpp
-  opencl/opencl_util.cpp
+set(SRC_DUMMY
+  dummy/device.cpp
+  dummy/device.h
 )
 
-if(WITH_CYCLES_NETWORK)
-  list(APPEND SRC
-    device_network.cpp
-  )
-endif()
+set(SRC_MULTI
+  multi/device.cpp
+  multi/device.h
+)
+
+set(SRC_OPTIX
+  optix/device.cpp
+  optix/device.h
+  optix/device_impl.cpp
+  optix/device_impl.h
+  optix/queue.cpp
+  optix/queue.h
+  optix/util.h
+)
 
 set(SRC_HEADERS
   device.h
-  device_denoising.h
+  device_denoise.h
+  device_graphics_interop.h
   device_memory.h
-  device_intern.h
-  device_network.h
-  device_split_kernel.h
-  device_task.h
+  device_kernel.h
+  device_queue.h
 )
 
 set(LIB
-  cycles_render
   cycles_kernel
   cycles_util
   ${CYCLES_GL_LIBRARIES}
@@ -95,15 +116,7 @@ else()
 endif()
 
 add_definitions(${GL_DEFINITIONS})
-if(WITH_CYCLES_NETWORK)
-  add_definitions(-DWITH_NETWORK)
-endif()
-if(WITH_CYCLES_DEVICE_OPENCL)
-  list(APPEND LIB
-    extern_clew
-  )
-  add_definitions(-DWITH_OPENCL)
-endif()
+
 if(WITH_CYCLES_DEVICE_CUDA)
   add_definitions(-DWITH_CUDA)
 endif()
@@ -115,18 +128,27 @@ if(WITH_CYCLES_DEVICE_MULTI)
 endif()
 
 if(WITH_OPENIMAGEDENOISE)
-  add_definitions(-DWITH_OPENIMAGEDENOISE)
-  add_definitions(-DOIDN_STATIC_LIB)
-  list(APPEND INC_SYS
-    ${OPENIMAGEDENOISE_INCLUDE_DIRS}
-  )
   list(APPEND LIB
     ${OPENIMAGEDENOISE_LIBRARIES}
-    ${TBB_LIBRARIES}
   )
 endif()
 
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
 
-cycles_add_library(cycles_device "${LIB}" ${SRC} ${SRC_CUDA} ${SRC_OPENCL} ${SRC_HEADERS})
+cycles_add_library(cycles_device "${LIB}"
+  ${SRC}
+  ${SRC_CPU}
+  ${SRC_CUDA}
+  ${SRC_DUMMY}
+  ${SRC_MULTI}
+  ${SRC_OPTIX}
+  ${SRC_HEADERS}
+)
+
+source_group("cpu" FILES ${SRC_CPU})
+source_group("cuda" FILES ${SRC_CUDA})
+source_group("dummy" FILES ${SRC_DUMMY})
+source_group("multi" FILES ${SRC_MULTI})
+source_group("optix" FILES ${SRC_OPTIX})
+source_group("common" FILES ${SRC} ${SRC_HEADERS})
diff --git a/intern/cycles/device/cpu/device.cpp b/intern/cycles/device/cpu/device.cpp
new file mode 100644
index 00000000000..68ca8e8bb22
--- /dev/null
+++ b/intern/cycles/device/cpu/device.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/cpu/device.h"
+#include "device/cpu/device_impl.h"
+
+/* Used for `info.denoisers`. */
+/* TODO(sergey): The denoisers are probably to be moved completely out of the device into their
+ * own class. But until then keep API consistent with how it used to work before. */
+#include "util/util_openimagedenoise.h"
+
+CCL_NAMESPACE_BEGIN
+
+Device *device_cpu_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+{
+  return new CPUDevice(info, stats, profiler);
+}
+
+void device_cpu_info(vector<DeviceInfo> &devices)
+{
+  DeviceInfo info;
+
+  info.type = DEVICE_CPU;
+  info.description = system_cpu_brand_string();
+  info.id = "CPU";
+  info.num = 0;
+  info.has_osl = true;
+  info.has_half_images = true;
+  info.has_nanovdb = true;
+  info.has_profiling = true;
+  if (openimagedenoise_supported()) {
+    info.denoisers |= DENOISER_OPENIMAGEDENOISE;
+  }
+
+  devices.insert(devices.begin(), info);
+}
+
+string device_cpu_capabilities()
+{
+  string capabilities = "";
+  capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
+  capabilities += system_cpu_support_sse3() ? "SSE3 " : "";
+  capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
+  capabilities += system_cpu_support_avx() ? "AVX " : "";
+  capabilities += system_cpu_support_avx2() ? "AVX2" : "";
+  if (capabilities[capabilities.size() - 1] == ' ')
+    capabilities.resize(capabilities.size() - 1);
+  return capabilities;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/device.h b/intern/cycles/device/cpu/device.h
new file mode 100644
index 00000000000..9cb2e80068d
--- /dev/null
+++ b/intern/cycles/device/cpu/device.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_string.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+Device *device_cpu_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+void device_cpu_info(vector<DeviceInfo> &devices);
+
+string device_cpu_capabilities();
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/device_impl.cpp b/intern/cycles/device/cpu/device_impl.cpp
new file mode 100644
index 00000000000..3b0db6bdd0e
--- /dev/null
+++ b/intern/cycles/device/cpu/device_impl.cpp
@@ -0,0 +1,481 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/cpu/device_impl.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+/* So ImathMath is included before our kernel_cpu_compat. */
+#ifdef WITH_OSL
+/* So no context pollution happens from indirectly included windows.h */
+#  include "util/util_windows.h"
+#  include <OSL/oslexec.h>
+#endif
+
+#ifdef WITH_EMBREE
+#  include <embree3/rtcore.h>
+#endif
+
+#include "device/cpu/kernel.h"
+#include "device/cpu/kernel_thread_globals.h"
+
+#include "device/device.h"
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+#include "kernel/device/cpu/kernel.h"
+#include "kernel/kernel_types.h"
+
+#include "kernel/osl/osl_shader.h"
+#include "kernel/osl/osl_globals.h"
+// clang-format on
+
+#include "bvh/bvh_embree.h"
+
+#include "render/buffers.h"
+
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_function.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_opengl.h"
+#include "util/util_openimagedenoise.h"
+#include "util/util_optimization.h"
+#include "util/util_progress.h"
+#include "util/util_system.h"
+#include "util/util_task.h"
+#include "util/util_thread.h"
+
+CCL_NAMESPACE_BEGIN
+
+CPUDevice::CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_)
+    : Device(info_, stats_, profiler_), texture_info(this, "__texture_info", MEM_GLOBAL)
+{
+  /* Pick any kernel, all of them are supposed to have same level of microarchitecture
+   * optimization. */
+  VLOG(1) << "Will be using " << kernels.integrator_init_from_camera.get_uarch_name()
+          << " kernels.";
+
+  if (info.cpu_threads == 0) {
+    info.cpu_threads = TaskScheduler::num_threads();
+  }
+
+#ifdef WITH_OSL
+  kernel_globals.osl = &osl_globals;
+#endif
+#ifdef WITH_EMBREE
+  embree_device = rtcNewDevice("verbose=0");
+#endif
+  need_texture_info = false;
+}
+
+CPUDevice::~CPUDevice()
+{
+#ifdef WITH_EMBREE
+  rtcReleaseDevice(embree_device);
+#endif
+
+  texture_info.free();
+}
+
+bool CPUDevice::show_samples() const
+{
+  return (info.cpu_threads == 1);
+}
+
+BVHLayoutMask CPUDevice::get_bvh_layout_mask() const
+{
+  BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2;
+#ifdef WITH_EMBREE
+  bvh_layout_mask |= BVH_LAYOUT_EMBREE;
+#endif /* WITH_EMBREE */
+  return bvh_layout_mask;
+}
+
+bool CPUDevice::load_texture_info()
+{
+  if (!need_texture_info) {
+    return false;
+  }
+
+  texture_info.copy_to_device();
+  need_texture_info = false;
+
+  return true;
+}
+
+void CPUDevice::mem_alloc(device_memory &mem)
+{
+  if (mem.type == MEM_TEXTURE) {
+    assert(!"mem_alloc not supported for textures.");
+  }
+  else if (mem.type == MEM_GLOBAL) {
+    assert(!"mem_alloc not supported for global memory.");
+  }
+  else {
+    if (mem.name) {
+      VLOG(1) << "Buffer allocate: " << mem.name << ", "
+              << string_human_readable_number(mem.memory_size()) << " bytes. ("
+              << string_human_readable_size(mem.memory_size()) << ")";
+    }
+
+    if (mem.type == MEM_DEVICE_ONLY) {
+      assert(!mem.host_pointer);
+      size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES;
+      void *data = util_aligned_malloc(mem.memory_size(), alignment);
+      mem.device_pointer = (device_ptr)data;
+    }
+    else {
+      mem.device_pointer = (device_ptr)mem.host_pointer;
+    }
+
+    mem.device_size = mem.memory_size();
+    stats.mem_alloc(mem.device_size);
+  }
+}
+
+void CPUDevice::mem_copy_to(device_memory &mem)
+{
+  if (mem.type == MEM_GLOBAL) {
+    global_free(mem);
+    global_alloc(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free((device_texture &)mem);
+    tex_alloc((device_texture &)mem);
+  }
+  else {
+    if (!mem.device_pointer) {
+      mem_alloc(mem);
+    }
+
+    /* copy is no-op */
+  }
+}
+
+void CPUDevice::mem_copy_from(
+    device_memory & /*mem*/, int /*y*/, int /*w*/, int /*h*/, int /*elem*/)
+{
+  /* no-op */
+}
+
+void CPUDevice::mem_zero(device_memory &mem)
+{
+  if (!mem.device_pointer) {
+    mem_alloc(mem);
+  }
+
+  if (mem.device_pointer) {
+    memset((void *)mem.device_pointer, 0, mem.memory_size());
+  }
+}
+
+void CPUDevice::mem_free(device_memory &mem)
+{
+  if (mem.type == MEM_GLOBAL) {
+    global_free(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free((device_texture &)mem);
+  }
+  else if (mem.device_pointer) {
+    if (mem.type == MEM_DEVICE_ONLY) {
+      util_aligned_free((void *)mem.device_pointer);
+    }
+    mem.device_pointer = 0;
+    stats.mem_free(mem.device_size);
+    mem.device_size = 0;
+  }
+}
+
+device_ptr CPUDevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
+{
+  return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
+}
+
+void CPUDevice::const_copy_to(const char *name, void *host, size_t size)
+{
+#if WITH_EMBREE
+  if (strcmp(name, "__data") == 0) {
+    assert(size <= sizeof(KernelData));
+
+    // Update scene handle (since it is different for each device on multi devices)
+    KernelData *const data = (KernelData *)host;
+    data->bvh.scene = embree_scene;
+  }
+#endif
+  kernel_const_copy(&kernel_globals, name, host, size);
+}
+
+void CPUDevice::global_alloc(device_memory &mem)
+{
+  VLOG(1) << "Global memory allocate: " << mem.name << ", "
+          << string_human_readable_number(mem.memory_size()) << " bytes. ("
+          << string_human_readable_size(mem.memory_size()) << ")";
+
+  kernel_global_memory_copy(&kernel_globals, mem.name, mem.host_pointer, mem.data_size);
+
+  mem.device_pointer = (device_ptr)mem.host_pointer;
+  mem.device_size = mem.memory_size();
+  stats.mem_alloc(mem.device_size);
+}
+
+void CPUDevice::global_free(device_memory &mem)
+{
+  if (mem.device_pointer) {
+    mem.device_pointer = 0;
+    stats.mem_free(mem.device_size);
+    mem.device_size = 0;
+  }
+}
+
+void CPUDevice::tex_alloc(device_texture &mem)
+{
+  VLOG(1) << "Texture allocate: " << mem.name << ", "
+          << string_human_readable_number(mem.memory_size()) << " bytes. ("
+          << string_human_readable_size(mem.memory_size()) << ")";
+
+  mem.device_pointer = (device_ptr)mem.host_pointer;
+  mem.device_size = mem.memory_size();
+  stats.mem_alloc(mem.device_size);
+
+  const uint slot = mem.slot;
+  if (slot >= texture_info.size()) {
+    /* Allocate some slots in advance, to reduce amount of re-allocations. */
+    texture_info.resize(slot + 128);
+  }
+
+  texture_info[slot] = mem.info;
+  texture_info[slot].data = (uint64_t)mem.host_pointer;
+  need_texture_info = true;
+}
+
+void CPUDevice::tex_free(device_texture &mem)
+{
+  if (mem.device_pointer) {
+    mem.device_pointer = 0;
+    stats.mem_free(mem.device_size);
+    mem.device_size = 0;
+    need_texture_info = true;
+  }
+}
+
+void CPUDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
+{
+#ifdef WITH_EMBREE
+  if (bvh->params.bvh_layout == BVH_LAYOUT_EMBREE ||
+      bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE) {
+    BVHEmbree *const bvh_embree = static_cast<BVHEmbree *>(bvh);
+    if (refit) {
+      bvh_embree->refit(progress);
+    }
+    else {
+      bvh_embree->build(progress, &stats, embree_device);
+    }
+
+    if (bvh->params.top_level) {
+      embree_scene = bvh_embree->scene;
+    }
+  }
+  else
+#endif
+    Device::build_bvh(bvh, progress, refit);
+}
+
+#if 0
+void CPUDevice::render(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
+{
+  const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE;
+
+  scoped_timer timer(&tile.buffers->render_time);
+
+  Coverage coverage(kg, tile);
+  if (use_coverage) {
+    coverage.init_path_trace();
+  }
+
+  float *render_buffer = (float *)tile.buffer;
+  int start_sample = tile.start_sample;
+  int end_sample = tile.start_sample + tile.num_samples;
+
+  /* Needed for Embree. */
+  SIMD_SET_FLUSH_TO_ZERO;
+
+  for (int sample = start_sample; sample < end_sample; sample++) {
+    if (task.get_cancel() || TaskPool::canceled()) {
+      if (task.need_finish_queue == false)
+        break;
+    }
+
+    if (tile.stealing_state == RenderTile::CAN_BE_STOLEN && task.get_tile_stolen()) {
+      tile.stealing_state = RenderTile::WAS_STOLEN;
+      break;
+    }
+
+    if (tile.task == RenderTile::PATH_TRACE) {
+      for (int y = tile.y; y < tile.y + tile.h; y++) {
+        for (int x = tile.x; x < tile.x + tile.w; x++) {
+          if (use_coverage) {
+            coverage.init_pixel(x, y);
+          }
+          kernels.path_trace(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
+        }
+      }
+    }
+    else {
+      for (int y = tile.y; y < tile.y + tile.h; y++) {
+        for (int x = tile.x; x < tile.x + tile.w; x++) {
+          kernels.bake(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
+        }
+      }
+    }
+    tile.sample = sample + 1;
+
+    if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(sample)) {
+      const bool stop = adaptive_sampling_filter(kg, tile, sample);
+      if (stop) {
+        const int num_progress_samples = end_sample - sample;
+        tile.sample = end_sample;
+        task.update_progress(&tile, tile.w * tile.h * num_progress_samples);
+        break;
+      }
+    }
+
+    task.update_progress(&tile, tile.w * tile.h);
+  }
+  if (use_coverage) {
+    coverage.finalize();
+  }
+
+  if (task.adaptive_sampling.use && (tile.stealing_state != RenderTile::WAS_STOLEN)) {
+    adaptive_sampling_post(tile, kg);
+  }
+}
+
+void CPUDevice::thread_render(DeviceTask &task)
+{
+  if (TaskPool::canceled()) {
+    if (task.need_finish_queue == false)
+      return;
+  }
+
+  /* allocate buffer for kernel globals */
+  CPUKernelThreadGlobals kg(kernel_globals, get_cpu_osl_memory());
+
+  profiler.add_state(&kg.profiler);
+
+  /* NLM denoiser. */
+  DenoisingTask *denoising = NULL;
+
+  /* OpenImageDenoise: we can only denoise with one thread at a time, so to
+   * avoid waiting with mutex locks in the denoiser, we let only a single
+   * thread acquire denoising tiles. */
+  uint tile_types = task.tile_types;
+  bool hold_denoise_lock = false;
+  if ((tile_types & RenderTile::DENOISE) && task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
+    if (!oidn_task_lock.try_lock()) {
+      tile_types &= ~RenderTile::DENOISE;
+      hold_denoise_lock = true;
+    }
+  }
+
+  RenderTile tile;
+  while (task.acquire_tile(this, tile, tile_types)) {
+    if (tile.task == RenderTile::PATH_TRACE) {
+      render(task, tile, &kg);
+    }
+    else if (tile.task == RenderTile::BAKE) {
+      render(task, tile, &kg);
+    }
+    else if (tile.task == RenderTile::DENOISE) {
+      denoise_openimagedenoise(task, tile);
+      task.update_progress(&tile, tile.w * tile.h);
+    }
+
+    task.release_tile(tile);
+
+    if (TaskPool::canceled()) {
+      if (task.need_finish_queue == false)
+        break;
+    }
+  }
+
+  if (hold_denoise_lock) {
+    oidn_task_lock.unlock();
+  }
+
+  profiler.remove_state(&kg.profiler);
+
+  delete denoising;
+}
+
+void CPUDevice::thread_denoise(DeviceTask &task)
+{
+  RenderTile tile;
+  tile.x = task.x;
+  tile.y = task.y;
+  tile.w = task.w;
+  tile.h = task.h;
+  tile.buffer = task.buffer;
+  tile.sample = task.sample + task.num_samples;
+  tile.num_samples = task.num_samples;
+  tile.start_sample = task.sample;
+  tile.offset = task.offset;
+  tile.stride = task.stride;
+  tile.buffers = task.buffers;
+
+  denoise_openimagedenoise(task, tile);
+
+  task.update_progress(&tile, tile.w * tile.h);
+}
+#endif
+
+const CPUKernels *CPUDevice::get_cpu_kernels() const
+{
+  return &kernels;
+}
+
+void CPUDevice::get_cpu_kernel_thread_globals(
+    vector<CPUKernelThreadGlobals> &kernel_thread_globals)
+{
+  /* Ensure latest texture info is loaded into kernel globals before returning. */
+  load_texture_info();
+
+  kernel_thread_globals.clear();
+  void *osl_memory = get_cpu_osl_memory();
+  for (int i = 0; i < info.cpu_threads; i++) {
+    kernel_thread_globals.emplace_back(kernel_globals, osl_memory, profiler);
+  }
+}
+
+void *CPUDevice::get_cpu_osl_memory()
+{
+#ifdef WITH_OSL
+  return &osl_globals;
+#else
+  return NULL;
+#endif
+}
+
+bool CPUDevice::load_kernels(const uint /*kernel_features*/)
+{
+  return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/device_impl.h b/intern/cycles/device/cpu/device_impl.h
new file mode 100644
index 00000000000..7d222808652
--- /dev/null
+++ b/intern/cycles/device/cpu/device_impl.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+/* So ImathMath is included before our kernel_cpu_compat. */
+#ifdef WITH_OSL
+/* So no context pollution happens from indirectly included windows.h */
+#  include "util/util_windows.h"
+#  include <OSL/oslexec.h>
+#endif
+
+#ifdef WITH_EMBREE
+#  include <embree3/rtcore.h>
+#endif
+
+#include "device/cpu/kernel.h"
+#include "device/device.h"
+#include "device/device_memory.h"
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/kernel.h"
+#include "kernel/device/cpu/globals.h"
+
+#include "kernel/osl/osl_shader.h"
+#include "kernel/osl/osl_globals.h"
+// clang-format on
+
+CCL_NAMESPACE_BEGIN
+
+class CPUDevice : public Device {
+ public:
+  KernelGlobals kernel_globals;
+
+  device_vector<TextureInfo> texture_info;
+  bool need_texture_info;
+
+#ifdef WITH_OSL
+  OSLGlobals osl_globals;
+#endif
+#ifdef WITH_EMBREE
+  RTCScene embree_scene = NULL;
+  RTCDevice embree_device;
+#endif
+
+  CPUKernels kernels;
+
+  CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_);
+  ~CPUDevice();
+
+  virtual bool show_samples() const override;
+
+  virtual BVHLayoutMask get_bvh_layout_mask() const override;
+
+  /* Returns true if the texture info was copied to the device (meaning, some more
+   * re-initialization might be needed). */
+  bool load_texture_info();
+
+  virtual void mem_alloc(device_memory &mem) override;
+  virtual void mem_copy_to(device_memory &mem) override;
+  virtual void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override;
+  virtual void mem_zero(device_memory &mem) override;
+  virtual void mem_free(device_memory &mem) override;
+  virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override;
+
+  virtual void const_copy_to(const char *name, void *host, size_t size) override;
+
+  void global_alloc(device_memory &mem);
+  void global_free(device_memory &mem);
+
+  void tex_alloc(device_texture &mem);
+  void tex_free(device_texture &mem);
+
+  void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
+
+  virtual const CPUKernels *get_cpu_kernels() const override;
+  virtual void get_cpu_kernel_thread_globals(
+      vector<CPUKernelThreadGlobals> &kernel_thread_globals) override;
+  virtual void *get_cpu_osl_memory() override;
+
+ protected:
+  virtual bool load_kernels(uint /*kernel_features*/) override;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel.cpp b/intern/cycles/device/cpu/kernel.cpp
new file mode 100644
index 00000000000..91282390e27
--- /dev/null
+++ b/intern/cycles/device/cpu/kernel.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/cpu/kernel.h"
+
+#include "kernel/device/cpu/kernel.h"
+
+CCL_NAMESPACE_BEGIN
+
+#define KERNEL_FUNCTIONS(name) \
+  KERNEL_NAME_EVAL(cpu, name), KERNEL_NAME_EVAL(cpu_sse2, name), \
+      KERNEL_NAME_EVAL(cpu_sse3, name), KERNEL_NAME_EVAL(cpu_sse41, name), \
+      KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name)
+
+#define REGISTER_KERNEL(name) name(KERNEL_FUNCTIONS(name))
+
+CPUKernels::CPUKernels()
+    : /* Integrator. */
+      REGISTER_KERNEL(integrator_init_from_camera),
+      REGISTER_KERNEL(integrator_init_from_bake),
+      REGISTER_KERNEL(integrator_intersect_closest),
+      REGISTER_KERNEL(integrator_intersect_shadow),
+      REGISTER_KERNEL(integrator_intersect_subsurface),
+      REGISTER_KERNEL(integrator_intersect_volume_stack),
+      REGISTER_KERNEL(integrator_shade_background),
+      REGISTER_KERNEL(integrator_shade_light),
+      REGISTER_KERNEL(integrator_shade_shadow),
+      REGISTER_KERNEL(integrator_shade_surface),
+      REGISTER_KERNEL(integrator_shade_volume),
+      REGISTER_KERNEL(integrator_megakernel),
+      /* Shader evaluation. */
+      REGISTER_KERNEL(shader_eval_displace),
+      REGISTER_KERNEL(shader_eval_background),
+      /* Adaptive sampling. */
+      REGISTER_KERNEL(adaptive_sampling_convergence_check),
+      REGISTER_KERNEL(adaptive_sampling_filter_x),
+      REGISTER_KERNEL(adaptive_sampling_filter_y),
+      /* Cryptomatte. */
+      REGISTER_KERNEL(cryptomatte_postprocess),
+      /* Bake. */
+      REGISTER_KERNEL(bake)
+{
+}
+
+#undef REGISTER_KERNEL
+#undef KERNEL_FUNCTIONS
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel.h b/intern/cycles/device/cpu/kernel.h
new file mode 100644
index 00000000000..54b18308544
--- /dev/null
+++ b/intern/cycles/device/cpu/kernel.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "device/cpu/kernel_function.h"
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct KernelGlobals;
+struct IntegratorStateCPU;
+struct TileInfo;
+
+class CPUKernels {
+ public:
+  /* Integrator. */
+
+  using IntegratorFunction =
+      CPUKernelFunction<void (*)(const KernelGlobals *kg, IntegratorStateCPU *state)>;
+  using IntegratorShadeFunction = CPUKernelFunction<void (*)(
+      const KernelGlobals *kg, IntegratorStateCPU *state, ccl_global float *render_buffer)>;
+  using IntegratorInitFunction = CPUKernelFunction<bool (*)(const KernelGlobals *kg,
+                                                            IntegratorStateCPU *state,
+                                                            KernelWorkTile *tile,
+                                                            ccl_global float *render_buffer)>;
+
+  IntegratorInitFunction integrator_init_from_camera;
+  IntegratorInitFunction integrator_init_from_bake;
+  IntegratorFunction integrator_intersect_closest;
+  IntegratorFunction integrator_intersect_shadow;
+  IntegratorFunction integrator_intersect_subsurface;
+  IntegratorFunction integrator_intersect_volume_stack;
+  IntegratorShadeFunction integrator_shade_background;
+  IntegratorShadeFunction integrator_shade_light;
+  IntegratorShadeFunction integrator_shade_shadow;
+  IntegratorShadeFunction integrator_shade_surface;
+  IntegratorShadeFunction integrator_shade_volume;
+  IntegratorShadeFunction integrator_megakernel;
+
+  /* Shader evaluation. */
+
+  using ShaderEvalFunction = CPUKernelFunction<void (*)(
+      const KernelGlobals *kg, const KernelShaderEvalInput *, float4 *, const int)>;
+
+  ShaderEvalFunction shader_eval_displace;
+  ShaderEvalFunction shader_eval_background;
+
+  /* Adaptive stopping. */
+
+  using AdaptiveSamplingConvergenceCheckFunction =
+      CPUKernelFunction<bool (*)(const KernelGlobals *kg,
+                                 ccl_global float *render_buffer,
+                                 int x,
+                                 int y,
+                                 float threshold,
+                                 bool reset,
+                                 int offset,
+                                 int stride)>;
+
+  using AdaptiveSamplingFilterXFunction =
+      CPUKernelFunction<void (*)(const KernelGlobals *kg,
+                                 ccl_global float *render_buffer,
+                                 int y,
+                                 int start_x,
+                                 int width,
+                                 int offset,
+                                 int stride)>;
+
+  using AdaptiveSamplingFilterYFunction =
+      CPUKernelFunction<void (*)(const KernelGlobals *kg,
+                                 ccl_global float *render_buffer,
+                                 int x,
+                                 int start_y,
+                                 int height,
+                                 int offset,
+                                 int stride)>;
+
+  AdaptiveSamplingConvergenceCheckFunction adaptive_sampling_convergence_check;
+
+  AdaptiveSamplingFilterXFunction adaptive_sampling_filter_x;
+  AdaptiveSamplingFilterYFunction adaptive_sampling_filter_y;
+
+  /* Cryptomatte. */
+
+  using CryptomattePostprocessFunction = CPUKernelFunction<void (*)(
+      const KernelGlobals *kg, ccl_global float *render_buffer, int pixel_index)>;
+
+  CryptomattePostprocessFunction cryptomatte_postprocess;
+
+  /* Bake. */
+
+  CPUKernelFunction<void (*)(const KernelGlobals *, float *, int, int, int, int, int)> bake;
+
+  CPUKernels();
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel_function.h b/intern/cycles/device/cpu/kernel_function.h
new file mode 100644
index 00000000000..aa18720cc24
--- /dev/null
+++ b/intern/cycles/device/cpu/kernel_function.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_debug.h"
+#include "util/util_system.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* A wrapper around per-microarchitecture variant of a kernel function.
+ *
+ * Provides a function-call-like API which gets routed to the most suitable implementation.
+ *
+ * For example, on a computer which only has SSE4.1 the kernel_sse41 will be used. */
+template<typename FunctionType> class CPUKernelFunction {
+ public:
+  CPUKernelFunction(FunctionType kernel_default,
+                    FunctionType kernel_sse2,
+                    FunctionType kernel_sse3,
+                    FunctionType kernel_sse41,
+                    FunctionType kernel_avx,
+                    FunctionType kernel_avx2)
+  {
+    kernel_info_ = get_best_kernel_info(
+        kernel_default, kernel_sse2, kernel_sse3, kernel_sse41, kernel_avx, kernel_avx2);
+  }
+
+  template<typename... Args> inline auto operator()(Args... args) const
+  {
+    assert(kernel_info_.kernel);
+
+    return kernel_info_.kernel(args...);
+  }
+
+  const char *get_uarch_name() const
+  {
+    return kernel_info_.uarch_name;
+  }
+
+ protected:
+  /* Helper class which allows to pass human-readable microarchitecture name together with function
+   * pointer. */
+  class KernelInfo {
+   public:
+    KernelInfo() : KernelInfo("", nullptr)
+    {
+    }
+
+    /* TODO(sergey): Use string view, to have higher-level functionality (i.e. comparison) without
+     * memory allocation. */
+    KernelInfo(const char *uarch_name, FunctionType kernel)
+        : uarch_name(uarch_name), kernel(kernel)
+    {
+    }
+
+    const char *uarch_name;
+    FunctionType kernel;
+  };
+
+  KernelInfo get_best_kernel_info(FunctionType kernel_default,
+                                  FunctionType kernel_sse2,
+                                  FunctionType kernel_sse3,
+                                  FunctionType kernel_sse41,
+                                  FunctionType kernel_avx,
+                                  FunctionType kernel_avx2)
+  {
+    /* Silence warnings about unused variables when compiling without some architectures. */
+    (void)kernel_sse2;
+    (void)kernel_sse3;
+    (void)kernel_sse41;
+    (void)kernel_avx;
+    (void)kernel_avx2;
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+    if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) {
+      return KernelInfo("AVX2", kernel_avx2);
+    }
+#endif
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+    if (DebugFlags().cpu.has_avx() && system_cpu_support_avx()) {
+      return KernelInfo("AVX", kernel_avx);
+    }
+#endif
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+    if (DebugFlags().cpu.has_sse41() && system_cpu_support_sse41()) {
+      return KernelInfo("SSE4.1", kernel_sse41);
+    }
+#endif
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+    if (DebugFlags().cpu.has_sse3() && system_cpu_support_sse3()) {
+      return KernelInfo("SSE3", kernel_sse3);
+    }
+#endif
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+    if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) {
+      return KernelInfo("SSE2", kernel_sse2);
+    }
+#endif
+
+    return KernelInfo("default", kernel_default);
+  }
+
+  KernelInfo kernel_info_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel_thread_globals.cpp b/intern/cycles/device/cpu/kernel_thread_globals.cpp
new file mode 100644
index 00000000000..988b00cd1f0
--- /dev/null
+++ b/intern/cycles/device/cpu/kernel_thread_globals.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/cpu/kernel_thread_globals.h"
+
+// clang-format off
+#include "kernel/osl/osl_shader.h"
+#include "kernel/osl/osl_globals.h"
+// clang-format on
+
+#include "util/util_profiling.h"
+
+CCL_NAMESPACE_BEGIN
+
+CPUKernelThreadGlobals::CPUKernelThreadGlobals(const KernelGlobals &kernel_globals,
+                                               void *osl_globals_memory,
+                                               Profiler &cpu_profiler)
+    : KernelGlobals(kernel_globals), cpu_profiler_(cpu_profiler)
+{
+  reset_runtime_memory();
+
+#ifdef WITH_OSL
+  OSLShader::thread_init(this, reinterpret_cast<OSLGlobals *>(osl_globals_memory));
+#else
+  (void)osl_globals_memory;
+#endif
+}
+
+CPUKernelThreadGlobals::CPUKernelThreadGlobals(CPUKernelThreadGlobals &&other) noexcept
+    : KernelGlobals(std::move(other)), cpu_profiler_(other.cpu_profiler_)
+{
+  other.reset_runtime_memory();
+}
+
+CPUKernelThreadGlobals::~CPUKernelThreadGlobals()
+{
+#ifdef WITH_OSL
+  OSLShader::thread_free(this);
+#endif
+}
+
+CPUKernelThreadGlobals &CPUKernelThreadGlobals::operator=(CPUKernelThreadGlobals &&other)
+{
+  if (this == &other) {
+    return *this;
+  }
+
+  *static_cast<KernelGlobals *>(this) = *static_cast<KernelGlobals *>(&other);
+
+  other.reset_runtime_memory();
+
+  return *this;
+}
+
+void CPUKernelThreadGlobals::reset_runtime_memory()
+{
+#ifdef WITH_OSL
+  osl = nullptr;
+#endif
+}
+
+void CPUKernelThreadGlobals::start_profiling()
+{
+  cpu_profiler_.add_state(&profiler);
+}
+
+void CPUKernelThreadGlobals::stop_profiling()
+{
+  cpu_profiler_.remove_state(&profiler);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel_thread_globals.h b/intern/cycles/device/cpu/kernel_thread_globals.h
new file mode 100644
index 00000000000..d005c3bb56c
--- /dev/null
+++ b/intern/cycles/device/cpu/kernel_thread_globals.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Profiler;
+
+/* A special class which extends memory ownership of the `KernelGlobals` decoupling any resource
+ * which is not thread-safe for access. Every worker thread which needs to operate on
+ * `KernelGlobals` needs to initialize its own copy of this object.
+ *
+ * NOTE: Only minimal subset of objects are copied: `KernelData` is never copied. This means that
+ * there is no unnecessary data duplication happening when using this object. */
+class CPUKernelThreadGlobals : public KernelGlobals {
+ public:
+  /* TODO(sergey): Would be nice to have properly typed OSLGlobals even in the case when building
+   * without OSL support. Will avoid need to those unnamed pointers and casts. */
+  CPUKernelThreadGlobals(const KernelGlobals &kernel_globals,
+                         void *osl_globals_memory,
+                         Profiler &cpu_profiler);
+
+  ~CPUKernelThreadGlobals();
+
+  CPUKernelThreadGlobals(const CPUKernelThreadGlobals &other) = delete;
+  CPUKernelThreadGlobals(CPUKernelThreadGlobals &&other) noexcept;
+
+  CPUKernelThreadGlobals &operator=(const CPUKernelThreadGlobals &other) = delete;
+  CPUKernelThreadGlobals &operator=(CPUKernelThreadGlobals &&other);
+
+  void start_profiling();
+  void stop_profiling();
+
+ protected:
+  void reset_runtime_memory();
+
+  Profiler &cpu_profiler_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/cuda/device.cpp
index 2e225ecfaf8..84becd6d081 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/cuda/device.cpp
@@ -14,21 +14,25 @@
  * limitations under the License.
  */
 
-#ifdef WITH_CUDA
+#include "device/cuda/device.h"
+
+#include "util/util_logging.h"
 
-#  include "device/cuda/device_cuda.h"
+#ifdef WITH_CUDA
+#  include "device/cuda/device_impl.h"
 #  include "device/device.h"
-#  include "device/device_intern.h"
 
-#  include "util/util_logging.h"
 #  include "util/util_string.h"
 #  include "util/util_windows.h"
+#endif /* WITH_CUDA */
 
 CCL_NAMESPACE_BEGIN
 
 bool device_cuda_init()
 {
-#  ifdef WITH_CUDA_DYNLOAD
+#if !defined(WITH_CUDA)
+  return false;
+#elif defined(WITH_CUDA_DYNLOAD)
   static bool initialized = false;
   static bool result = false;
 
@@ -59,16 +63,27 @@ bool device_cuda_init()
   }
 
   return result;
-#  else  /* WITH_CUDA_DYNLOAD */
+#else  /* WITH_CUDA_DYNLOAD */
   return true;
-#  endif /* WITH_CUDA_DYNLOAD */
+#endif /* WITH_CUDA_DYNLOAD */
 }
 
-Device *device_cuda_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
+Device *device_cuda_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
 {
-  return new CUDADevice(info, stats, profiler, background);
+#ifdef WITH_CUDA
+  return new CUDADevice(info, stats, profiler);
+#else
+  (void)info;
+  (void)stats;
+  (void)profiler;
+
+  LOG(FATAL) << "Request to create CUDA device without compiled-in support. Should never happen.";
+
+  return nullptr;
+#endif
 }
 
+#ifdef WITH_CUDA
 static CUresult device_cuda_safe_init()
 {
 #  ifdef _WIN32
@@ -86,9 +101,11 @@ static CUresult device_cuda_safe_init()
   return cuInit(0);
 #  endif
 }
+#endif /* WITH_CUDA */
 
 void device_cuda_info(vector<DeviceInfo> &devices)
 {
+#ifdef WITH_CUDA
   CUresult result = device_cuda_safe_init();
   if (result != CUDA_SUCCESS) {
     if (result != CUDA_ERROR_NO_DEVICE)
@@ -129,9 +146,9 @@ void device_cuda_info(vector<DeviceInfo> &devices)
 
     info.has_half_images = (major >= 3);
     info.has_nanovdb = true;
-    info.has_volume_decoupled = false;
-    info.has_adaptive_stop_per_sample = false;
-    info.denoisers = DENOISER_NLM;
+    info.denoisers = 0;
+
+    info.has_gpu_queue = true;
 
     /* Check if the device has P2P access to any other device in the system. */
     for (int peer_num = 0; peer_num < count && !info.has_peer_memory; peer_num++) {
@@ -182,10 +199,14 @@ void device_cuda_info(vector<DeviceInfo> &devices)
 
   if (!display_devices.empty())
     devices.insert(devices.end(), display_devices.begin(), display_devices.end());
+#else  /* WITH_CUDA */
+  (void)devices;
+#endif /* WITH_CUDA */
 }
 
 string device_cuda_capabilities()
 {
+#ifdef WITH_CUDA
   CUresult result = device_cuda_safe_init();
   if (result != CUDA_SUCCESS) {
     if (result != CUDA_ERROR_NO_DEVICE) {
@@ -310,8 +331,10 @@ string device_cuda_capabilities()
   }
 
   return capabilities;
+
+#else  /* WITH_CUDA */
+  return "";
+#endif /* WITH_CUDA */
 }
 
 CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/cuda/device.h b/intern/cycles/device/cuda/device.h
new file mode 100644
index 00000000000..b0484904d1a
--- /dev/null
+++ b/intern/cycles/device/cuda/device.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_string.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+bool device_cuda_init();
+
+Device *device_cuda_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+void device_cuda_info(vector<DeviceInfo> &devices);
+
+string device_cuda_capabilities();
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cuda/device_cuda.h b/intern/cycles/device/cuda/device_cuda.h
deleted file mode 100644
index c3271c3cfcf..00000000000
--- a/intern/cycles/device/cuda/device_cuda.h
+++ /dev/null
@@ -1,270 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_CUDA
-
-#  include "device/device.h"
-#  include "device/device_denoising.h"
-#  include "device/device_split_kernel.h"
-
-#  include "util/util_map.h"
-#  include "util/util_task.h"
-
-#  ifdef WITH_CUDA_DYNLOAD
-#    include "cuew.h"
-#  else
-#    include "util/util_opengl.h"
-#    include <cuda.h>
-#    include <cudaGL.h>
-#  endif
-
-CCL_NAMESPACE_BEGIN
-
-class CUDASplitKernel;
-
-class CUDADevice : public Device {
-
-  friend class CUDASplitKernelFunction;
-  friend class CUDASplitKernel;
-  friend class CUDAContextScope;
-
- public:
-  DedicatedTaskPool task_pool;
-  CUdevice cuDevice;
-  CUcontext cuContext;
-  CUmodule cuModule, cuFilterModule;
-  size_t device_texture_headroom;
-  size_t device_working_headroom;
-  bool move_texture_to_host;
-  size_t map_host_used;
-  size_t map_host_limit;
-  int can_map_host;
-  int pitch_alignment;
-  int cuDevId;
-  int cuDevArchitecture;
-  bool first_error;
-  CUDASplitKernel *split_kernel;
-
-  struct CUDAMem {
-    CUDAMem() : texobject(0), array(0), use_mapped_host(false)
-    {
-    }
-
-    CUtexObject texobject;
-    CUarray array;
-
-    /* If true, a mapped host memory in shared_pointer is being used. */
-    bool use_mapped_host;
-  };
-  typedef map<device_memory *, CUDAMem> CUDAMemMap;
-  CUDAMemMap cuda_mem_map;
-  thread_mutex cuda_mem_map_mutex;
-
-  struct PixelMem {
-    GLuint cuPBO;
-    CUgraphicsResource cuPBOresource;
-    GLuint cuTexId;
-    int w, h;
-  };
-  map<device_ptr, PixelMem> pixel_mem_map;
-
-  /* Bindless Textures */
-  device_vector<TextureInfo> texture_info;
-  bool need_texture_info;
-
-  /* Kernels */
-  struct {
-    bool loaded;
-
-    CUfunction adaptive_stopping;
-    CUfunction adaptive_filter_x;
-    CUfunction adaptive_filter_y;
-    CUfunction adaptive_scale_samples;
-    int adaptive_num_threads_per_block;
-  } functions;
-
-  static bool have_precompiled_kernels();
-
-  virtual bool show_samples() const override;
-
-  virtual BVHLayoutMask get_bvh_layout_mask() const override;
-
-  void set_error(const string &error) override;
-
-  CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_);
-
-  virtual ~CUDADevice();
-
-  bool support_device(const DeviceRequestedFeatures & /*requested_features*/);
-
-  bool check_peer_access(Device *peer_device) override;
-
-  bool use_adaptive_compilation();
-
-  bool use_split_kernel();
-
-  virtual string compile_kernel_get_common_cflags(
-      const DeviceRequestedFeatures &requested_features, bool filter = false, bool split = false);
-
-  string compile_kernel(const DeviceRequestedFeatures &requested_features,
-                        const char *name,
-                        const char *base = "cuda",
-                        bool force_ptx = false);
-
-  virtual bool load_kernels(const DeviceRequestedFeatures &requested_features) override;
-
-  void load_functions();
-
-  void reserve_local_memory(const DeviceRequestedFeatures &requested_features);
-
-  void init_host_memory();
-
-  void load_texture_info();
-
-  void move_textures_to_host(size_t size, bool for_texture);
-
-  CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
-
-  void generic_copy_to(device_memory &mem);
-
-  void generic_free(device_memory &mem);
-
-  void mem_alloc(device_memory &mem) override;
-
-  void mem_copy_to(device_memory &mem) override;
-
-  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override;
-
-  void mem_zero(device_memory &mem) override;
-
-  void mem_free(device_memory &mem) override;
-
-  device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override;
-
-  virtual void const_copy_to(const char *name, void *host, size_t size) override;
-
-  void global_alloc(device_memory &mem);
-
-  void global_free(device_memory &mem);
-
-  void tex_alloc(device_texture &mem);
-
-  void tex_free(device_texture &mem);
-
-  bool denoising_non_local_means(device_ptr image_ptr,
-                                 device_ptr guide_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr out_ptr,
-                                 DenoisingTask *task);
-
-  bool denoising_construct_transform(DenoisingTask *task);
-
-  bool denoising_accumulate(device_ptr color_ptr,
-                            device_ptr color_variance_ptr,
-                            device_ptr scale_ptr,
-                            int frame,
-                            DenoisingTask *task);
-
-  bool denoising_solve(device_ptr output_ptr, DenoisingTask *task);
-
-  bool denoising_combine_halves(device_ptr a_ptr,
-                                device_ptr b_ptr,
-                                device_ptr mean_ptr,
-                                device_ptr variance_ptr,
-                                int r,
-                                int4 rect,
-                                DenoisingTask *task);
-
-  bool denoising_divide_shadow(device_ptr a_ptr,
-                               device_ptr b_ptr,
-                               device_ptr sample_variance_ptr,
-                               device_ptr sv_variance_ptr,
-                               device_ptr buffer_variance_ptr,
-                               DenoisingTask *task);
-
-  bool denoising_get_feature(int mean_offset,
-                             int variance_offset,
-                             device_ptr mean_ptr,
-                             device_ptr variance_ptr,
-                             float scale,
-                             DenoisingTask *task);
-
-  bool denoising_write_feature(int out_offset,
-                               device_ptr from_ptr,
-                               device_ptr buffer_ptr,
-                               DenoisingTask *task);
-
-  bool denoising_detect_outliers(device_ptr image_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr depth_ptr,
-                                 device_ptr output_ptr,
-                                 DenoisingTask *task);
-
-  void denoise(RenderTile &rtile, DenoisingTask &denoising);
-
-  void adaptive_sampling_filter(uint filter_sample,
-                                WorkTile *wtile,
-                                CUdeviceptr d_wtile,
-                                CUstream stream = 0);
-  void adaptive_sampling_post(RenderTile &rtile,
-                              WorkTile *wtile,
-                              CUdeviceptr d_wtile,
-                              CUstream stream = 0);
-
-  void render(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles);
-
-  void film_convert(DeviceTask &task,
-                    device_ptr buffer,
-                    device_ptr rgba_byte,
-                    device_ptr rgba_half);
-
-  void shader(DeviceTask &task);
-
-  CUdeviceptr map_pixels(device_ptr mem);
-
-  void unmap_pixels(device_ptr mem);
-
-  void pixels_alloc(device_memory &mem);
-
-  void pixels_copy_from(device_memory &mem, int y, int w, int h);
-
-  void pixels_free(device_memory &mem);
-
-  void draw_pixels(device_memory &mem,
-                   int y,
-                   int w,
-                   int h,
-                   int width,
-                   int height,
-                   int dx,
-                   int dy,
-                   int dw,
-                   int dh,
-                   bool transparent,
-                   const DeviceDrawParams &draw_params) override;
-
-  void thread_run(DeviceTask &task);
-
-  virtual void task_add(DeviceTask &task) override;
-
-  virtual void task_wait() override;
-
-  virtual void task_cancel() override;
-};
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp
deleted file mode 100644
index 2d2fcb38705..00000000000
--- a/intern/cycles/device/cuda/device_cuda_impl.cpp
+++ /dev/null
@@ -1,2714 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_CUDA
-
-#  include <climits>
-#  include <limits.h>
-#  include <stdio.h>
-#  include <stdlib.h>
-#  include <string.h>
-
-#  include "device/cuda/device_cuda.h"
-#  include "device/device_intern.h"
-#  include "device/device_split_kernel.h"
-
-#  include "render/buffers.h"
-
-#  include "kernel/filter/filter_defines.h"
-
-#  include "util/util_debug.h"
-#  include "util/util_foreach.h"
-#  include "util/util_logging.h"
-#  include "util/util_map.h"
-#  include "util/util_md5.h"
-#  include "util/util_opengl.h"
-#  include "util/util_path.h"
-#  include "util/util_string.h"
-#  include "util/util_system.h"
-#  include "util/util_time.h"
-#  include "util/util_types.h"
-#  include "util/util_windows.h"
-
-#  include "kernel/split/kernel_split_data_types.h"
-
-CCL_NAMESPACE_BEGIN
-
-#  ifndef WITH_CUDA_DYNLOAD
-
-/* Transparently implement some functions, so majority of the file does not need
- * to worry about difference between dynamically loaded and linked CUDA at all.
- */
-
-namespace {
-
-const char *cuewErrorString(CUresult result)
-{
-  /* We can only give error code here without major code duplication, that
-   * should be enough since dynamic loading is only being disabled by folks
-   * who knows what they're doing anyway.
-   *
-   * NOTE: Avoid call from several threads.
-   */
-  static string error;
-  error = string_printf("%d", result);
-  return error.c_str();
-}
-
-const char *cuewCompilerPath()
-{
-  return CYCLES_CUDA_NVCC_EXECUTABLE;
-}
-
-int cuewCompilerVersion()
-{
-  return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10);
-}
-
-} /* namespace */
-#  endif /* WITH_CUDA_DYNLOAD */
-
-class CUDADevice;
-
-class CUDASplitKernel : public DeviceSplitKernel {
-  CUDADevice *device;
-
- public:
-  explicit CUDASplitKernel(CUDADevice *device);
-
-  virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads);
-
-  virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
-                                              RenderTile &rtile,
-                                              int num_global_elements,
-                                              device_memory &kernel_globals,
-                                              device_memory &kernel_data_,
-                                              device_memory &split_data,
-                                              device_memory &ray_state,
-                                              device_memory &queue_index,
-                                              device_memory &use_queues_flag,
-                                              device_memory &work_pool_wgs);
-
-  virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
-                                                         const DeviceRequestedFeatures &);
-  virtual int2 split_kernel_local_size();
-  virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task);
-};
-
-/* Utility to push/pop CUDA context. */
-class CUDAContextScope {
- public:
-  CUDAContextScope(CUDADevice *device);
-  ~CUDAContextScope();
-
- private:
-  CUDADevice *device;
-};
-
-bool CUDADevice::have_precompiled_kernels()
-{
-  string cubins_path = path_get("lib");
-  return path_exists(cubins_path);
-}
-
-bool CUDADevice::show_samples() const
-{
-  /* The CUDADevice only processes one tile at a time, so showing samples is fine. */
-  return true;
-}
-
-BVHLayoutMask CUDADevice::get_bvh_layout_mask() const
-{
-  return BVH_LAYOUT_BVH2;
-}
-
-void CUDADevice::set_error(const string &error)
-{
-  Device::set_error(error);
-
-  if (first_error) {
-    fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
-    fprintf(stderr,
-            "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
-    first_error = false;
-  }
-}
-
-#  define cuda_assert(stmt) \
-    { \
-      CUresult result = stmt; \
-      if (result != CUDA_SUCCESS) { \
-        const char *name = cuewErrorString(result); \
-        set_error(string_printf("%s in %s (device_cuda_impl.cpp:%d)", name, #stmt, __LINE__)); \
-      } \
-    } \
-    (void)0
-
-CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_)
-    : Device(info, stats, profiler, background_), texture_info(this, "__texture_info", MEM_GLOBAL)
-{
-  first_error = true;
-  background = background_;
-
-  cuDevId = info.num;
-  cuDevice = 0;
-  cuContext = 0;
-
-  cuModule = 0;
-  cuFilterModule = 0;
-
-  split_kernel = NULL;
-
-  need_texture_info = false;
-
-  device_texture_headroom = 0;
-  device_working_headroom = 0;
-  move_texture_to_host = false;
-  map_host_limit = 0;
-  map_host_used = 0;
-  can_map_host = 0;
-  pitch_alignment = 0;
-
-  functions.loaded = false;
-
-  /* Initialize CUDA. */
-  CUresult result = cuInit(0);
-  if (result != CUDA_SUCCESS) {
-    set_error(string_printf("Failed to initialize CUDA runtime (%s)", cuewErrorString(result)));
-    return;
-  }
-
-  /* Setup device and context. */
-  result = cuDeviceGet(&cuDevice, cuDevId);
-  if (result != CUDA_SUCCESS) {
-    set_error(string_printf("Failed to get CUDA device handle from ordinal (%s)",
-                            cuewErrorString(result)));
-    return;
-  }
-
-  /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
-   * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
-   * so we can predict which memory to map to host. */
-  cuda_assert(
-      cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
-
-  cuda_assert(cuDeviceGetAttribute(
-      &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
-
-  unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
-  if (can_map_host) {
-    ctx_flags |= CU_CTX_MAP_HOST;
-    init_host_memory();
-  }
-
-  /* Create context. */
-  if (background) {
-    result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
-  }
-  else {
-    result = cuGLCtxCreate(&cuContext, ctx_flags, cuDevice);
-
-    if (result != CUDA_SUCCESS) {
-      result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
-      background = true;
-    }
-  }
-
-  if (result != CUDA_SUCCESS) {
-    set_error(string_printf("Failed to create CUDA context (%s)", cuewErrorString(result)));
-    return;
-  }
-
-  int major, minor;
-  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
-  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
-  cuDevArchitecture = major * 100 + minor * 10;
-
-  /* Pop context set by cuCtxCreate. */
-  cuCtxPopCurrent(NULL);
-}
-
-CUDADevice::~CUDADevice()
-{
-  task_pool.cancel();
-
-  delete split_kernel;
-
-  texture_info.free();
-
-  cuda_assert(cuCtxDestroy(cuContext));
-}
-
-bool CUDADevice::support_device(const DeviceRequestedFeatures & /*requested_features*/)
-{
-  int major, minor;
-  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
-  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
-
-  /* We only support sm_30 and above */
-  if (major < 3) {
-    set_error(string_printf(
-        "CUDA backend requires compute capability 3.0 or up, but found %d.%d.", major, minor));
-    return false;
-  }
-
-  return true;
-}
-
-bool CUDADevice::check_peer_access(Device *peer_device)
-{
-  if (peer_device == this) {
-    return false;
-  }
-  if (peer_device->info.type != DEVICE_CUDA && peer_device->info.type != DEVICE_OPTIX) {
-    return false;
-  }
-
-  CUDADevice *const peer_device_cuda = static_cast<CUDADevice *>(peer_device);
-
-  int can_access = 0;
-  cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice));
-  if (can_access == 0) {
-    return false;
-  }
-
-  // Ensure array access over the link is possible as well (for 3D textures)
-  cuda_assert(cuDeviceGetP2PAttribute(&can_access,
-                                      CU_DEVICE_P2P_ATTRIBUTE_ARRAY_ACCESS_ACCESS_SUPPORTED,
-                                      cuDevice,
-                                      peer_device_cuda->cuDevice));
-  if (can_access == 0) {
-    return false;
-  }
-
-  // Enable peer access in both directions
-  {
-    const CUDAContextScope scope(this);
-    CUresult result = cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0);
-    if (result != CUDA_SUCCESS) {
-      set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
-                              cuewErrorString(result)));
-      return false;
-    }
-  }
-  {
-    const CUDAContextScope scope(peer_device_cuda);
-    CUresult result = cuCtxEnablePeerAccess(cuContext, 0);
-    if (result != CUDA_SUCCESS) {
-      set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
-                              cuewErrorString(result)));
-      return false;
-    }
-  }
-
-  return true;
-}
-
-bool CUDADevice::use_adaptive_compilation()
-{
-  return DebugFlags().cuda.adaptive_compile;
-}
-
-bool CUDADevice::use_split_kernel()
-{
-  return DebugFlags().cuda.split_kernel;
-}
-
-/* Common NVCC flags which stays the same regardless of shading model,
- * kernel sources md5 and only depends on compiler or compilation settings.
- */
-string CUDADevice::compile_kernel_get_common_cflags(
-    const DeviceRequestedFeatures &requested_features, bool filter, bool split)
-{
-  const int machine = system_cpu_bits();
-  const string source_path = path_get("source");
-  const string include_path = source_path;
-  string cflags = string_printf(
-      "-m%d "
-      "--ptxas-options=\"-v\" "
-      "--use_fast_math "
-      "-DNVCC "
-      "-I\"%s\"",
-      machine,
-      include_path.c_str());
-  if (!filter && use_adaptive_compilation()) {
-    cflags += " " + requested_features.get_build_options();
-  }
-  const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
-  if (extra_cflags) {
-    cflags += string(" ") + string(extra_cflags);
-  }
-
-  if (split) {
-    cflags += " -D__SPLIT__";
-  }
-
-#  ifdef WITH_NANOVDB
-  cflags += " -DWITH_NANOVDB";
-#  endif
-
-  return cflags;
-}
-
-string CUDADevice::compile_kernel(const DeviceRequestedFeatures &requested_features,
-                                  const char *name,
-                                  const char *base,
-                                  bool force_ptx)
-{
-  /* Compute kernel name. */
-  int major, minor;
-  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
-  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
-
-  /* Attempt to use kernel provided with Blender. */
-  if (!use_adaptive_compilation()) {
-    if (!force_ptx) {
-      const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor));
-      VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
-      if (path_exists(cubin)) {
-        VLOG(1) << "Using precompiled kernel.";
-        return cubin;
-      }
-    }
-
-    /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */
-    int ptx_major = major, ptx_minor = minor;
-    while (ptx_major >= 3) {
-      const string ptx = path_get(
-          string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor));
-      VLOG(1) << "Testing for pre-compiled kernel " << ptx << ".";
-      if (path_exists(ptx)) {
-        VLOG(1) << "Using precompiled kernel.";
-        return ptx;
-      }
-
-      if (ptx_minor > 0) {
-        ptx_minor--;
-      }
-      else {
-        ptx_major--;
-        ptx_minor = 9;
-      }
-    }
-  }
-
-  /* Try to use locally compiled kernel. */
-  string source_path = path_get("source");
-  const string source_md5 = path_files_md5_hash(source_path);
-
-  /* We include cflags into md5 so changing cuda toolkit or changing other
-   * compiler command line arguments makes sure cubin gets re-built.
-   */
-  string common_cflags = compile_kernel_get_common_cflags(
-      requested_features, strstr(name, "filter") != NULL, strstr(name, "split") != NULL);
-  const string kernel_md5 = util_md5_string(source_md5 + common_cflags);
-
-  const char *const kernel_ext = force_ptx ? "ptx" : "cubin";
-  const char *const kernel_arch = force_ptx ? "compute" : "sm";
-  const string cubin_file = string_printf(
-      "cycles_%s_%s_%d%d_%s.%s", name, kernel_arch, major, minor, kernel_md5.c_str(), kernel_ext);
-  const string cubin = path_cache_get(path_join("kernels", cubin_file));
-  VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
-  if (path_exists(cubin)) {
-    VLOG(1) << "Using locally compiled kernel.";
-    return cubin;
-  }
-
-#  ifdef _WIN32
-  if (!use_adaptive_compilation() && have_precompiled_kernels()) {
-    if (major < 3) {
-      set_error(
-          string_printf("CUDA backend requires compute capability 3.0 or up, but found %d.%d. "
-                        "Your GPU is not supported.",
-                        major,
-                        minor));
-    }
-    else {
-      set_error(
-          string_printf("CUDA binary kernel for this graphics card compute "
-                        "capability (%d.%d) not found.",
-                        major,
-                        minor));
-    }
-    return string();
-  }
-#  endif
-
-  /* Compile. */
-  const char *const nvcc = cuewCompilerPath();
-  if (nvcc == NULL) {
-    set_error(
-        "CUDA nvcc compiler not found. "
-        "Install CUDA toolkit in default location.");
-    return string();
-  }
-
-  const int nvcc_cuda_version = cuewCompilerVersion();
-  VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << nvcc_cuda_version << ".";
-  if (nvcc_cuda_version < 101) {
-    printf(
-        "Unsupported CUDA version %d.%d detected, "
-        "you need CUDA 10.1 or newer.\n",
-        nvcc_cuda_version / 10,
-        nvcc_cuda_version % 10);
-    return string();
-  }
-  else if (!(nvcc_cuda_version == 101 || nvcc_cuda_version == 102 || nvcc_cuda_version == 111 ||
-             nvcc_cuda_version == 112 || nvcc_cuda_version == 113 || nvcc_cuda_version == 114)) {
-    printf(
-        "CUDA version %d.%d detected, build may succeed but only "
-        "CUDA 10.1 to 11.4 are officially supported.\n",
-        nvcc_cuda_version / 10,
-        nvcc_cuda_version % 10);
-  }
-
-  double starttime = time_dt();
-
-  path_create_directories(cubin);
-
-  source_path = path_join(path_join(source_path, "kernel"),
-                          path_join("kernels", path_join(base, string_printf("%s.cu", name))));
-
-  string command = string_printf(
-      "\"%s\" "
-      "-arch=%s_%d%d "
-      "--%s \"%s\" "
-      "-o \"%s\" "
-      "%s",
-      nvcc,
-      kernel_arch,
-      major,
-      minor,
-      kernel_ext,
-      source_path.c_str(),
-      cubin.c_str(),
-      common_cflags.c_str());
-
-  printf("Compiling CUDA kernel ...\n%s\n", command.c_str());
-
-#  ifdef _WIN32
-  command = "call " + command;
-#  endif
-  if (system(command.c_str()) != 0) {
-    set_error(
-        "Failed to execute compilation command, "
-        "see console for details.");
-    return string();
-  }
-
-  /* Verify if compilation succeeded */
-  if (!path_exists(cubin)) {
-    set_error(
-        "CUDA kernel compilation failed, "
-        "see console for details.");
-    return string();
-  }
-
-  printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
-
-  return cubin;
-}
-
-bool CUDADevice::load_kernels(const DeviceRequestedFeatures &requested_features)
-{
-  /* TODO(sergey): Support kernels re-load for CUDA devices.
-   *
-   * Currently re-loading kernel will invalidate memory pointers,
-   * causing problems in cuCtxSynchronize.
-   */
-  if (cuFilterModule && cuModule) {
-    VLOG(1) << "Skipping kernel reload, not currently supported.";
-    return true;
-  }
-
-  /* check if cuda init succeeded */
-  if (cuContext == 0)
-    return false;
-
-  /* check if GPU is supported */
-  if (!support_device(requested_features))
-    return false;
-
-  /* get kernel */
-  const char *kernel_name = use_split_kernel() ? "kernel_split" : "kernel";
-  string cubin = compile_kernel(requested_features, kernel_name);
-  if (cubin.empty())
-    return false;
-
-  const char *filter_name = "filter";
-  string filter_cubin = compile_kernel(requested_features, filter_name);
-  if (filter_cubin.empty())
-    return false;
-
-  /* open module */
-  CUDAContextScope scope(this);
-
-  string cubin_data;
-  CUresult result;
-
-  if (path_read_text(cubin, cubin_data))
-    result = cuModuleLoadData(&cuModule, cubin_data.c_str());
-  else
-    result = CUDA_ERROR_FILE_NOT_FOUND;
-
-  if (result != CUDA_SUCCESS)
-    set_error(string_printf(
-        "Failed to load CUDA kernel from '%s' (%s)", cubin.c_str(), cuewErrorString(result)));
-
-  if (path_read_text(filter_cubin, cubin_data))
-    result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
-  else
-    result = CUDA_ERROR_FILE_NOT_FOUND;
-
-  if (result != CUDA_SUCCESS)
-    set_error(string_printf("Failed to load CUDA kernel from '%s' (%s)",
-                            filter_cubin.c_str(),
-                            cuewErrorString(result)));
-
-  if (result == CUDA_SUCCESS) {
-    reserve_local_memory(requested_features);
-  }
-
-  load_functions();
-
-  return (result == CUDA_SUCCESS);
-}
-
-void CUDADevice::load_functions()
-{
-  /* TODO: load all functions here. */
-  if (functions.loaded) {
-    return;
-  }
-  functions.loaded = true;
-
-  cuda_assert(cuModuleGetFunction(
-      &functions.adaptive_stopping, cuModule, "kernel_cuda_adaptive_stopping"));
-  cuda_assert(cuModuleGetFunction(
-      &functions.adaptive_filter_x, cuModule, "kernel_cuda_adaptive_filter_x"));
-  cuda_assert(cuModuleGetFunction(
-      &functions.adaptive_filter_y, cuModule, "kernel_cuda_adaptive_filter_y"));
-  cuda_assert(cuModuleGetFunction(
-      &functions.adaptive_scale_samples, cuModule, "kernel_cuda_adaptive_scale_samples"));
-
-  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_stopping, CU_FUNC_CACHE_PREFER_L1));
-  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_x, CU_FUNC_CACHE_PREFER_L1));
-  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_y, CU_FUNC_CACHE_PREFER_L1));
-  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_scale_samples, CU_FUNC_CACHE_PREFER_L1));
-
-  int unused_min_blocks;
-  cuda_assert(cuOccupancyMaxPotentialBlockSize(&unused_min_blocks,
-                                               &functions.adaptive_num_threads_per_block,
-                                               functions.adaptive_scale_samples,
-                                               NULL,
-                                               0,
-                                               0));
-}
-
-void CUDADevice::reserve_local_memory(const DeviceRequestedFeatures &requested_features)
-{
-  if (use_split_kernel()) {
-    /* Split kernel mostly uses global memory and adaptive compilation,
-     * difficult to predict how much is needed currently. */
-    return;
-  }
-
-  /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
-   * needed for kernel launches, so that we can reliably figure out when
-   * to allocate scene data in mapped host memory. */
-  CUDAContextScope scope(this);
-
-  size_t total = 0, free_before = 0, free_after = 0;
-  cuMemGetInfo(&free_before, &total);
-
-  /* Get kernel function. */
-  CUfunction cuRender;
-
-  if (requested_features.use_baking) {
-    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake"));
-  }
-  else if (requested_features.use_integrator_branched) {
-    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace"));
-  }
-  else {
-    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace"));
-  }
-
-  cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
-
-  int min_blocks, num_threads_per_block;
-  cuda_assert(
-      cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0));
-
-  /* Launch kernel, using just 1 block appears sufficient to reserve
-   * memory for all multiprocessors. It would be good to do this in
-   * parallel for the multi GPU case still to make it faster. */
-  CUdeviceptr d_work_tiles = 0;
-  uint total_work_size = 0;
-
-  void *args[] = {&d_work_tiles, &total_work_size};
-
-  cuda_assert(cuLaunchKernel(cuRender, 1, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
-
-  cuda_assert(cuCtxSynchronize());
-
-  cuMemGetInfo(&free_after, &total);
-  VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after)
-          << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
-
-#  if 0
-  /* For testing mapped host memory, fill up device memory. */
-  const size_t keep_mb = 1024;
-
-  while (free_after > keep_mb * 1024 * 1024LL) {
-    CUdeviceptr tmp;
-    cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
-    cuMemGetInfo(&free_after, &total);
-  }
-#  endif
-}
-
-void CUDADevice::init_host_memory()
-{
-  /* Limit amount of host mapped memory, because allocating too much can
-   * cause system instability. Leave at least half or 4 GB of system
-   * memory free, whichever is smaller. */
-  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
-  size_t system_ram = system_physical_ram();
-
-  if (system_ram > 0) {
-    if (system_ram / 2 > default_limit) {
-      map_host_limit = system_ram - default_limit;
-    }
-    else {
-      map_host_limit = system_ram / 2;
-    }
-  }
-  else {
-    VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
-    map_host_limit = 0;
-  }
-
-  /* Amount of device memory to keep is free after texture memory
-   * and working memory allocations respectively. We set the working
-   * memory limit headroom lower so that some space is left after all
-   * texture memory allocations. */
-  device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
-  device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
-
-  VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
-          << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
-}
-
-void CUDADevice::load_texture_info()
-{
-  if (need_texture_info) {
-    /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
-     * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
-    need_texture_info = false;
-    texture_info.copy_to_device();
-  }
-}
-
-void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
-{
-  /* Break out of recursive call, which can happen when moving memory on a multi device. */
-  static bool any_device_moving_textures_to_host = false;
-  if (any_device_moving_textures_to_host) {
-    return;
-  }
-
-  /* Signal to reallocate textures in host memory only. */
-  move_texture_to_host = true;
-
-  while (size > 0) {
-    /* Find suitable memory allocation to move. */
-    device_memory *max_mem = NULL;
-    size_t max_size = 0;
-    bool max_is_image = false;
-
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
-      device_memory &mem = *pair.first;
-      CUDAMem *cmem = &pair.second;
-
-      /* Can only move textures allocated on this device (and not those from peer devices).
-       * And need to ignore memory that is already on the host. */
-      if (!mem.is_resident(this) || cmem->use_mapped_host) {
-        continue;
-      }
-
-      bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
-                        (&mem != &texture_info);
-      bool is_image = is_texture && (mem.data_height > 1);
-
-      /* Can't move this type of memory. */
-      if (!is_texture || cmem->array) {
-        continue;
-      }
-
-      /* For other textures, only move image textures. */
-      if (for_texture && !is_image) {
-        continue;
-      }
-
-      /* Try to move largest allocation, prefer moving images. */
-      if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
-        max_is_image = is_image;
-        max_size = mem.device_size;
-        max_mem = &mem;
-      }
-    }
-    lock.unlock();
-
-    /* Move to host memory. This part is mutex protected since
-     * multiple CUDA devices could be moving the memory. The
-     * first one will do it, and the rest will adopt the pointer. */
-    if (max_mem) {
-      VLOG(1) << "Move memory from device to host: " << max_mem->name;
-
-      static thread_mutex move_mutex;
-      thread_scoped_lock lock(move_mutex);
-
-      any_device_moving_textures_to_host = true;
-
-      /* Potentially need to call back into multi device, so pointer mapping
-       * and peer devices are updated. This is also necessary since the device
-       * pointer may just be a key here, so cannot be accessed and freed directly.
-       * Unfortunately it does mean that memory is reallocated on all other
-       * devices as well, which is potentially dangerous when still in use (since
-       * a thread rendering on another devices would only be caught in this mutex
-       * if it so happens to do an allocation at the same time as well. */
-      max_mem->device_copy_to();
-      size = (max_size >= size) ? 0 : size - max_size;
-
-      any_device_moving_textures_to_host = false;
-    }
-    else {
-      break;
-    }
-  }
-
-  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
-  move_texture_to_host = false;
-
-  /* Update texture info array with new pointers. */
-  load_texture_info();
-}
-
-CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding)
-{
-  CUDAContextScope scope(this);
-
-  CUdeviceptr device_pointer = 0;
-  size_t size = mem.memory_size() + pitch_padding;
-
-  CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
-  const char *status = "";
-
-  /* First try allocating in device memory, respecting headroom. We make
-   * an exception for texture info. It is small and frequently accessed,
-   * so treat it as working memory.
-   *
-   * If there is not enough room for working memory, we will try to move
-   * textures to host memory, assuming the performance impact would have
-   * been worse for working memory. */
-  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
-  bool is_image = is_texture && (mem.data_height > 1);
-
-  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
-
-  size_t total = 0, free = 0;
-  cuMemGetInfo(&free, &total);
-
-  /* Move textures to host memory if needed. */
-  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
-    move_textures_to_host(size + headroom - free, is_texture);
-    cuMemGetInfo(&free, &total);
-  }
-
-  /* Allocate in device memory. */
-  if (!move_texture_to_host && (size + headroom) < free) {
-    mem_alloc_result = cuMemAlloc(&device_pointer, size);
-    if (mem_alloc_result == CUDA_SUCCESS) {
-      status = " in device memory";
-    }
-  }
-
-  /* Fall back to mapped host memory if needed and possible. */
-
-  void *shared_pointer = 0;
-
-  if (mem_alloc_result != CUDA_SUCCESS && can_map_host && mem.type != MEM_DEVICE_ONLY) {
-    if (mem.shared_pointer) {
-      /* Another device already allocated host memory. */
-      mem_alloc_result = CUDA_SUCCESS;
-      shared_pointer = mem.shared_pointer;
-    }
-    else if (map_host_used + size < map_host_limit) {
-      /* Allocate host memory ourselves. */
-      mem_alloc_result = cuMemHostAlloc(
-          &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
-
-      assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
-             (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
-    }
-
-    if (mem_alloc_result == CUDA_SUCCESS) {
-      cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0));
-      map_host_used += size;
-      status = " in host memory";
-    }
-  }
-
-  if (mem_alloc_result != CUDA_SUCCESS) {
-    if (mem.type == MEM_DEVICE_ONLY) {
-      status = " failed, out of device memory";
-      set_error("System is out of GPU memory");
-    }
-    else {
-      status = " failed, out of device and host memory";
-      set_error("System is out of GPU and shared host memory");
-    }
-  }
-
-  if (mem.name) {
-    VLOG(1) << "Buffer allocate: " << mem.name << ", "
-            << string_human_readable_number(mem.memory_size()) << " bytes. ("
-            << string_human_readable_size(mem.memory_size()) << ")" << status;
-  }
-
-  mem.device_pointer = (device_ptr)device_pointer;
-  mem.device_size = size;
-  stats.mem_alloc(size);
-
-  if (!mem.device_pointer) {
-    return NULL;
-  }
-
-  /* Insert into map of allocations. */
-  thread_scoped_lock lock(cuda_mem_map_mutex);
-  CUDAMem *cmem = &cuda_mem_map[&mem];
-  if (shared_pointer != 0) {
-    /* Replace host pointer with our host allocation. Only works if
-     * CUDA memory layout is the same and has no pitch padding. Also
-     * does not work if we move textures to host during a render,
-     * since other devices might be using the memory. */
-
-    if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
-        mem.host_pointer != shared_pointer) {
-      memcpy(shared_pointer, mem.host_pointer, size);
-
-      /* A Call to device_memory::host_free() should be preceded by
-       * a call to device_memory::device_free() for host memory
-       * allocated by a device to be handled properly. Two exceptions
-       * are here and a call in OptiXDevice::generic_alloc(), where
-       * the current host memory can be assumed to be allocated by
-       * device_memory::host_alloc(), not by a device */
-
-      mem.host_free();
-      mem.host_pointer = shared_pointer;
-    }
-    mem.shared_pointer = shared_pointer;
-    mem.shared_counter++;
-    cmem->use_mapped_host = true;
-  }
-  else {
-    cmem->use_mapped_host = false;
-  }
-
-  return cmem;
-}
-
-void CUDADevice::generic_copy_to(device_memory &mem)
-{
-  if (!mem.host_pointer || !mem.device_pointer) {
-    return;
-  }
-
-  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
-   * cuMemAlloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
-   * mem.host_pointer. */
-  thread_scoped_lock lock(cuda_mem_map_mutex);
-  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
-    const CUDAContextScope scope(this);
-    cuda_assert(
-        cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size()));
-  }
-}
-
-void CUDADevice::generic_free(device_memory &mem)
-{
-  if (mem.device_pointer) {
-    CUDAContextScope scope(this);
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    const CUDAMem &cmem = cuda_mem_map[&mem];
-
-    /* If cmem.use_mapped_host is true, reference counting is used
-     * to safely free a mapped host memory. */
-
-    if (cmem.use_mapped_host) {
-      assert(mem.shared_pointer);
-      if (mem.shared_pointer) {
-        assert(mem.shared_counter > 0);
-        if (--mem.shared_counter == 0) {
-          if (mem.host_pointer == mem.shared_pointer) {
-            mem.host_pointer = 0;
-          }
-          cuMemFreeHost(mem.shared_pointer);
-          mem.shared_pointer = 0;
-        }
-      }
-      map_host_used -= mem.device_size;
-    }
-    else {
-      /* Free device memory. */
-      cuda_assert(cuMemFree(mem.device_pointer));
-    }
-
-    stats.mem_free(mem.device_size);
-    mem.device_pointer = 0;
-    mem.device_size = 0;
-
-    cuda_mem_map.erase(cuda_mem_map.find(&mem));
-  }
-}
-
-void CUDADevice::mem_alloc(device_memory &mem)
-{
-  if (mem.type == MEM_PIXELS && !background) {
-    pixels_alloc(mem);
-  }
-  else if (mem.type == MEM_TEXTURE) {
-    assert(!"mem_alloc not supported for textures.");
-  }
-  else if (mem.type == MEM_GLOBAL) {
-    assert(!"mem_alloc not supported for global memory.");
-  }
-  else {
-    generic_alloc(mem);
-  }
-}
-
-void CUDADevice::mem_copy_to(device_memory &mem)
-{
-  if (mem.type == MEM_PIXELS) {
-    assert(!"mem_copy_to not supported for pixels.");
-  }
-  else if (mem.type == MEM_GLOBAL) {
-    global_free(mem);
-    global_alloc(mem);
-  }
-  else if (mem.type == MEM_TEXTURE) {
-    tex_free((device_texture &)mem);
-    tex_alloc((device_texture &)mem);
-  }
-  else {
-    if (!mem.device_pointer) {
-      generic_alloc(mem);
-    }
-    generic_copy_to(mem);
-  }
-}
-
-void CUDADevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
-{
-  if (mem.type == MEM_PIXELS && !background) {
-    pixels_copy_from(mem, y, w, h);
-  }
-  else if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) {
-    assert(!"mem_copy_from not supported for textures.");
-  }
-  else if (mem.host_pointer) {
-    const size_t size = elem * w * h;
-    const size_t offset = elem * y * w;
-
-    if (mem.device_pointer) {
-      const CUDAContextScope scope(this);
-      cuda_assert(cuMemcpyDtoH(
-          (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size));
-    }
-    else {
-      memset((char *)mem.host_pointer + offset, 0, size);
-    }
-  }
-}
-
-void CUDADevice::mem_zero(device_memory &mem)
-{
-  if (!mem.device_pointer) {
-    mem_alloc(mem);
-  }
-  if (!mem.device_pointer) {
-    return;
-  }
-
-  /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
-   * regardless of mem.host_pointer and mem.shared_pointer. */
-  thread_scoped_lock lock(cuda_mem_map_mutex);
-  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
-    const CUDAContextScope scope(this);
-    cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
-  }
-  else if (mem.host_pointer) {
-    memset(mem.host_pointer, 0, mem.memory_size());
-  }
-}
-
-void CUDADevice::mem_free(device_memory &mem)
-{
-  if (mem.type == MEM_PIXELS && !background) {
-    pixels_free(mem);
-  }
-  else if (mem.type == MEM_GLOBAL) {
-    global_free(mem);
-  }
-  else if (mem.type == MEM_TEXTURE) {
-    tex_free((device_texture &)mem);
-  }
-  else {
-    generic_free(mem);
-  }
-}
-
-device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
-{
-  return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
-}
-
-void CUDADevice::const_copy_to(const char *name, void *host, size_t size)
-{
-  CUDAContextScope scope(this);
-  CUdeviceptr mem;
-  size_t bytes;
-
-  cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
-  // assert(bytes == size);
-  cuda_assert(cuMemcpyHtoD(mem, host, size));
-}
-
-void CUDADevice::global_alloc(device_memory &mem)
-{
-  if (mem.is_resident(this)) {
-    generic_alloc(mem);
-    generic_copy_to(mem);
-  }
-
-  const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
-}
-
-void CUDADevice::global_free(device_memory &mem)
-{
-  if (mem.is_resident(this) && mem.device_pointer) {
-    generic_free(mem);
-  }
-}
-
-void CUDADevice::tex_alloc(device_texture &mem)
-{
-  CUDAContextScope scope(this);
-
-  /* General variables for both architectures */
-  string bind_name = mem.name;
-  size_t dsize = datatype_size(mem.data_type);
-  size_t size = mem.memory_size();
-
-  CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
-  switch (mem.info.extension) {
-    case EXTENSION_REPEAT:
-      address_mode = CU_TR_ADDRESS_MODE_WRAP;
-      break;
-    case EXTENSION_EXTEND:
-      address_mode = CU_TR_ADDRESS_MODE_CLAMP;
-      break;
-    case EXTENSION_CLIP:
-      address_mode = CU_TR_ADDRESS_MODE_BORDER;
-      break;
-    default:
-      assert(0);
-      break;
-  }
-
-  CUfilter_mode filter_mode;
-  if (mem.info.interpolation == INTERPOLATION_CLOSEST) {
-    filter_mode = CU_TR_FILTER_MODE_POINT;
-  }
-  else {
-    filter_mode = CU_TR_FILTER_MODE_LINEAR;
-  }
-
-  /* Image Texture Storage */
-  CUarray_format_enum format;
-  switch (mem.data_type) {
-    case TYPE_UCHAR:
-      format = CU_AD_FORMAT_UNSIGNED_INT8;
-      break;
-    case TYPE_UINT16:
-      format = CU_AD_FORMAT_UNSIGNED_INT16;
-      break;
-    case TYPE_UINT:
-      format = CU_AD_FORMAT_UNSIGNED_INT32;
-      break;
-    case TYPE_INT:
-      format = CU_AD_FORMAT_SIGNED_INT32;
-      break;
-    case TYPE_FLOAT:
-      format = CU_AD_FORMAT_FLOAT;
-      break;
-    case TYPE_HALF:
-      format = CU_AD_FORMAT_HALF;
-      break;
-    default:
-      assert(0);
-      return;
-  }
-
-  CUDAMem *cmem = NULL;
-  CUarray array_3d = NULL;
-  size_t src_pitch = mem.data_width * dsize * mem.data_elements;
-  size_t dst_pitch = src_pitch;
-
-  if (!mem.is_resident(this)) {
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    cmem = &cuda_mem_map[&mem];
-    cmem->texobject = 0;
-
-    if (mem.data_depth > 1) {
-      array_3d = (CUarray)mem.device_pointer;
-      cmem->array = array_3d;
-    }
-    else if (mem.data_height > 0) {
-      dst_pitch = align_up(src_pitch, pitch_alignment);
-    }
-  }
-  else if (mem.data_depth > 1) {
-    /* 3D texture using array, there is no API for linear memory. */
-    CUDA_ARRAY3D_DESCRIPTOR desc;
-
-    desc.Width = mem.data_width;
-    desc.Height = mem.data_height;
-    desc.Depth = mem.data_depth;
-    desc.Format = format;
-    desc.NumChannels = mem.data_elements;
-    desc.Flags = 0;
-
-    VLOG(1) << "Array 3D allocate: " << mem.name << ", "
-            << string_human_readable_number(mem.memory_size()) << " bytes. ("
-            << string_human_readable_size(mem.memory_size()) << ")";
-
-    cuda_assert(cuArray3DCreate(&array_3d, &desc));
-
-    if (!array_3d) {
-      return;
-    }
-
-    CUDA_MEMCPY3D param;
-    memset(&param, 0, sizeof(param));
-    param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
-    param.dstArray = array_3d;
-    param.srcMemoryType = CU_MEMORYTYPE_HOST;
-    param.srcHost = mem.host_pointer;
-    param.srcPitch = src_pitch;
-    param.WidthInBytes = param.srcPitch;
-    param.Height = mem.data_height;
-    param.Depth = mem.data_depth;
-
-    cuda_assert(cuMemcpy3D(&param));
-
-    mem.device_pointer = (device_ptr)array_3d;
-    mem.device_size = size;
-    stats.mem_alloc(size);
-
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    cmem = &cuda_mem_map[&mem];
-    cmem->texobject = 0;
-    cmem->array = array_3d;
-  }
-  else if (mem.data_height > 0) {
-    /* 2D texture, using pitch aligned linear memory. */
-    dst_pitch = align_up(src_pitch, pitch_alignment);
-    size_t dst_size = dst_pitch * mem.data_height;
-
-    cmem = generic_alloc(mem, dst_size - mem.memory_size());
-    if (!cmem) {
-      return;
-    }
-
-    CUDA_MEMCPY2D param;
-    memset(&param, 0, sizeof(param));
-    param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
-    param.dstDevice = mem.device_pointer;
-    param.dstPitch = dst_pitch;
-    param.srcMemoryType = CU_MEMORYTYPE_HOST;
-    param.srcHost = mem.host_pointer;
-    param.srcPitch = src_pitch;
-    param.WidthInBytes = param.srcPitch;
-    param.Height = mem.data_height;
-
-    cuda_assert(cuMemcpy2DUnaligned(&param));
-  }
-  else {
-    /* 1D texture, using linear memory. */
-    cmem = generic_alloc(mem);
-    if (!cmem) {
-      return;
-    }
-
-    cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
-  }
-
-  /* Resize once */
-  const uint slot = mem.slot;
-  if (slot >= texture_info.size()) {
-    /* Allocate some slots in advance, to reduce amount
-     * of re-allocations. */
-    texture_info.resize(slot + 128);
-  }
-
-  /* Set Mapping and tag that we need to (re-)upload to device */
-  texture_info[slot] = mem.info;
-  need_texture_info = true;
-
-  if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
-      mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
-    /* Kepler+, bindless textures. */
-    CUDA_RESOURCE_DESC resDesc;
-    memset(&resDesc, 0, sizeof(resDesc));
-
-    if (array_3d) {
-      resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
-      resDesc.res.array.hArray = array_3d;
-      resDesc.flags = 0;
-    }
-    else if (mem.data_height > 0) {
-      resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
-      resDesc.res.pitch2D.devPtr = mem.device_pointer;
-      resDesc.res.pitch2D.format = format;
-      resDesc.res.pitch2D.numChannels = mem.data_elements;
-      resDesc.res.pitch2D.height = mem.data_height;
-      resDesc.res.pitch2D.width = mem.data_width;
-      resDesc.res.pitch2D.pitchInBytes = dst_pitch;
-    }
-    else {
-      resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
-      resDesc.res.linear.devPtr = mem.device_pointer;
-      resDesc.res.linear.format = format;
-      resDesc.res.linear.numChannels = mem.data_elements;
-      resDesc.res.linear.sizeInBytes = mem.device_size;
-    }
-
-    CUDA_TEXTURE_DESC texDesc;
-    memset(&texDesc, 0, sizeof(texDesc));
-    texDesc.addressMode[0] = address_mode;
-    texDesc.addressMode[1] = address_mode;
-    texDesc.addressMode[2] = address_mode;
-    texDesc.filterMode = filter_mode;
-    texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
-
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    cmem = &cuda_mem_map[&mem];
-
-    cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
-
-    texture_info[slot].data = (uint64_t)cmem->texobject;
-  }
-  else {
-    texture_info[slot].data = (uint64_t)mem.device_pointer;
-  }
-}
-
-void CUDADevice::tex_free(device_texture &mem)
-{
-  if (mem.device_pointer) {
-    CUDAContextScope scope(this);
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    const CUDAMem &cmem = cuda_mem_map[&mem];
-
-    if (cmem.texobject) {
-      /* Free bindless texture. */
-      cuTexObjectDestroy(cmem.texobject);
-    }
-
-    if (!mem.is_resident(this)) {
-      /* Do not free memory here, since it was allocated on a different device. */
-      cuda_mem_map.erase(cuda_mem_map.find(&mem));
-    }
-    else if (cmem.array) {
-      /* Free array. */
-      cuArrayDestroy(cmem.array);
-      stats.mem_free(mem.device_size);
-      mem.device_pointer = 0;
-      mem.device_size = 0;
-
-      cuda_mem_map.erase(cuda_mem_map.find(&mem));
-    }
-    else {
-      lock.unlock();
-      generic_free(mem);
-    }
-  }
-}
-
-#  define CUDA_GET_BLOCKSIZE(func, w, h) \
-    int threads_per_block; \
-    cuda_assert( \
-        cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
-    int threads = (int)sqrt((float)threads_per_block); \
-    int xblocks = ((w) + threads - 1) / threads; \
-    int yblocks = ((h) + threads - 1) / threads;
-
-#  define CUDA_LAUNCH_KERNEL(func, args) \
-    cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0));
-
-/* Similar as above, but for 1-dimensional blocks. */
-#  define CUDA_GET_BLOCKSIZE_1D(func, w, h) \
-    int threads_per_block; \
-    cuda_assert( \
-        cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
-    int xblocks = ((w) + threads_per_block - 1) / threads_per_block; \
-    int yblocks = h;
-
-#  define CUDA_LAUNCH_KERNEL_1D(func, args) \
-    cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads_per_block, 1, 1, 0, 0, args, 0));
-
-bool CUDADevice::denoising_non_local_means(device_ptr image_ptr,
-                                           device_ptr guide_ptr,
-                                           device_ptr variance_ptr,
-                                           device_ptr out_ptr,
-                                           DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  int stride = task->buffer.stride;
-  int w = task->buffer.width;
-  int h = task->buffer.h;
-  int r = task->nlm_state.r;
-  int f = task->nlm_state.f;
-  float a = task->nlm_state.a;
-  float k_2 = task->nlm_state.k_2;
-
-  int pass_stride = task->buffer.pass_stride;
-  int num_shifts = (2 * r + 1) * (2 * r + 1);
-  int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
-  int frame_offset = 0;
-
-  if (have_error())
-    return false;
-
-  CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer;
-  CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
-  CUdeviceptr weightAccum = difference + 2 * sizeof(float) * pass_stride * num_shifts;
-  CUdeviceptr scale_ptr = 0;
-
-  cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float) * pass_stride));
-  cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float) * pass_stride));
-
-  {
-    CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput;
-    cuda_assert(cuModuleGetFunction(
-        &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
-    cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
-    cuda_assert(cuModuleGetFunction(
-        &cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
-    cuda_assert(cuModuleGetFunction(
-        &cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
-
-    cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
-    cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
-    cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
-    cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
-
-    CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w * h, num_shifts);
-
-    void *calc_difference_args[] = {&guide_ptr,
-                                    &variance_ptr,
-                                    &scale_ptr,
-                                    &difference,
-                                    &w,
-                                    &h,
-                                    &stride,
-                                    &pass_stride,
-                                    &r,
-                                    &channel_offset,
-                                    &frame_offset,
-                                    &a,
-                                    &k_2};
-    void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
-    void *calc_weight_args[] = {
-        &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
-    void *update_output_args[] = {&blurDifference,
-                                  &image_ptr,
-                                  &out_ptr,
-                                  &weightAccum,
-                                  &w,
-                                  &h,
-                                  &stride,
-                                  &pass_stride,
-                                  &channel_offset,
-                                  &r,
-                                  &f};
-
-    CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args);
-  }
-
-  {
-    CUfunction cuNLMNormalize;
-    cuda_assert(
-        cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
-    cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
-    void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride};
-    CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h);
-    CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
-    cuda_assert(cuCtxSynchronize());
-  }
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_construct_transform(DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuFilterConstructTransform;
-  cuda_assert(cuModuleGetFunction(
-      &cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
-  cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
-  CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, task->storage.w, task->storage.h);
-
-  void *args[] = {&task->buffer.mem.device_pointer,
-                  &task->tile_info_mem.device_pointer,
-                  &task->storage.transform.device_pointer,
-                  &task->storage.rank.device_pointer,
-                  &task->filter_area,
-                  &task->rect,
-                  &task->radius,
-                  &task->pca_threshold,
-                  &task->buffer.pass_stride,
-                  &task->buffer.frame_stride,
-                  &task->buffer.use_time};
-  CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_accumulate(device_ptr color_ptr,
-                                      device_ptr color_variance_ptr,
-                                      device_ptr scale_ptr,
-                                      int frame,
-                                      DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  int r = task->radius;
-  int f = 4;
-  float a = 1.0f;
-  float k_2 = task->nlm_k_2;
-
-  int w = task->reconstruction_state.source_w;
-  int h = task->reconstruction_state.source_h;
-  int stride = task->buffer.stride;
-  int frame_offset = frame * task->buffer.frame_stride;
-  int t = task->tile_info->frames[frame];
-
-  int pass_stride = task->buffer.pass_stride;
-  int num_shifts = (2 * r + 1) * (2 * r + 1);
-
-  if (have_error())
-    return false;
-
-  CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer;
-  CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
-
-  CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
-  cuda_assert(cuModuleGetFunction(
-      &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
-  cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
-  cuda_assert(
-      cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
-  cuda_assert(cuModuleGetFunction(
-      &cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
-
-  cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
-  cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
-  cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
-  cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
-
-  CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
-                        task->reconstruction_state.source_w * task->reconstruction_state.source_h,
-                        num_shifts);
-
-  void *calc_difference_args[] = {&color_ptr,
-                                  &color_variance_ptr,
-                                  &scale_ptr,
-                                  &difference,
-                                  &w,
-                                  &h,
-                                  &stride,
-                                  &pass_stride,
-                                  &r,
-                                  &pass_stride,
-                                  &frame_offset,
-                                  &a,
-                                  &k_2};
-  void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
-  void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
-  void *construct_gramian_args[] = {&t,
-                                    &blurDifference,
-                                    &task->buffer.mem.device_pointer,
-                                    &task->storage.transform.device_pointer,
-                                    &task->storage.rank.device_pointer,
-                                    &task->storage.XtWX.device_pointer,
-                                    &task->storage.XtWY.device_pointer,
-                                    &task->reconstruction_state.filter_window,
-                                    &w,
-                                    &h,
-                                    &stride,
-                                    &pass_stride,
-                                    &r,
-                                    &f,
-                                    &frame_offset,
-                                    &task->buffer.use_time};
-
-  CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
-  CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-  CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
-  CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-  CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_solve(device_ptr output_ptr, DenoisingTask *task)
-{
-  CUfunction cuFinalize;
-  cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
-  cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
-  void *finalize_args[] = {&output_ptr,
-                           &task->storage.rank.device_pointer,
-                           &task->storage.XtWX.device_pointer,
-                           &task->storage.XtWY.device_pointer,
-                           &task->filter_area,
-                           &task->reconstruction_state.buffer_params.x,
-                           &task->render_buffer.samples};
-  CUDA_GET_BLOCKSIZE(
-      cuFinalize, task->reconstruction_state.source_w, task->reconstruction_state.source_h);
-  CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_combine_halves(device_ptr a_ptr,
-                                          device_ptr b_ptr,
-                                          device_ptr mean_ptr,
-                                          device_ptr variance_ptr,
-                                          int r,
-                                          int4 rect,
-                                          DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuFilterCombineHalves;
-  cuda_assert(cuModuleGetFunction(
-      &cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
-  cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
-  CUDA_GET_BLOCKSIZE(
-      cuFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  void *args[] = {&mean_ptr, &variance_ptr, &a_ptr, &b_ptr, &rect, &r};
-  CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_divide_shadow(device_ptr a_ptr,
-                                         device_ptr b_ptr,
-                                         device_ptr sample_variance_ptr,
-                                         device_ptr sv_variance_ptr,
-                                         device_ptr buffer_variance_ptr,
-                                         DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuFilterDivideShadow;
-  cuda_assert(cuModuleGetFunction(
-      &cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
-  cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
-  CUDA_GET_BLOCKSIZE(
-      cuFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  void *args[] = {&task->render_buffer.samples,
-                  &task->tile_info_mem.device_pointer,
-                  &a_ptr,
-                  &b_ptr,
-                  &sample_variance_ptr,
-                  &sv_variance_ptr,
-                  &buffer_variance_ptr,
-                  &task->rect,
-                  &task->render_buffer.pass_stride,
-                  &task->render_buffer.offset};
-  CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_get_feature(int mean_offset,
-                                       int variance_offset,
-                                       device_ptr mean_ptr,
-                                       device_ptr variance_ptr,
-                                       float scale,
-                                       DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuFilterGetFeature;
-  cuda_assert(
-      cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
-  cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
-  CUDA_GET_BLOCKSIZE(cuFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  void *args[] = {&task->render_buffer.samples,
-                  &task->tile_info_mem.device_pointer,
-                  &mean_offset,
-                  &variance_offset,
-                  &mean_ptr,
-                  &variance_ptr,
-                  &scale,
-                  &task->rect,
-                  &task->render_buffer.pass_stride,
-                  &task->render_buffer.offset};
-  CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_write_feature(int out_offset,
-                                         device_ptr from_ptr,
-                                         device_ptr buffer_ptr,
-                                         DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuFilterWriteFeature;
-  cuda_assert(cuModuleGetFunction(
-      &cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature"));
-  cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1));
-  CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, task->filter_area.z, task->filter_area.w);
-
-  void *args[] = {&task->render_buffer.samples,
-                  &task->reconstruction_state.buffer_params,
-                  &task->filter_area,
-                  &from_ptr,
-                  &buffer_ptr,
-                  &out_offset,
-                  &task->rect};
-  CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_detect_outliers(device_ptr image_ptr,
-                                           device_ptr variance_ptr,
-                                           device_ptr depth_ptr,
-                                           device_ptr output_ptr,
-                                           DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuFilterDetectOutliers;
-  cuda_assert(cuModuleGetFunction(
-      &cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers"));
-  cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1));
-  CUDA_GET_BLOCKSIZE(
-      cuFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  void *args[] = {
-      &image_ptr, &variance_ptr, &depth_ptr, &output_ptr, &task->rect, &task->buffer.pass_stride};
-
-  CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-void CUDADevice::denoise(RenderTile &rtile, DenoisingTask &denoising)
-{
-  denoising.functions.construct_transform = function_bind(
-      &CUDADevice::denoising_construct_transform, this, &denoising);
-  denoising.functions.accumulate = function_bind(
-      &CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
-  denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising);
-  denoising.functions.divide_shadow = function_bind(
-      &CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
-  denoising.functions.non_local_means = function_bind(
-      &CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
-  denoising.functions.combine_halves = function_bind(
-      &CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
-  denoising.functions.get_feature = function_bind(
-      &CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
-  denoising.functions.write_feature = function_bind(
-      &CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising);
-  denoising.functions.detect_outliers = function_bind(
-      &CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
-
-  denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
-  denoising.render_buffer.samples = rtile.sample;
-  denoising.buffer.gpu_temporary_mem = true;
-
-  denoising.run_denoising(rtile);
-}
-
-void CUDADevice::adaptive_sampling_filter(uint filter_sample,
-                                          WorkTile *wtile,
-                                          CUdeviceptr d_wtile,
-                                          CUstream stream)
-{
-  const int num_threads_per_block = functions.adaptive_num_threads_per_block;
-
-  /* These are a series of tiny kernels because there is no grid synchronization
-   * from within a kernel, so multiple kernel launches it is. */
-  uint total_work_size = wtile->h * wtile->w;
-  void *args2[] = {&d_wtile, &filter_sample, &total_work_size};
-  uint num_blocks = divide_up(total_work_size, num_threads_per_block);
-  cuda_assert(cuLaunchKernel(functions.adaptive_stopping,
-                             num_blocks,
-                             1,
-                             1,
-                             num_threads_per_block,
-                             1,
-                             1,
-                             0,
-                             stream,
-                             args2,
-                             0));
-  total_work_size = wtile->h;
-  num_blocks = divide_up(total_work_size, num_threads_per_block);
-  cuda_assert(cuLaunchKernel(functions.adaptive_filter_x,
-                             num_blocks,
-                             1,
-                             1,
-                             num_threads_per_block,
-                             1,
-                             1,
-                             0,
-                             stream,
-                             args2,
-                             0));
-  total_work_size = wtile->w;
-  num_blocks = divide_up(total_work_size, num_threads_per_block);
-  cuda_assert(cuLaunchKernel(functions.adaptive_filter_y,
-                             num_blocks,
-                             1,
-                             1,
-                             num_threads_per_block,
-                             1,
-                             1,
-                             0,
-                             stream,
-                             args2,
-                             0));
-}
-
-void CUDADevice::adaptive_sampling_post(RenderTile &rtile,
-                                        WorkTile *wtile,
-                                        CUdeviceptr d_wtile,
-                                        CUstream stream)
-{
-  const int num_threads_per_block = functions.adaptive_num_threads_per_block;
-  uint total_work_size = wtile->h * wtile->w;
-
-  void *args[] = {&d_wtile, &rtile.start_sample, &rtile.sample, &total_work_size};
-  uint num_blocks = divide_up(total_work_size, num_threads_per_block);
-  cuda_assert(cuLaunchKernel(functions.adaptive_scale_samples,
-                             num_blocks,
-                             1,
-                             1,
-                             num_threads_per_block,
-                             1,
-                             1,
-                             0,
-                             stream,
-                             args,
-                             0));
-}
-
-void CUDADevice::render(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles)
-{
-  scoped_timer timer(&rtile.buffers->render_time);
-
-  if (have_error())
-    return;
-
-  CUDAContextScope scope(this);
-  CUfunction cuRender;
-
-  /* Get kernel function. */
-  if (rtile.task == RenderTile::BAKE) {
-    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake"));
-  }
-  else if (task.integrator_branched) {
-    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace"));
-  }
-  else {
-    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace"));
-  }
-
-  if (have_error()) {
-    return;
-  }
-
-  cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
-
-  /* Allocate work tile. */
-  work_tiles.alloc(1);
-
-  WorkTile *wtile = work_tiles.data();
-  wtile->x = rtile.x;
-  wtile->y = rtile.y;
-  wtile->w = rtile.w;
-  wtile->h = rtile.h;
-  wtile->offset = rtile.offset;
-  wtile->stride = rtile.stride;
-  wtile->buffer = (float *)(CUdeviceptr)rtile.buffer;
-
-  /* Prepare work size. More step samples render faster, but for now we
-   * remain conservative for GPUs connected to a display to avoid driver
-   * timeouts and display freezing. */
-  int min_blocks, num_threads_per_block;
-  cuda_assert(
-      cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0));
-  if (!info.display_device) {
-    min_blocks *= 8;
-  }
-
-  uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
-
-  /* Render all samples. */
-  int start_sample = rtile.start_sample;
-  int end_sample = rtile.start_sample + rtile.num_samples;
-
-  for (int sample = start_sample; sample < end_sample;) {
-    /* Setup and copy work tile to device. */
-    wtile->start_sample = sample;
-    wtile->num_samples = step_samples;
-    if (task.adaptive_sampling.use) {
-      wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
-    }
-    wtile->num_samples = min(wtile->num_samples, end_sample - sample);
-    work_tiles.copy_to_device();
-
-    CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
-    uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
-    uint num_blocks = divide_up(total_work_size, num_threads_per_block);
-
-    /* Launch kernel. */
-    void *args[] = {&d_work_tiles, &total_work_size};
-
-    cuda_assert(
-        cuLaunchKernel(cuRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
-
-    /* Run the adaptive sampling kernels at selected samples aligned to step samples. */
-    uint filter_sample = sample + wtile->num_samples - 1;
-    if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
-      adaptive_sampling_filter(filter_sample, wtile, d_work_tiles);
-    }
-
-    cuda_assert(cuCtxSynchronize());
-
-    /* Update progress. */
-    sample += wtile->num_samples;
-    rtile.sample = sample;
-    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
-
-    if (task.get_cancel()) {
-      if (task.need_finish_queue == false)
-        break;
-    }
-  }
-
-  /* Finalize adaptive sampling. */
-  if (task.adaptive_sampling.use) {
-    CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
-    adaptive_sampling_post(rtile, wtile, d_work_tiles);
-    cuda_assert(cuCtxSynchronize());
-    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
-  }
-}
-
-void CUDADevice::film_convert(DeviceTask &task,
-                              device_ptr buffer,
-                              device_ptr rgba_byte,
-                              device_ptr rgba_half)
-{
-  if (have_error())
-    return;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuFilmConvert;
-  CUdeviceptr d_rgba = map_pixels((rgba_byte) ? rgba_byte : rgba_half);
-  CUdeviceptr d_buffer = (CUdeviceptr)buffer;
-
-  /* get kernel function */
-  if (rgba_half) {
-    cuda_assert(
-        cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float"));
-  }
-  else {
-    cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte"));
-  }
-
-  float sample_scale = 1.0f / (task.sample + 1);
-
-  /* pass in parameters */
-  void *args[] = {&d_rgba,
-                  &d_buffer,
-                  &sample_scale,
-                  &task.x,
-                  &task.y,
-                  &task.w,
-                  &task.h,
-                  &task.offset,
-                  &task.stride};
-
-  /* launch kernel */
-  int threads_per_block;
-  cuda_assert(cuFuncGetAttribute(
-      &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert));
-
-  int xthreads = (int)sqrt(threads_per_block);
-  int ythreads = (int)sqrt(threads_per_block);
-  int xblocks = (task.w + xthreads - 1) / xthreads;
-  int yblocks = (task.h + ythreads - 1) / ythreads;
-
-  cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1));
-
-  cuda_assert(cuLaunchKernel(cuFilmConvert,
-                             xblocks,
-                             yblocks,
-                             1, /* blocks */
-                             xthreads,
-                             ythreads,
-                             1, /* threads */
-                             0,
-                             0,
-                             args,
-                             0));
-
-  unmap_pixels((rgba_byte) ? rgba_byte : rgba_half);
-
-  cuda_assert(cuCtxSynchronize());
-}
-
-void CUDADevice::shader(DeviceTask &task)
-{
-  if (have_error())
-    return;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuShader;
-  CUdeviceptr d_input = (CUdeviceptr)task.shader_input;
-  CUdeviceptr d_output = (CUdeviceptr)task.shader_output;
-
-  /* get kernel function */
-  if (task.shader_eval_type == SHADER_EVAL_DISPLACE) {
-    cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace"));
-  }
-  else {
-    cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background"));
-  }
-
-  /* do tasks in smaller chunks, so we can cancel it */
-  const int shader_chunk_size = 65536;
-  const int start = task.shader_x;
-  const int end = task.shader_x + task.shader_w;
-  int offset = task.offset;
-
-  bool canceled = false;
-  for (int sample = 0; sample < task.num_samples && !canceled; sample++) {
-    for (int shader_x = start; shader_x < end; shader_x += shader_chunk_size) {
-      int shader_w = min(shader_chunk_size, end - shader_x);
-
-      /* pass in parameters */
-      void *args[8];
-      int arg = 0;
-      args[arg++] = &d_input;
-      args[arg++] = &d_output;
-      args[arg++] = &task.shader_eval_type;
-      if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
-        args[arg++] = &task.shader_filter;
-      }
-      args[arg++] = &shader_x;
-      args[arg++] = &shader_w;
-      args[arg++] = &offset;
-      args[arg++] = &sample;
-
-      /* launch kernel */
-      int threads_per_block;
-      cuda_assert(cuFuncGetAttribute(
-          &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));
-
-      int xblocks = (shader_w + threads_per_block - 1) / threads_per_block;
-
-      cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
-      cuda_assert(cuLaunchKernel(cuShader,
-                                 xblocks,
-                                 1,
-                                 1, /* blocks */
-                                 threads_per_block,
-                                 1,
-                                 1, /* threads */
-                                 0,
-                                 0,
-                                 args,
-                                 0));
-
-      cuda_assert(cuCtxSynchronize());
-
-      if (task.get_cancel()) {
-        canceled = true;
-        break;
-      }
-    }
-
-    task.update_progress(NULL);
-  }
-}
-
-CUdeviceptr CUDADevice::map_pixels(device_ptr mem)
-{
-  if (!background) {
-    PixelMem pmem = pixel_mem_map[mem];
-    CUdeviceptr buffer;
-
-    size_t bytes;
-    cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0));
-    cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource));
-
-    return buffer;
-  }
-
-  return (CUdeviceptr)mem;
-}
-
-void CUDADevice::unmap_pixels(device_ptr mem)
-{
-  if (!background) {
-    PixelMem pmem = pixel_mem_map[mem];
-
-    cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0));
-  }
-}
-
-void CUDADevice::pixels_alloc(device_memory &mem)
-{
-  PixelMem pmem;
-
-  pmem.w = mem.data_width;
-  pmem.h = mem.data_height;
-
-  CUDAContextScope scope(this);
-
-  glGenBuffers(1, &pmem.cuPBO);
-  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
-  if (mem.data_type == TYPE_HALF)
-    glBufferData(
-        GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(GLhalf) * 4, NULL, GL_DYNAMIC_DRAW);
-  else
-    glBufferData(
-        GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(uint8_t) * 4, NULL, GL_DYNAMIC_DRAW);
-
-  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
-  glActiveTexture(GL_TEXTURE0);
-  glGenTextures(1, &pmem.cuTexId);
-  glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
-  if (mem.data_type == TYPE_HALF)
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL);
-  else
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
-  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-  glBindTexture(GL_TEXTURE_2D, 0);
-
-  CUresult result = cuGraphicsGLRegisterBuffer(
-      &pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
-
-  if (result == CUDA_SUCCESS) {
-    mem.device_pointer = pmem.cuTexId;
-    pixel_mem_map[mem.device_pointer] = pmem;
-
-    mem.device_size = mem.memory_size();
-    stats.mem_alloc(mem.device_size);
-
-    return;
-  }
-  else {
-    /* failed to register buffer, fallback to no interop */
-    glDeleteBuffers(1, &pmem.cuPBO);
-    glDeleteTextures(1, &pmem.cuTexId);
-
-    background = true;
-  }
-}
-
-void CUDADevice::pixels_copy_from(device_memory &mem, int y, int w, int h)
-{
-  PixelMem pmem = pixel_mem_map[mem.device_pointer];
-
-  CUDAContextScope scope(this);
-
-  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
-  uchar *pixels = (uchar *)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
-  size_t offset = sizeof(uchar) * 4 * y * w;
-  memcpy((uchar *)mem.host_pointer + offset, pixels + offset, sizeof(uchar) * 4 * w * h);
-  glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
-  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-}
-
-void CUDADevice::pixels_free(device_memory &mem)
-{
-  if (mem.device_pointer) {
-    PixelMem pmem = pixel_mem_map[mem.device_pointer];
-
-    CUDAContextScope scope(this);
-
-    cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource));
-    glDeleteBuffers(1, &pmem.cuPBO);
-    glDeleteTextures(1, &pmem.cuTexId);
-
-    pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
-    mem.device_pointer = 0;
-
-    stats.mem_free(mem.device_size);
-    mem.device_size = 0;
-  }
-}
-
-void CUDADevice::draw_pixels(device_memory &mem,
-                             int y,
-                             int w,
-                             int h,
-                             int width,
-                             int height,
-                             int dx,
-                             int dy,
-                             int dw,
-                             int dh,
-                             bool transparent,
-                             const DeviceDrawParams &draw_params)
-{
-  assert(mem.type == MEM_PIXELS);
-
-  if (!background) {
-    const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL);
-    PixelMem pmem = pixel_mem_map[mem.device_pointer];
-    float *vpointer;
-
-    CUDAContextScope scope(this);
-
-    /* for multi devices, this assumes the inefficient method that we allocate
-     * all pixels on the device even though we only render to a subset */
-    size_t offset = 4 * y * w;
-
-    if (mem.data_type == TYPE_HALF)
-      offset *= sizeof(GLhalf);
-    else
-      offset *= sizeof(uint8_t);
-
-    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
-    glActiveTexture(GL_TEXTURE0);
-    glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
-    if (mem.data_type == TYPE_HALF) {
-      glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_HALF_FLOAT, (void *)offset);
-    }
-    else {
-      glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void *)offset);
-    }
-    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
-    if (transparent) {
-      glEnable(GL_BLEND);
-      glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
-    }
-
-    GLint shader_program;
-    if (use_fallback_shader) {
-      if (!bind_fallback_display_space_shader(dw, dh)) {
-        return;
-      }
-      shader_program = fallback_shader_program;
-    }
-    else {
-      draw_params.bind_display_space_shader_cb();
-      glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program);
-    }
-
-    if (!vertex_buffer) {
-      glGenBuffers(1, &vertex_buffer);
-    }
-
-    glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
-    /* invalidate old contents -
-     * avoids stalling if buffer is still waiting in queue to be rendered */
-    glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
-
-    vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
-
-    if (vpointer) {
-      /* texture coordinate - vertex pair */
-      vpointer[0] = 0.0f;
-      vpointer[1] = 0.0f;
-      vpointer[2] = dx;
-      vpointer[3] = dy;
-
-      vpointer[4] = (float)w / (float)pmem.w;
-      vpointer[5] = 0.0f;
-      vpointer[6] = (float)width + dx;
-      vpointer[7] = dy;
-
-      vpointer[8] = (float)w / (float)pmem.w;
-      vpointer[9] = (float)h / (float)pmem.h;
-      vpointer[10] = (float)width + dx;
-      vpointer[11] = (float)height + dy;
-
-      vpointer[12] = 0.0f;
-      vpointer[13] = (float)h / (float)pmem.h;
-      vpointer[14] = dx;
-      vpointer[15] = (float)height + dy;
-
-      glUnmapBuffer(GL_ARRAY_BUFFER);
-    }
-
-    GLuint vertex_array_object;
-    GLuint position_attribute, texcoord_attribute;
-
-    glGenVertexArrays(1, &vertex_array_object);
-    glBindVertexArray(vertex_array_object);
-
-    texcoord_attribute = glGetAttribLocation(shader_program, "texCoord");
-    position_attribute = glGetAttribLocation(shader_program, "pos");
-
-    glEnableVertexAttribArray(texcoord_attribute);
-    glEnableVertexAttribArray(position_attribute);
-
-    glVertexAttribPointer(
-        texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
-    glVertexAttribPointer(position_attribute,
-                          2,
-                          GL_FLOAT,
-                          GL_FALSE,
-                          4 * sizeof(float),
-                          (const GLvoid *)(sizeof(float) * 2));
-
-    glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
-
-    if (use_fallback_shader) {
-      glUseProgram(0);
-    }
-    else {
-      draw_params.unbind_display_space_shader_cb();
-    }
-
-    if (transparent) {
-      glDisable(GL_BLEND);
-    }
-
-    glBindTexture(GL_TEXTURE_2D, 0);
-
-    return;
-  }
-
-  Device::draw_pixels(mem, y, w, h, width, height, dx, dy, dw, dh, transparent, draw_params);
-}
-
-void CUDADevice::thread_run(DeviceTask &task)
-{
-  CUDAContextScope scope(this);
-
-  if (task.type == DeviceTask::RENDER) {
-    DeviceRequestedFeatures requested_features;
-    if (use_split_kernel()) {
-      if (split_kernel == NULL) {
-        split_kernel = new CUDASplitKernel(this);
-        split_kernel->load_kernels(requested_features);
-      }
-    }
-
-    device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
-
-    /* keep rendering tiles until done */
-    RenderTile tile;
-    DenoisingTask denoising(this, task);
-
-    while (task.acquire_tile(this, tile, task.tile_types)) {
-      if (tile.task == RenderTile::PATH_TRACE) {
-        if (use_split_kernel()) {
-          device_only_memory<uchar> void_buffer(this, "void_buffer");
-          split_kernel->path_trace(task, tile, void_buffer, void_buffer);
-        }
-        else {
-          render(task, tile, work_tiles);
-        }
-      }
-      else if (tile.task == RenderTile::BAKE) {
-        render(task, tile, work_tiles);
-      }
-      else if (tile.task == RenderTile::DENOISE) {
-        tile.sample = tile.start_sample + tile.num_samples;
-
-        denoise(tile, denoising);
-
-        task.update_progress(&tile, tile.w * tile.h);
-      }
-
-      task.release_tile(tile);
-
-      if (task.get_cancel()) {
-        if (task.need_finish_queue == false)
-          break;
-      }
-    }
-
-    work_tiles.free();
-  }
-  else if (task.type == DeviceTask::SHADER) {
-    shader(task);
-
-    cuda_assert(cuCtxSynchronize());
-  }
-  else if (task.type == DeviceTask::DENOISE_BUFFER) {
-    RenderTile tile;
-    tile.x = task.x;
-    tile.y = task.y;
-    tile.w = task.w;
-    tile.h = task.h;
-    tile.buffer = task.buffer;
-    tile.sample = task.sample + task.num_samples;
-    tile.num_samples = task.num_samples;
-    tile.start_sample = task.sample;
-    tile.offset = task.offset;
-    tile.stride = task.stride;
-    tile.buffers = task.buffers;
-
-    DenoisingTask denoising(this, task);
-    denoise(tile, denoising);
-    task.update_progress(&tile, tile.w * tile.h);
-  }
-}
-
-void CUDADevice::task_add(DeviceTask &task)
-{
-  CUDAContextScope scope(this);
-
-  /* Load texture info. */
-  load_texture_info();
-
-  /* Synchronize all memory copies before executing task. */
-  cuda_assert(cuCtxSynchronize());
-
-  if (task.type == DeviceTask::FILM_CONVERT) {
-    /* must be done in main thread due to opengl access */
-    film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
-  }
-  else {
-    task_pool.push([=] {
-      DeviceTask task_copy = task;
-      thread_run(task_copy);
-    });
-  }
-}
-
-void CUDADevice::task_wait()
-{
-  task_pool.wait();
-}
-
-void CUDADevice::task_cancel()
-{
-  task_pool.cancel();
-}
-
-/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class
- * now that the definition of that class is complete
- */
-#  undef cuda_assert
-#  define cuda_assert(stmt) \
-    { \
-      CUresult result = stmt; \
-      if (result != CUDA_SUCCESS) { \
-        const char *name = cuewErrorString(result); \
-        device->set_error( \
-            string_printf("%s in %s (device_cuda_impl.cpp:%d)", name, #stmt, __LINE__)); \
-      } \
-    } \
-    (void)0
-
-/* CUDA context scope. */
-
-CUDAContextScope::CUDAContextScope(CUDADevice *device) : device(device)
-{
-  cuda_assert(cuCtxPushCurrent(device->cuContext));
-}
-
-CUDAContextScope::~CUDAContextScope()
-{
-  cuda_assert(cuCtxPopCurrent(NULL));
-}
-
-/* split kernel */
-
-class CUDASplitKernelFunction : public SplitKernelFunction {
-  CUDADevice *device;
-  CUfunction func;
-
- public:
-  CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func)
-  {
-  }
-
-  /* enqueue the kernel, returns false if there is an error */
-  bool enqueue(const KernelDimensions &dim, device_memory & /*kg*/, device_memory & /*data*/)
-  {
-    return enqueue(dim, NULL);
-  }
-
-  /* enqueue the kernel, returns false if there is an error */
-  bool enqueue(const KernelDimensions &dim, void *args[])
-  {
-    if (device->have_error())
-      return false;
-
-    CUDAContextScope scope(device);
-
-    /* we ignore dim.local_size for now, as this is faster */
-    int threads_per_block;
-    cuda_assert(
-        cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
-
-    int xblocks = (dim.global_size[0] * dim.global_size[1] + threads_per_block - 1) /
-                  threads_per_block;
-
-    cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
-
-    cuda_assert(cuLaunchKernel(func,
-                               xblocks,
-                               1,
-                               1, /* blocks */
-                               threads_per_block,
-                               1,
-                               1, /* threads */
-                               0,
-                               0,
-                               args,
-                               0));
-
-    return !device->have_error();
-  }
-};
-
-CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device)
-{
-}
-
-uint64_t CUDASplitKernel::state_buffer_size(device_memory & /*kg*/,
-                                            device_memory & /*data*/,
-                                            size_t num_threads)
-{
-  CUDAContextScope scope(device);
-
-  device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
-  size_buffer.alloc(1);
-  size_buffer.zero_to_device();
-
-  uint threads = num_threads;
-  CUdeviceptr d_size = (CUdeviceptr)size_buffer.device_pointer;
-
-  struct args_t {
-    uint *num_threads;
-    CUdeviceptr *size;
-  };
-
-  args_t args = {&threads, &d_size};
-
-  CUfunction state_buffer_size;
-  cuda_assert(
-      cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size"));
-
-  cuda_assert(cuLaunchKernel(state_buffer_size, 1, 1, 1, 1, 1, 1, 0, 0, (void **)&args, 0));
-
-  size_buffer.copy_from_device(0, 1, 1);
-  size_t size = size_buffer[0];
-  size_buffer.free();
-
-  return size;
-}
-
-bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim,
-                                                     RenderTile &rtile,
-                                                     int num_global_elements,
-                                                     device_memory & /*kernel_globals*/,
-                                                     device_memory & /*kernel_data*/,
-                                                     device_memory &split_data,
-                                                     device_memory &ray_state,
-                                                     device_memory &queue_index,
-                                                     device_memory &use_queues_flag,
-                                                     device_memory &work_pool_wgs)
-{
-  CUDAContextScope scope(device);
-
-  CUdeviceptr d_split_data = (CUdeviceptr)split_data.device_pointer;
-  CUdeviceptr d_ray_state = (CUdeviceptr)ray_state.device_pointer;
-  CUdeviceptr d_queue_index = (CUdeviceptr)queue_index.device_pointer;
-  CUdeviceptr d_use_queues_flag = (CUdeviceptr)use_queues_flag.device_pointer;
-  CUdeviceptr d_work_pool_wgs = (CUdeviceptr)work_pool_wgs.device_pointer;
-
-  CUdeviceptr d_buffer = (CUdeviceptr)rtile.buffer;
-
-  int end_sample = rtile.start_sample + rtile.num_samples;
-  int queue_size = dim.global_size[0] * dim.global_size[1];
-
-  struct args_t {
-    CUdeviceptr *split_data_buffer;
-    int *num_elements;
-    CUdeviceptr *ray_state;
-    int *start_sample;
-    int *end_sample;
-    int *sx;
-    int *sy;
-    int *sw;
-    int *sh;
-    int *offset;
-    int *stride;
-    CUdeviceptr *queue_index;
-    int *queuesize;
-    CUdeviceptr *use_queues_flag;
-    CUdeviceptr *work_pool_wgs;
-    int *num_samples;
-    CUdeviceptr *buffer;
-  };
-
-  args_t args = {&d_split_data,
-                 &num_global_elements,
-                 &d_ray_state,
-                 &rtile.start_sample,
-                 &end_sample,
-                 &rtile.x,
-                 &rtile.y,
-                 &rtile.w,
-                 &rtile.h,
-                 &rtile.offset,
-                 &rtile.stride,
-                 &d_queue_index,
-                 &queue_size,
-                 &d_use_queues_flag,
-                 &d_work_pool_wgs,
-                 &rtile.num_samples,
-                 &d_buffer};
-
-  CUfunction data_init;
-  cuda_assert(
-      cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init"));
-  if (device->have_error()) {
-    return false;
-  }
-
-  CUDASplitKernelFunction(device, data_init).enqueue(dim, (void **)&args);
-
-  return !device->have_error();
-}
-
-SplitKernelFunction *CUDASplitKernel::get_split_kernel_function(const string &kernel_name,
-                                                                const DeviceRequestedFeatures &)
-{
-  const CUDAContextScope scope(device);
-
-  CUfunction func;
-  const CUresult result = cuModuleGetFunction(
-      &func, device->cuModule, (string("kernel_cuda_") + kernel_name).data());
-  if (result != CUDA_SUCCESS) {
-    device->set_error(string_printf("Could not find kernel \"kernel_cuda_%s\" in module (%s)",
-                                    kernel_name.data(),
-                                    cuewErrorString(result)));
-    return NULL;
-  }
-
-  return new CUDASplitKernelFunction(device, func);
-}
-
-int2 CUDASplitKernel::split_kernel_local_size()
-{
-  return make_int2(32, 1);
-}
-
-int2 CUDASplitKernel::split_kernel_global_size(device_memory &kg,
-                                               device_memory &data,
-                                               DeviceTask & /*task*/)
-{
-  CUDAContextScope scope(device);
-  size_t free;
-  size_t total;
-
-  cuda_assert(cuMemGetInfo(&free, &total));
-
-  VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(free)
-          << " bytes. (" << string_human_readable_size(free) << ").";
-
-  size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2);
-  size_t side = round_down((int)sqrt(num_elements), 32);
-  int2 global_size = make_int2(side, round_down(num_elements / side, 16));
-  VLOG(1) << "Global size: " << global_size << ".";
-  return global_size;
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp
new file mode 100644
index 00000000000..37fab8f8293
--- /dev/null
+++ b/intern/cycles/device/cuda/device_impl.cpp
@@ -0,0 +1,1370 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include <climits>
+#  include <limits.h>
+#  include <stdio.h>
+#  include <stdlib.h>
+#  include <string.h>
+
+#  include "device/cuda/device_impl.h"
+
+#  include "render/buffers.h"
+
+#  include "util/util_debug.h"
+#  include "util/util_foreach.h"
+#  include "util/util_logging.h"
+#  include "util/util_map.h"
+#  include "util/util_md5.h"
+#  include "util/util_opengl.h"
+#  include "util/util_path.h"
+#  include "util/util_string.h"
+#  include "util/util_system.h"
+#  include "util/util_time.h"
+#  include "util/util_types.h"
+#  include "util/util_windows.h"
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice;
+
+bool CUDADevice::have_precompiled_kernels()
+{
+  string cubins_path = path_get("lib");
+  return path_exists(cubins_path);
+}
+
+bool CUDADevice::show_samples() const
+{
+  /* The CUDADevice only processes one tile at a time, so showing samples is fine. */
+  return true;
+}
+
+BVHLayoutMask CUDADevice::get_bvh_layout_mask() const
+{
+  return BVH_LAYOUT_BVH2;
+}
+
+void CUDADevice::set_error(const string &error)
+{
+  Device::set_error(error);
+
+  if (first_error) {
+    fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
+    fprintf(stderr,
+            "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
+    first_error = false;
+  }
+}
+
+CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+    : Device(info, stats, profiler), texture_info(this, "__texture_info", MEM_GLOBAL)
+{
+  first_error = true;
+
+  cuDevId = info.num;
+  cuDevice = 0;
+  cuContext = 0;
+
+  cuModule = 0;
+
+  need_texture_info = false;
+
+  device_texture_headroom = 0;
+  device_working_headroom = 0;
+  move_texture_to_host = false;
+  map_host_limit = 0;
+  map_host_used = 0;
+  can_map_host = 0;
+  pitch_alignment = 0;
+
+  /* Initialize CUDA. */
+  CUresult result = cuInit(0);
+  if (result != CUDA_SUCCESS) {
+    set_error(string_printf("Failed to initialize CUDA runtime (%s)", cuewErrorString(result)));
+    return;
+  }
+
+  /* Setup device and context. */
+  result = cuDeviceGet(&cuDevice, cuDevId);
+  if (result != CUDA_SUCCESS) {
+    set_error(string_printf("Failed to get CUDA device handle from ordinal (%s)",
+                            cuewErrorString(result)));
+    return;
+  }
+
+  /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
+   * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
+   * so we can predict which memory to map to host. */
+  cuda_assert(
+      cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
+
+  cuda_assert(cuDeviceGetAttribute(
+      &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
+
+  unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
+  if (can_map_host) {
+    ctx_flags |= CU_CTX_MAP_HOST;
+    init_host_memory();
+  }
+
+  /* Create context. */
+  result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
+
+  if (result != CUDA_SUCCESS) {
+    set_error(string_printf("Failed to create CUDA context (%s)", cuewErrorString(result)));
+    return;
+  }
+
+  int major, minor;
+  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+  cuDevArchitecture = major * 100 + minor * 10;
+
+  /* Pop context set by cuCtxCreate. */
+  cuCtxPopCurrent(NULL);
+}
+
+CUDADevice::~CUDADevice()
+{
+  texture_info.free();
+
+  cuda_assert(cuCtxDestroy(cuContext));
+}
+
+bool CUDADevice::support_device(const uint /*kernel_features*/)
+{
+  int major, minor;
+  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+
+  /* We only support sm_30 and above */
+  if (major < 3) {
+    set_error(string_printf(
+        "CUDA backend requires compute capability 3.0 or up, but found %d.%d.", major, minor));
+    return false;
+  }
+
+  return true;
+}
+
+bool CUDADevice::check_peer_access(Device *peer_device)
+{
+  if (peer_device == this) {
+    return false;
+  }
+  if (peer_device->info.type != DEVICE_CUDA && peer_device->info.type != DEVICE_OPTIX) {
+    return false;
+  }
+
+  CUDADevice *const peer_device_cuda = static_cast<CUDADevice *>(peer_device);
+
+  int can_access = 0;
+  cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice));
+  if (can_access == 0) {
+    return false;
+  }
+
+  // Ensure array access over the link is possible as well (for 3D textures)
+  cuda_assert(cuDeviceGetP2PAttribute(&can_access,
+                                      CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED,
+                                      cuDevice,
+                                      peer_device_cuda->cuDevice));
+  if (can_access == 0) {
+    return false;
+  }
+
+  // Enable peer access in both directions
+  {
+    const CUDAContextScope scope(this);
+    CUresult result = cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0);
+    if (result != CUDA_SUCCESS) {
+      set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
+                              cuewErrorString(result)));
+      return false;
+    }
+  }
+  {
+    const CUDAContextScope scope(peer_device_cuda);
+    CUresult result = cuCtxEnablePeerAccess(cuContext, 0);
+    if (result != CUDA_SUCCESS) {
+      set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
+                              cuewErrorString(result)));
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool CUDADevice::use_adaptive_compilation()
+{
+  return DebugFlags().cuda.adaptive_compile;
+}
+
+/* Common NVCC flags which stays the same regardless of shading model,
+ * kernel sources md5 and only depends on compiler or compilation settings.
+ */
+string CUDADevice::compile_kernel_get_common_cflags(const uint kernel_features)
+{
+  const int machine = system_cpu_bits();
+  const string source_path = path_get("source");
+  const string include_path = source_path;
+  string cflags = string_printf(
+      "-m%d "
+      "--ptxas-options=\"-v\" "
+      "--use_fast_math "
+      "-DNVCC "
+      "-I\"%s\"",
+      machine,
+      include_path.c_str());
+  if (use_adaptive_compilation()) {
+    cflags += " -D__KERNEL_FEATURES__=" + to_string(kernel_features);
+  }
+  const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
+  if (extra_cflags) {
+    cflags += string(" ") + string(extra_cflags);
+  }
+
+#  ifdef WITH_NANOVDB
+  cflags += " -DWITH_NANOVDB";
+#  endif
+
+  return cflags;
+}
+
+string CUDADevice::compile_kernel(const uint kernel_features,
+                                  const char *name,
+                                  const char *base,
+                                  bool force_ptx)
+{
+  /* Compute kernel name. */
+  int major, minor;
+  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+
+  /* Attempt to use kernel provided with Blender. */
+  if (!use_adaptive_compilation()) {
+    if (!force_ptx) {
+      const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor));
+      VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
+      if (path_exists(cubin)) {
+        VLOG(1) << "Using precompiled kernel.";
+        return cubin;
+      }
+    }
+
+    /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */
+    int ptx_major = major, ptx_minor = minor;
+    while (ptx_major >= 3) {
+      const string ptx = path_get(
+          string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor));
+      VLOG(1) << "Testing for pre-compiled kernel " << ptx << ".";
+      if (path_exists(ptx)) {
+        VLOG(1) << "Using precompiled kernel.";
+        return ptx;
+      }
+
+      if (ptx_minor > 0) {
+        ptx_minor--;
+      }
+      else {
+        ptx_major--;
+        ptx_minor = 9;
+      }
+    }
+  }
+
+  /* Try to use locally compiled kernel. */
+  string source_path = path_get("source");
+  const string source_md5 = path_files_md5_hash(source_path);
+
+  /* We include cflags into md5 so changing cuda toolkit or changing other
+   * compiler command line arguments makes sure cubin gets re-built.
+   */
+  string common_cflags = compile_kernel_get_common_cflags(kernel_features);
+  const string kernel_md5 = util_md5_string(source_md5 + common_cflags);
+
+  const char *const kernel_ext = force_ptx ? "ptx" : "cubin";
+  const char *const kernel_arch = force_ptx ? "compute" : "sm";
+  const string cubin_file = string_printf(
+      "cycles_%s_%s_%d%d_%s.%s", name, kernel_arch, major, minor, kernel_md5.c_str(), kernel_ext);
+  const string cubin = path_cache_get(path_join("kernels", cubin_file));
+  VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
+  if (path_exists(cubin)) {
+    VLOG(1) << "Using locally compiled kernel.";
+    return cubin;
+  }
+
+#  ifdef _WIN32
+  if (!use_adaptive_compilation() && have_precompiled_kernels()) {
+    if (major < 3) {
+      set_error(
+          string_printf("CUDA backend requires compute capability 3.0 or up, but found %d.%d. "
+                        "Your GPU is not supported.",
+                        major,
+                        minor));
+    }
+    else {
+      set_error(
+          string_printf("CUDA binary kernel for this graphics card compute "
+                        "capability (%d.%d) not found.",
+                        major,
+                        minor));
+    }
+    return string();
+  }
+#  endif
+
+  /* Compile. */
+  const char *const nvcc = cuewCompilerPath();
+  if (nvcc == NULL) {
+    set_error(
+        "CUDA nvcc compiler not found. "
+        "Install CUDA toolkit in default location.");
+    return string();
+  }
+
+  const int nvcc_cuda_version = cuewCompilerVersion();
+  VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << nvcc_cuda_version << ".";
+  if (nvcc_cuda_version < 101) {
+    printf(
+        "Unsupported CUDA version %d.%d detected, "
+        "you need CUDA 10.1 or newer.\n",
+        nvcc_cuda_version / 10,
+        nvcc_cuda_version % 10);
+    return string();
+  }
+  else if (!(nvcc_cuda_version == 101 || nvcc_cuda_version == 102 || nvcc_cuda_version == 111 ||
+             nvcc_cuda_version == 112 || nvcc_cuda_version == 113 || nvcc_cuda_version == 114)) {
+    printf(
+        "CUDA version %d.%d detected, build may succeed but only "
+        "CUDA 10.1 to 11.4 are officially supported.\n",
+        nvcc_cuda_version / 10,
+        nvcc_cuda_version % 10);
+  }
+
+  double starttime = time_dt();
+
+  path_create_directories(cubin);
+
+  source_path = path_join(path_join(source_path, "kernel"),
+                          path_join("device", path_join(base, string_printf("%s.cu", name))));
+
+  string command = string_printf(
+      "\"%s\" "
+      "-arch=%s_%d%d "
+      "--%s \"%s\" "
+      "-o \"%s\" "
+      "%s",
+      nvcc,
+      kernel_arch,
+      major,
+      minor,
+      kernel_ext,
+      source_path.c_str(),
+      cubin.c_str(),
+      common_cflags.c_str());
+
+  printf("Compiling CUDA kernel ...\n%s\n", command.c_str());
+
+#  ifdef _WIN32
+  command = "call " + command;
+#  endif
+  if (system(command.c_str()) != 0) {
+    set_error(
+        "Failed to execute compilation command, "
+        "see console for details.");
+    return string();
+  }
+
+  /* Verify if compilation succeeded */
+  if (!path_exists(cubin)) {
+    set_error(
+        "CUDA kernel compilation failed, "
+        "see console for details.");
+    return string();
+  }
+
+  printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
+
+  return cubin;
+}
+
+bool CUDADevice::load_kernels(const uint kernel_features)
+{
+  /* TODO(sergey): Support kernels re-load for CUDA devices.
+   *
+   * Currently re-loading kernel will invalidate memory pointers,
+   * causing problems in cuCtxSynchronize.
+   */
+  if (cuModule) {
+    VLOG(1) << "Skipping kernel reload, not currently supported.";
+    return true;
+  }
+
+  /* check if cuda init succeeded */
+  if (cuContext == 0)
+    return false;
+
+  /* check if GPU is supported */
+  if (!support_device(kernel_features))
+    return false;
+
+  /* get kernel */
+  const char *kernel_name = "kernel";
+  string cubin = compile_kernel(kernel_features, kernel_name);
+  if (cubin.empty())
+    return false;
+
+  /* open module */
+  CUDAContextScope scope(this);
+
+  string cubin_data;
+  CUresult result;
+
+  if (path_read_text(cubin, cubin_data))
+    result = cuModuleLoadData(&cuModule, cubin_data.c_str());
+  else
+    result = CUDA_ERROR_FILE_NOT_FOUND;
+
+  if (result != CUDA_SUCCESS)
+    set_error(string_printf(
+        "Failed to load CUDA kernel from '%s' (%s)", cubin.c_str(), cuewErrorString(result)));
+
+  if (result == CUDA_SUCCESS) {
+    kernels.load(this);
+    reserve_local_memory(kernel_features);
+  }
+
+  return (result == CUDA_SUCCESS);
+}
+
+void CUDADevice::reserve_local_memory(const uint /* kernel_features */)
+{
+  /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
+   * needed for kernel launches, so that we can reliably figure out when
+   * to allocate scene data in mapped host memory. */
+  size_t total = 0, free_before = 0, free_after = 0;
+
+  {
+    CUDAContextScope scope(this);
+    cuMemGetInfo(&free_before, &total);
+  }
+
+  {
+    /* Use the biggest kernel for estimation. */
+    const DeviceKernel test_kernel = DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE;
+
+    /* Launch kernel, using just 1 block appears sufficient to reserve memory for all
+     * multiprocessors. It would be good to do this in parallel for the multi GPU case
+     * still to make it faster. */
+    CUDADeviceQueue queue(this);
+
+    void *d_path_index = nullptr;
+    void *d_render_buffer = nullptr;
+    int d_work_size = 0;
+    void *args[] = {&d_path_index, &d_render_buffer, &d_work_size};
+
+    queue.init_execution();
+    queue.enqueue(test_kernel, 1, args);
+    queue.synchronize();
+  }
+
+  {
+    CUDAContextScope scope(this);
+    cuMemGetInfo(&free_after, &total);
+  }
+
+  VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after)
+          << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
+
+#  if 0
+  /* For testing mapped host memory, fill up device memory. */
+  const size_t keep_mb = 1024;
+
+  while (free_after > keep_mb * 1024 * 1024LL) {
+    CUdeviceptr tmp;
+    cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
+    cuMemGetInfo(&free_after, &total);
+  }
+#  endif
+}
+
+void CUDADevice::init_host_memory()
+{
+  /* Limit amount of host mapped memory, because allocating too much can
+   * cause system instability. Leave at least half or 4 GB of system
+   * memory free, whichever is smaller. */
+  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
+  size_t system_ram = system_physical_ram();
+
+  if (system_ram > 0) {
+    if (system_ram / 2 > default_limit) {
+      map_host_limit = system_ram - default_limit;
+    }
+    else {
+      map_host_limit = system_ram / 2;
+    }
+  }
+  else {
+    VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
+    map_host_limit = 0;
+  }
+
+  /* Amount of device memory to keep is free after texture memory
+   * and working memory allocations respectively. We set the working
+   * memory limit headroom lower so that some space is left after all
+   * texture memory allocations. */
+  device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
+  device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
+
+  VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
+          << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
+}
+
+void CUDADevice::load_texture_info()
+{
+  if (need_texture_info) {
+    /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
+     * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
+    need_texture_info = false;
+    texture_info.copy_to_device();
+  }
+}
+
+void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
+{
+  /* Break out of recursive call, which can happen when moving memory on a multi device. */
+  static bool any_device_moving_textures_to_host = false;
+  if (any_device_moving_textures_to_host) {
+    return;
+  }
+
+  /* Signal to reallocate textures in host memory only. */
+  move_texture_to_host = true;
+
+  while (size > 0) {
+    /* Find suitable memory allocation to move. */
+    device_memory *max_mem = NULL;
+    size_t max_size = 0;
+    bool max_is_image = false;
+
+    thread_scoped_lock lock(cuda_mem_map_mutex);
+    foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
+      device_memory &mem = *pair.first;
+      CUDAMem *cmem = &pair.second;
+
+      /* Can only move textures allocated on this device (and not those from peer devices).
+       * And need to ignore memory that is already on the host. */
+      if (!mem.is_resident(this) || cmem->use_mapped_host) {
+        continue;
+      }
+
+      bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
+                        (&mem != &texture_info);
+      bool is_image = is_texture && (mem.data_height > 1);
+
+      /* Can't move this type of memory. */
+      if (!is_texture || cmem->array) {
+        continue;
+      }
+
+      /* For other textures, only move image textures. */
+      if (for_texture && !is_image) {
+        continue;
+      }
+
+      /* Try to move largest allocation, prefer moving images. */
+      if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
+        max_is_image = is_image;
+        max_size = mem.device_size;
+        max_mem = &mem;
+      }
+    }
+    lock.unlock();
+
+    /* Move to host memory. This part is mutex protected since
+     * multiple CUDA devices could be moving the memory. The
+     * first one will do it, and the rest will adopt the pointer. */
+    if (max_mem) {
+      VLOG(1) << "Move memory from device to host: " << max_mem->name;
+
+      static thread_mutex move_mutex;
+      thread_scoped_lock lock(move_mutex);
+
+      any_device_moving_textures_to_host = true;
+
+      /* Potentially need to call back into multi device, so pointer mapping
+       * and peer devices are updated. This is also necessary since the device
+       * pointer may just be a key here, so cannot be accessed and freed directly.
+       * Unfortunately it does mean that memory is reallocated on all other
+       * devices as well, which is potentially dangerous when still in use (since
+       * a thread rendering on another devices would only be caught in this mutex
+       * if it so happens to do an allocation at the same time as well. */
+      max_mem->device_copy_to();
+      size = (max_size >= size) ? 0 : size - max_size;
+
+      any_device_moving_textures_to_host = false;
+    }
+    else {
+      break;
+    }
+  }
+
+  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
+  move_texture_to_host = false;
+
+  /* Update texture info array with new pointers. */
+  load_texture_info();
+}
+
+CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding)
+{
+  CUDAContextScope scope(this);
+
+  CUdeviceptr device_pointer = 0;
+  size_t size = mem.memory_size() + pitch_padding;
+
+  CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
+  const char *status = "";
+
+  /* First try allocating in device memory, respecting headroom. We make
+   * an exception for texture info. It is small and frequently accessed,
+   * so treat it as working memory.
+   *
+   * If there is not enough room for working memory, we will try to move
+   * textures to host memory, assuming the performance impact would have
+   * been worse for working memory. */
+  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
+  bool is_image = is_texture && (mem.data_height > 1);
+
+  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
+
+  size_t total = 0, free = 0;
+  cuMemGetInfo(&free, &total);
+
+  /* Move textures to host memory if needed. */
+  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
+    move_textures_to_host(size + headroom - free, is_texture);
+    cuMemGetInfo(&free, &total);
+  }
+
+  /* Allocate in device memory. */
+  if (!move_texture_to_host && (size + headroom) < free) {
+    mem_alloc_result = cuMemAlloc(&device_pointer, size);
+    if (mem_alloc_result == CUDA_SUCCESS) {
+      status = " in device memory";
+    }
+  }
+
+  /* Fall back to mapped host memory if needed and possible. */
+
+  void *shared_pointer = 0;
+
+  if (mem_alloc_result != CUDA_SUCCESS && can_map_host) {
+    if (mem.shared_pointer) {
+      /* Another device already allocated host memory. */
+      mem_alloc_result = CUDA_SUCCESS;
+      shared_pointer = mem.shared_pointer;
+    }
+    else if (map_host_used + size < map_host_limit) {
+      /* Allocate host memory ourselves. */
+      mem_alloc_result = cuMemHostAlloc(
+          &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
+
+      assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
+             (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
+    }
+
+    if (mem_alloc_result == CUDA_SUCCESS) {
+      cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0));
+      map_host_used += size;
+      status = " in host memory";
+    }
+  }
+
+  if (mem_alloc_result != CUDA_SUCCESS) {
+    status = " failed, out of device and host memory";
+    set_error("System is out of GPU and shared host memory");
+  }
+
+  if (mem.name) {
+    VLOG(1) << "Buffer allocate: " << mem.name << ", "
+            << string_human_readable_number(mem.memory_size()) << " bytes. ("
+            << string_human_readable_size(mem.memory_size()) << ")" << status;
+  }
+
+  mem.device_pointer = (device_ptr)device_pointer;
+  mem.device_size = size;
+  stats.mem_alloc(size);
+
+  if (!mem.device_pointer) {
+    return NULL;
+  }
+
+  /* Insert into map of allocations. */
+  thread_scoped_lock lock(cuda_mem_map_mutex);
+  CUDAMem *cmem = &cuda_mem_map[&mem];
+  if (shared_pointer != 0) {
+    /* Replace host pointer with our host allocation. Only works if
+     * CUDA memory layout is the same and has no pitch padding. Also
+     * does not work if we move textures to host during a render,
+     * since other devices might be using the memory. */
+
+    if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
+        mem.host_pointer != shared_pointer) {
+      memcpy(shared_pointer, mem.host_pointer, size);
+
+      /* A Call to device_memory::host_free() should be preceded by
+       * a call to device_memory::device_free() for host memory
+       * allocated by a device to be handled properly. Two exceptions
+       * are here and a call in OptiXDevice::generic_alloc(), where
+       * the current host memory can be assumed to be allocated by
+       * device_memory::host_alloc(), not by a device */
+
+      mem.host_free();
+      mem.host_pointer = shared_pointer;
+    }
+    mem.shared_pointer = shared_pointer;
+    mem.shared_counter++;
+    cmem->use_mapped_host = true;
+  }
+  else {
+    cmem->use_mapped_host = false;
+  }
+
+  return cmem;
+}
+
+void CUDADevice::generic_copy_to(device_memory &mem)
+{
+  if (!mem.host_pointer || !mem.device_pointer) {
+    return;
+  }
+
+  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
+   * cuMemAlloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
+   * mem.host_pointer. */
+  thread_scoped_lock lock(cuda_mem_map_mutex);
+  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+    const CUDAContextScope scope(this);
+    cuda_assert(
+        cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size()));
+  }
+}
+
+void CUDADevice::generic_free(device_memory &mem)
+{
+  if (mem.device_pointer) {
+    CUDAContextScope scope(this);
+    thread_scoped_lock lock(cuda_mem_map_mutex);
+    const CUDAMem &cmem = cuda_mem_map[&mem];
+
+    /* If cmem.use_mapped_host is true, reference counting is used
+     * to safely free a mapped host memory. */
+
+    if (cmem.use_mapped_host) {
+      assert(mem.shared_pointer);
+      if (mem.shared_pointer) {
+        assert(mem.shared_counter > 0);
+        if (--mem.shared_counter == 0) {
+          if (mem.host_pointer == mem.shared_pointer) {
+            mem.host_pointer = 0;
+          }
+          cuMemFreeHost(mem.shared_pointer);
+          mem.shared_pointer = 0;
+        }
+      }
+      map_host_used -= mem.device_size;
+    }
+    else {
+      /* Free device memory. */
+      cuda_assert(cuMemFree(mem.device_pointer));
+    }
+
+    stats.mem_free(mem.device_size);
+    mem.device_pointer = 0;
+    mem.device_size = 0;
+
+    cuda_mem_map.erase(cuda_mem_map.find(&mem));
+  }
+}
+
+void CUDADevice::mem_alloc(device_memory &mem)
+{
+  if (mem.type == MEM_TEXTURE) {
+    assert(!"mem_alloc not supported for textures.");
+  }
+  else if (mem.type == MEM_GLOBAL) {
+    assert(!"mem_alloc not supported for global memory.");
+  }
+  else {
+    generic_alloc(mem);
+  }
+}
+
+void CUDADevice::mem_copy_to(device_memory &mem)
+{
+  if (mem.type == MEM_GLOBAL) {
+    global_free(mem);
+    global_alloc(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free((device_texture &)mem);
+    tex_alloc((device_texture &)mem);
+  }
+  else {
+    if (!mem.device_pointer) {
+      generic_alloc(mem);
+    }
+    generic_copy_to(mem);
+  }
+}
+
+void CUDADevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
+{
+  if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) {
+    assert(!"mem_copy_from not supported for textures.");
+  }
+  else if (mem.host_pointer) {
+    const size_t size = elem * w * h;
+    const size_t offset = elem * y * w;
+
+    if (mem.device_pointer) {
+      const CUDAContextScope scope(this);
+      cuda_assert(cuMemcpyDtoH(
+          (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size));
+    }
+    else {
+      memset((char *)mem.host_pointer + offset, 0, size);
+    }
+  }
+}
+
+void CUDADevice::mem_zero(device_memory &mem)
+{
+  if (!mem.device_pointer) {
+    mem_alloc(mem);
+  }
+  if (!mem.device_pointer) {
+    return;
+  }
+
+  /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
+   * regardless of mem.host_pointer and mem.shared_pointer. */
+  thread_scoped_lock lock(cuda_mem_map_mutex);
+  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+    const CUDAContextScope scope(this);
+    cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
+  }
+  else if (mem.host_pointer) {
+    memset(mem.host_pointer, 0, mem.memory_size());
+  }
+}
+
+void CUDADevice::mem_free(device_memory &mem)
+{
+  if (mem.type == MEM_GLOBAL) {
+    global_free(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free((device_texture &)mem);
+  }
+  else {
+    generic_free(mem);
+  }
+}
+
+device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
+{
+  return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
+}
+
+void CUDADevice::const_copy_to(const char *name, void *host, size_t size)
+{
+  CUDAContextScope scope(this);
+  CUdeviceptr mem;
+  size_t bytes;
+
+  cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
+  // assert(bytes == size);
+  cuda_assert(cuMemcpyHtoD(mem, host, size));
+}
+
+void CUDADevice::global_alloc(device_memory &mem)
+{
+  if (mem.is_resident(this)) {
+    generic_alloc(mem);
+    generic_copy_to(mem);
+  }
+
+  const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
+}
+
+void CUDADevice::global_free(device_memory &mem)
+{
+  if (mem.is_resident(this) && mem.device_pointer) {
+    generic_free(mem);
+  }
+}
+
+void CUDADevice::tex_alloc(device_texture &mem)
+{
+  CUDAContextScope scope(this);
+
+  /* General variables for both architectures */
+  string bind_name = mem.name;
+  size_t dsize = datatype_size(mem.data_type);
+  size_t size = mem.memory_size();
+
+  CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
+  switch (mem.info.extension) {
+    case EXTENSION_REPEAT:
+      address_mode = CU_TR_ADDRESS_MODE_WRAP;
+      break;
+    case EXTENSION_EXTEND:
+      address_mode = CU_TR_ADDRESS_MODE_CLAMP;
+      break;
+    case EXTENSION_CLIP:
+      address_mode = CU_TR_ADDRESS_MODE_BORDER;
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  CUfilter_mode filter_mode;
+  if (mem.info.interpolation == INTERPOLATION_CLOSEST) {
+    filter_mode = CU_TR_FILTER_MODE_POINT;
+  }
+  else {
+    filter_mode = CU_TR_FILTER_MODE_LINEAR;
+  }
+
+  /* Image Texture Storage */
+  CUarray_format_enum format;
+  switch (mem.data_type) {
+    case TYPE_UCHAR:
+      format = CU_AD_FORMAT_UNSIGNED_INT8;
+      break;
+    case TYPE_UINT16:
+      format = CU_AD_FORMAT_UNSIGNED_INT16;
+      break;
+    case TYPE_UINT:
+      format = CU_AD_FORMAT_UNSIGNED_INT32;
+      break;
+    case TYPE_INT:
+      format = CU_AD_FORMAT_SIGNED_INT32;
+      break;
+    case TYPE_FLOAT:
+      format = CU_AD_FORMAT_FLOAT;
+      break;
+    case TYPE_HALF:
+      format = CU_AD_FORMAT_HALF;
+      break;
+    default:
+      assert(0);
+      return;
+  }
+
+  CUDAMem *cmem = NULL;
+  CUarray array_3d = NULL;
+  size_t src_pitch = mem.data_width * dsize * mem.data_elements;
+  size_t dst_pitch = src_pitch;
+
+  if (!mem.is_resident(this)) {
+    thread_scoped_lock lock(cuda_mem_map_mutex);
+    cmem = &cuda_mem_map[&mem];
+    cmem->texobject = 0;
+
+    if (mem.data_depth > 1) {
+      array_3d = (CUarray)mem.device_pointer;
+      cmem->array = array_3d;
+    }
+    else if (mem.data_height > 0) {
+      dst_pitch = align_up(src_pitch, pitch_alignment);
+    }
+  }
+  else if (mem.data_depth > 1) {
+    /* 3D texture using array, there is no API for linear memory. */
+    CUDA_ARRAY3D_DESCRIPTOR desc;
+
+    desc.Width = mem.data_width;
+    desc.Height = mem.data_height;
+    desc.Depth = mem.data_depth;
+    desc.Format = format;
+    desc.NumChannels = mem.data_elements;
+    desc.Flags = 0;
+
+    VLOG(1) << "Array 3D allocate: " << mem.name << ", "
+            << string_human_readable_number(mem.memory_size()) << " bytes. ("
+            << string_human_readable_size(mem.memory_size()) << ")";
+
+    cuda_assert(cuArray3DCreate(&array_3d, &desc));
+
+    if (!array_3d) {
+      return;
+    }
+
+    CUDA_MEMCPY3D param;
+    memset(&param, 0, sizeof(param));
+    param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+    param.dstArray = array_3d;
+    param.srcMemoryType = CU_MEMORYTYPE_HOST;
+    param.srcHost = mem.host_pointer;
+    param.srcPitch = src_pitch;
+    param.WidthInBytes = param.srcPitch;
+    param.Height = mem.data_height;
+    param.Depth = mem.data_depth;
+
+    cuda_assert(cuMemcpy3D(&param));
+
+    mem.device_pointer = (device_ptr)array_3d;
+    mem.device_size = size;
+    stats.mem_alloc(size);
+
+    thread_scoped_lock lock(cuda_mem_map_mutex);
+    cmem = &cuda_mem_map[&mem];
+    cmem->texobject = 0;
+    cmem->array = array_3d;
+  }
+  else if (mem.data_height > 0) {
+    /* 2D texture, using pitch aligned linear memory. */
+    dst_pitch = align_up(src_pitch, pitch_alignment);
+    size_t dst_size = dst_pitch * mem.data_height;
+
+    cmem = generic_alloc(mem, dst_size - mem.memory_size());
+    if (!cmem) {
+      return;
+    }
+
+    CUDA_MEMCPY2D param;
+    memset(&param, 0, sizeof(param));
+    param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+    param.dstDevice = mem.device_pointer;
+    param.dstPitch = dst_pitch;
+    param.srcMemoryType = CU_MEMORYTYPE_HOST;
+    param.srcHost = mem.host_pointer;
+    param.srcPitch = src_pitch;
+    param.WidthInBytes = param.srcPitch;
+    param.Height = mem.data_height;
+
+    cuda_assert(cuMemcpy2DUnaligned(&param));
+  }
+  else {
+    /* 1D texture, using linear memory. */
+    cmem = generic_alloc(mem);
+    if (!cmem) {
+      return;
+    }
+
+    cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
+  }
+
+  /* Resize once */
+  const uint slot = mem.slot;
+  if (slot >= texture_info.size()) {
+    /* Allocate some slots in advance, to reduce amount
+     * of re-allocations. */
+    texture_info.resize(slot + 128);
+  }
+
+  /* Set Mapping and tag that we need to (re-)upload to device */
+  texture_info[slot] = mem.info;
+  need_texture_info = true;
+
+  if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
+      mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
+    /* Kepler+, bindless textures. */
+    CUDA_RESOURCE_DESC resDesc;
+    memset(&resDesc, 0, sizeof(resDesc));
+
+    if (array_3d) {
+      resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
+      resDesc.res.array.hArray = array_3d;
+      resDesc.flags = 0;
+    }
+    else if (mem.data_height > 0) {
+      resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
+      resDesc.res.pitch2D.devPtr = mem.device_pointer;
+      resDesc.res.pitch2D.format = format;
+      resDesc.res.pitch2D.numChannels = mem.data_elements;
+      resDesc.res.pitch2D.height = mem.data_height;
+      resDesc.res.pitch2D.width = mem.data_width;
+      resDesc.res.pitch2D.pitchInBytes = dst_pitch;
+    }
+    else {
+      resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
+      resDesc.res.linear.devPtr = mem.device_pointer;
+      resDesc.res.linear.format = format;
+      resDesc.res.linear.numChannels = mem.data_elements;
+      resDesc.res.linear.sizeInBytes = mem.device_size;
+    }
+
+    CUDA_TEXTURE_DESC texDesc;
+    memset(&texDesc, 0, sizeof(texDesc));
+    texDesc.addressMode[0] = address_mode;
+    texDesc.addressMode[1] = address_mode;
+    texDesc.addressMode[2] = address_mode;
+    texDesc.filterMode = filter_mode;
+    texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
+
+    thread_scoped_lock lock(cuda_mem_map_mutex);
+    cmem = &cuda_mem_map[&mem];
+
+    cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
+
+    texture_info[slot].data = (uint64_t)cmem->texobject;
+  }
+  else {
+    texture_info[slot].data = (uint64_t)mem.device_pointer;
+  }
+}
+
+void CUDADevice::tex_free(device_texture &mem)
+{
+  if (mem.device_pointer) {
+    CUDAContextScope scope(this);
+    thread_scoped_lock lock(cuda_mem_map_mutex);
+    const CUDAMem &cmem = cuda_mem_map[&mem];
+
+    if (cmem.texobject) {
+      /* Free bindless texture. */
+      cuTexObjectDestroy(cmem.texobject);
+    }
+
+    if (!mem.is_resident(this)) {
+      /* Do not free memory here, since it was allocated on a different device. */
+      cuda_mem_map.erase(cuda_mem_map.find(&mem));
+    }
+    else if (cmem.array) {
+      /* Free array. */
+      cuArrayDestroy(cmem.array);
+      stats.mem_free(mem.device_size);
+      mem.device_pointer = 0;
+      mem.device_size = 0;
+
+      cuda_mem_map.erase(cuda_mem_map.find(&mem));
+    }
+    else {
+      lock.unlock();
+      generic_free(mem);
+    }
+  }
+}
+
+#  if 0
+void CUDADevice::render(DeviceTask &task,
+                        RenderTile &rtile,
+                        device_vector<KernelWorkTile> &work_tiles)
+{
+  scoped_timer timer(&rtile.buffers->render_time);
+
+  if (have_error())
+    return;
+
+  CUDAContextScope scope(this);
+  CUfunction cuRender;
+
+  /* Get kernel function. */
+  if (rtile.task == RenderTile::BAKE) {
+    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake"));
+  }
+  else {
+    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace"));
+  }
+
+  if (have_error()) {
+    return;
+  }
+
+  cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
+
+  /* Allocate work tile. */
+  work_tiles.alloc(1);
+
+  KernelWorkTile *wtile = work_tiles.data();
+  wtile->x = rtile.x;
+  wtile->y = rtile.y;
+  wtile->w = rtile.w;
+  wtile->h = rtile.h;
+  wtile->offset = rtile.offset;
+  wtile->stride = rtile.stride;
+  wtile->buffer = (float *)(CUdeviceptr)rtile.buffer;
+
+  /* Prepare work size. More step samples render faster, but for now we
+   * remain conservative for GPUs connected to a display to avoid driver
+   * timeouts and display freezing. */
+  int min_blocks, num_threads_per_block;
+  cuda_assert(
+      cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0));
+  if (!info.display_device) {
+    min_blocks *= 8;
+  }
+
+  uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
+
+  /* Render all samples. */
+  uint start_sample = rtile.start_sample;
+  uint end_sample = rtile.start_sample + rtile.num_samples;
+
+  for (int sample = start_sample; sample < end_sample;) {
+    /* Setup and copy work tile to device. */
+    wtile->start_sample = sample;
+    wtile->num_samples = step_samples;
+    if (task.adaptive_sampling.use) {
+      wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
+    }
+    wtile->num_samples = min(wtile->num_samples, end_sample - sample);
+    work_tiles.copy_to_device();
+
+    CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
+    uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
+    uint num_blocks = divide_up(total_work_size, num_threads_per_block);
+
+    /* Launch kernel. */
+    void *args[] = {&d_work_tiles, &total_work_size};
+
+    cuda_assert(
+        cuLaunchKernel(cuRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
+
+    /* Run the adaptive sampling kernels at selected samples aligned to step samples. */
+    uint filter_sample = sample + wtile->num_samples - 1;
+    if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
+      adaptive_sampling_filter(filter_sample, wtile, d_work_tiles);
+    }
+
+    cuda_assert(cuCtxSynchronize());
+
+    /* Update progress. */
+    sample += wtile->num_samples;
+    rtile.sample = sample;
+    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
+
+    if (task.get_cancel()) {
+      if (task.need_finish_queue == false)
+        break;
+    }
+  }
+
+  /* Finalize adaptive sampling. */
+  if (task.adaptive_sampling.use) {
+    CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
+    adaptive_sampling_post(rtile, wtile, d_work_tiles);
+    cuda_assert(cuCtxSynchronize());
+    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
+  }
+}
+
+void CUDADevice::thread_run(DeviceTask &task)
+{
+  CUDAContextScope scope(this);
+
+  if (task.type == DeviceTask::RENDER) {
+    device_vector<KernelWorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
+
+    /* keep rendering tiles until done */
+    RenderTile tile;
+    DenoisingTask denoising(this, task);
+
+    while (task.acquire_tile(this, tile, task.tile_types)) {
+      if (tile.task == RenderTile::PATH_TRACE) {
+        render(task, tile, work_tiles);
+      }
+      else if (tile.task == RenderTile::BAKE) {
+        render(task, tile, work_tiles);
+      }
+
+      task.release_tile(tile);
+
+      if (task.get_cancel()) {
+        if (task.need_finish_queue == false)
+          break;
+      }
+    }
+
+    work_tiles.free();
+  }
+}
+#  endif
+
+unique_ptr<DeviceQueue> CUDADevice::gpu_queue_create()
+{
+  return make_unique<CUDADeviceQueue>(this);
+}
+
+bool CUDADevice::should_use_graphics_interop()
+{
+  /* Check whether this device is part of OpenGL context.
+   *
+   * Using CUDA device for graphics interoperability which is not part of the OpenGL context is
+   * possible, but from the empiric measurements it can be considerably slower than using naive
+   * pixels copy. */
+
+  CUDAContextScope scope(this);
+
+  int num_all_devices = 0;
+  cuda_assert(cuDeviceGetCount(&num_all_devices));
+
+  if (num_all_devices == 0) {
+    return false;
+  }
+
+  vector<CUdevice> gl_devices(num_all_devices);
+  uint num_gl_devices;
+  cuGLGetDevices(&num_gl_devices, gl_devices.data(), num_all_devices, CU_GL_DEVICE_LIST_ALL);
+
+  for (CUdevice gl_device : gl_devices) {
+    if (gl_device == cuDevice) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+int CUDADevice::get_num_multiprocessors()
+{
+  return get_device_default_attribute(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, 0);
+}
+
+int CUDADevice::get_max_num_threads_per_multiprocessor()
+{
+  return get_device_default_attribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, 0);
+}
+
+bool CUDADevice::get_device_attribute(CUdevice_attribute attribute, int *value)
+{
+  CUDAContextScope scope(this);
+
+  return cuDeviceGetAttribute(value, attribute, cuDevice) == CUDA_SUCCESS;
+}
+
+int CUDADevice::get_device_default_attribute(CUdevice_attribute attribute, int default_value)
+{
+  int value = 0;
+  if (!get_device_attribute(attribute, &value)) {
+    return default_value;
+  }
+  return value;
+}
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/cuda/device_impl.h b/intern/cycles/device/cuda/device_impl.h
new file mode 100644
index 00000000000..6b27db54ab4
--- /dev/null
+++ b/intern/cycles/device/cuda/device_impl.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include "device/cuda/kernel.h"
+#  include "device/cuda/queue.h"
+#  include "device/cuda/util.h"
+#  include "device/device.h"
+
+#  include "util/util_map.h"
+
+#  ifdef WITH_CUDA_DYNLOAD
+#    include "cuew.h"
+#  else
+#    include "util/util_opengl.h"
+#    include <cuda.h>
+#    include <cudaGL.h>
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class DeviceQueue;
+
+class CUDADevice : public Device {
+
+  friend class CUDAContextScope;
+
+ public:
+  CUdevice cuDevice;
+  CUcontext cuContext;
+  CUmodule cuModule;
+  size_t device_texture_headroom;
+  size_t device_working_headroom;
+  bool move_texture_to_host;
+  size_t map_host_used;
+  size_t map_host_limit;
+  int can_map_host;
+  int pitch_alignment;
+  int cuDevId;
+  int cuDevArchitecture;
+  bool first_error;
+
+  struct CUDAMem {
+    CUDAMem() : texobject(0), array(0), use_mapped_host(false)
+    {
+    }
+
+    CUtexObject texobject;
+    CUarray array;
+
+    /* If true, a mapped host memory in shared_pointer is being used. */
+    bool use_mapped_host;
+  };
+  typedef map<device_memory *, CUDAMem> CUDAMemMap;
+  CUDAMemMap cuda_mem_map;
+  thread_mutex cuda_mem_map_mutex;
+
+  /* Bindless Textures */
+  device_vector<TextureInfo> texture_info;
+  bool need_texture_info;
+
+  CUDADeviceKernels kernels;
+
+  static bool have_precompiled_kernels();
+
+  virtual bool show_samples() const override;
+
+  virtual BVHLayoutMask get_bvh_layout_mask() const override;
+
+  void set_error(const string &error) override;
+
+  CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+  virtual ~CUDADevice();
+
+  bool support_device(const uint /*kernel_features*/);
+
+  bool check_peer_access(Device *peer_device) override;
+
+  bool use_adaptive_compilation();
+
+  virtual string compile_kernel_get_common_cflags(const uint kernel_features);
+
+  string compile_kernel(const uint kernel_features,
+                        const char *name,
+                        const char *base = "cuda",
+                        bool force_ptx = false);
+
+  virtual bool load_kernels(const uint kernel_features) override;
+
+  void reserve_local_memory(const uint kernel_features);
+
+  void init_host_memory();
+
+  void load_texture_info();
+
+  void move_textures_to_host(size_t size, bool for_texture);
+
+  CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
+
+  void generic_copy_to(device_memory &mem);
+
+  void generic_free(device_memory &mem);
+
+  void mem_alloc(device_memory &mem) override;
+
+  void mem_copy_to(device_memory &mem) override;
+
+  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override;
+
+  void mem_zero(device_memory &mem) override;
+
+  void mem_free(device_memory &mem) override;
+
+  device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override;
+
+  virtual void const_copy_to(const char *name, void *host, size_t size) override;
+
+  void global_alloc(device_memory &mem);
+
+  void global_free(device_memory &mem);
+
+  void tex_alloc(device_texture &mem);
+
+  void tex_free(device_texture &mem);
+
+  virtual bool should_use_graphics_interop() override;
+
+  virtual unique_ptr<DeviceQueue> gpu_queue_create() override;
+
+  int get_num_multiprocessors();
+  int get_max_num_threads_per_multiprocessor();
+
+ protected:
+  bool get_device_attribute(CUdevice_attribute attribute, int *value);
+  int get_device_default_attribute(CUdevice_attribute attribute, int default_value);
+};
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/cuda/graphics_interop.cpp b/intern/cycles/device/cuda/graphics_interop.cpp
new file mode 100644
index 00000000000..e8ca8b90eae
--- /dev/null
+++ b/intern/cycles/device/cuda/graphics_interop.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include "device/cuda/graphics_interop.h"
+
+#  include "device/cuda/device_impl.h"
+#  include "device/cuda/util.h"
+
+CCL_NAMESPACE_BEGIN
+
+CUDADeviceGraphicsInterop::CUDADeviceGraphicsInterop(CUDADeviceQueue *queue)
+    : queue_(queue), device_(static_cast<CUDADevice *>(queue->device))
+{
+}
+
+CUDADeviceGraphicsInterop::~CUDADeviceGraphicsInterop()
+{
+  CUDAContextScope scope(device_);
+
+  if (cu_graphics_resource_) {
+    cuda_device_assert(device_, cuGraphicsUnregisterResource(cu_graphics_resource_));
+  }
+}
+
+void CUDADeviceGraphicsInterop::set_destination(
+    const DeviceGraphicsInteropDestination &destination)
+{
+  const int64_t new_buffer_area = int64_t(destination.buffer_width) * destination.buffer_height;
+
+  need_clear_ = destination.need_clear;
+
+  if (opengl_pbo_id_ == destination.opengl_pbo_id && buffer_area_ == new_buffer_area) {
+    return;
+  }
+
+  CUDAContextScope scope(device_);
+
+  if (cu_graphics_resource_) {
+    cuda_device_assert(device_, cuGraphicsUnregisterResource(cu_graphics_resource_));
+  }
+
+  const CUresult result = cuGraphicsGLRegisterBuffer(
+      &cu_graphics_resource_, destination.opengl_pbo_id, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
+  if (result != CUDA_SUCCESS) {
+    LOG(ERROR) << "Error registering OpenGL buffer: " << cuewErrorString(result);
+  }
+
+  opengl_pbo_id_ = destination.opengl_pbo_id;
+  buffer_area_ = new_buffer_area;
+}
+
+device_ptr CUDADeviceGraphicsInterop::map()
+{
+  if (!cu_graphics_resource_) {
+    return 0;
+  }
+
+  CUDAContextScope scope(device_);
+
+  CUdeviceptr cu_buffer;
+  size_t bytes;
+
+  cuda_device_assert(device_, cuGraphicsMapResources(1, &cu_graphics_resource_, queue_->stream()));
+  cuda_device_assert(
+      device_, cuGraphicsResourceGetMappedPointer(&cu_buffer, &bytes, cu_graphics_resource_));
+
+  if (need_clear_) {
+    cuda_device_assert(
+        device_, cuMemsetD8Async(static_cast<CUdeviceptr>(cu_buffer), 0, bytes, queue_->stream()));
+
+    need_clear_ = false;
+  }
+
+  return static_cast<device_ptr>(cu_buffer);
+}
+
+void CUDADeviceGraphicsInterop::unmap()
+{
+  CUDAContextScope scope(device_);
+
+  cuda_device_assert(device_,
+                     cuGraphicsUnmapResources(1, &cu_graphics_resource_, queue_->stream()));
+}
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/cuda/graphics_interop.h b/intern/cycles/device/cuda/graphics_interop.h
new file mode 100644
index 00000000000..8a70c8aa71d
--- /dev/null
+++ b/intern/cycles/device/cuda/graphics_interop.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include "device/device_graphics_interop.h"
+
+#  ifdef WITH_CUDA_DYNLOAD
+#    include "cuew.h"
+#  else
+#    include <cuda.h>
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice;
+class CUDADeviceQueue;
+
+class CUDADeviceGraphicsInterop : public DeviceGraphicsInterop {
+ public:
+  explicit CUDADeviceGraphicsInterop(CUDADeviceQueue *queue);
+
+  CUDADeviceGraphicsInterop(const CUDADeviceGraphicsInterop &other) = delete;
+  CUDADeviceGraphicsInterop(CUDADeviceGraphicsInterop &&other) noexcept = delete;
+
+  ~CUDADeviceGraphicsInterop();
+
+  CUDADeviceGraphicsInterop &operator=(const CUDADeviceGraphicsInterop &other) = delete;
+  CUDADeviceGraphicsInterop &operator=(CUDADeviceGraphicsInterop &&other) = delete;
+
+  virtual void set_destination(const DeviceGraphicsInteropDestination &destination) override;
+
+  virtual device_ptr map() override;
+  virtual void unmap() override;
+
+ protected:
+  CUDADeviceQueue *queue_ = nullptr;
+  CUDADevice *device_ = nullptr;
+
+  /* OpenGL PBO which is currently registered as the destination for the CUDA buffer. */
+  uint opengl_pbo_id_ = 0;
+  /* Buffer area in pixels of the corresponding PBO. */
+  int64_t buffer_area_ = 0;
+
+  /* The destination was requested to be cleared. */
+  bool need_clear_ = false;
+
+  CUgraphicsResource cu_graphics_resource_ = nullptr;
+};
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/cuda/kernel.cpp b/intern/cycles/device/cuda/kernel.cpp
new file mode 100644
index 00000000000..a4a7bfabce0
--- /dev/null
+++ b/intern/cycles/device/cuda/kernel.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include "device/cuda/kernel.h"
+#  include "device/cuda/device_impl.h"
+
+CCL_NAMESPACE_BEGIN
+
+void CUDADeviceKernels::load(CUDADevice *device)
+{
+  CUmodule cuModule = device->cuModule;
+
+  for (int i = 0; i < (int)DEVICE_KERNEL_NUM; i++) {
+    CUDADeviceKernel &kernel = kernels_[i];
+
+    /* No mega-kernel used for GPU. */
+    if (i == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
+      continue;
+    }
+
+    const std::string function_name = std::string("kernel_gpu_") +
+                                      device_kernel_as_string((DeviceKernel)i);
+    cuda_device_assert(device,
+                       cuModuleGetFunction(&kernel.function, cuModule, function_name.c_str()));
+
+    if (kernel.function) {
+      cuda_device_assert(device, cuFuncSetCacheConfig(kernel.function, CU_FUNC_CACHE_PREFER_L1));
+
+      cuda_device_assert(
+          device,
+          cuOccupancyMaxPotentialBlockSize(
+              &kernel.min_blocks, &kernel.num_threads_per_block, kernel.function, NULL, 0, 0));
+    }
+    else {
+      LOG(ERROR) << "Unable to load kernel " << function_name;
+    }
+  }
+
+  loaded = true;
+}
+
+const CUDADeviceKernel &CUDADeviceKernels::get(DeviceKernel kernel) const
+{
+  return kernels_[(int)kernel];
+}
+
+bool CUDADeviceKernels::available(DeviceKernel kernel) const
+{
+  return kernels_[(int)kernel].function != nullptr;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA*/
diff --git a/intern/cycles/device/cuda/kernel.h b/intern/cycles/device/cuda/kernel.h
new file mode 100644
index 00000000000..b489547a350
--- /dev/null
+++ b/intern/cycles/device/cuda/kernel.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_CUDA
+
+#  include "device/device_kernel.h"
+
+#  ifdef WITH_CUDA_DYNLOAD
+#    include "cuew.h"
+#  else
+#    include <cuda.h>
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice;
+
+/* CUDA kernel and associate occupancy information. */
+class CUDADeviceKernel {
+ public:
+  CUfunction function = nullptr;
+
+  int num_threads_per_block = 0;
+  int min_blocks = 0;
+};
+
+/* Cache of CUDA kernels for each DeviceKernel. */
+class CUDADeviceKernels {
+ public:
+  void load(CUDADevice *device);
+  const CUDADeviceKernel &get(DeviceKernel kernel) const;
+  bool available(DeviceKernel kernel) const;
+
+ protected:
+  CUDADeviceKernel kernels_[DEVICE_KERNEL_NUM];
+  bool loaded = false;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA */
diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp
new file mode 100644
index 00000000000..b7f86c10553
--- /dev/null
+++ b/intern/cycles/device/cuda/queue.cpp
@@ -0,0 +1,220 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include "device/cuda/queue.h"
+
+#  include "device/cuda/device_impl.h"
+#  include "device/cuda/graphics_interop.h"
+#  include "device/cuda/kernel.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* CUDADeviceQueue */
+
+CUDADeviceQueue::CUDADeviceQueue(CUDADevice *device)
+    : DeviceQueue(device), cuda_device_(device), cuda_stream_(nullptr)
+{
+  const CUDAContextScope scope(cuda_device_);
+  cuda_device_assert(cuda_device_, cuStreamCreate(&cuda_stream_, CU_STREAM_NON_BLOCKING));
+}
+
+CUDADeviceQueue::~CUDADeviceQueue()
+{
+  const CUDAContextScope scope(cuda_device_);
+  cuStreamDestroy(cuda_stream_);
+}
+
+int CUDADeviceQueue::num_concurrent_states(const size_t state_size) const
+{
+  int num_states = max(cuda_device_->get_num_multiprocessors() *
+                           cuda_device_->get_max_num_threads_per_multiprocessor() * 16,
+                       1048576);
+
+  const char *factor_str = getenv("CYCLES_CONCURRENT_STATES_FACTOR");
+  if (factor_str) {
+    num_states = max((int)(num_states * atof(factor_str)), 1024);
+  }
+
+  VLOG(3) << "GPU queue concurrent states: " << num_states << ", using up to "
+          << string_human_readable_size(num_states * state_size);
+
+  return num_states;
+}
+
+int CUDADeviceQueue::num_concurrent_busy_states() const
+{
+  const int max_num_threads = cuda_device_->get_num_multiprocessors() *
+                              cuda_device_->get_max_num_threads_per_multiprocessor();
+
+  if (max_num_threads == 0) {
+    return 65536;
+  }
+
+  return 4 * max_num_threads;
+}
+
+void CUDADeviceQueue::init_execution()
+{
+  /* Synchronize all textures and memory copies before executing task. */
+  CUDAContextScope scope(cuda_device_);
+  cuda_device_->load_texture_info();
+  cuda_device_assert(cuda_device_, cuCtxSynchronize());
+
+  debug_init_execution();
+}
+
+bool CUDADeviceQueue::kernel_available(DeviceKernel kernel) const
+{
+  return cuda_device_->kernels.available(kernel);
+}
+
+bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[])
+{
+  if (cuda_device_->have_error()) {
+    return false;
+  }
+
+  debug_enqueue(kernel, work_size);
+
+  const CUDAContextScope scope(cuda_device_);
+  const CUDADeviceKernel &cuda_kernel = cuda_device_->kernels.get(kernel);
+
+  /* Compute kernel launch parameters. */
+  const int num_threads_per_block = cuda_kernel.num_threads_per_block;
+  const int num_blocks = divide_up(work_size, num_threads_per_block);
+
+  int shared_mem_bytes = 0;
+
+  switch (kernel) {
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
+      /* See parall_active_index.h for why this amount of shared memory is needed. */
+      shared_mem_bytes = (num_threads_per_block + 1) * sizeof(int);
+      break;
+
+    default:
+      break;
+  }
+
+  /* Launch kernel. */
+  cuda_device_assert(cuda_device_,
+                     cuLaunchKernel(cuda_kernel.function,
+                                    num_blocks,
+                                    1,
+                                    1,
+                                    num_threads_per_block,
+                                    1,
+                                    1,
+                                    shared_mem_bytes,
+                                    cuda_stream_,
+                                    args,
+                                    0));
+
+  return !(cuda_device_->have_error());
+}
+
+bool CUDADeviceQueue::synchronize()
+{
+  if (cuda_device_->have_error()) {
+    return false;
+  }
+
+  const CUDAContextScope scope(cuda_device_);
+  cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_));
+  debug_synchronize();
+
+  return !(cuda_device_->have_error());
+}
+
+void CUDADeviceQueue::zero_to_device(device_memory &mem)
+{
+  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+  if (mem.memory_size() == 0) {
+    return;
+  }
+
+  /* Allocate on demand. */
+  if (mem.device_pointer == 0) {
+    cuda_device_->mem_alloc(mem);
+  }
+
+  /* Zero memory on device. */
+  assert(mem.device_pointer != 0);
+
+  const CUDAContextScope scope(cuda_device_);
+  cuda_device_assert(
+      cuda_device_,
+      cuMemsetD8Async((CUdeviceptr)mem.device_pointer, 0, mem.memory_size(), cuda_stream_));
+}
+
+void CUDADeviceQueue::copy_to_device(device_memory &mem)
+{
+  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+  if (mem.memory_size() == 0) {
+    return;
+  }
+
+  /* Allocate on demand. */
+  if (mem.device_pointer == 0) {
+    cuda_device_->mem_alloc(mem);
+  }
+
+  assert(mem.device_pointer != 0);
+  assert(mem.host_pointer != nullptr);
+
+  /* Copy memory to device. */
+  const CUDAContextScope scope(cuda_device_);
+  cuda_device_assert(
+      cuda_device_,
+      cuMemcpyHtoDAsync(
+          (CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size(), cuda_stream_));
+}
+
+void CUDADeviceQueue::copy_from_device(device_memory &mem)
+{
+  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+  if (mem.memory_size() == 0) {
+    return;
+  }
+
+  assert(mem.device_pointer != 0);
+  assert(mem.host_pointer != nullptr);
+
+  /* Copy memory from device. */
+  const CUDAContextScope scope(cuda_device_);
+  cuda_device_assert(
+      cuda_device_,
+      cuMemcpyDtoHAsync(
+          mem.host_pointer, (CUdeviceptr)mem.device_pointer, mem.memory_size(), cuda_stream_));
+}
+
+unique_ptr<DeviceGraphicsInterop> CUDADeviceQueue::graphics_interop_create()
+{
+  return make_unique<CUDADeviceGraphicsInterop>(this);
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA */
diff --git a/intern/cycles/device/cuda/queue.h b/intern/cycles/device/cuda/queue.h
new file mode 100644
index 00000000000..62e3aa3d6c2
--- /dev/null
+++ b/intern/cycles/device/cuda/queue.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_CUDA
+
+#  include "device/device_kernel.h"
+#  include "device/device_memory.h"
+#  include "device/device_queue.h"
+
+#  include "device/cuda/util.h"
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice;
+class device_memory;
+
+/* Base class for CUDA queues. */
+class CUDADeviceQueue : public DeviceQueue {
+ public:
+  CUDADeviceQueue(CUDADevice *device);
+  ~CUDADeviceQueue();
+
+  virtual int num_concurrent_states(const size_t state_size) const override;
+  virtual int num_concurrent_busy_states() const override;
+
+  virtual void init_execution() override;
+
+  virtual bool kernel_available(DeviceKernel kernel) const override;
+
+  virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) override;
+
+  virtual bool synchronize() override;
+
+  virtual void zero_to_device(device_memory &mem) override;
+  virtual void copy_to_device(device_memory &mem) override;
+  virtual void copy_from_device(device_memory &mem) override;
+
+  virtual CUstream stream()
+  {
+    return cuda_stream_;
+  }
+
+  virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create() override;
+
+ protected:
+  CUDADevice *cuda_device_;
+  CUstream cuda_stream_;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA */
diff --git a/intern/cycles/device/cuda/util.cpp b/intern/cycles/device/cuda/util.cpp
new file mode 100644
index 00000000000..8f657cc10fe
--- /dev/null
+++ b/intern/cycles/device/cuda/util.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include "device/cuda/util.h"
+#  include "device/cuda/device_impl.h"
+
+CCL_NAMESPACE_BEGIN
+
+CUDAContextScope::CUDAContextScope(CUDADevice *device) : device(device)
+{
+  cuda_device_assert(device, cuCtxPushCurrent(device->cuContext));
+}
+
+CUDAContextScope::~CUDAContextScope()
+{
+  cuda_device_assert(device, cuCtxPopCurrent(NULL));
+}
+
+#  ifndef WITH_CUDA_DYNLOAD
+const char *cuewErrorString(CUresult result)
+{
+  /* We can only give error code here without major code duplication, that
+   * should be enough since dynamic loading is only being disabled by folks
+   * who knows what they're doing anyway.
+   *
+   * NOTE: Avoid call from several threads.
+   */
+  static string error;
+  error = string_printf("%d", result);
+  return error.c_str();
+}
+
+const char *cuewCompilerPath()
+{
+  return CYCLES_CUDA_NVCC_EXECUTABLE;
+}
+
+int cuewCompilerVersion()
+{
+  return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10);
+}
+#  endif
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA */
diff --git a/intern/cycles/device/cuda/util.h b/intern/cycles/device/cuda/util.h
new file mode 100644
index 00000000000..a0898094c08
--- /dev/null
+++ b/intern/cycles/device/cuda/util.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_CUDA
+
+#  ifdef WITH_CUDA_DYNLOAD
+#    include "cuew.h"
+#  else
+#    include <cuda.h>
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice;
+
+/* Utility to push/pop CUDA context. */
+class CUDAContextScope {
+ public:
+  CUDAContextScope(CUDADevice *device);
+  ~CUDAContextScope();
+
+ private:
+  CUDADevice *device;
+};
+
+/* Utility for checking return values of CUDA function calls. */
+#  define cuda_device_assert(cuda_device, stmt) \
+    { \
+      CUresult result = stmt; \
+      if (result != CUDA_SUCCESS) { \
+        const char *name = cuewErrorString(result); \
+        cuda_device->set_error( \
+            string_printf("%s in %s (%s:%d)", name, #stmt, __FILE__, __LINE__)); \
+      } \
+    } \
+    (void)0
+
+#  define cuda_assert(stmt) cuda_device_assert(this, stmt)
+
+#  ifndef WITH_CUDA_DYNLOAD
+/* Transparently implement some functions, so majority of the file does not need
+ * to worry about difference between dynamically loaded and linked CUDA at all. */
+const char *cuewErrorString(CUresult result);
+const char *cuewCompilerPath();
+int cuewCompilerVersion();
+#  endif /* WITH_CUDA_DYNLOAD */
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA */
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index ed53fbb54ae..6ccedcf54ef 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -20,7 +20,13 @@
 #include "bvh/bvh2.h"
 
 #include "device/device.h"
-#include "device/device_intern.h"
+#include "device/device_queue.h"
+
+#include "device/cpu/device.h"
+#include "device/cuda/device.h"
+#include "device/dummy/device.h"
+#include "device/multi/device.h"
+#include "device/optix/device.h"
 
 #include "util/util_foreach.h"
 #include "util/util_half.h"
@@ -38,332 +44,15 @@ CCL_NAMESPACE_BEGIN
 bool Device::need_types_update = true;
 bool Device::need_devices_update = true;
 thread_mutex Device::device_mutex;
-vector<DeviceInfo> Device::opencl_devices;
 vector<DeviceInfo> Device::cuda_devices;
 vector<DeviceInfo> Device::optix_devices;
 vector<DeviceInfo> Device::cpu_devices;
-vector<DeviceInfo> Device::network_devices;
 uint Device::devices_initialized_mask = 0;
 
-/* Device Requested Features */
-
-std::ostream &operator<<(std::ostream &os, const DeviceRequestedFeatures &requested_features)
-{
-  os << "Experimental features: " << (requested_features.experimental ? "On" : "Off") << std::endl;
-  os << "Max nodes group: " << requested_features.max_nodes_group << std::endl;
-  /* TODO(sergey): Decode bitflag into list of names. */
-  os << "Nodes features: " << requested_features.nodes_features << std::endl;
-  os << "Use Hair: " << string_from_bool(requested_features.use_hair) << std::endl;
-  os << "Use Object Motion: " << string_from_bool(requested_features.use_object_motion)
-     << std::endl;
-  os << "Use Camera Motion: " << string_from_bool(requested_features.use_camera_motion)
-     << std::endl;
-  os << "Use Baking: " << string_from_bool(requested_features.use_baking) << std::endl;
-  os << "Use Subsurface: " << string_from_bool(requested_features.use_subsurface) << std::endl;
-  os << "Use Volume: " << string_from_bool(requested_features.use_volume) << std::endl;
-  os << "Use Branched Integrator: " << string_from_bool(requested_features.use_integrator_branched)
-     << std::endl;
-  os << "Use Patch Evaluation: " << string_from_bool(requested_features.use_patch_evaluation)
-     << std::endl;
-  os << "Use Transparent Shadows: " << string_from_bool(requested_features.use_transparent)
-     << std::endl;
-  os << "Use Principled BSDF: " << string_from_bool(requested_features.use_principled)
-     << std::endl;
-  os << "Use Denoising: " << string_from_bool(requested_features.use_denoising) << std::endl;
-  os << "Use Displacement: " << string_from_bool(requested_features.use_true_displacement)
-     << std::endl;
-  os << "Use Background Light: " << string_from_bool(requested_features.use_background_light)
-     << std::endl;
-  return os;
-}
-
 /* Device */
 
 Device::~Device() noexcept(false)
 {
-  if (!background) {
-    if (vertex_buffer != 0) {
-      glDeleteBuffers(1, &vertex_buffer);
-    }
-    if (fallback_shader_program != 0) {
-      glDeleteProgram(fallback_shader_program);
-    }
-  }
-}
-
-/* TODO move shaders to standalone .glsl file. */
-const char *FALLBACK_VERTEX_SHADER =
-    "#version 330\n"
-    "uniform vec2 fullscreen;\n"
-    "in vec2 texCoord;\n"
-    "in vec2 pos;\n"
-    "out vec2 texCoord_interp;\n"
-    "\n"
-    "vec2 normalize_coordinates()\n"
-    "{\n"
-    "   return (vec2(2.0) * (pos / fullscreen)) - vec2(1.0);\n"
-    "}\n"
-    "\n"
-    "void main()\n"
-    "{\n"
-    "   gl_Position = vec4(normalize_coordinates(), 0.0, 1.0);\n"
-    "   texCoord_interp = texCoord;\n"
-    "}\n\0";
-
-const char *FALLBACK_FRAGMENT_SHADER =
-    "#version 330\n"
-    "uniform sampler2D image_texture;\n"
-    "in vec2 texCoord_interp;\n"
-    "out vec4 fragColor;\n"
-    "\n"
-    "void main()\n"
-    "{\n"
-    "   fragColor = texture(image_texture, texCoord_interp);\n"
-    "}\n\0";
-
-static void shader_print_errors(const char *task, const char *log, const char *code)
-{
-  LOG(ERROR) << "Shader: " << task << " error:";
-  LOG(ERROR) << "===== shader string ====";
-
-  stringstream stream(code);
-  string partial;
-
-  int line = 1;
-  while (getline(stream, partial, '\n')) {
-    if (line < 10) {
-      LOG(ERROR) << " " << line << " " << partial;
-    }
-    else {
-      LOG(ERROR) << line << " " << partial;
-    }
-    line++;
-  }
-  LOG(ERROR) << log;
-}
-
-static int bind_fallback_shader(void)
-{
-  GLint status;
-  GLchar log[5000];
-  GLsizei length = 0;
-  GLuint program = 0;
-
-  struct Shader {
-    const char *source;
-    GLenum type;
-  } shaders[2] = {{FALLBACK_VERTEX_SHADER, GL_VERTEX_SHADER},
-                  {FALLBACK_FRAGMENT_SHADER, GL_FRAGMENT_SHADER}};
-
-  program = glCreateProgram();
-
-  for (int i = 0; i < 2; i++) {
-    GLuint shader = glCreateShader(shaders[i].type);
-
-    string source_str = shaders[i].source;
-    const char *c_str = source_str.c_str();
-
-    glShaderSource(shader, 1, &c_str, NULL);
-    glCompileShader(shader);
-
-    glGetShaderiv(shader, GL_COMPILE_STATUS, &status);
-
-    if (!status) {
-      glGetShaderInfoLog(shader, sizeof(log), &length, log);
-      shader_print_errors("compile", log, c_str);
-      return 0;
-    }
-
-    glAttachShader(program, shader);
-  }
-
-  /* Link output. */
-  glBindFragDataLocation(program, 0, "fragColor");
-
-  /* Link and error check. */
-  glLinkProgram(program);
-
-  glGetProgramiv(program, GL_LINK_STATUS, &status);
-  if (!status) {
-    glGetShaderInfoLog(program, sizeof(log), &length, log);
-    shader_print_errors("linking", log, FALLBACK_VERTEX_SHADER);
-    shader_print_errors("linking", log, FALLBACK_FRAGMENT_SHADER);
-    return 0;
-  }
-
-  return program;
-}
-
-bool Device::bind_fallback_display_space_shader(const float width, const float height)
-{
-  if (fallback_status == FALLBACK_SHADER_STATUS_ERROR) {
-    return false;
-  }
-
-  if (fallback_status == FALLBACK_SHADER_STATUS_NONE) {
-    fallback_shader_program = bind_fallback_shader();
-    fallback_status = FALLBACK_SHADER_STATUS_ERROR;
-
-    if (fallback_shader_program == 0) {
-      return false;
-    }
-
-    glUseProgram(fallback_shader_program);
-    image_texture_location = glGetUniformLocation(fallback_shader_program, "image_texture");
-    if (image_texture_location < 0) {
-      LOG(ERROR) << "Shader doesn't contain the 'image_texture' uniform.";
-      return false;
-    }
-
-    fullscreen_location = glGetUniformLocation(fallback_shader_program, "fullscreen");
-    if (fullscreen_location < 0) {
-      LOG(ERROR) << "Shader doesn't contain the 'fullscreen' uniform.";
-      return false;
-    }
-
-    fallback_status = FALLBACK_SHADER_STATUS_SUCCESS;
-  }
-
-  /* Run this every time. */
-  glUseProgram(fallback_shader_program);
-  glUniform1i(image_texture_location, 0);
-  glUniform2f(fullscreen_location, width, height);
-  return true;
-}
-
-void Device::draw_pixels(device_memory &rgba,
-                         int y,
-                         int w,
-                         int h,
-                         int width,
-                         int height,
-                         int dx,
-                         int dy,
-                         int dw,
-                         int dh,
-                         bool transparent,
-                         const DeviceDrawParams &draw_params)
-{
-  const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL);
-
-  assert(rgba.type == MEM_PIXELS);
-  mem_copy_from(rgba, y, w, h, rgba.memory_elements_size(1));
-
-  GLuint texid;
-  glActiveTexture(GL_TEXTURE0);
-  glGenTextures(1, &texid);
-  glBindTexture(GL_TEXTURE_2D, texid);
-
-  if (rgba.data_type == TYPE_HALF) {
-    GLhalf *data_pointer = (GLhalf *)rgba.host_pointer;
-    data_pointer += 4 * y * w;
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, w, h, 0, GL_RGBA, GL_HALF_FLOAT, data_pointer);
-  }
-  else {
-    uint8_t *data_pointer = (uint8_t *)rgba.host_pointer;
-    data_pointer += 4 * y * w;
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, w, h, 0, GL_RGBA, GL_UNSIGNED_BYTE, data_pointer);
-  }
-
-  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-
-  if (transparent) {
-    glEnable(GL_BLEND);
-    glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
-  }
-
-  GLint shader_program;
-  if (use_fallback_shader) {
-    if (!bind_fallback_display_space_shader(dw, dh)) {
-      return;
-    }
-    shader_program = fallback_shader_program;
-  }
-  else {
-    draw_params.bind_display_space_shader_cb();
-    glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program);
-  }
-
-  if (!vertex_buffer) {
-    glGenBuffers(1, &vertex_buffer);
-  }
-
-  glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
-  /* invalidate old contents - avoids stalling if buffer is still waiting in queue to be rendered
-   */
-  glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
-
-  float *vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
-
-  if (vpointer) {
-    /* texture coordinate - vertex pair */
-    vpointer[0] = 0.0f;
-    vpointer[1] = 0.0f;
-    vpointer[2] = dx;
-    vpointer[3] = dy;
-
-    vpointer[4] = 1.0f;
-    vpointer[5] = 0.0f;
-    vpointer[6] = (float)width + dx;
-    vpointer[7] = dy;
-
-    vpointer[8] = 1.0f;
-    vpointer[9] = 1.0f;
-    vpointer[10] = (float)width + dx;
-    vpointer[11] = (float)height + dy;
-
-    vpointer[12] = 0.0f;
-    vpointer[13] = 1.0f;
-    vpointer[14] = dx;
-    vpointer[15] = (float)height + dy;
-
-    if (vertex_buffer) {
-      glUnmapBuffer(GL_ARRAY_BUFFER);
-    }
-  }
-
-  GLuint vertex_array_object;
-  GLuint position_attribute, texcoord_attribute;
-
-  glGenVertexArrays(1, &vertex_array_object);
-  glBindVertexArray(vertex_array_object);
-
-  texcoord_attribute = glGetAttribLocation(shader_program, "texCoord");
-  position_attribute = glGetAttribLocation(shader_program, "pos");
-
-  glEnableVertexAttribArray(texcoord_attribute);
-  glEnableVertexAttribArray(position_attribute);
-
-  glVertexAttribPointer(
-      texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
-  glVertexAttribPointer(position_attribute,
-                        2,
-                        GL_FLOAT,
-                        GL_FALSE,
-                        4 * sizeof(float),
-                        (const GLvoid *)(sizeof(float) * 2));
-
-  glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
-
-  if (vertex_buffer) {
-    glBindBuffer(GL_ARRAY_BUFFER, 0);
-  }
-
-  if (use_fallback_shader) {
-    glUseProgram(0);
-  }
-  else {
-    draw_params.unbind_display_space_shader_cb();
-  }
-
-  glDeleteVertexArrays(1, &vertex_array_object);
-  glBindTexture(GL_TEXTURE_2D, 0);
-  glDeleteTextures(1, &texid);
-
-  if (transparent) {
-    glDisable(GL_BLEND);
-  }
 }
 
 void Device::build_bvh(BVH *bvh, Progress &progress, bool refit)
@@ -379,14 +68,14 @@ void Device::build_bvh(BVH *bvh, Progress &progress, bool refit)
   }
 }
 
-Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
+Device *Device::create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
 {
 #ifdef WITH_MULTI
   if (!info.multi_devices.empty()) {
     /* Always create a multi device when info contains multiple devices.
      * This is done so that the type can still be e.g. DEVICE_CPU to indicate
      * that it is a homogeneous collection of devices, which simplifies checks. */
-    return device_multi_create(info, stats, profiler, background);
+    return device_multi_create(info, stats, profiler);
   }
 #endif
 
@@ -394,29 +83,18 @@ Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool
 
   switch (info.type) {
     case DEVICE_CPU:
-      device = device_cpu_create(info, stats, profiler, background);
+      device = device_cpu_create(info, stats, profiler);
       break;
 #ifdef WITH_CUDA
     case DEVICE_CUDA:
       if (device_cuda_init())
-        device = device_cuda_create(info, stats, profiler, background);
+        device = device_cuda_create(info, stats, profiler);
       break;
 #endif
 #ifdef WITH_OPTIX
     case DEVICE_OPTIX:
       if (device_optix_init())
-        device = device_optix_create(info, stats, profiler, background);
-      break;
-#endif
-#ifdef WITH_NETWORK
-    case DEVICE_NETWORK:
-      device = device_network_create(info, stats, profiler, "127.0.0.1");
-      break;
-#endif
-#ifdef WITH_OPENCL
-    case DEVICE_OPENCL:
-      if (device_opencl_init())
-        device = device_opencl_create(info, stats, profiler, background);
+        device = device_optix_create(info, stats, profiler);
       break;
 #endif
     default:
@@ -424,7 +102,7 @@ Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool
   }
 
   if (device == NULL) {
-    device = device_dummy_create(info, stats, profiler, background);
+    device = device_dummy_create(info, stats, profiler);
   }
 
   return device;
@@ -438,10 +116,6 @@ DeviceType Device::type_from_string(const char *name)
     return DEVICE_CUDA;
   else if (strcmp(name, "OPTIX") == 0)
     return DEVICE_OPTIX;
-  else if (strcmp(name, "OPENCL") == 0)
-    return DEVICE_OPENCL;
-  else if (strcmp(name, "NETWORK") == 0)
-    return DEVICE_NETWORK;
   else if (strcmp(name, "MULTI") == 0)
     return DEVICE_MULTI;
 
@@ -456,10 +130,6 @@ string Device::string_from_type(DeviceType type)
     return "CUDA";
   else if (type == DEVICE_OPTIX)
     return "OPTIX";
-  else if (type == DEVICE_OPENCL)
-    return "OPENCL";
-  else if (type == DEVICE_NETWORK)
-    return "NETWORK";
   else if (type == DEVICE_MULTI)
     return "MULTI";
 
@@ -476,12 +146,6 @@ vector<DeviceType> Device::available_types()
 #ifdef WITH_OPTIX
   types.push_back(DEVICE_OPTIX);
 #endif
-#ifdef WITH_OPENCL
-  types.push_back(DEVICE_OPENCL);
-#endif
-#ifdef WITH_NETWORK
-  types.push_back(DEVICE_NETWORK);
-#endif
   return types;
 }
 
@@ -493,20 +157,6 @@ vector<DeviceInfo> Device::available_devices(uint mask)
   thread_scoped_lock lock(device_mutex);
   vector<DeviceInfo> devices;
 
-#ifdef WITH_OPENCL
-  if (mask & DEVICE_MASK_OPENCL) {
-    if (!(devices_initialized_mask & DEVICE_MASK_OPENCL)) {
-      if (device_opencl_init()) {
-        device_opencl_info(opencl_devices);
-      }
-      devices_initialized_mask |= DEVICE_MASK_OPENCL;
-    }
-    foreach (DeviceInfo &info, opencl_devices) {
-      devices.push_back(info);
-    }
-  }
-#endif
-
 #if defined(WITH_CUDA) || defined(WITH_OPTIX)
   if (mask & (DEVICE_MASK_CUDA | DEVICE_MASK_OPTIX)) {
     if (!(devices_initialized_mask & DEVICE_MASK_CUDA)) {
@@ -547,18 +197,6 @@ vector<DeviceInfo> Device::available_devices(uint mask)
     }
   }
 
-#ifdef WITH_NETWORK
-  if (mask & DEVICE_MASK_NETWORK) {
-    if (!(devices_initialized_mask & DEVICE_MASK_NETWORK)) {
-      device_network_info(network_devices);
-      devices_initialized_mask |= DEVICE_MASK_NETWORK;
-    }
-    foreach (DeviceInfo &info, network_devices) {
-      devices.push_back(info);
-    }
-  }
-#endif
-
   return devices;
 }
 
@@ -580,15 +218,6 @@ string Device::device_capabilities(uint mask)
     capabilities += device_cpu_capabilities() + "\n";
   }
 
-#ifdef WITH_OPENCL
-  if (mask & DEVICE_MASK_OPENCL) {
-    if (device_opencl_init()) {
-      capabilities += "\nOpenCL device capabilities:\n";
-      capabilities += device_opencl_capabilities();
-    }
-  }
-#endif
-
 #ifdef WITH_CUDA
   if (mask & DEVICE_MASK_CUDA) {
     if (device_cuda_init()) {
@@ -613,16 +242,13 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
   }
 
   DeviceInfo info;
-  info.type = subdevices.front().type;
+  info.type = DEVICE_NONE;
   info.id = "MULTI";
   info.description = "Multi Device";
   info.num = 0;
 
   info.has_half_images = true;
   info.has_nanovdb = true;
-  info.has_volume_decoupled = true;
-  info.has_branched_path = true;
-  info.has_adaptive_stop_per_sample = true;
   info.has_osl = true;
   info.has_profiling = true;
   info.has_peer_memory = false;
@@ -660,16 +286,16 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
     info.id += device.id;
 
     /* Set device type to MULTI if subdevices are not of a common type. */
-    if (device.type != info.type) {
+    if (info.type == DEVICE_NONE) {
+      info.type = device.type;
+    }
+    else if (device.type != info.type) {
       info.type = DEVICE_MULTI;
     }
 
     /* Accumulate device info. */
     info.has_half_images &= device.has_half_images;
     info.has_nanovdb &= device.has_nanovdb;
-    info.has_volume_decoupled &= device.has_volume_decoupled;
-    info.has_branched_path &= device.has_branched_path;
-    info.has_adaptive_stop_per_sample &= device.has_adaptive_stop_per_sample;
     info.has_osl &= device.has_osl;
     info.has_profiling &= device.has_profiling;
     info.has_peer_memory |= device.has_peer_memory;
@@ -689,60 +315,32 @@ void Device::free_memory()
   devices_initialized_mask = 0;
   cuda_devices.free_memory();
   optix_devices.free_memory();
-  opencl_devices.free_memory();
   cpu_devices.free_memory();
-  network_devices.free_memory();
 }
 
-/* DeviceInfo */
-
-void DeviceInfo::add_denoising_devices(DenoiserType denoiser_type)
+unique_ptr<DeviceQueue> Device::gpu_queue_create()
 {
-  assert(denoising_devices.empty());
-
-  if (denoiser_type == DENOISER_OPTIX && type != DEVICE_OPTIX) {
-    vector<DeviceInfo> optix_devices = Device::available_devices(DEVICE_MASK_OPTIX);
-    if (!optix_devices.empty()) {
-      /* Convert to a special multi device with separate denoising devices. */
-      if (multi_devices.empty()) {
-        multi_devices.push_back(*this);
-      }
-
-      /* Try to use the same physical devices for denoising. */
-      for (const DeviceInfo &cuda_device : multi_devices) {
-        if (cuda_device.type == DEVICE_CUDA) {
-          for (const DeviceInfo &optix_device : optix_devices) {
-            if (cuda_device.num == optix_device.num) {
-              id += optix_device.id;
-              denoising_devices.push_back(optix_device);
-              break;
-            }
-          }
-        }
-      }
-
-      if (denoising_devices.empty()) {
-        /* Simply use the first available OptiX device. */
-        const DeviceInfo optix_device = optix_devices.front();
-        id += optix_device.id; /* Uniquely identify this special multi device. */
-        denoising_devices.push_back(optix_device);
-      }
+  LOG(FATAL) << "Device does not support queues.";
+  return nullptr;
+}
 
-      denoisers = denoiser_type;
-    }
-  }
-  else if (denoiser_type == DENOISER_OPENIMAGEDENOISE && type != DEVICE_CPU) {
-    /* Convert to a special multi device with separate denoising devices. */
-    if (multi_devices.empty()) {
-      multi_devices.push_back(*this);
-    }
+const CPUKernels *Device::get_cpu_kernels() const
+{
+  LOG(FATAL) << "Device does not support CPU kernels.";
+  return nullptr;
+}
 
-    /* Add CPU denoising devices. */
-    DeviceInfo cpu_device = Device::available_devices(DEVICE_MASK_CPU).front();
-    denoising_devices.push_back(cpu_device);
+void Device::get_cpu_kernel_thread_globals(
+    vector<CPUKernelThreadGlobals> & /*kernel_thread_globals*/)
+{
+  LOG(FATAL) << "Device does not support CPU kernels.";
+}
 
-    denoisers = denoiser_type;
-  }
+void *Device::get_cpu_osl_memory()
+{
+  return nullptr;
 }
 
+/* DeviceInfo */
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index ecf79bcdfa6..399d5eb91df 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -21,31 +21,34 @@
 
 #include "bvh/bvh_params.h"
 
+#include "device/device_denoise.h"
 #include "device/device_memory.h"
-#include "device/device_task.h"
 
+#include "util/util_function.h"
 #include "util/util_list.h"
+#include "util/util_logging.h"
 #include "util/util_stats.h"
 #include "util/util_string.h"
 #include "util/util_texture.h"
 #include "util/util_thread.h"
 #include "util/util_types.h"
+#include "util/util_unique_ptr.h"
 #include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
 class BVH;
+class DeviceQueue;
 class Progress;
-class RenderTile;
+class CPUKernels;
+class CPUKernelThreadGlobals;
 
 /* Device Types */
 
 enum DeviceType {
   DEVICE_NONE = 0,
   DEVICE_CPU,
-  DEVICE_OPENCL,
   DEVICE_CUDA,
-  DEVICE_NETWORK,
   DEVICE_MULTI,
   DEVICE_OPTIX,
   DEVICE_DUMMY,
@@ -53,20 +56,11 @@ enum DeviceType {
 
 enum DeviceTypeMask {
   DEVICE_MASK_CPU = (1 << DEVICE_CPU),
-  DEVICE_MASK_OPENCL = (1 << DEVICE_OPENCL),
   DEVICE_MASK_CUDA = (1 << DEVICE_CUDA),
   DEVICE_MASK_OPTIX = (1 << DEVICE_OPTIX),
-  DEVICE_MASK_NETWORK = (1 << DEVICE_NETWORK),
   DEVICE_MASK_ALL = ~0
 };
 
-enum DeviceKernelStatus {
-  DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE,
-  DEVICE_KERNEL_USING_FEATURE_KERNEL,
-  DEVICE_KERNEL_FEATURE_KERNEL_INVALID,
-  DEVICE_KERNEL_UNKNOWN,
-};
-
 #define DEVICE_MASK(type) (DeviceTypeMask)(1 << type)
 
 class DeviceInfo {
@@ -75,20 +69,16 @@ class DeviceInfo {
   string description;
   string id; /* used for user preferences, should stay fixed with changing hardware config */
   int num;
-  bool display_device;               /* GPU is used as a display device. */
-  bool has_half_images;              /* Support half-float textures. */
-  bool has_nanovdb;                  /* Support NanoVDB volumes. */
-  bool has_volume_decoupled;         /* Decoupled volume shading. */
-  bool has_branched_path;            /* Supports branched path tracing. */
-  bool has_adaptive_stop_per_sample; /* Per-sample adaptive sampling stopping. */
-  bool has_osl;                      /* Support Open Shading Language. */
-  bool use_split_kernel;             /* Use split or mega kernel. */
-  bool has_profiling;                /* Supports runtime collection of profiling info. */
-  bool has_peer_memory;              /* GPU has P2P access to memory of another GPU. */
-  DenoiserTypeMask denoisers;        /* Supported denoiser types. */
+  bool display_device;        /* GPU is used as a display device. */
+  bool has_nanovdb;           /* Support NanoVDB volumes. */
+  bool has_half_images;       /* Support half-float textures. */
+  bool has_osl;               /* Support Open Shading Language. */
+  bool has_profiling;         /* Supports runtime collection of profiling info. */
+  bool has_peer_memory;       /* GPU has P2P access to memory of another GPU. */
+  bool has_gpu_queue;         /* Device supports GPU queue. */
+  DenoiserTypeMask denoisers; /* Supported denoiser types. */
   int cpu_threads;
   vector<DeviceInfo> multi_devices;
-  vector<DeviceInfo> denoising_devices;
   string error_msg;
 
   DeviceInfo()
@@ -100,227 +90,35 @@ class DeviceInfo {
     display_device = false;
     has_half_images = false;
     has_nanovdb = false;
-    has_volume_decoupled = false;
-    has_branched_path = true;
-    has_adaptive_stop_per_sample = false;
     has_osl = false;
-    use_split_kernel = false;
     has_profiling = false;
     has_peer_memory = false;
+    has_gpu_queue = false;
     denoisers = DENOISER_NONE;
   }
 
-  bool operator==(const DeviceInfo &info)
+  bool operator==(const DeviceInfo &info) const
   {
     /* Multiple Devices with the same ID would be very bad. */
     assert(id != info.id ||
            (type == info.type && num == info.num && description == info.description));
     return id == info.id;
   }
-
-  /* Add additional devices needed for the specified denoiser. */
-  void add_denoising_devices(DenoiserType denoiser_type);
-};
-
-class DeviceRequestedFeatures {
- public:
-  /* Use experimental feature set. */
-  bool experimental;
-
-  /* Selective nodes compilation. */
-
-  /* Identifier of a node group up to which all the nodes needs to be
-   * compiled in. Nodes from higher group indices will be ignores.
-   */
-  int max_nodes_group;
-
-  /* Features bitfield indicating which features from the requested group
-   * will be compiled in. Nodes which corresponds to features which are not
-   * in this bitfield will be ignored even if they're in the requested group.
-   */
-  int nodes_features;
-
-  /* BVH/sampling kernel features. */
-  bool use_hair;
-  bool use_hair_thick;
-  bool use_object_motion;
-  bool use_camera_motion;
-
-  /* Denotes whether baking functionality is needed. */
-  bool use_baking;
-
-  /* Use subsurface scattering materials. */
-  bool use_subsurface;
-
-  /* Use volume materials. */
-  bool use_volume;
-
-  /* Use branched integrator. */
-  bool use_integrator_branched;
-
-  /* Use OpenSubdiv patch evaluation */
-  bool use_patch_evaluation;
-
-  /* Use Transparent shadows */
-  bool use_transparent;
-
-  /* Use various shadow tricks, such as shadow catcher. */
-  bool use_shadow_tricks;
-
-  /* Per-uber shader usage flags. */
-  bool use_principled;
-
-  /* Denoising features. */
-  bool use_denoising;
-
-  /* Use raytracing in shaders. */
-  bool use_shader_raytrace;
-
-  /* Use true displacement */
-  bool use_true_displacement;
-
-  /* Use background lights */
-  bool use_background_light;
-
-  DeviceRequestedFeatures()
-  {
-    /* TODO(sergey): Find more meaningful defaults. */
-    max_nodes_group = 0;
-    nodes_features = 0;
-    use_hair = false;
-    use_hair_thick = false;
-    use_object_motion = false;
-    use_camera_motion = false;
-    use_baking = false;
-    use_subsurface = false;
-    use_volume = false;
-    use_integrator_branched = false;
-    use_patch_evaluation = false;
-    use_transparent = false;
-    use_shadow_tricks = false;
-    use_principled = false;
-    use_denoising = false;
-    use_shader_raytrace = false;
-    use_true_displacement = false;
-    use_background_light = false;
-  }
-
-  bool modified(const DeviceRequestedFeatures &requested_features)
-  {
-    return !(max_nodes_group == requested_features.max_nodes_group &&
-             nodes_features == requested_features.nodes_features &&
-             use_hair == requested_features.use_hair &&
-             use_hair_thick == requested_features.use_hair_thick &&
-             use_object_motion == requested_features.use_object_motion &&
-             use_camera_motion == requested_features.use_camera_motion &&
-             use_baking == requested_features.use_baking &&
-             use_subsurface == requested_features.use_subsurface &&
-             use_volume == requested_features.use_volume &&
-             use_integrator_branched == requested_features.use_integrator_branched &&
-             use_patch_evaluation == requested_features.use_patch_evaluation &&
-             use_transparent == requested_features.use_transparent &&
-             use_shadow_tricks == requested_features.use_shadow_tricks &&
-             use_principled == requested_features.use_principled &&
-             use_denoising == requested_features.use_denoising &&
-             use_shader_raytrace == requested_features.use_shader_raytrace &&
-             use_true_displacement == requested_features.use_true_displacement &&
-             use_background_light == requested_features.use_background_light);
-  }
-
-  /* Convert the requested features structure to a build options,
-   * which could then be passed to compilers.
-   */
-  string get_build_options() const
-  {
-    string build_options = "";
-    if (experimental) {
-      build_options += "-D__KERNEL_EXPERIMENTAL__ ";
-    }
-    build_options += "-D__NODES_MAX_GROUP__=" + string_printf("%d", max_nodes_group);
-    build_options += " -D__NODES_FEATURES__=" + string_printf("%d", nodes_features);
-    if (!use_hair) {
-      build_options += " -D__NO_HAIR__";
-    }
-    if (!use_object_motion) {
-      build_options += " -D__NO_OBJECT_MOTION__";
-    }
-    if (!use_camera_motion) {
-      build_options += " -D__NO_CAMERA_MOTION__";
-    }
-    if (!use_baking) {
-      build_options += " -D__NO_BAKING__";
-    }
-    if (!use_volume) {
-      build_options += " -D__NO_VOLUME__";
-    }
-    if (!use_subsurface) {
-      build_options += " -D__NO_SUBSURFACE__";
-    }
-    if (!use_integrator_branched) {
-      build_options += " -D__NO_BRANCHED_PATH__";
-    }
-    if (!use_patch_evaluation) {
-      build_options += " -D__NO_PATCH_EVAL__";
-    }
-    if (!use_transparent && !use_volume) {
-      build_options += " -D__NO_TRANSPARENT__";
-    }
-    if (!use_shadow_tricks) {
-      build_options += " -D__NO_SHADOW_TRICKS__";
-    }
-    if (!use_principled) {
-      build_options += " -D__NO_PRINCIPLED__";
-    }
-    if (!use_denoising) {
-      build_options += " -D__NO_DENOISING__";
-    }
-    if (!use_shader_raytrace) {
-      build_options += " -D__NO_SHADER_RAYTRACE__";
-    }
-    return build_options;
-  }
 };
 
-std::ostream &operator<<(std::ostream &os, const DeviceRequestedFeatures &requested_features);
-
 /* Device */
 
-struct DeviceDrawParams {
-  function<void()> bind_display_space_shader_cb;
-  function<void()> unbind_display_space_shader_cb;
-};
-
 class Device {
   friend class device_sub_ptr;
 
  protected:
-  enum {
-    FALLBACK_SHADER_STATUS_NONE = 0,
-    FALLBACK_SHADER_STATUS_ERROR,
-    FALLBACK_SHADER_STATUS_SUCCESS,
-  };
-
-  Device(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background)
-      : background(background),
-        vertex_buffer(0),
-        fallback_status(FALLBACK_SHADER_STATUS_NONE),
-        fallback_shader_program(0),
-        info(info_),
-        stats(stats_),
-        profiler(profiler_)
+  Device(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_)
+      : info(info_), stats(stats_), profiler(profiler_)
   {
   }
 
-  bool background;
   string error_msg;
 
-  /* used for real time display */
-  unsigned int vertex_buffer;
-  int fallback_status, fallback_shader_program;
-  int image_texture_location, fullscreen_location;
-
-  bool bind_fallback_display_space_shader(const float width, const float height);
-
   virtual device_ptr mem_alloc_sub_ptr(device_memory & /*mem*/, int /*offset*/, int /*size*/)
   {
     /* Only required for devices that implement denoising. */
@@ -361,67 +159,31 @@ class Device {
   Stats &stats;
   Profiler &profiler;
 
-  /* memory alignment */
-  virtual int mem_sub_ptr_alignment()
-  {
-    return MIN_ALIGNMENT_CPU_DATA_TYPES;
-  }
-
   /* constant memory */
   virtual void const_copy_to(const char *name, void *host, size_t size) = 0;
 
-  /* open shading language, only for CPU device */
-  virtual void *osl_memory()
-  {
-    return NULL;
-  }
-
   /* load/compile kernels, must be called before adding tasks */
-  virtual bool load_kernels(const DeviceRequestedFeatures & /*requested_features*/)
+  virtual bool load_kernels(uint /*kernel_features*/)
   {
     return true;
   }
 
-  /* Wait for device to become available to upload data and receive tasks
-   * This method is used by the OpenCL device to load the
-   * optimized kernels or when not (yet) available load the
-   * generic kernels (only during foreground rendering) */
-  virtual bool wait_for_availability(const DeviceRequestedFeatures & /*requested_features*/)
-  {
-    return true;
-  }
-  /* Check if there are 'better' kernels available to be used
-   * We can switch over to these kernels
-   * This method is used to determine if we can switch the preview kernels
-   * to regular kernels */
-  virtual DeviceKernelStatus get_active_kernel_switch_state()
-  {
-    return DEVICE_KERNEL_USING_FEATURE_KERNEL;
-  }
+  /* GPU device only functions.
+   * These may not be used on CPU or multi-devices. */
 
-  /* tasks */
-  virtual int get_split_task_count(DeviceTask &)
-  {
-    return 1;
-  }
+  /* Create new queue for executing kernels in. */
+  virtual unique_ptr<DeviceQueue> gpu_queue_create();
+
+  /* CPU device only functions.
+   * These may not be used on GPU or multi-devices. */
 
-  virtual void task_add(DeviceTask &task) = 0;
-  virtual void task_wait() = 0;
-  virtual void task_cancel() = 0;
-
-  /* opengl drawing */
-  virtual void draw_pixels(device_memory &mem,
-                           int y,
-                           int w,
-                           int h,
-                           int width,
-                           int height,
-                           int dx,
-                           int dy,
-                           int dw,
-                           int dh,
-                           bool transparent,
-                           const DeviceDrawParams &draw_params);
+  /* Get CPU kernel functions for native instruction set. */
+  virtual const CPUKernels *get_cpu_kernels() const;
+  /* Get kernel globals to pass to kernels. */
+  virtual void get_cpu_kernel_thread_globals(
+      vector<CPUKernelThreadGlobals> & /*kernel_thread_globals*/);
+  /* Get OpenShadingLanguage memory buffer. */
+  virtual void *get_cpu_osl_memory();
 
   /* acceleration structure building */
   virtual void build_bvh(BVH *bvh, Progress &progress, bool refit);
@@ -429,25 +191,11 @@ class Device {
   /* OptiX specific destructor. */
   virtual void release_optix_bvh(BVH * /*bvh*/){};
 
-#ifdef WITH_NETWORK
-  /* networking */
-  void server_run();
-#endif
-
   /* multi device */
-  virtual void map_tile(Device * /*sub_device*/, RenderTile & /*tile*/)
-  {
-  }
   virtual int device_number(Device * /*sub_device*/)
   {
     return 0;
   }
-  virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTileNeighbors & /*neighbors*/)
-  {
-  }
-  virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTileNeighbors & /*neighbors*/)
-  {
-  }
 
   virtual bool is_resident(device_ptr /*key*/, Device *sub_device)
   {
@@ -460,11 +208,47 @@ class Device {
     return false;
   }
 
+  /* Graphics resources interoperability.
+   *
+   * The interoperability comes here by the meaning that the device is capable of computing result
+   * directly into an OpenGL (or other graphics library) buffer. */
+
+  /* Check display is to be updated using graphics interoperability.
+   * The interoperability can not be used is it is not supported by the device. But the device
+   * might also force disable the interoperability if it detects that it will be slower than
+   * copying pixels from the render buffer. */
+  virtual bool should_use_graphics_interop()
+  {
+    return false;
+  }
+
+  /* Buffer denoising. */
+
+  /* Returns true if task is fully handled. */
+  virtual bool denoise_buffer(const DeviceDenoiseTask & /*task*/)
+  {
+    LOG(ERROR) << "Request buffer denoising from a device which does not support it.";
+    return false;
+  }
+
+  virtual DeviceQueue *get_denoise_queue()
+  {
+    LOG(ERROR) << "Request denoising queue from a device which does not support it.";
+    return nullptr;
+  }
+
+  /* Sub-devices */
+
+  /* Run given callback for every individual device which will be handling rendering.
+   * For the single device the callback is called for the device itself. For the multi-device the
+   * callback is only called for the sub-devices. */
+  virtual void foreach_device(const function<void(Device *)> &callback)
+  {
+    callback(this);
+  }
+
   /* static */
-  static Device *create(DeviceInfo &info,
-                        Stats &stats,
-                        Profiler &profiler,
-                        bool background = true);
+  static Device *create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
 
   static DeviceType type_from_string(const char *name);
   static string string_from_type(DeviceType type);
@@ -499,9 +283,7 @@ class Device {
   static thread_mutex device_mutex;
   static vector<DeviceInfo> cuda_devices;
   static vector<DeviceInfo> optix_devices;
-  static vector<DeviceInfo> opencl_devices;
   static vector<DeviceInfo> cpu_devices;
-  static vector<DeviceInfo> network_devices;
   static uint devices_initialized_mask;
 };
 
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
deleted file mode 100644
index 4a6e77d6eaa..00000000000
--- a/intern/cycles/device/device_cpu.cpp
+++ /dev/null
@@ -1,1680 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stdlib.h>
-#include <string.h>
-
-/* So ImathMath is included before our kernel_cpu_compat. */
-#ifdef WITH_OSL
-/* So no context pollution happens from indirectly included windows.h */
-#  include "util/util_windows.h"
-#  include <OSL/oslexec.h>
-#endif
-
-#ifdef WITH_EMBREE
-#  include <embree3/rtcore.h>
-#endif
-
-#include "device/device.h"
-#include "device/device_denoising.h"
-#include "device/device_intern.h"
-#include "device/device_split_kernel.h"
-
-// clang-format off
-#include "kernel/kernel.h"
-#include "kernel/kernel_compat_cpu.h"
-#include "kernel/kernel_types.h"
-#include "kernel/split/kernel_split_data.h"
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_adaptive_sampling.h"
-
-#include "kernel/filter/filter.h"
-
-#include "kernel/osl/osl_shader.h"
-#include "kernel/osl/osl_globals.h"
-// clang-format on
-
-#include "bvh/bvh_embree.h"
-
-#include "render/buffers.h"
-#include "render/coverage.h"
-
-#include "util/util_debug.h"
-#include "util/util_foreach.h"
-#include "util/util_function.h"
-#include "util/util_logging.h"
-#include "util/util_map.h"
-#include "util/util_opengl.h"
-#include "util/util_openimagedenoise.h"
-#include "util/util_optimization.h"
-#include "util/util_progress.h"
-#include "util/util_system.h"
-#include "util/util_task.h"
-#include "util/util_thread.h"
-
-CCL_NAMESPACE_BEGIN
-
-class CPUDevice;
-
-/* Has to be outside of the class to be shared across template instantiations. */
-static const char *logged_architecture = "";
-
-template<typename F> class KernelFunctions {
- public:
-  KernelFunctions()
-  {
-    kernel = (F)NULL;
-  }
-
-  KernelFunctions(
-      F kernel_default, F kernel_sse2, F kernel_sse3, F kernel_sse41, F kernel_avx, F kernel_avx2)
-  {
-    const char *architecture_name = "default";
-    kernel = kernel_default;
-
-    /* Silence potential warnings about unused variables
-     * when compiling without some architectures. */
-    (void)kernel_sse2;
-    (void)kernel_sse3;
-    (void)kernel_sse41;
-    (void)kernel_avx;
-    (void)kernel_avx2;
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-    if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) {
-      architecture_name = "AVX2";
-      kernel = kernel_avx2;
-    }
-    else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-        if (DebugFlags().cpu.has_avx() && system_cpu_support_avx()) {
-      architecture_name = "AVX";
-      kernel = kernel_avx;
-    }
-    else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-        if (DebugFlags().cpu.has_sse41() && system_cpu_support_sse41()) {
-      architecture_name = "SSE4.1";
-      kernel = kernel_sse41;
-    }
-    else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-        if (DebugFlags().cpu.has_sse3() && system_cpu_support_sse3()) {
-      architecture_name = "SSE3";
-      kernel = kernel_sse3;
-    }
-    else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-        if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) {
-      architecture_name = "SSE2";
-      kernel = kernel_sse2;
-    }
-#else
-    {
-      /* Dummy to prevent the architecture if below become
-       * conditional when WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-       * is not defined. */
-    }
-#endif
-
-    if (strcmp(architecture_name, logged_architecture) != 0) {
-      VLOG(1) << "Will be using " << architecture_name << " kernels.";
-      logged_architecture = architecture_name;
-    }
-  }
-
-  inline F operator()() const
-  {
-    assert(kernel);
-    return kernel;
-  }
-
- protected:
-  F kernel;
-};
-
-class CPUSplitKernel : public DeviceSplitKernel {
-  CPUDevice *device;
-
- public:
-  explicit CPUSplitKernel(CPUDevice *device);
-
-  virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
-                                              RenderTile &rtile,
-                                              int num_global_elements,
-                                              device_memory &kernel_globals,
-                                              device_memory &kernel_data_,
-                                              device_memory &split_data,
-                                              device_memory &ray_state,
-                                              device_memory &queue_index,
-                                              device_memory &use_queues_flag,
-                                              device_memory &work_pool_wgs);
-
-  virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
-                                                         const DeviceRequestedFeatures &);
-  virtual int2 split_kernel_local_size();
-  virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task);
-  virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads);
-};
-
-class CPUDevice : public Device {
- public:
-  TaskPool task_pool;
-  KernelGlobals kernel_globals;
-
-  device_vector<TextureInfo> texture_info;
-  bool need_texture_info;
-
-#ifdef WITH_OSL
-  OSLGlobals osl_globals;
-#endif
-#ifdef WITH_OPENIMAGEDENOISE
-  oidn::DeviceRef oidn_device;
-  oidn::FilterRef oidn_filter;
-#endif
-  thread_spin_lock oidn_task_lock;
-#ifdef WITH_EMBREE
-  RTCScene embree_scene = NULL;
-  RTCDevice embree_device;
-#endif
-
-  bool use_split_kernel;
-
-  DeviceRequestedFeatures requested_features;
-
-  KernelFunctions<void (*)(KernelGlobals *, float *, int, int, int, int, int)> path_trace_kernel;
-  KernelFunctions<void (*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)>
-      convert_to_half_float_kernel;
-  KernelFunctions<void (*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)>
-      convert_to_byte_kernel;
-  KernelFunctions<void (*)(KernelGlobals *, uint4 *, float4 *, int, int, int, int, int)>
-      shader_kernel;
-  KernelFunctions<void (*)(KernelGlobals *, float *, int, int, int, int, int)> bake_kernel;
-
-  KernelFunctions<void (*)(
-      int, TileInfo *, int, int, float *, float *, float *, float *, float *, int *, int, int)>
-      filter_divide_shadow_kernel;
-  KernelFunctions<void (*)(
-      int, TileInfo *, int, int, int, int, float *, float *, float, int *, int, int)>
-      filter_get_feature_kernel;
-  KernelFunctions<void (*)(int, int, int, int *, float *, float *, int, int *)>
-      filter_write_feature_kernel;
-  KernelFunctions<void (*)(int, int, float *, float *, float *, float *, int *, int)>
-      filter_detect_outliers_kernel;
-  KernelFunctions<void (*)(int, int, float *, float *, float *, float *, int *, int)>
-      filter_combine_halves_kernel;
-
-  KernelFunctions<void (*)(
-      int, int, float *, float *, float *, float *, int *, int, int, int, float, float)>
-      filter_nlm_calc_difference_kernel;
-  KernelFunctions<void (*)(float *, float *, int *, int, int)> filter_nlm_blur_kernel;
-  KernelFunctions<void (*)(float *, float *, int *, int, int)> filter_nlm_calc_weight_kernel;
-  KernelFunctions<void (*)(
-      int, int, float *, float *, float *, float *, float *, int *, int, int, int)>
-      filter_nlm_update_output_kernel;
-  KernelFunctions<void (*)(float *, float *, int *, int)> filter_nlm_normalize_kernel;
-
-  KernelFunctions<void (*)(
-      float *, TileInfo *, int, int, int, float *, int *, int *, int, int, bool, int, float)>
-      filter_construct_transform_kernel;
-  KernelFunctions<void (*)(int,
-                           int,
-                           int,
-                           float *,
-                           float *,
-                           float *,
-                           int *,
-                           float *,
-                           float3 *,
-                           int *,
-                           int *,
-                           int,
-                           int,
-                           int,
-                           int,
-                           bool)>
-      filter_nlm_construct_gramian_kernel;
-  KernelFunctions<void (*)(int, int, int, float *, int *, float *, float3 *, int *, int)>
-      filter_finalize_kernel;
-
-  KernelFunctions<void (*)(KernelGlobals *,
-                           ccl_constant KernelData *,
-                           ccl_global void *,
-                           int,
-                           ccl_global char *,
-                           int,
-                           int,
-                           int,
-                           int,
-                           int,
-                           int,
-                           int,
-                           int,
-                           ccl_global int *,
-                           int,
-                           ccl_global char *,
-                           ccl_global unsigned int *,
-                           unsigned int,
-                           ccl_global float *)>
-      data_init_kernel;
-  unordered_map<string, KernelFunctions<void (*)(KernelGlobals *, KernelData *)>> split_kernels;
-
-#define KERNEL_FUNCTIONS(name) \
-  KERNEL_NAME_EVAL(cpu, name), KERNEL_NAME_EVAL(cpu_sse2, name), \
-      KERNEL_NAME_EVAL(cpu_sse3, name), KERNEL_NAME_EVAL(cpu_sse41, name), \
-      KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name)
-
-  CPUDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
-      : Device(info_, stats_, profiler_, background_),
-        texture_info(this, "__texture_info", MEM_GLOBAL),
-#define REGISTER_KERNEL(name) name##_kernel(KERNEL_FUNCTIONS(name))
-        REGISTER_KERNEL(path_trace),
-        REGISTER_KERNEL(convert_to_half_float),
-        REGISTER_KERNEL(convert_to_byte),
-        REGISTER_KERNEL(shader),
-        REGISTER_KERNEL(bake),
-        REGISTER_KERNEL(filter_divide_shadow),
-        REGISTER_KERNEL(filter_get_feature),
-        REGISTER_KERNEL(filter_write_feature),
-        REGISTER_KERNEL(filter_detect_outliers),
-        REGISTER_KERNEL(filter_combine_halves),
-        REGISTER_KERNEL(filter_nlm_calc_difference),
-        REGISTER_KERNEL(filter_nlm_blur),
-        REGISTER_KERNEL(filter_nlm_calc_weight),
-        REGISTER_KERNEL(filter_nlm_update_output),
-        REGISTER_KERNEL(filter_nlm_normalize),
-        REGISTER_KERNEL(filter_construct_transform),
-        REGISTER_KERNEL(filter_nlm_construct_gramian),
-        REGISTER_KERNEL(filter_finalize),
-        REGISTER_KERNEL(data_init)
-#undef REGISTER_KERNEL
-  {
-    if (info.cpu_threads == 0) {
-      info.cpu_threads = TaskScheduler::num_threads();
-    }
-
-#ifdef WITH_OSL
-    kernel_globals.osl = &osl_globals;
-#endif
-#ifdef WITH_EMBREE
-    embree_device = rtcNewDevice("verbose=0");
-#endif
-    use_split_kernel = DebugFlags().cpu.split_kernel;
-    if (use_split_kernel) {
-      VLOG(1) << "Will be using split kernel.";
-    }
-    need_texture_info = false;
-
-#define REGISTER_SPLIT_KERNEL(name) \
-  split_kernels[#name] = KernelFunctions<void (*)(KernelGlobals *, KernelData *)>( \
-      KERNEL_FUNCTIONS(name))
-    REGISTER_SPLIT_KERNEL(path_init);
-    REGISTER_SPLIT_KERNEL(scene_intersect);
-    REGISTER_SPLIT_KERNEL(lamp_emission);
-    REGISTER_SPLIT_KERNEL(do_volume);
-    REGISTER_SPLIT_KERNEL(queue_enqueue);
-    REGISTER_SPLIT_KERNEL(indirect_background);
-    REGISTER_SPLIT_KERNEL(shader_setup);
-    REGISTER_SPLIT_KERNEL(shader_sort);
-    REGISTER_SPLIT_KERNEL(shader_eval);
-    REGISTER_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao);
-    REGISTER_SPLIT_KERNEL(subsurface_scatter);
-    REGISTER_SPLIT_KERNEL(direct_lighting);
-    REGISTER_SPLIT_KERNEL(shadow_blocked_ao);
-    REGISTER_SPLIT_KERNEL(shadow_blocked_dl);
-    REGISTER_SPLIT_KERNEL(enqueue_inactive);
-    REGISTER_SPLIT_KERNEL(next_iteration_setup);
-    REGISTER_SPLIT_KERNEL(indirect_subsurface);
-    REGISTER_SPLIT_KERNEL(buffer_update);
-    REGISTER_SPLIT_KERNEL(adaptive_stopping);
-    REGISTER_SPLIT_KERNEL(adaptive_filter_x);
-    REGISTER_SPLIT_KERNEL(adaptive_filter_y);
-    REGISTER_SPLIT_KERNEL(adaptive_adjust_samples);
-#undef REGISTER_SPLIT_KERNEL
-#undef KERNEL_FUNCTIONS
-  }
-
-  ~CPUDevice()
-  {
-#ifdef WITH_EMBREE
-    rtcReleaseDevice(embree_device);
-#endif
-    task_pool.cancel();
-    texture_info.free();
-  }
-
-  virtual bool show_samples() const override
-  {
-    return (info.cpu_threads == 1);
-  }
-
-  virtual BVHLayoutMask get_bvh_layout_mask() const override
-  {
-    BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2;
-#ifdef WITH_EMBREE
-    bvh_layout_mask |= BVH_LAYOUT_EMBREE;
-#endif /* WITH_EMBREE */
-    return bvh_layout_mask;
-  }
-
-  void load_texture_info()
-  {
-    if (need_texture_info) {
-      texture_info.copy_to_device();
-      need_texture_info = false;
-    }
-  }
-
-  virtual void mem_alloc(device_memory &mem) override
-  {
-    if (mem.type == MEM_TEXTURE) {
-      assert(!"mem_alloc not supported for textures.");
-    }
-    else if (mem.type == MEM_GLOBAL) {
-      assert(!"mem_alloc not supported for global memory.");
-    }
-    else {
-      if (mem.name) {
-        VLOG(1) << "Buffer allocate: " << mem.name << ", "
-                << string_human_readable_number(mem.memory_size()) << " bytes. ("
-                << string_human_readable_size(mem.memory_size()) << ")";
-      }
-
-      if (mem.type == MEM_DEVICE_ONLY || !mem.host_pointer) {
-        size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES;
-        void *data = util_aligned_malloc(mem.memory_size(), alignment);
-        mem.device_pointer = (device_ptr)data;
-      }
-      else {
-        mem.device_pointer = (device_ptr)mem.host_pointer;
-      }
-
-      mem.device_size = mem.memory_size();
-      stats.mem_alloc(mem.device_size);
-    }
-  }
-
-  virtual void mem_copy_to(device_memory &mem) override
-  {
-    if (mem.type == MEM_GLOBAL) {
-      global_free(mem);
-      global_alloc(mem);
-    }
-    else if (mem.type == MEM_TEXTURE) {
-      tex_free((device_texture &)mem);
-      tex_alloc((device_texture &)mem);
-    }
-    else if (mem.type == MEM_PIXELS) {
-      assert(!"mem_copy_to not supported for pixels.");
-    }
-    else {
-      if (!mem.device_pointer) {
-        mem_alloc(mem);
-      }
-
-      /* copy is no-op */
-    }
-  }
-
-  virtual void mem_copy_from(
-      device_memory & /*mem*/, int /*y*/, int /*w*/, int /*h*/, int /*elem*/) override
-  {
-    /* no-op */
-  }
-
-  virtual void mem_zero(device_memory &mem) override
-  {
-    if (!mem.device_pointer) {
-      mem_alloc(mem);
-    }
-
-    if (mem.device_pointer) {
-      memset((void *)mem.device_pointer, 0, mem.memory_size());
-    }
-  }
-
-  virtual void mem_free(device_memory &mem) override
-  {
-    if (mem.type == MEM_GLOBAL) {
-      global_free(mem);
-    }
-    else if (mem.type == MEM_TEXTURE) {
-      tex_free((device_texture &)mem);
-    }
-    else if (mem.device_pointer) {
-      if (mem.type == MEM_DEVICE_ONLY || !mem.host_pointer) {
-        util_aligned_free((void *)mem.device_pointer);
-      }
-      mem.device_pointer = 0;
-      stats.mem_free(mem.device_size);
-      mem.device_size = 0;
-    }
-  }
-
-  virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override
-  {
-    return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
-  }
-
-  virtual void const_copy_to(const char *name, void *host, size_t size) override
-  {
-#if WITH_EMBREE
-    if (strcmp(name, "__data") == 0) {
-      assert(size <= sizeof(KernelData));
-
-      // Update scene handle (since it is different for each device on multi devices)
-      KernelData *const data = (KernelData *)host;
-      data->bvh.scene = embree_scene;
-    }
-#endif
-    kernel_const_copy(&kernel_globals, name, host, size);
-  }
-
-  void global_alloc(device_memory &mem)
-  {
-    VLOG(1) << "Global memory allocate: " << mem.name << ", "
-            << string_human_readable_number(mem.memory_size()) << " bytes. ("
-            << string_human_readable_size(mem.memory_size()) << ")";
-
-    kernel_global_memory_copy(&kernel_globals, mem.name, mem.host_pointer, mem.data_size);
-
-    mem.device_pointer = (device_ptr)mem.host_pointer;
-    mem.device_size = mem.memory_size();
-    stats.mem_alloc(mem.device_size);
-  }
-
-  void global_free(device_memory &mem)
-  {
-    if (mem.device_pointer) {
-      mem.device_pointer = 0;
-      stats.mem_free(mem.device_size);
-      mem.device_size = 0;
-    }
-  }
-
-  void tex_alloc(device_texture &mem)
-  {
-    VLOG(1) << "Texture allocate: " << mem.name << ", "
-            << string_human_readable_number(mem.memory_size()) << " bytes. ("
-            << string_human_readable_size(mem.memory_size()) << ")";
-
-    mem.device_pointer = (device_ptr)mem.host_pointer;
-    mem.device_size = mem.memory_size();
-    stats.mem_alloc(mem.device_size);
-
-    const uint slot = mem.slot;
-    if (slot >= texture_info.size()) {
-      /* Allocate some slots in advance, to reduce amount of re-allocations. */
-      texture_info.resize(slot + 128);
-    }
-
-    texture_info[slot] = mem.info;
-    texture_info[slot].data = (uint64_t)mem.host_pointer;
-    need_texture_info = true;
-  }
-
-  void tex_free(device_texture &mem)
-  {
-    if (mem.device_pointer) {
-      mem.device_pointer = 0;
-      stats.mem_free(mem.device_size);
-      mem.device_size = 0;
-      need_texture_info = true;
-    }
-  }
-
-  virtual void *osl_memory() override
-  {
-#ifdef WITH_OSL
-    return &osl_globals;
-#else
-    return NULL;
-#endif
-  }
-
-  void build_bvh(BVH *bvh, Progress &progress, bool refit) override
-  {
-#ifdef WITH_EMBREE
-    if (bvh->params.bvh_layout == BVH_LAYOUT_EMBREE ||
-        bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE) {
-      BVHEmbree *const bvh_embree = static_cast<BVHEmbree *>(bvh);
-      if (refit) {
-        bvh_embree->refit(progress);
-      }
-      else {
-        bvh_embree->build(progress, &stats, embree_device);
-      }
-
-      if (bvh->params.top_level) {
-        embree_scene = bvh_embree->scene;
-      }
-    }
-    else
-#endif
-      Device::build_bvh(bvh, progress, refit);
-  }
-
-  void thread_run(DeviceTask &task)
-  {
-    if (task.type == DeviceTask::RENDER)
-      thread_render(task);
-    else if (task.type == DeviceTask::SHADER)
-      thread_shader(task);
-    else if (task.type == DeviceTask::FILM_CONVERT)
-      thread_film_convert(task);
-    else if (task.type == DeviceTask::DENOISE_BUFFER)
-      thread_denoise(task);
-  }
-
-  bool denoising_non_local_means(device_ptr image_ptr,
-                                 device_ptr guide_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr out_ptr,
-                                 DenoisingTask *task)
-  {
-    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_NON_LOCAL_MEANS);
-
-    int4 rect = task->rect;
-    int r = task->nlm_state.r;
-    int f = task->nlm_state.f;
-    float a = task->nlm_state.a;
-    float k_2 = task->nlm_state.k_2;
-
-    int w = align_up(rect.z - rect.x, 4);
-    int h = rect.w - rect.y;
-    int stride = task->buffer.stride;
-    int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
-
-    float *temporary_mem = (float *)task->buffer.temporary_mem.device_pointer;
-    float *blurDifference = temporary_mem;
-    float *difference = temporary_mem + task->buffer.pass_stride;
-    float *weightAccum = temporary_mem + 2 * task->buffer.pass_stride;
-
-    memset(weightAccum, 0, sizeof(float) * w * h);
-    memset((float *)out_ptr, 0, sizeof(float) * w * h);
-
-    for (int i = 0; i < (2 * r + 1) * (2 * r + 1); i++) {
-      int dy = i / (2 * r + 1) - r;
-      int dx = i % (2 * r + 1) - r;
-
-      int local_rect[4] = {
-          max(0, -dx), max(0, -dy), rect.z - rect.x - max(0, dx), rect.w - rect.y - max(0, dy)};
-      filter_nlm_calc_difference_kernel()(dx,
-                                          dy,
-                                          (float *)guide_ptr,
-                                          (float *)variance_ptr,
-                                          NULL,
-                                          difference,
-                                          local_rect,
-                                          w,
-                                          channel_offset,
-                                          0,
-                                          a,
-                                          k_2);
-
-      filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f);
-      filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f);
-      filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f);
-
-      filter_nlm_update_output_kernel()(dx,
-                                        dy,
-                                        blurDifference,
-                                        (float *)image_ptr,
-                                        difference,
-                                        (float *)out_ptr,
-                                        weightAccum,
-                                        local_rect,
-                                        channel_offset,
-                                        stride,
-                                        f);
-    }
-
-    int local_rect[4] = {0, 0, rect.z - rect.x, rect.w - rect.y};
-    filter_nlm_normalize_kernel()((float *)out_ptr, weightAccum, local_rect, w);
-
-    return true;
-  }
-
-  bool denoising_construct_transform(DenoisingTask *task)
-  {
-    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_CONSTRUCT_TRANSFORM);
-
-    for (int y = 0; y < task->filter_area.w; y++) {
-      for (int x = 0; x < task->filter_area.z; x++) {
-        filter_construct_transform_kernel()((float *)task->buffer.mem.device_pointer,
-                                            task->tile_info,
-                                            x + task->filter_area.x,
-                                            y + task->filter_area.y,
-                                            y * task->filter_area.z + x,
-                                            (float *)task->storage.transform.device_pointer,
-                                            (int *)task->storage.rank.device_pointer,
-                                            &task->rect.x,
-                                            task->buffer.pass_stride,
-                                            task->buffer.frame_stride,
-                                            task->buffer.use_time,
-                                            task->radius,
-                                            task->pca_threshold);
-      }
-    }
-    return true;
-  }
-
-  bool denoising_accumulate(device_ptr color_ptr,
-                            device_ptr color_variance_ptr,
-                            device_ptr scale_ptr,
-                            int frame,
-                            DenoisingTask *task)
-  {
-    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_RECONSTRUCT);
-
-    float *temporary_mem = (float *)task->buffer.temporary_mem.device_pointer;
-    float *difference = temporary_mem;
-    float *blurDifference = temporary_mem + task->buffer.pass_stride;
-
-    int r = task->radius;
-    int frame_offset = frame * task->buffer.frame_stride;
-    for (int i = 0; i < (2 * r + 1) * (2 * r + 1); i++) {
-      int dy = i / (2 * r + 1) - r;
-      int dx = i % (2 * r + 1) - r;
-
-      int local_rect[4] = {max(0, -dx),
-                           max(0, -dy),
-                           task->reconstruction_state.source_w - max(0, dx),
-                           task->reconstruction_state.source_h - max(0, dy)};
-      filter_nlm_calc_difference_kernel()(dx,
-                                          dy,
-                                          (float *)color_ptr,
-                                          (float *)color_variance_ptr,
-                                          (float *)scale_ptr,
-                                          difference,
-                                          local_rect,
-                                          task->buffer.stride,
-                                          task->buffer.pass_stride,
-                                          frame_offset,
-                                          1.0f,
-                                          task->nlm_k_2);
-      filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
-      filter_nlm_calc_weight_kernel()(
-          blurDifference, difference, local_rect, task->buffer.stride, 4);
-      filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
-      filter_nlm_construct_gramian_kernel()(dx,
-                                            dy,
-                                            task->tile_info->frames[frame],
-                                            blurDifference,
-                                            (float *)task->buffer.mem.device_pointer,
-                                            (float *)task->storage.transform.device_pointer,
-                                            (int *)task->storage.rank.device_pointer,
-                                            (float *)task->storage.XtWX.device_pointer,
-                                            (float3 *)task->storage.XtWY.device_pointer,
-                                            local_rect,
-                                            &task->reconstruction_state.filter_window.x,
-                                            task->buffer.stride,
-                                            4,
-                                            task->buffer.pass_stride,
-                                            frame_offset,
-                                            task->buffer.use_time);
-    }
-
-    return true;
-  }
-
-  bool denoising_solve(device_ptr output_ptr, DenoisingTask *task)
-  {
-    for (int y = 0; y < task->filter_area.w; y++) {
-      for (int x = 0; x < task->filter_area.z; x++) {
-        filter_finalize_kernel()(x,
-                                 y,
-                                 y * task->filter_area.z + x,
-                                 (float *)output_ptr,
-                                 (int *)task->storage.rank.device_pointer,
-                                 (float *)task->storage.XtWX.device_pointer,
-                                 (float3 *)task->storage.XtWY.device_pointer,
-                                 &task->reconstruction_state.buffer_params.x,
-                                 task->render_buffer.samples);
-      }
-    }
-    return true;
-  }
-
-  bool denoising_combine_halves(device_ptr a_ptr,
-                                device_ptr b_ptr,
-                                device_ptr mean_ptr,
-                                device_ptr variance_ptr,
-                                int r,
-                                int4 rect,
-                                DenoisingTask *task)
-  {
-    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_COMBINE_HALVES);
-
-    for (int y = rect.y; y < rect.w; y++) {
-      for (int x = rect.x; x < rect.z; x++) {
-        filter_combine_halves_kernel()(x,
-                                       y,
-                                       (float *)mean_ptr,
-                                       (float *)variance_ptr,
-                                       (float *)a_ptr,
-                                       (float *)b_ptr,
-                                       &rect.x,
-                                       r);
-      }
-    }
-    return true;
-  }
-
-  bool denoising_divide_shadow(device_ptr a_ptr,
-                               device_ptr b_ptr,
-                               device_ptr sample_variance_ptr,
-                               device_ptr sv_variance_ptr,
-                               device_ptr buffer_variance_ptr,
-                               DenoisingTask *task)
-  {
-    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DIVIDE_SHADOW);
-
-    for (int y = task->rect.y; y < task->rect.w; y++) {
-      for (int x = task->rect.x; x < task->rect.z; x++) {
-        filter_divide_shadow_kernel()(task->render_buffer.samples,
-                                      task->tile_info,
-                                      x,
-                                      y,
-                                      (float *)a_ptr,
-                                      (float *)b_ptr,
-                                      (float *)sample_variance_ptr,
-                                      (float *)sv_variance_ptr,
-                                      (float *)buffer_variance_ptr,
-                                      &task->rect.x,
-                                      task->render_buffer.pass_stride,
-                                      task->render_buffer.offset);
-      }
-    }
-    return true;
-  }
-
-  bool denoising_get_feature(int mean_offset,
-                             int variance_offset,
-                             device_ptr mean_ptr,
-                             device_ptr variance_ptr,
-                             float scale,
-                             DenoisingTask *task)
-  {
-    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_GET_FEATURE);
-
-    for (int y = task->rect.y; y < task->rect.w; y++) {
-      for (int x = task->rect.x; x < task->rect.z; x++) {
-        filter_get_feature_kernel()(task->render_buffer.samples,
-                                    task->tile_info,
-                                    mean_offset,
-                                    variance_offset,
-                                    x,
-                                    y,
-                                    (float *)mean_ptr,
-                                    (float *)variance_ptr,
-                                    scale,
-                                    &task->rect.x,
-                                    task->render_buffer.pass_stride,
-                                    task->render_buffer.offset);
-      }
-    }
-    return true;
-  }
-
-  bool denoising_write_feature(int out_offset,
-                               device_ptr from_ptr,
-                               device_ptr buffer_ptr,
-                               DenoisingTask *task)
-  {
-    for (int y = 0; y < task->filter_area.w; y++) {
-      for (int x = 0; x < task->filter_area.z; x++) {
-        filter_write_feature_kernel()(task->render_buffer.samples,
-                                      x + task->filter_area.x,
-                                      y + task->filter_area.y,
-                                      &task->reconstruction_state.buffer_params.x,
-                                      (float *)from_ptr,
-                                      (float *)buffer_ptr,
-                                      out_offset,
-                                      &task->rect.x);
-      }
-    }
-    return true;
-  }
-
-  bool denoising_detect_outliers(device_ptr image_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr depth_ptr,
-                                 device_ptr output_ptr,
-                                 DenoisingTask *task)
-  {
-    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DETECT_OUTLIERS);
-
-    for (int y = task->rect.y; y < task->rect.w; y++) {
-      for (int x = task->rect.x; x < task->rect.z; x++) {
-        filter_detect_outliers_kernel()(x,
-                                        y,
-                                        (float *)image_ptr,
-                                        (float *)variance_ptr,
-                                        (float *)depth_ptr,
-                                        (float *)output_ptr,
-                                        &task->rect.x,
-                                        task->buffer.pass_stride);
-      }
-    }
-    return true;
-  }
-
-  bool adaptive_sampling_filter(KernelGlobals *kg, RenderTile &tile, int sample)
-  {
-    WorkTile wtile;
-    wtile.x = tile.x;
-    wtile.y = tile.y;
-    wtile.w = tile.w;
-    wtile.h = tile.h;
-    wtile.offset = tile.offset;
-    wtile.stride = tile.stride;
-    wtile.buffer = (float *)tile.buffer;
-
-    /* For CPU we do adaptive stopping per sample so we can stop earlier, but
-     * for combined CPU + GPU rendering we match the GPU and do it per tile
-     * after a given number of sample steps. */
-    if (!kernel_data.integrator.adaptive_stop_per_sample) {
-      for (int y = wtile.y; y < wtile.y + wtile.h; ++y) {
-        for (int x = wtile.x; x < wtile.x + wtile.w; ++x) {
-          const int index = wtile.offset + x + y * wtile.stride;
-          float *buffer = wtile.buffer + index * kernel_data.film.pass_stride;
-          kernel_do_adaptive_stopping(kg, buffer, sample);
-        }
-      }
-    }
-
-    bool any = false;
-    for (int y = wtile.y; y < wtile.y + wtile.h; ++y) {
-      any |= kernel_do_adaptive_filter_x(kg, y, &wtile);
-    }
-    for (int x = wtile.x; x < wtile.x + wtile.w; ++x) {
-      any |= kernel_do_adaptive_filter_y(kg, x, &wtile);
-    }
-    return (!any);
-  }
-
-  void adaptive_sampling_post(const RenderTile &tile, KernelGlobals *kg)
-  {
-    float *render_buffer = (float *)tile.buffer;
-    for (int y = tile.y; y < tile.y + tile.h; y++) {
-      for (int x = tile.x; x < tile.x + tile.w; x++) {
-        int index = tile.offset + x + y * tile.stride;
-        ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride;
-        if (buffer[kernel_data.film.pass_sample_count] < 0.0f) {
-          buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
-          float sample_multiplier = tile.sample / buffer[kernel_data.film.pass_sample_count];
-          if (sample_multiplier != 1.0f) {
-            kernel_adaptive_post_adjust(kg, buffer, sample_multiplier);
-          }
-        }
-        else {
-          kernel_adaptive_post_adjust(kg, buffer, tile.sample / (tile.sample - 1.0f));
-        }
-      }
-    }
-  }
-
-  void render(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
-  {
-    const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE;
-
-    scoped_timer timer(&tile.buffers->render_time);
-
-    Coverage coverage(kg, tile);
-    if (use_coverage) {
-      coverage.init_path_trace();
-    }
-
-    float *render_buffer = (float *)tile.buffer;
-    int start_sample = tile.start_sample;
-    int end_sample = tile.start_sample + tile.num_samples;
-
-    /* Needed for Embree. */
-    SIMD_SET_FLUSH_TO_ZERO;
-
-    for (int sample = start_sample; sample < end_sample; sample++) {
-      if (task.get_cancel() || TaskPool::canceled()) {
-        if (task.need_finish_queue == false)
-          break;
-      }
-
-      if (tile.stealing_state == RenderTile::CAN_BE_STOLEN && task.get_tile_stolen()) {
-        tile.stealing_state = RenderTile::WAS_STOLEN;
-        break;
-      }
-
-      if (tile.task == RenderTile::PATH_TRACE) {
-        for (int y = tile.y; y < tile.y + tile.h; y++) {
-          for (int x = tile.x; x < tile.x + tile.w; x++) {
-            if (use_coverage) {
-              coverage.init_pixel(x, y);
-            }
-            path_trace_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
-          }
-        }
-      }
-      else {
-        for (int y = tile.y; y < tile.y + tile.h; y++) {
-          for (int x = tile.x; x < tile.x + tile.w; x++) {
-            bake_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
-          }
-        }
-      }
-      tile.sample = sample + 1;
-
-      if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(sample)) {
-        const bool stop = adaptive_sampling_filter(kg, tile, sample);
-        if (stop) {
-          const int num_progress_samples = end_sample - sample;
-          tile.sample = end_sample;
-          task.update_progress(&tile, tile.w * tile.h * num_progress_samples);
-          break;
-        }
-      }
-
-      task.update_progress(&tile, tile.w * tile.h);
-    }
-    if (use_coverage) {
-      coverage.finalize();
-    }
-
-    if (task.adaptive_sampling.use && (tile.stealing_state != RenderTile::WAS_STOLEN)) {
-      adaptive_sampling_post(tile, kg);
-    }
-  }
-
-  void denoise_openimagedenoise_buffer(DeviceTask &task,
-                                       float *buffer,
-                                       const size_t offset,
-                                       const size_t stride,
-                                       const size_t x,
-                                       const size_t y,
-                                       const size_t w,
-                                       const size_t h,
-                                       const float scale)
-  {
-#ifdef WITH_OPENIMAGEDENOISE
-    assert(openimagedenoise_supported());
-
-    /* Only one at a time, since OpenImageDenoise itself is multithreaded for full
-     * buffers, and for tiled rendering because creating multiple devices and filters
-     * is slow and memory hungry as well.
-     *
-     * TODO: optimize tiled rendering case, by batching together denoising of many
-     * tiles somehow? */
-    static thread_mutex mutex;
-    thread_scoped_lock lock(mutex);
-
-    /* Create device and filter, cached for reuse. */
-    if (!oidn_device) {
-      oidn_device = oidn::newDevice();
-      oidn_device.commit();
-    }
-    if (!oidn_filter) {
-      oidn_filter = oidn_device.newFilter("RT");
-      oidn_filter.set("hdr", true);
-      oidn_filter.set("srgb", false);
-    }
-
-    /* Set images with appropriate stride for our interleaved pass storage. */
-    struct {
-      const char *name;
-      const int offset;
-      const bool scale;
-      const bool use;
-      array<float> scaled_buffer;
-    } passes[] = {{"color", task.pass_denoising_data + DENOISING_PASS_COLOR, false, true},
-                  {"albedo",
-                   task.pass_denoising_data + DENOISING_PASS_ALBEDO,
-                   true,
-                   task.denoising.input_passes >= DENOISER_INPUT_RGB_ALBEDO},
-                  {"normal",
-                   task.pass_denoising_data + DENOISING_PASS_NORMAL,
-                   true,
-                   task.denoising.input_passes >= DENOISER_INPUT_RGB_ALBEDO_NORMAL},
-                  {"output", 0, false, true},
-                  { NULL,
-                    0 }};
-
-    for (int i = 0; passes[i].name; i++) {
-      if (!passes[i].use) {
-        continue;
-      }
-
-      const int64_t pixel_offset = offset + x + y * stride;
-      const int64_t buffer_offset = (pixel_offset * task.pass_stride + passes[i].offset);
-      const int64_t pixel_stride = task.pass_stride;
-      const int64_t row_stride = stride * pixel_stride;
-
-      if (passes[i].scale && scale != 1.0f) {
-        /* Normalize albedo and normal passes as they are scaled by the number of samples.
-         * For the color passes OIDN will perform auto-exposure making it unnecessary. */
-        array<float> &scaled_buffer = passes[i].scaled_buffer;
-        scaled_buffer.resize(w * h * 3);
-
-        for (int y = 0; y < h; y++) {
-          const float *pass_row = buffer + buffer_offset + y * row_stride;
-          float *scaled_row = scaled_buffer.data() + y * w * 3;
-
-          for (int x = 0; x < w; x++) {
-            scaled_row[x * 3 + 0] = pass_row[x * pixel_stride + 0] * scale;
-            scaled_row[x * 3 + 1] = pass_row[x * pixel_stride + 1] * scale;
-            scaled_row[x * 3 + 2] = pass_row[x * pixel_stride + 2] * scale;
-          }
-        }
-
-        oidn_filter.setImage(
-            passes[i].name, scaled_buffer.data(), oidn::Format::Float3, w, h, 0, 0, 0);
-      }
-      else {
-        oidn_filter.setImage(passes[i].name,
-                             buffer + buffer_offset,
-                             oidn::Format::Float3,
-                             w,
-                             h,
-                             0,
-                             pixel_stride * sizeof(float),
-                             row_stride * sizeof(float));
-      }
-    }
-
-    /* Execute filter. */
-    oidn_filter.commit();
-    oidn_filter.execute();
-#else
-    (void)task;
-    (void)buffer;
-    (void)offset;
-    (void)stride;
-    (void)x;
-    (void)y;
-    (void)w;
-    (void)h;
-    (void)scale;
-#endif
-  }
-
-  void denoise_openimagedenoise(DeviceTask &task, RenderTile &rtile)
-  {
-    if (task.type == DeviceTask::DENOISE_BUFFER) {
-      /* Copy pixels from compute device to CPU (no-op for CPU device). */
-      rtile.buffers->buffer.copy_from_device();
-
-      denoise_openimagedenoise_buffer(task,
-                                      (float *)rtile.buffer,
-                                      rtile.offset,
-                                      rtile.stride,
-                                      rtile.x,
-                                      rtile.y,
-                                      rtile.w,
-                                      rtile.h,
-                                      1.0f / rtile.sample);
-
-      /* todo: it may be possible to avoid this copy, but we have to ensure that
-       * when other code copies data from the device it doesn't overwrite the
-       * denoiser buffers. */
-      rtile.buffers->buffer.copy_to_device();
-    }
-    else {
-      /* Per-tile denoising. */
-      rtile.sample = rtile.start_sample + rtile.num_samples;
-      const float scale = 1.0f / rtile.sample;
-      const float invscale = rtile.sample;
-      const size_t pass_stride = task.pass_stride;
-
-      /* Map neighboring tiles into one buffer for denoising. */
-      RenderTileNeighbors neighbors(rtile);
-      task.map_neighbor_tiles(neighbors, this);
-      RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
-      rtile = center_tile;
-
-      /* Calculate size of the tile to denoise (including overlap). The overlap
-       * size was chosen empirically. OpenImageDenoise specifies an overlap size
-       * of 128 but this is significantly bigger than typical tile size. */
-      const int4 rect = rect_clip(rect_expand(center_tile.bounds(), 64), neighbors.bounds());
-      const int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y);
-
-      /* Adjacent tiles are in separate memory regions, copy into single buffer. */
-      array<float> merged(rect_size.x * rect_size.y * task.pass_stride);
-
-      for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
-        RenderTile &ntile = neighbors.tiles[i];
-        if (!ntile.buffer) {
-          continue;
-        }
-
-        const int xmin = max(ntile.x, rect.x);
-        const int ymin = max(ntile.y, rect.y);
-        const int xmax = min(ntile.x + ntile.w, rect.z);
-        const int ymax = min(ntile.y + ntile.h, rect.w);
-
-        const size_t tile_offset = ntile.offset + xmin + ymin * ntile.stride;
-        const float *tile_buffer = (float *)ntile.buffer + tile_offset * pass_stride;
-
-        const size_t merged_stride = rect_size.x;
-        const size_t merged_offset = (xmin - rect.x) + (ymin - rect.y) * merged_stride;
-        float *merged_buffer = merged.data() + merged_offset * pass_stride;
-
-        for (int y = ymin; y < ymax; y++) {
-          for (int x = 0; x < pass_stride * (xmax - xmin); x++) {
-            merged_buffer[x] = tile_buffer[x] * scale;
-          }
-          tile_buffer += ntile.stride * pass_stride;
-          merged_buffer += merged_stride * pass_stride;
-        }
-      }
-
-      /* Denoise */
-      denoise_openimagedenoise_buffer(
-          task, merged.data(), 0, rect_size.x, 0, 0, rect_size.x, rect_size.y, 1.0f);
-
-      /* Copy back result from merged buffer. */
-      RenderTile &ntile = neighbors.target;
-      if (ntile.buffer) {
-        const int xmin = max(ntile.x, rect.x);
-        const int ymin = max(ntile.y, rect.y);
-        const int xmax = min(ntile.x + ntile.w, rect.z);
-        const int ymax = min(ntile.y + ntile.h, rect.w);
-
-        const size_t tile_offset = ntile.offset + xmin + ymin * ntile.stride;
-        float *tile_buffer = (float *)ntile.buffer + tile_offset * pass_stride;
-
-        const size_t merged_stride = rect_size.x;
-        const size_t merged_offset = (xmin - rect.x) + (ymin - rect.y) * merged_stride;
-        const float *merged_buffer = merged.data() + merged_offset * pass_stride;
-
-        for (int y = ymin; y < ymax; y++) {
-          for (int x = 0; x < pass_stride * (xmax - xmin); x += pass_stride) {
-            tile_buffer[x + 0] = merged_buffer[x + 0] * invscale;
-            tile_buffer[x + 1] = merged_buffer[x + 1] * invscale;
-            tile_buffer[x + 2] = merged_buffer[x + 2] * invscale;
-          }
-          tile_buffer += ntile.stride * pass_stride;
-          merged_buffer += merged_stride * pass_stride;
-        }
-      }
-
-      task.unmap_neighbor_tiles(neighbors, this);
-    }
-  }
-
-  void denoise_nlm(DenoisingTask &denoising, RenderTile &tile)
-  {
-    ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING);
-
-    tile.sample = tile.start_sample + tile.num_samples;
-
-    denoising.functions.construct_transform = function_bind(
-        &CPUDevice::denoising_construct_transform, this, &denoising);
-    denoising.functions.accumulate = function_bind(
-        &CPUDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
-    denoising.functions.solve = function_bind(&CPUDevice::denoising_solve, this, _1, &denoising);
-    denoising.functions.divide_shadow = function_bind(
-        &CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
-    denoising.functions.non_local_means = function_bind(
-        &CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
-    denoising.functions.combine_halves = function_bind(
-        &CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
-    denoising.functions.get_feature = function_bind(
-        &CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
-    denoising.functions.write_feature = function_bind(
-        &CPUDevice::denoising_write_feature, this, _1, _2, _3, &denoising);
-    denoising.functions.detect_outliers = function_bind(
-        &CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
-
-    denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h);
-    denoising.render_buffer.samples = tile.sample;
-    denoising.buffer.gpu_temporary_mem = false;
-
-    denoising.run_denoising(tile);
-  }
-
-  void thread_render(DeviceTask &task)
-  {
-    if (TaskPool::canceled()) {
-      if (task.need_finish_queue == false)
-        return;
-    }
-
-    /* allocate buffer for kernel globals */
-    device_only_memory<KernelGlobals> kgbuffer(this, "kernel_globals");
-    kgbuffer.alloc_to_device(1);
-
-    KernelGlobals *kg = new ((void *)kgbuffer.device_pointer)
-        KernelGlobals(thread_kernel_globals_init());
-
-    profiler.add_state(&kg->profiler);
-
-    CPUSplitKernel *split_kernel = NULL;
-    if (use_split_kernel) {
-      split_kernel = new CPUSplitKernel(this);
-      if (!split_kernel->load_kernels(requested_features)) {
-        thread_kernel_globals_free((KernelGlobals *)kgbuffer.device_pointer);
-        kgbuffer.free();
-        delete split_kernel;
-        return;
-      }
-    }
-
-    /* NLM denoiser. */
-    DenoisingTask *denoising = NULL;
-
-    /* OpenImageDenoise: we can only denoise with one thread at a time, so to
-     * avoid waiting with mutex locks in the denoiser, we let only a single
-     * thread acquire denoising tiles. */
-    uint tile_types = task.tile_types;
-    bool hold_denoise_lock = false;
-    if ((tile_types & RenderTile::DENOISE) && task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
-      if (!oidn_task_lock.try_lock()) {
-        tile_types &= ~RenderTile::DENOISE;
-        hold_denoise_lock = true;
-      }
-    }
-
-    RenderTile tile;
-    while (task.acquire_tile(this, tile, tile_types)) {
-      if (tile.task == RenderTile::PATH_TRACE) {
-        if (use_split_kernel) {
-          device_only_memory<uchar> void_buffer(this, "void_buffer");
-          split_kernel->path_trace(task, tile, kgbuffer, void_buffer);
-        }
-        else {
-          render(task, tile, kg);
-        }
-      }
-      else if (tile.task == RenderTile::BAKE) {
-        render(task, tile, kg);
-      }
-      else if (tile.task == RenderTile::DENOISE) {
-        if (task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
-          denoise_openimagedenoise(task, tile);
-        }
-        else if (task.denoising.type == DENOISER_NLM) {
-          if (denoising == NULL) {
-            denoising = new DenoisingTask(this, task);
-            denoising->profiler = &kg->profiler;
-          }
-          denoise_nlm(*denoising, tile);
-        }
-        task.update_progress(&tile, tile.w * tile.h);
-      }
-
-      task.release_tile(tile);
-
-      if (TaskPool::canceled()) {
-        if (task.need_finish_queue == false)
-          break;
-      }
-    }
-
-    if (hold_denoise_lock) {
-      oidn_task_lock.unlock();
-    }
-
-    profiler.remove_state(&kg->profiler);
-
-    thread_kernel_globals_free((KernelGlobals *)kgbuffer.device_pointer);
-    kg->~KernelGlobals();
-    kgbuffer.free();
-    delete split_kernel;
-    delete denoising;
-  }
-
-  void thread_denoise(DeviceTask &task)
-  {
-    RenderTile tile;
-    tile.x = task.x;
-    tile.y = task.y;
-    tile.w = task.w;
-    tile.h = task.h;
-    tile.buffer = task.buffer;
-    tile.sample = task.sample + task.num_samples;
-    tile.num_samples = task.num_samples;
-    tile.start_sample = task.sample;
-    tile.offset = task.offset;
-    tile.stride = task.stride;
-    tile.buffers = task.buffers;
-
-    if (task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
-      denoise_openimagedenoise(task, tile);
-    }
-    else {
-      DenoisingTask denoising(this, task);
-
-      ProfilingState denoising_profiler_state;
-      profiler.add_state(&denoising_profiler_state);
-      denoising.profiler = &denoising_profiler_state;
-
-      denoise_nlm(denoising, tile);
-
-      profiler.remove_state(&denoising_profiler_state);
-    }
-
-    task.update_progress(&tile, tile.w * tile.h);
-  }
-
-  void thread_film_convert(DeviceTask &task)
-  {
-    float sample_scale = 1.0f / (task.sample + 1);
-
-    if (task.rgba_half) {
-      for (int y = task.y; y < task.y + task.h; y++)
-        for (int x = task.x; x < task.x + task.w; x++)
-          convert_to_half_float_kernel()(&kernel_globals,
-                                         (uchar4 *)task.rgba_half,
-                                         (float *)task.buffer,
-                                         sample_scale,
-                                         x,
-                                         y,
-                                         task.offset,
-                                         task.stride);
-    }
-    else {
-      for (int y = task.y; y < task.y + task.h; y++)
-        for (int x = task.x; x < task.x + task.w; x++)
-          convert_to_byte_kernel()(&kernel_globals,
-                                   (uchar4 *)task.rgba_byte,
-                                   (float *)task.buffer,
-                                   sample_scale,
-                                   x,
-                                   y,
-                                   task.offset,
-                                   task.stride);
-    }
-  }
-
-  void thread_shader(DeviceTask &task)
-  {
-    KernelGlobals *kg = new KernelGlobals(thread_kernel_globals_init());
-
-    for (int sample = 0; sample < task.num_samples; sample++) {
-      for (int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-        shader_kernel()(kg,
-                        (uint4 *)task.shader_input,
-                        (float4 *)task.shader_output,
-                        task.shader_eval_type,
-                        task.shader_filter,
-                        x,
-                        task.offset,
-                        sample);
-
-      if (task.get_cancel() || TaskPool::canceled())
-        break;
-
-      task.update_progress(NULL);
-    }
-
-    thread_kernel_globals_free(kg);
-    delete kg;
-  }
-
-  virtual int get_split_task_count(DeviceTask &task) override
-  {
-    if (task.type == DeviceTask::SHADER)
-      return task.get_subtask_count(info.cpu_threads, 256);
-    else
-      return task.get_subtask_count(info.cpu_threads);
-  }
-
-  virtual void task_add(DeviceTask &task) override
-  {
-    /* Load texture info. */
-    load_texture_info();
-
-    /* split task into smaller ones */
-    list<DeviceTask> tasks;
-
-    if (task.type == DeviceTask::DENOISE_BUFFER &&
-        task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
-      /* Denoise entire buffer at once with OIDN, it has own threading. */
-      tasks.push_back(task);
-    }
-    else if (task.type == DeviceTask::SHADER) {
-      task.split(tasks, info.cpu_threads, 256);
-    }
-    else {
-      task.split(tasks, info.cpu_threads);
-    }
-
-    foreach (DeviceTask &task, tasks) {
-      task_pool.push([=] {
-        DeviceTask task_copy = task;
-        thread_run(task_copy);
-      });
-    }
-  }
-
-  virtual void task_wait() override
-  {
-    task_pool.wait_work();
-  }
-
-  virtual void task_cancel() override
-  {
-    task_pool.cancel();
-  }
-
- protected:
-  inline KernelGlobals thread_kernel_globals_init()
-  {
-    KernelGlobals kg = kernel_globals;
-    kg.transparent_shadow_intersections = NULL;
-    const int decoupled_count = sizeof(kg.decoupled_volume_steps) /
-                                sizeof(*kg.decoupled_volume_steps);
-    for (int i = 0; i < decoupled_count; ++i) {
-      kg.decoupled_volume_steps[i] = NULL;
-    }
-    kg.decoupled_volume_steps_index = 0;
-    kg.coverage_asset = kg.coverage_object = kg.coverage_material = NULL;
-#ifdef WITH_OSL
-    OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
-#endif
-    return kg;
-  }
-
-  inline void thread_kernel_globals_free(KernelGlobals *kg)
-  {
-    if (kg == NULL) {
-      return;
-    }
-
-    if (kg->transparent_shadow_intersections != NULL) {
-      free(kg->transparent_shadow_intersections);
-    }
-    const int decoupled_count = sizeof(kg->decoupled_volume_steps) /
-                                sizeof(*kg->decoupled_volume_steps);
-    for (int i = 0; i < decoupled_count; ++i) {
-      if (kg->decoupled_volume_steps[i] != NULL) {
-        free(kg->decoupled_volume_steps[i]);
-      }
-    }
-#ifdef WITH_OSL
-    OSLShader::thread_free(kg);
-#endif
-  }
-
-  virtual bool load_kernels(const DeviceRequestedFeatures &requested_features_) override
-  {
-    requested_features = requested_features_;
-
-    return true;
-  }
-};
-
-/* split kernel */
-
-class CPUSplitKernelFunction : public SplitKernelFunction {
- public:
-  CPUDevice *device;
-  void (*func)(KernelGlobals *kg, KernelData *data);
-
-  CPUSplitKernelFunction(CPUDevice *device) : device(device), func(NULL)
-  {
-  }
-  ~CPUSplitKernelFunction()
-  {
-  }
-
-  virtual bool enqueue(const KernelDimensions &dim,
-                       device_memory &kernel_globals,
-                       device_memory &data)
-  {
-    if (!func) {
-      return false;
-    }
-
-    KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer;
-    kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
-
-    for (int y = 0; y < dim.global_size[1]; y++) {
-      for (int x = 0; x < dim.global_size[0]; x++) {
-        kg->global_id = make_int2(x, y);
-
-        func(kg, (KernelData *)data.device_pointer);
-      }
-    }
-
-    return true;
-  }
-};
-
-CPUSplitKernel::CPUSplitKernel(CPUDevice *device) : DeviceSplitKernel(device), device(device)
-{
-}
-
-bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim,
-                                                    RenderTile &rtile,
-                                                    int num_global_elements,
-                                                    device_memory &kernel_globals,
-                                                    device_memory &data,
-                                                    device_memory &split_data,
-                                                    device_memory &ray_state,
-                                                    device_memory &queue_index,
-                                                    device_memory &use_queues_flags,
-                                                    device_memory &work_pool_wgs)
-{
-  KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer;
-  kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
-
-  for (int y = 0; y < dim.global_size[1]; y++) {
-    for (int x = 0; x < dim.global_size[0]; x++) {
-      kg->global_id = make_int2(x, y);
-
-      device->data_init_kernel()((KernelGlobals *)kernel_globals.device_pointer,
-                                 (KernelData *)data.device_pointer,
-                                 (void *)split_data.device_pointer,
-                                 num_global_elements,
-                                 (char *)ray_state.device_pointer,
-                                 rtile.start_sample,
-                                 rtile.start_sample + rtile.num_samples,
-                                 rtile.x,
-                                 rtile.y,
-                                 rtile.w,
-                                 rtile.h,
-                                 rtile.offset,
-                                 rtile.stride,
-                                 (int *)queue_index.device_pointer,
-                                 dim.global_size[0] * dim.global_size[1],
-                                 (char *)use_queues_flags.device_pointer,
-                                 (uint *)work_pool_wgs.device_pointer,
-                                 rtile.num_samples,
-                                 (float *)rtile.buffer);
-    }
-  }
-
-  return true;
-}
-
-SplitKernelFunction *CPUSplitKernel::get_split_kernel_function(const string &kernel_name,
-                                                               const DeviceRequestedFeatures &)
-{
-  CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
-
-  kernel->func = device->split_kernels[kernel_name]();
-  if (!kernel->func) {
-    delete kernel;
-    return NULL;
-  }
-
-  return kernel;
-}
-
-int2 CPUSplitKernel::split_kernel_local_size()
-{
-  return make_int2(1, 1);
-}
-
-int2 CPUSplitKernel::split_kernel_global_size(device_memory & /*kg*/,
-                                              device_memory & /*data*/,
-                                              DeviceTask & /*task*/)
-{
-  return make_int2(1, 1);
-}
-
-uint64_t CPUSplitKernel::state_buffer_size(device_memory &kernel_globals,
-                                           device_memory & /*data*/,
-                                           size_t num_threads)
-{
-  KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer;
-
-  return split_data_buffer_size(kg, num_threads);
-}
-
-Device *device_cpu_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
-{
-  return new CPUDevice(info, stats, profiler, background);
-}
-
-void device_cpu_info(vector<DeviceInfo> &devices)
-{
-  DeviceInfo info;
-
-  info.type = DEVICE_CPU;
-  info.description = system_cpu_brand_string();
-  info.id = "CPU";
-  info.num = 0;
-  info.has_volume_decoupled = true;
-  info.has_adaptive_stop_per_sample = true;
-  info.has_osl = true;
-  info.has_half_images = true;
-  info.has_nanovdb = true;
-  info.has_profiling = true;
-  info.denoisers = DENOISER_NLM;
-  if (openimagedenoise_supported()) {
-    info.denoisers |= DENOISER_OPENIMAGEDENOISE;
-  }
-
-  devices.insert(devices.begin(), info);
-}
-
-string device_cpu_capabilities()
-{
-  string capabilities = "";
-  capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
-  capabilities += system_cpu_support_sse3() ? "SSE3 " : "";
-  capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
-  capabilities += system_cpu_support_avx() ? "AVX " : "";
-  capabilities += system_cpu_support_avx2() ? "AVX2" : "";
-  if (capabilities[capabilities.size() - 1] == ' ')
-    capabilities.resize(capabilities.size() - 1);
-  return capabilities;
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoise.cpp b/intern/cycles/device/device_denoise.cpp
new file mode 100644
index 00000000000..aea7868f65d
--- /dev/null
+++ b/intern/cycles/device/device_denoise.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device_denoise.h"
+
+CCL_NAMESPACE_BEGIN
+
+const char *denoiserTypeToHumanReadable(DenoiserType type)
+{
+  switch (type) {
+    case DENOISER_OPTIX:
+      return "OptiX";
+    case DENOISER_OPENIMAGEDENOISE:
+      return "OpenImageDenoise";
+
+    case DENOISER_NUM:
+    case DENOISER_NONE:
+    case DENOISER_ALL:
+      return "UNKNOWN";
+  }
+
+  return "UNKNOWN";
+}
+
+const NodeEnum *DenoiseParams::get_type_enum()
+{
+  static NodeEnum type_enum;
+
+  if (type_enum.empty()) {
+    type_enum.insert("optix", DENOISER_OPTIX);
+    type_enum.insert("openimageio", DENOISER_OPENIMAGEDENOISE);
+  }
+
+  return &type_enum;
+}
+
+const NodeEnum *DenoiseParams::get_prefilter_enum()
+{
+  static NodeEnum prefilter_enum;
+
+  if (prefilter_enum.empty()) {
+    prefilter_enum.insert("none", DENOISER_PREFILTER_NONE);
+    prefilter_enum.insert("fast", DENOISER_PREFILTER_FAST);
+    prefilter_enum.insert("accurate", DENOISER_PREFILTER_ACCURATE);
+  }
+
+  return &prefilter_enum;
+}
+
+NODE_DEFINE(DenoiseParams)
+{
+  NodeType *type = NodeType::add("denoise_params", create);
+
+  const NodeEnum *type_enum = get_type_enum();
+  const NodeEnum *prefilter_enum = get_prefilter_enum();
+
+  SOCKET_BOOLEAN(use, "Use", false);
+
+  SOCKET_ENUM(type, "Type", *type_enum, DENOISER_OPENIMAGEDENOISE);
+
+  SOCKET_INT(start_sample, "Start Sample", 0);
+
+  SOCKET_BOOLEAN(use_pass_albedo, "Use Pass Albedo", true);
+  SOCKET_BOOLEAN(use_pass_normal, "Use Pass Normal", false);
+
+  SOCKET_ENUM(prefilter, "Prefilter", *prefilter_enum, DENOISER_PREFILTER_FAST);
+
+  return type;
+}
+
+DenoiseParams::DenoiseParams() : Node(get_node_type())
+{
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoise.h b/intern/cycles/device/device_denoise.h
new file mode 100644
index 00000000000..dfdc7cc87b3
--- /dev/null
+++ b/intern/cycles/device/device_denoise.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "device/device_memory.h"
+#include "graph/node.h"
+#include "render/buffers.h"
+
+CCL_NAMESPACE_BEGIN
+
+enum DenoiserType {
+  DENOISER_OPTIX = 2,
+  DENOISER_OPENIMAGEDENOISE = 4,
+  DENOISER_NUM,
+
+  DENOISER_NONE = 0,
+  DENOISER_ALL = ~0,
+};
+
+/* COnstruct human-readable string which denotes the denoiser type. */
+const char *denoiserTypeToHumanReadable(DenoiserType type);
+
+typedef int DenoiserTypeMask;
+
+enum DenoiserPrefilter {
+  /* Best quality of the result without extra processing time, but requires guiding passes to be
+   * noise-free. */
+  DENOISER_PREFILTER_NONE = 1,
+
+  /* Denoise color and guiding passes together.
+   * Improves quality when guiding passes are noisy using least amount of extra processing time. */
+  DENOISER_PREFILTER_FAST = 2,
+
+  /* Prefilter noisy guiding passes before denoising color.
+   * Improves quality when guiding passes are noisy using extra processing time. */
+  DENOISER_PREFILTER_ACCURATE = 3,
+
+  DENOISER_PREFILTER_NUM,
+};
+
+/* NOTE: Is not a real scene node. Using Node API for ease of (de)serialization.
+ * The default values here do not really matter as they are always initialized from the
+ * Integrator node. */
+class DenoiseParams : public Node {
+ public:
+  NODE_DECLARE
+
+  /* Apply denoiser to image. */
+  bool use = false;
+
+  /* Denoiser type. */
+  DenoiserType type = DENOISER_OPENIMAGEDENOISE;
+
+  /* Viewport start sample. */
+  int start_sample = 0;
+
+  /* Auxiliary passes. */
+  bool use_pass_albedo = true;
+  bool use_pass_normal = true;
+
+  DenoiserPrefilter prefilter = DENOISER_PREFILTER_FAST;
+
+  static const NodeEnum *get_type_enum();
+  static const NodeEnum *get_prefilter_enum();
+
+  DenoiseParams();
+
+  bool modified(const DenoiseParams &other) const
+  {
+    return !(use == other.use && type == other.type && start_sample == other.start_sample &&
+             use_pass_albedo == other.use_pass_albedo &&
+             use_pass_normal == other.use_pass_normal && prefilter == other.prefilter);
+  }
+};
+
+/* All the parameters needed to perform buffer denoising on a device.
+ * Is not really a task in its canonical terms (as in, is not an asynchronous running task). Is
+ * more like a wrapper for all the arguments and parameters needed to perform denoising. Is a
+ * single place where they are all listed, so that it's not required to modify all device methods
+ * when these parameters do change. */
+class DeviceDenoiseTask {
+ public:
+  DenoiseParams params;
+
+  int num_samples;
+
+  RenderBuffers *render_buffers;
+  BufferParams buffer_params;
+
+  /* Allow to do in-place modification of the input passes (scaling them down i.e.). This will
+   * lower the memory footprint of the denoiser but will make input passes "invalid" (from path
+   * tracer) point of view. */
+  bool allow_inplace_modification;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
deleted file mode 100644
index 38c42d15cab..00000000000
--- a/intern/cycles/device/device_denoising.cpp
+++ /dev/null
@@ -1,353 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "device/device_denoising.h"
-
-#include "kernel/filter/filter_defines.h"
-
-CCL_NAMESPACE_BEGIN
-
-DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task)
-    : tile_info_mem(device, "denoising tile info mem", MEM_READ_WRITE),
-      profiler(NULL),
-      storage(device),
-      buffer(device),
-      device(device)
-{
-  radius = task.denoising.radius;
-  nlm_k_2 = powf(2.0f, lerp(-5.0f, 3.0f, task.denoising.strength));
-  if (task.denoising.relative_pca) {
-    pca_threshold = -powf(10.0f, lerp(-8.0f, 0.0f, task.denoising.feature_strength));
-  }
-  else {
-    pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising.feature_strength));
-  }
-
-  render_buffer.frame_stride = task.frame_stride;
-  render_buffer.pass_stride = task.pass_stride;
-  render_buffer.offset = task.pass_denoising_data;
-
-  target_buffer.pass_stride = task.target_pass_stride;
-  target_buffer.denoising_clean_offset = task.pass_denoising_clean;
-  target_buffer.offset = 0;
-
-  functions.map_neighbor_tiles = function_bind(task.map_neighbor_tiles, _1, device);
-  functions.unmap_neighbor_tiles = function_bind(task.unmap_neighbor_tiles, _1, device);
-
-  tile_info = (TileInfo *)tile_info_mem.alloc(sizeof(TileInfo) / sizeof(int));
-  tile_info->from_render = task.denoising_from_render ? 1 : 0;
-
-  tile_info->frames[0] = 0;
-  tile_info->num_frames = min(task.denoising_frames.size() + 1, DENOISE_MAX_FRAMES);
-  for (int i = 1; i < tile_info->num_frames; i++) {
-    tile_info->frames[i] = task.denoising_frames[i - 1];
-  }
-
-  do_prefilter = task.denoising.store_passes && task.denoising.type == DENOISER_NLM;
-  do_filter = task.denoising.use && task.denoising.type == DENOISER_NLM;
-}
-
-DenoisingTask::~DenoisingTask()
-{
-  storage.XtWX.free();
-  storage.XtWY.free();
-  storage.transform.free();
-  storage.rank.free();
-  buffer.mem.free();
-  buffer.temporary_mem.free();
-  tile_info_mem.free();
-}
-
-void DenoisingTask::set_render_buffer(RenderTileNeighbors &neighbors)
-{
-  for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
-    RenderTile &rtile = neighbors.tiles[i];
-    tile_info->offsets[i] = rtile.offset;
-    tile_info->strides[i] = rtile.stride;
-    tile_info->buffers[i] = rtile.buffer;
-  }
-  tile_info->x[0] = neighbors.tiles[3].x;
-  tile_info->x[1] = neighbors.tiles[4].x;
-  tile_info->x[2] = neighbors.tiles[5].x;
-  tile_info->x[3] = neighbors.tiles[5].x + neighbors.tiles[5].w;
-  tile_info->y[0] = neighbors.tiles[1].y;
-  tile_info->y[1] = neighbors.tiles[4].y;
-  tile_info->y[2] = neighbors.tiles[7].y;
-  tile_info->y[3] = neighbors.tiles[7].y + neighbors.tiles[7].h;
-
-  target_buffer.offset = neighbors.target.offset;
-  target_buffer.stride = neighbors.target.stride;
-  target_buffer.ptr = neighbors.target.buffer;
-
-  if (do_prefilter && neighbors.target.buffers) {
-    target_buffer.denoising_output_offset =
-        neighbors.target.buffers->params.get_denoising_prefiltered_offset();
-  }
-  else {
-    target_buffer.denoising_output_offset = 0;
-  }
-
-  tile_info_mem.copy_to_device();
-}
-
-void DenoisingTask::setup_denoising_buffer()
-{
-  /* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring
-   * tiles */
-  rect = rect_from_shape(filter_area.x, filter_area.y, filter_area.z, filter_area.w);
-  rect = rect_expand(rect, radius);
-  rect = rect_clip(rect,
-                   make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3]));
-
-  buffer.use_intensity = do_prefilter || (tile_info->num_frames > 1);
-  buffer.passes = buffer.use_intensity ? 15 : 14;
-  buffer.width = rect.z - rect.x;
-  buffer.stride = align_up(buffer.width, 4);
-  buffer.h = rect.w - rect.y;
-  int alignment_floats = divide_up(device->mem_sub_ptr_alignment(), sizeof(float));
-  buffer.pass_stride = align_up(buffer.stride * buffer.h, alignment_floats);
-  buffer.frame_stride = buffer.pass_stride * buffer.passes;
-  /* Pad the total size by four floats since the SIMD kernels might go a bit over the end. */
-  int mem_size = align_up(tile_info->num_frames * buffer.frame_stride + 4, alignment_floats);
-  buffer.mem.alloc_to_device(mem_size, false);
-  buffer.use_time = (tile_info->num_frames > 1);
-
-  /* CPUs process shifts sequentially while GPUs process them in parallel. */
-  int num_layers;
-  if (buffer.gpu_temporary_mem) {
-    /* Shadowing prefiltering uses a radius of 6, so allocate at least that much. */
-    int max_radius = max(radius, 6);
-    int num_shifts = (2 * max_radius + 1) * (2 * max_radius + 1);
-    num_layers = 2 * num_shifts + 1;
-  }
-  else {
-    num_layers = 3;
-  }
-  /* Allocate two layers per shift as well as one for the weight accumulation. */
-  buffer.temporary_mem.alloc_to_device(num_layers * buffer.pass_stride);
-}
-
-void DenoisingTask::prefilter_shadowing()
-{
-  device_ptr null_ptr = (device_ptr)0;
-
-  device_sub_ptr unfiltered_a(buffer.mem, 0, buffer.pass_stride);
-  device_sub_ptr unfiltered_b(buffer.mem, 1 * buffer.pass_stride, buffer.pass_stride);
-  device_sub_ptr sample_var(buffer.mem, 2 * buffer.pass_stride, buffer.pass_stride);
-  device_sub_ptr sample_var_var(buffer.mem, 3 * buffer.pass_stride, buffer.pass_stride);
-  device_sub_ptr buffer_var(buffer.mem, 5 * buffer.pass_stride, buffer.pass_stride);
-  device_sub_ptr filtered_var(buffer.mem, 6 * buffer.pass_stride, buffer.pass_stride);
-
-  /* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the
-   * sample variance and the buffer variance. */
-  functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var);
-
-  /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the
-   * sample variance. */
-  nlm_state.set_parameters(6, 3, 4.0f, 1.0f, false);
-  functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var);
-
-  /* Reuse memory, the previous data isn't needed anymore. */
-  device_ptr filtered_a = *buffer_var, filtered_b = *sample_var;
-  /* Use the smoothed variance to filter the two shadow half images using each other for weight
-   * calculation. */
-  nlm_state.set_parameters(5, 3, 1.0f, 0.25f, false);
-  functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a);
-  functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b);
-
-  device_ptr residual_var = *sample_var_var;
-  /* Estimate the residual variance between the two filtered halves. */
-  functions.combine_halves(filtered_a, filtered_b, null_ptr, residual_var, 2, rect);
-
-  device_ptr final_a = *unfiltered_a, final_b = *unfiltered_b;
-  /* Use the residual variance for a second filter pass. */
-  nlm_state.set_parameters(4, 2, 1.0f, 0.5f, false);
-  functions.non_local_means(filtered_a, filtered_b, residual_var, final_a);
-  functions.non_local_means(filtered_b, filtered_a, residual_var, final_b);
-
-  /* Combine the two double-filtered halves to a final shadow feature. */
-  device_sub_ptr shadow_pass(buffer.mem, 4 * buffer.pass_stride, buffer.pass_stride);
-  functions.combine_halves(final_a, final_b, *shadow_pass, null_ptr, 0, rect);
-}
-
-void DenoisingTask::prefilter_features()
-{
-  device_sub_ptr unfiltered(buffer.mem, 8 * buffer.pass_stride, buffer.pass_stride);
-  device_sub_ptr variance(buffer.mem, 9 * buffer.pass_stride, buffer.pass_stride);
-
-  int mean_from[] = {0, 1, 2, 12, 6, 7, 8};
-  int variance_from[] = {3, 4, 5, 13, 9, 10, 11};
-  int pass_to[] = {1, 2, 3, 0, 5, 6, 7};
-  for (int pass = 0; pass < 7; pass++) {
-    device_sub_ptr feature_pass(
-        buffer.mem, pass_to[pass] * buffer.pass_stride, buffer.pass_stride);
-    /* Get the unfiltered pass and its variance from the RenderBuffers. */
-    functions.get_feature(mean_from[pass],
-                          variance_from[pass],
-                          *unfiltered,
-                          *variance,
-                          1.0f / render_buffer.samples);
-    /* Smooth the pass and store the result in the denoising buffers. */
-    nlm_state.set_parameters(2, 2, 1.0f, 0.25f, false);
-    functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass);
-  }
-}
-
-void DenoisingTask::prefilter_color()
-{
-  int mean_from[] = {20, 21, 22};
-  int variance_from[] = {23, 24, 25};
-  int mean_to[] = {8, 9, 10};
-  int variance_to[] = {11, 12, 13};
-  int num_color_passes = 3;
-
-  device_only_memory<float> temporary_color(device, "denoising temporary color");
-  temporary_color.alloc_to_device(6 * buffer.pass_stride, false);
-
-  for (int pass = 0; pass < num_color_passes; pass++) {
-    device_sub_ptr color_pass(temporary_color, pass * buffer.pass_stride, buffer.pass_stride);
-    device_sub_ptr color_var_pass(
-        temporary_color, (pass + 3) * buffer.pass_stride, buffer.pass_stride);
-    functions.get_feature(mean_from[pass],
-                          variance_from[pass],
-                          *color_pass,
-                          *color_var_pass,
-                          1.0f / render_buffer.samples);
-  }
-
-  device_sub_ptr depth_pass(buffer.mem, 0, buffer.pass_stride);
-  device_sub_ptr color_var_pass(
-      buffer.mem, variance_to[0] * buffer.pass_stride, 3 * buffer.pass_stride);
-  device_sub_ptr output_pass(buffer.mem, mean_to[0] * buffer.pass_stride, 3 * buffer.pass_stride);
-  functions.detect_outliers(
-      temporary_color.device_pointer, *color_var_pass, *depth_pass, *output_pass);
-
-  if (buffer.use_intensity) {
-    device_sub_ptr intensity_pass(buffer.mem, 14 * buffer.pass_stride, buffer.pass_stride);
-    nlm_state.set_parameters(radius, 4, 2.0f, nlm_k_2 * 4.0f, true);
-    functions.non_local_means(*output_pass, *output_pass, *color_var_pass, *intensity_pass);
-  }
-}
-
-void DenoisingTask::load_buffer()
-{
-  device_ptr null_ptr = (device_ptr)0;
-
-  int original_offset = render_buffer.offset;
-
-  int num_passes = buffer.use_intensity ? 15 : 14;
-  for (int i = 0; i < tile_info->num_frames; i++) {
-    for (int pass = 0; pass < num_passes; pass++) {
-      device_sub_ptr to_pass(
-          buffer.mem, i * buffer.frame_stride + pass * buffer.pass_stride, buffer.pass_stride);
-      bool is_variance = (pass >= 11) && (pass <= 13);
-      functions.get_feature(
-          pass, -1, *to_pass, null_ptr, is_variance ? (1.0f / render_buffer.samples) : 1.0f);
-    }
-    render_buffer.offset += render_buffer.frame_stride;
-  }
-
-  render_buffer.offset = original_offset;
-}
-
-void DenoisingTask::write_buffer()
-{
-  reconstruction_state.buffer_params = make_int4(target_buffer.offset,
-                                                 target_buffer.stride,
-                                                 target_buffer.pass_stride,
-                                                 target_buffer.denoising_clean_offset);
-  int num_passes = buffer.use_intensity ? 15 : 14;
-  for (int pass = 0; pass < num_passes; pass++) {
-    device_sub_ptr from_pass(buffer.mem, pass * buffer.pass_stride, buffer.pass_stride);
-    int out_offset = pass + target_buffer.denoising_output_offset;
-    functions.write_feature(out_offset, *from_pass, target_buffer.ptr);
-  }
-}
-
-void DenoisingTask::construct_transform()
-{
-  storage.w = filter_area.z;
-  storage.h = filter_area.w;
-
-  storage.transform.alloc_to_device(storage.w * storage.h * TRANSFORM_SIZE, false);
-  storage.rank.alloc_to_device(storage.w * storage.h, false);
-
-  functions.construct_transform();
-}
-
-void DenoisingTask::reconstruct()
-{
-  storage.XtWX.alloc_to_device(storage.w * storage.h * XTWX_SIZE, false);
-  storage.XtWY.alloc_to_device(storage.w * storage.h * XTWY_SIZE, false);
-  storage.XtWX.zero_to_device();
-  storage.XtWY.zero_to_device();
-
-  reconstruction_state.filter_window = rect_from_shape(
-      filter_area.x - rect.x, filter_area.y - rect.y, storage.w, storage.h);
-  int tile_coordinate_offset = filter_area.y * target_buffer.stride + filter_area.x;
-  reconstruction_state.buffer_params = make_int4(target_buffer.offset + tile_coordinate_offset,
-                                                 target_buffer.stride,
-                                                 target_buffer.pass_stride,
-                                                 target_buffer.denoising_clean_offset);
-  reconstruction_state.source_w = rect.z - rect.x;
-  reconstruction_state.source_h = rect.w - rect.y;
-
-  device_sub_ptr color_ptr(buffer.mem, 8 * buffer.pass_stride, 3 * buffer.pass_stride);
-  device_sub_ptr color_var_ptr(buffer.mem, 11 * buffer.pass_stride, 3 * buffer.pass_stride);
-  for (int f = 0; f < tile_info->num_frames; f++) {
-    device_ptr scale_ptr = 0;
-    device_sub_ptr *scale_sub_ptr = NULL;
-    if (tile_info->frames[f] != 0 && (tile_info->num_frames > 1)) {
-      scale_sub_ptr = new device_sub_ptr(buffer.mem, 14 * buffer.pass_stride, buffer.pass_stride);
-      scale_ptr = **scale_sub_ptr;
-    }
-
-    functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr, f);
-    delete scale_sub_ptr;
-  }
-  functions.solve(target_buffer.ptr);
-}
-
-void DenoisingTask::run_denoising(RenderTile &tile)
-{
-  RenderTileNeighbors neighbors(tile);
-  functions.map_neighbor_tiles(neighbors);
-  set_render_buffer(neighbors);
-
-  setup_denoising_buffer();
-
-  if (tile_info->from_render) {
-    prefilter_shadowing();
-    prefilter_features();
-    prefilter_color();
-  }
-  else {
-    load_buffer();
-  }
-
-  if (do_filter) {
-    construct_transform();
-    reconstruct();
-  }
-
-  if (do_prefilter) {
-    write_buffer();
-  }
-
-  functions.unmap_neighbor_tiles(neighbors);
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h
deleted file mode 100644
index bb8bdfdd225..00000000000
--- a/intern/cycles/device/device_denoising.h
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEVICE_DENOISING_H__
-#define __DEVICE_DENOISING_H__
-
-#include "device/device.h"
-
-#include "render/buffers.h"
-
-#include "kernel/filter/filter_defines.h"
-
-#include "util/util_profiling.h"
-
-CCL_NAMESPACE_BEGIN
-
-class DenoisingTask {
- public:
-  /* Parameters of the denoising algorithm. */
-  int radius;
-  float nlm_k_2;
-  float pca_threshold;
-
-  /* Parameters of the RenderBuffers. */
-  struct RenderBuffers {
-    int offset;
-    int pass_stride;
-    int frame_stride;
-    int samples;
-  } render_buffer;
-
-  /* Pointer and parameters of the target buffer. */
-  struct TargetBuffer {
-    int offset;
-    int stride;
-    int pass_stride;
-    int denoising_clean_offset;
-    int denoising_output_offset;
-    device_ptr ptr;
-  } target_buffer;
-
-  TileInfo *tile_info;
-  device_vector<int> tile_info_mem;
-
-  ProfilingState *profiler;
-
-  int4 rect;
-  int4 filter_area;
-
-  bool do_prefilter;
-  bool do_filter;
-
-  struct DeviceFunctions {
-    function<bool(
-        device_ptr image_ptr,    /* Contains the values that are smoothed. */
-        device_ptr guide_ptr,    /* Contains the values that are used to calculate weights. */
-        device_ptr variance_ptr, /* Contains the variance of the guide image. */
-        device_ptr out_ptr       /* The filtered output is written into this image. */
-        )>
-        non_local_means;
-    function<bool(
-        device_ptr color_ptr, device_ptr color_variance_ptr, device_ptr scale_ptr, int frame)>
-        accumulate;
-    function<bool(device_ptr output_ptr)> solve;
-    function<bool()> construct_transform;
-
-    function<bool(device_ptr a_ptr,
-                  device_ptr b_ptr,
-                  device_ptr mean_ptr,
-                  device_ptr variance_ptr,
-                  int r,
-                  int4 rect)>
-        combine_halves;
-    function<bool(device_ptr a_ptr,
-                  device_ptr b_ptr,
-                  device_ptr sample_variance_ptr,
-                  device_ptr sv_variance_ptr,
-                  device_ptr buffer_variance_ptr)>
-        divide_shadow;
-    function<bool(int mean_offset,
-                  int variance_offset,
-                  device_ptr mean_ptr,
-                  device_ptr variance_ptr,
-                  float scale)>
-        get_feature;
-    function<bool(device_ptr image_ptr,
-                  device_ptr variance_ptr,
-                  device_ptr depth_ptr,
-                  device_ptr output_ptr)>
-        detect_outliers;
-    function<bool(int out_offset, device_ptr frop_ptr, device_ptr buffer_ptr)> write_feature;
-    function<void(RenderTileNeighbors &neighbors)> map_neighbor_tiles;
-    function<void(RenderTileNeighbors &neighbors)> unmap_neighbor_tiles;
-  } functions;
-
-  /* Stores state of the current Reconstruction operation,
-   * which is accessed by the device in order to perform the operation. */
-  struct ReconstructionState {
-    int4 filter_window;
-    int4 buffer_params;
-
-    int source_w;
-    int source_h;
-  } reconstruction_state;
-
-  /* Stores state of the current NLM operation,
-   * which is accessed by the device in order to perform the operation. */
-  struct NLMState {
-    int r;     /* Search radius of the filter. */
-    int f;     /* Patch size of the filter. */
-    float a;   /* Variance compensation factor in the MSE estimation. */
-    float k_2; /* Squared value of the k parameter of the filter. */
-    bool is_color;
-
-    void set_parameters(int r_, int f_, float a_, float k_2_, bool is_color_)
-    {
-      r = r_;
-      f = f_;
-      a = a_, k_2 = k_2_;
-      is_color = is_color_;
-    }
-  } nlm_state;
-
-  struct Storage {
-    device_only_memory<float> transform;
-    device_only_memory<int> rank;
-    device_only_memory<float> XtWX;
-    device_only_memory<float3> XtWY;
-    int w;
-    int h;
-
-    Storage(Device *device)
-        : transform(device, "denoising transform"),
-          rank(device, "denoising rank"),
-          XtWX(device, "denoising XtWX"),
-          XtWY(device, "denoising XtWY")
-    {
-    }
-  } storage;
-
-  DenoisingTask(Device *device, const DeviceTask &task);
-  ~DenoisingTask();
-
-  void run_denoising(RenderTile &tile);
-
-  struct DenoiseBuffers {
-    int pass_stride;
-    int passes;
-    int stride;
-    int h;
-    int width;
-    int frame_stride;
-    device_only_memory<float> mem;
-    device_only_memory<float> temporary_mem;
-    bool use_time;
-    bool use_intensity;
-
-    bool gpu_temporary_mem;
-
-    DenoiseBuffers(Device *device)
-        : mem(device, "denoising pixel buffer"),
-          temporary_mem(device, "denoising temporary mem", true)
-    {
-    }
-  } buffer;
-
- protected:
-  Device *device;
-
-  void set_render_buffer(RenderTileNeighbors &neighbors);
-  void setup_denoising_buffer();
-  void prefilter_shadowing();
-  void prefilter_features();
-  void prefilter_color();
-  void construct_transform();
-  void reconstruct();
-
-  void load_buffer();
-  void write_buffer();
-};
-
-CCL_NAMESPACE_END
-
-#endif /* __DEVICE_DENOISING_H__ */
diff --git a/intern/cycles/device/device_graphics_interop.cpp b/intern/cycles/device/device_graphics_interop.cpp
new file mode 100644
index 00000000000..a80a236759f
--- /dev/null
+++ b/intern/cycles/device/device_graphics_interop.cpp
@@ -0,0 +1,21 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device_graphics_interop.h"
+
+CCL_NAMESPACE_BEGIN
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_graphics_interop.h b/intern/cycles/device/device_graphics_interop.h
new file mode 100644
index 00000000000..671b1c189d7
--- /dev/null
+++ b/intern/cycles/device/device_graphics_interop.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Information about interoperability destination.
+ * Is provided by the GPUDisplay. */
+class DeviceGraphicsInteropDestination {
+ public:
+  /* Dimensions of the buffer, in pixels. */
+  int buffer_width = 0;
+  int buffer_height = 0;
+
+  /* OpenGL pixel buffer object. */
+  int opengl_pbo_id = 0;
+
+  /* Clear the entire destination before doing partial write to it. */
+  bool need_clear = false;
+};
+
+/* Device-side graphics interoperability support.
+ *
+ * Takes care of holding all the handlers needed by the device to implement interoperability with
+ * the graphics library. */
+class DeviceGraphicsInterop {
+ public:
+  DeviceGraphicsInterop() = default;
+  virtual ~DeviceGraphicsInterop() = default;
+
+  /* Update this device-side graphics interoperability object with the given destination resource
+   * information. */
+  virtual void set_destination(const DeviceGraphicsInteropDestination &destination) = 0;
+
+  virtual device_ptr map() = 0;
+  virtual void unmap() = 0;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_intern.h b/intern/cycles/device/device_intern.h
deleted file mode 100644
index ecc79c5d7ee..00000000000
--- a/intern/cycles/device/device_intern.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEVICE_INTERN_H__
-#define __DEVICE_INTERN_H__
-
-#include "util/util_string.h"
-#include "util/util_vector.h"
-
-CCL_NAMESPACE_BEGIN
-
-class Device;
-class DeviceInfo;
-class Profiler;
-class Stats;
-
-Device *device_cpu_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-bool device_opencl_init();
-Device *device_opencl_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-bool device_opencl_compile_kernel(const vector<string> &parameters);
-bool device_cuda_init();
-Device *device_cuda_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-bool device_optix_init();
-Device *device_optix_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-Device *device_dummy_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-
-Device *device_network_create(DeviceInfo &info,
-                              Stats &stats,
-                              Profiler &profiler,
-                              const char *address);
-Device *device_multi_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-
-void device_cpu_info(vector<DeviceInfo> &devices);
-void device_opencl_info(vector<DeviceInfo> &devices);
-void device_cuda_info(vector<DeviceInfo> &devices);
-void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices);
-void device_network_info(vector<DeviceInfo> &devices);
-
-string device_cpu_capabilities();
-string device_opencl_capabilities();
-string device_cuda_capabilities();
-
-CCL_NAMESPACE_END
-
-#endif /* __DEVICE_INTERN_H__ */
diff --git a/intern/cycles/device/device_kernel.cpp b/intern/cycles/device/device_kernel.cpp
new file mode 100644
index 00000000000..ceaddee4756
--- /dev/null
+++ b/intern/cycles/device/device_kernel.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device_kernel.h"
+
+#include "util/util_logging.h"
+
+CCL_NAMESPACE_BEGIN
+
+const char *device_kernel_as_string(DeviceKernel kernel)
+{
+  switch (kernel) {
+    /* Integrator. */
+    case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA:
+      return "integrator_init_from_camera";
+    case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE:
+      return "integrator_init_from_bake";
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
+      return "integrator_intersect_closest";
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
+      return "integrator_intersect_shadow";
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
+      return "integrator_intersect_subsurface";
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK:
+      return "integrator_intersect_volume_stack";
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
+      return "integrator_shade_background";
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
+      return "integrator_shade_light";
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
+      return "integrator_shade_shadow";
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
+      return "integrator_shade_surface";
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
+      return "integrator_shade_surface_raytrace";
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME:
+      return "integrator_shade_volume";
+    case DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL:
+      return "integrator_megakernel";
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
+      return "integrator_queued_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
+      return "integrator_queued_shadow_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
+      return "integrator_active_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
+      return "integrator_terminated_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
+      return "integrator_sorted_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
+      return "integrator_compact_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES:
+      return "integrator_compact_states";
+    case DEVICE_KERNEL_INTEGRATOR_RESET:
+      return "integrator_reset";
+    case DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS:
+      return "integrator_shadow_catcher_count_possible_splits";
+
+    /* Shader evaluation. */
+    case DEVICE_KERNEL_SHADER_EVAL_DISPLACE:
+      return "shader_eval_displace";
+    case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND:
+      return "shader_eval_background";
+
+      /* Film. */
+
+#define FILM_CONVERT_KERNEL_AS_STRING(variant, variant_lowercase) \
+  case DEVICE_KERNEL_FILM_CONVERT_##variant: \
+    return "film_convert_" #variant_lowercase; \
+  case DEVICE_KERNEL_FILM_CONVERT_##variant##_HALF_RGBA: \
+    return "film_convert_" #variant_lowercase "_half_rgba";
+
+      FILM_CONVERT_KERNEL_AS_STRING(DEPTH, depth)
+      FILM_CONVERT_KERNEL_AS_STRING(MIST, mist)
+      FILM_CONVERT_KERNEL_AS_STRING(SAMPLE_COUNT, sample_count)
+      FILM_CONVERT_KERNEL_AS_STRING(FLOAT, float)
+      FILM_CONVERT_KERNEL_AS_STRING(LIGHT_PATH, light_path)
+      FILM_CONVERT_KERNEL_AS_STRING(FLOAT3, float3)
+      FILM_CONVERT_KERNEL_AS_STRING(MOTION, motion)
+      FILM_CONVERT_KERNEL_AS_STRING(CRYPTOMATTE, cryptomatte)
+      FILM_CONVERT_KERNEL_AS_STRING(SHADOW_CATCHER, shadow_catcher)
+      FILM_CONVERT_KERNEL_AS_STRING(SHADOW_CATCHER_MATTE_WITH_SHADOW,
+                                    shadow_catcher_matte_with_shadow)
+      FILM_CONVERT_KERNEL_AS_STRING(COMBINED, combined)
+      FILM_CONVERT_KERNEL_AS_STRING(FLOAT4, float4)
+
+#undef FILM_CONVERT_KERNEL_AS_STRING
+
+    /* Adaptive sampling. */
+    case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK:
+      return "adaptive_sampling_convergence_check";
+    case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X:
+      return "adaptive_sampling_filter_x";
+    case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y:
+      return "adaptive_sampling_filter_y";
+
+    /* Denoising. */
+    case DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS:
+      return "filter_guiding_preprocess";
+    case DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO:
+      return "filter_guiding_set_fake_albedo";
+    case DEVICE_KERNEL_FILTER_COLOR_PREPROCESS:
+      return "filter_color_preprocess";
+    case DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS:
+      return "filter_color_postprocess";
+
+    /* Cryptomatte. */
+    case DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS:
+      return "cryptomatte_postprocess";
+
+    /* Generic */
+    case DEVICE_KERNEL_PREFIX_SUM:
+      return "prefix_sum";
+
+    case DEVICE_KERNEL_NUM:
+      break;
+  };
+  LOG(FATAL) << "Unhandled kernel " << static_cast<int>(kernel) << ", should never happen.";
+  return "UNKNOWN";
+}
+
+std::ostream &operator<<(std::ostream &os, DeviceKernel kernel)
+{
+  os << device_kernel_as_string(kernel);
+  return os;
+}
+
+string device_kernel_mask_as_string(DeviceKernelMask mask)
+{
+  string str;
+
+  for (uint64_t i = 0; i < sizeof(DeviceKernelMask) * 8; i++) {
+    if (mask & (uint64_t(1) << i)) {
+      if (!str.empty()) {
+        str += " ";
+      }
+      str += device_kernel_as_string((DeviceKernel)i);
+    }
+  }
+
+  return str;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_kernel.h b/intern/cycles/device/device_kernel.h
new file mode 100644
index 00000000000..83d959ca87b
--- /dev/null
+++ b/intern/cycles/device/device_kernel.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_types.h"
+
+#include "util/util_string.h"
+
+#include <ostream>  // NOLINT
+
+CCL_NAMESPACE_BEGIN
+
+const char *device_kernel_as_string(DeviceKernel kernel);
+std::ostream &operator<<(std::ostream &os, DeviceKernel kernel);
+
+typedef uint64_t DeviceKernelMask;
+string device_kernel_mask_as_string(DeviceKernelMask mask);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_memory.cpp b/intern/cycles/device/device_memory.cpp
index 80a05fc32fe..c4d45829b83 100644
--- a/intern/cycles/device/device_memory.cpp
+++ b/intern/cycles/device/device_memory.cpp
@@ -23,7 +23,7 @@ CCL_NAMESPACE_BEGIN
 
 device_memory::device_memory(Device *device, const char *name, MemoryType type)
     : data_type(device_type_traits<uchar>::data_type),
-      data_elements(device_type_traits<uchar>::num_elements),
+      data_elements(device_type_traits<uchar>::num_elements_cpu),
       data_size(0),
       device_size(0),
       data_width(0),
@@ -149,6 +149,11 @@ void device_memory::device_zero()
   }
 }
 
+bool device_memory::device_is_cpu()
+{
+  return (device->info.type == DEVICE_CPU);
+}
+
 void device_memory::swap_device(Device *new_device,
                                 size_t new_device_size,
                                 device_ptr new_device_ptr)
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index 80f4d7b0468..c51594b8580 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -38,7 +38,6 @@ enum MemoryType {
   MEM_DEVICE_ONLY,
   MEM_GLOBAL,
   MEM_TEXTURE,
-  MEM_PIXELS
 };
 
 /* Supported Data Types */
@@ -54,7 +53,7 @@ enum DataType {
   TYPE_UINT64,
 };
 
-static inline size_t datatype_size(DataType datatype)
+static constexpr size_t datatype_size(DataType datatype)
 {
   switch (datatype) {
     case TYPE_UNKNOWN:
@@ -82,112 +81,155 @@ static inline size_t datatype_size(DataType datatype)
 
 template<typename T> struct device_type_traits {
   static const DataType data_type = TYPE_UNKNOWN;
-  static const int num_elements = sizeof(T);
+  static const int num_elements_cpu = sizeof(T);
+  static const int num_elements_gpu = sizeof(T);
 };
 
 template<> struct device_type_traits<uchar> {
   static const DataType data_type = TYPE_UCHAR;
-  static const int num_elements = 1;
+  static const int num_elements_cpu = 1;
+  static const int num_elements_gpu = 1;
+  static_assert(sizeof(uchar) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uchar2> {
   static const DataType data_type = TYPE_UCHAR;
-  static const int num_elements = 2;
+  static const int num_elements_cpu = 2;
+  static const int num_elements_gpu = 2;
+  static_assert(sizeof(uchar2) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uchar3> {
   static const DataType data_type = TYPE_UCHAR;
-  static const int num_elements = 3;
+  static const int num_elements_cpu = 3;
+  static const int num_elements_gpu = 3;
+  static_assert(sizeof(uchar3) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uchar4> {
   static const DataType data_type = TYPE_UCHAR;
-  static const int num_elements = 4;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 4;
+  static_assert(sizeof(uchar4) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint> {
   static const DataType data_type = TYPE_UINT;
-  static const int num_elements = 1;
+  static const int num_elements_cpu = 1;
+  static const int num_elements_gpu = 1;
+  static_assert(sizeof(uint) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint2> {
   static const DataType data_type = TYPE_UINT;
-  static const int num_elements = 2;
+  static const int num_elements_cpu = 2;
+  static const int num_elements_gpu = 2;
+  static_assert(sizeof(uint2) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint3> {
   static const DataType data_type = TYPE_UINT;
-  static const int num_elements = 3;
+  static const int num_elements_cpu = 3;
+  static const int num_elements_gpu = 3;
+  static_assert(sizeof(uint3) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint4> {
   static const DataType data_type = TYPE_UINT;
-  static const int num_elements = 4;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 4;
+  static_assert(sizeof(uint4) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<int> {
   static const DataType data_type = TYPE_INT;
-  static const int num_elements = 1;
+  static const int num_elements_cpu = 1;
+  static const int num_elements_gpu = 1;
+  static_assert(sizeof(int) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<int2> {
   static const DataType data_type = TYPE_INT;
-  static const int num_elements = 2;
+  static const int num_elements_cpu = 2;
+  static const int num_elements_gpu = 2;
+  static_assert(sizeof(int2) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<int3> {
   static const DataType data_type = TYPE_INT;
-  static const int num_elements = 3;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 3;
+  static_assert(sizeof(int3) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<int4> {
   static const DataType data_type = TYPE_INT;
-  static const int num_elements = 4;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 4;
+  static_assert(sizeof(int4) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<float> {
   static const DataType data_type = TYPE_FLOAT;
-  static const int num_elements = 1;
+  static const int num_elements_cpu = 1;
+  static const int num_elements_gpu = 1;
+  static_assert(sizeof(float) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<float2> {
   static const DataType data_type = TYPE_FLOAT;
-  static const int num_elements = 2;
+  static const int num_elements_cpu = 2;
+  static const int num_elements_gpu = 2;
+  static_assert(sizeof(float2) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<float3> {
   static const DataType data_type = TYPE_FLOAT;
-  static const int num_elements = 4;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 3;
+  static_assert(sizeof(float3) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<float4> {
   static const DataType data_type = TYPE_FLOAT;
-  static const int num_elements = 4;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 4;
+  static_assert(sizeof(float4) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<half> {
   static const DataType data_type = TYPE_HALF;
-  static const int num_elements = 1;
+  static const int num_elements_cpu = 1;
+  static const int num_elements_gpu = 1;
+  static_assert(sizeof(half) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<ushort4> {
   static const DataType data_type = TYPE_UINT16;
-  static const int num_elements = 4;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 4;
+  static_assert(sizeof(ushort4) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint16_t> {
   static const DataType data_type = TYPE_UINT16;
-  static const int num_elements = 1;
+  static const int num_elements_cpu = 1;
+  static const int num_elements_gpu = 1;
+  static_assert(sizeof(uint16_t) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<half4> {
   static const DataType data_type = TYPE_HALF;
-  static const int num_elements = 4;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 4;
+  static_assert(sizeof(half4) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint64_t> {
   static const DataType data_type = TYPE_UINT64;
-  static const int num_elements = 1;
+  static const int num_elements_cpu = 1;
+  static const int num_elements_gpu = 1;
+  static_assert(sizeof(uint64_t) == num_elements_cpu * datatype_size(data_type));
 };
 
 /* Device Memory
@@ -257,6 +299,8 @@ class device_memory {
   void device_copy_from(int y, int w, int h, int elem);
   void device_zero();
 
+  bool device_is_cpu();
+
   device_ptr original_device_ptr;
   size_t original_device_size;
   Device *original_device;
@@ -275,7 +319,9 @@ template<typename T> class device_only_memory : public device_memory {
       : device_memory(device, name, allow_host_memory_fallback ? MEM_READ_WRITE : MEM_DEVICE_ONLY)
   {
     data_type = device_type_traits<T>::data_type;
-    data_elements = max(device_type_traits<T>::num_elements, 1);
+    data_elements = max(device_is_cpu() ? device_type_traits<T>::num_elements_cpu :
+                                          device_type_traits<T>::num_elements_gpu,
+                        1);
   }
 
   device_only_memory(device_only_memory &&other) noexcept : device_memory(std::move(other))
@@ -331,11 +377,15 @@ template<typename T> class device_only_memory : public device_memory {
 
 template<typename T> class device_vector : public device_memory {
  public:
+  /* Can only use this for types that have the same size on CPU and GPU. */
+  static_assert(device_type_traits<T>::num_elements_cpu ==
+                device_type_traits<T>::num_elements_gpu);
+
   device_vector(Device *device, const char *name, MemoryType type)
       : device_memory(device, name, type)
   {
     data_type = device_type_traits<T>::data_type;
-    data_elements = device_type_traits<T>::num_elements;
+    data_elements = device_type_traits<T>::num_elements_cpu;
     modified = true;
     need_realloc_ = true;
 
@@ -477,6 +527,11 @@ template<typename T> class device_vector : public device_memory {
     return (T *)host_pointer;
   }
 
+  const T *data() const
+  {
+    return (T *)host_pointer;
+  }
+
   T &operator[](size_t i)
   {
     assert(i < data_size);
@@ -507,7 +562,7 @@ template<typename T> class device_vector : public device_memory {
 
   void copy_from_device()
   {
-    device_copy_from(0, data_width, data_height, sizeof(T));
+    device_copy_from(0, data_width, (data_height == 0) ? 1 : data_height, sizeof(T));
   }
 
   void copy_from_device(int y, int w, int h)
@@ -535,33 +590,6 @@ template<typename T> class device_vector : public device_memory {
   }
 };
 
-/* Pixel Memory
- *
- * Device memory to efficiently draw as pixels to the screen in interactive
- * rendering. Only copying pixels from the device is supported, not copying to. */
-
-template<typename T> class device_pixels : public device_vector<T> {
- public:
-  device_pixels(Device *device, const char *name) : device_vector<T>(device, name, MEM_PIXELS)
-  {
-  }
-
-  void alloc_to_device(size_t width, size_t height, size_t depth = 0)
-  {
-    device_vector<T>::alloc(width, height, depth);
-
-    if (!device_memory::device_pointer) {
-      device_memory::device_alloc();
-    }
-  }
-
-  T *copy_from_device(int y, int w, int h)
-  {
-    device_memory::device_copy_from(y, w, h, sizeof(T));
-    return device_vector<T>::data();
-  }
-};
-
 /* Device Sub Memory
  *
  * Pointer into existing memory. It is not allocated separately, but created
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
deleted file mode 100644
index 85ffa5fcd52..00000000000
--- a/intern/cycles/device/device_multi.cpp
+++ /dev/null
@@ -1,826 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <sstream>
-#include <stdlib.h>
-
-#include "bvh/bvh_multi.h"
-
-#include "device/device.h"
-#include "device/device_intern.h"
-#include "device/device_network.h"
-
-#include "render/buffers.h"
-#include "render/geometry.h"
-
-#include "util/util_foreach.h"
-#include "util/util_list.h"
-#include "util/util_logging.h"
-#include "util/util_map.h"
-#include "util/util_time.h"
-
-CCL_NAMESPACE_BEGIN
-
-class MultiDevice : public Device {
- public:
-  struct SubDevice {
-    Stats stats;
-    Device *device;
-    map<device_ptr, device_ptr> ptr_map;
-    int peer_island_index = -1;
-  };
-
-  list<SubDevice> devices, denoising_devices;
-  device_ptr unique_key;
-  vector<vector<SubDevice *>> peer_islands;
-  bool use_denoising;
-  bool matching_rendering_and_denoising_devices;
-
-  MultiDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_)
-      : Device(info, stats, profiler, background_),
-        unique_key(1),
-        use_denoising(!info.denoising_devices.empty())
-  {
-    foreach (DeviceInfo &subinfo, info.multi_devices) {
-      /* Always add CPU devices at the back since GPU devices can change
-       * host memory pointers, which CPU uses as device pointer. */
-      SubDevice *sub;
-      if (subinfo.type == DEVICE_CPU) {
-        devices.emplace_back();
-        sub = &devices.back();
-      }
-      else {
-        devices.emplace_front();
-        sub = &devices.front();
-      }
-
-      /* The pointer to 'sub->stats' will stay valid even after new devices
-       * are added, since 'devices' is a linked list. */
-      sub->device = Device::create(subinfo, sub->stats, profiler, background);
-    }
-
-    foreach (DeviceInfo &subinfo, info.denoising_devices) {
-      denoising_devices.emplace_front();
-      SubDevice *sub = &denoising_devices.front();
-
-      sub->device = Device::create(subinfo, sub->stats, profiler, background);
-    }
-
-    /* Build a list of peer islands for the available render devices */
-    foreach (SubDevice &sub, devices) {
-      /* First ensure that every device is in at least once peer island */
-      if (sub.peer_island_index < 0) {
-        peer_islands.emplace_back();
-        sub.peer_island_index = (int)peer_islands.size() - 1;
-        peer_islands[sub.peer_island_index].push_back(&sub);
-      }
-
-      if (!info.has_peer_memory) {
-        continue;
-      }
-
-      /* Second check peer access between devices and fill up the islands accordingly */
-      foreach (SubDevice &peer_sub, devices) {
-        if (peer_sub.peer_island_index < 0 &&
-            peer_sub.device->info.type == sub.device->info.type &&
-            peer_sub.device->check_peer_access(sub.device)) {
-          peer_sub.peer_island_index = sub.peer_island_index;
-          peer_islands[sub.peer_island_index].push_back(&peer_sub);
-        }
-      }
-    }
-
-    /* Try to re-use memory when denoising and render devices use the same physical devices
-     * (e.g. OptiX denoising and CUDA rendering device pointing to the same GPU).
-     * Ordering has to match as well, so that 'DeviceTask::split' behaves consistent. */
-    matching_rendering_and_denoising_devices = denoising_devices.empty() ||
-                                               (devices.size() == denoising_devices.size());
-    if (matching_rendering_and_denoising_devices) {
-      for (list<SubDevice>::iterator device_it = devices.begin(),
-                                     denoising_device_it = denoising_devices.begin();
-           device_it != devices.end() && denoising_device_it != denoising_devices.end();
-           ++device_it, ++denoising_device_it) {
-        const DeviceInfo &info = device_it->device->info;
-        const DeviceInfo &denoising_info = denoising_device_it->device->info;
-        if ((info.type != DEVICE_CUDA && info.type != DEVICE_OPTIX) ||
-            (denoising_info.type != DEVICE_CUDA && denoising_info.type != DEVICE_OPTIX) ||
-            info.num != denoising_info.num) {
-          matching_rendering_and_denoising_devices = false;
-          break;
-        }
-      }
-    }
-
-#ifdef WITH_NETWORK
-    /* try to add network devices */
-    ServerDiscovery discovery(true);
-    time_sleep(1.0);
-
-    vector<string> servers = discovery.get_server_list();
-
-    foreach (string &server, servers) {
-      Device *device = device_network_create(info, stats, profiler, server.c_str());
-      if (device)
-        devices.push_back(SubDevice(device));
-    }
-#endif
-  }
-
-  ~MultiDevice()
-  {
-    foreach (SubDevice &sub, devices)
-      delete sub.device;
-    foreach (SubDevice &sub, denoising_devices)
-      delete sub.device;
-  }
-
-  const string &error_message() override
-  {
-    error_msg.clear();
-
-    foreach (SubDevice &sub, devices)
-      error_msg += sub.device->error_message();
-    foreach (SubDevice &sub, denoising_devices)
-      error_msg += sub.device->error_message();
-
-    return error_msg;
-  }
-
-  virtual bool show_samples() const override
-  {
-    if (devices.size() > 1) {
-      return false;
-    }
-    return devices.front().device->show_samples();
-  }
-
-  virtual BVHLayoutMask get_bvh_layout_mask() const override
-  {
-    BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_ALL;
-    BVHLayoutMask bvh_layout_mask_all = BVH_LAYOUT_NONE;
-    foreach (const SubDevice &sub_device, devices) {
-      BVHLayoutMask device_bvh_layout_mask = sub_device.device->get_bvh_layout_mask();
-      bvh_layout_mask &= device_bvh_layout_mask;
-      bvh_layout_mask_all |= device_bvh_layout_mask;
-    }
-
-    /* With multiple OptiX devices, every device needs its own acceleration structure */
-    if (bvh_layout_mask == BVH_LAYOUT_OPTIX) {
-      return BVH_LAYOUT_MULTI_OPTIX;
-    }
-
-    /* When devices do not share a common BVH layout, fall back to creating one for each */
-    const BVHLayoutMask BVH_LAYOUT_OPTIX_EMBREE = (BVH_LAYOUT_OPTIX | BVH_LAYOUT_EMBREE);
-    if ((bvh_layout_mask_all & BVH_LAYOUT_OPTIX_EMBREE) == BVH_LAYOUT_OPTIX_EMBREE) {
-      return BVH_LAYOUT_MULTI_OPTIX_EMBREE;
-    }
-
-    return bvh_layout_mask;
-  }
-
-  bool load_kernels(const DeviceRequestedFeatures &requested_features) override
-  {
-    foreach (SubDevice &sub, devices)
-      if (!sub.device->load_kernels(requested_features))
-        return false;
-
-    use_denoising = requested_features.use_denoising;
-    if (requested_features.use_denoising) {
-      /* Only need denoising feature, everything else is unused. */
-      DeviceRequestedFeatures denoising_features;
-      denoising_features.use_denoising = true;
-      foreach (SubDevice &sub, denoising_devices)
-        if (!sub.device->load_kernels(denoising_features))
-          return false;
-    }
-
-    return true;
-  }
-
-  bool wait_for_availability(const DeviceRequestedFeatures &requested_features) override
-  {
-    foreach (SubDevice &sub, devices)
-      if (!sub.device->wait_for_availability(requested_features))
-        return false;
-
-    if (requested_features.use_denoising) {
-      foreach (SubDevice &sub, denoising_devices)
-        if (!sub.device->wait_for_availability(requested_features))
-          return false;
-    }
-
-    return true;
-  }
-
-  DeviceKernelStatus get_active_kernel_switch_state() override
-  {
-    DeviceKernelStatus result = DEVICE_KERNEL_USING_FEATURE_KERNEL;
-
-    foreach (SubDevice &sub, devices) {
-      DeviceKernelStatus subresult = sub.device->get_active_kernel_switch_state();
-      switch (subresult) {
-        case DEVICE_KERNEL_FEATURE_KERNEL_INVALID:
-        case DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE:
-          return subresult;
-
-        case DEVICE_KERNEL_USING_FEATURE_KERNEL:
-        case DEVICE_KERNEL_UNKNOWN:
-          break;
-      }
-    }
-
-    return result;
-  }
-
-  void build_bvh(BVH *bvh, Progress &progress, bool refit) override
-  {
-    /* Try to build and share a single acceleration structure, if possible */
-    if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2 || bvh->params.bvh_layout == BVH_LAYOUT_EMBREE) {
-      devices.back().device->build_bvh(bvh, progress, refit);
-      return;
-    }
-
-    assert(bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX ||
-           bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE);
-
-    BVHMulti *const bvh_multi = static_cast<BVHMulti *>(bvh);
-    bvh_multi->sub_bvhs.resize(devices.size());
-
-    vector<BVHMulti *> geom_bvhs;
-    geom_bvhs.reserve(bvh->geometry.size());
-    foreach (Geometry *geom, bvh->geometry) {
-      geom_bvhs.push_back(static_cast<BVHMulti *>(geom->bvh));
-    }
-
-    /* Broadcast acceleration structure build to all render devices */
-    size_t i = 0;
-    foreach (SubDevice &sub, devices) {
-      /* Change geometry BVH pointers to the sub BVH */
-      for (size_t k = 0; k < bvh->geometry.size(); ++k) {
-        bvh->geometry[k]->bvh = geom_bvhs[k]->sub_bvhs[i];
-      }
-
-      if (!bvh_multi->sub_bvhs[i]) {
-        BVHParams params = bvh->params;
-        if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX)
-          params.bvh_layout = BVH_LAYOUT_OPTIX;
-        else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE)
-          params.bvh_layout = sub.device->info.type == DEVICE_OPTIX ? BVH_LAYOUT_OPTIX :
-                                                                      BVH_LAYOUT_EMBREE;
-
-        /* Skip building a bottom level acceleration structure for non-instanced geometry on Embree
-         * (since they are put into the top level directly, see bvh_embree.cpp) */
-        if (!params.top_level && params.bvh_layout == BVH_LAYOUT_EMBREE &&
-            !bvh->geometry[0]->is_instanced()) {
-          i++;
-          continue;
-        }
-
-        bvh_multi->sub_bvhs[i] = BVH::create(params, bvh->geometry, bvh->objects, sub.device);
-      }
-
-      sub.device->build_bvh(bvh_multi->sub_bvhs[i], progress, refit);
-      i++;
-    }
-
-    /* Change geometry BVH pointers back to the multi BVH. */
-    for (size_t k = 0; k < bvh->geometry.size(); ++k) {
-      bvh->geometry[k]->bvh = geom_bvhs[k];
-    }
-  }
-
-  virtual void *osl_memory() override
-  {
-    if (devices.size() > 1) {
-      return NULL;
-    }
-    return devices.front().device->osl_memory();
-  }
-
-  bool is_resident(device_ptr key, Device *sub_device) override
-  {
-    foreach (SubDevice &sub, devices) {
-      if (sub.device == sub_device) {
-        return find_matching_mem_device(key, sub)->device == sub_device;
-      }
-    }
-    return false;
-  }
-
-  SubDevice *find_matching_mem_device(device_ptr key, SubDevice &sub)
-  {
-    assert(key != 0 && (sub.peer_island_index >= 0 || sub.ptr_map.find(key) != sub.ptr_map.end()));
-
-    /* Get the memory owner of this key (first try current device, then peer devices) */
-    SubDevice *owner_sub = &sub;
-    if (owner_sub->ptr_map.find(key) == owner_sub->ptr_map.end()) {
-      foreach (SubDevice *island_sub, peer_islands[sub.peer_island_index]) {
-        if (island_sub != owner_sub &&
-            island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) {
-          owner_sub = island_sub;
-        }
-      }
-    }
-    return owner_sub;
-  }
-
-  SubDevice *find_suitable_mem_device(device_ptr key, const vector<SubDevice *> &island)
-  {
-    assert(!island.empty());
-
-    /* Get the memory owner of this key or the device with the lowest memory usage when new */
-    SubDevice *owner_sub = island.front();
-    foreach (SubDevice *island_sub, island) {
-      if (key ? (island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) :
-                (island_sub->device->stats.mem_used < owner_sub->device->stats.mem_used)) {
-        owner_sub = island_sub;
-      }
-    }
-    return owner_sub;
-  }
-
-  inline device_ptr find_matching_mem(device_ptr key, SubDevice &sub)
-  {
-    return find_matching_mem_device(key, sub)->ptr_map[key];
-  }
-
-  void mem_alloc(device_memory &mem) override
-  {
-    device_ptr key = unique_key++;
-
-    if (mem.type == MEM_PIXELS) {
-      /* Always allocate pixels memory on all devices
-       * This is necessary to ensure PBOs are registered everywhere, which FILM_CONVERT uses */
-      foreach (SubDevice &sub, devices) {
-        mem.device = sub.device;
-        mem.device_pointer = 0;
-        mem.device_size = 0;
-
-        sub.device->mem_alloc(mem);
-        sub.ptr_map[key] = mem.device_pointer;
-      }
-    }
-    else {
-      assert(mem.type == MEM_READ_ONLY || mem.type == MEM_READ_WRITE ||
-             mem.type == MEM_DEVICE_ONLY);
-      /* The remaining memory types can be distributed across devices */
-      foreach (const vector<SubDevice *> &island, peer_islands) {
-        SubDevice *owner_sub = find_suitable_mem_device(key, island);
-        mem.device = owner_sub->device;
-        mem.device_pointer = 0;
-        mem.device_size = 0;
-
-        owner_sub->device->mem_alloc(mem);
-        owner_sub->ptr_map[key] = mem.device_pointer;
-      }
-    }
-
-    mem.device = this;
-    mem.device_pointer = key;
-    stats.mem_alloc(mem.device_size);
-  }
-
-  void mem_copy_to(device_memory &mem) override
-  {
-    device_ptr existing_key = mem.device_pointer;
-    device_ptr key = (existing_key) ? existing_key : unique_key++;
-    size_t existing_size = mem.device_size;
-
-    /* The tile buffers are allocated on each device (see below), so copy to all of them */
-    if (strcmp(mem.name, "RenderBuffers") == 0 && use_denoising) {
-      foreach (SubDevice &sub, devices) {
-        mem.device = sub.device;
-        mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
-        mem.device_size = existing_size;
-
-        sub.device->mem_copy_to(mem);
-        sub.ptr_map[key] = mem.device_pointer;
-      }
-    }
-    else {
-      foreach (const vector<SubDevice *> &island, peer_islands) {
-        SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
-        mem.device = owner_sub->device;
-        mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
-        mem.device_size = existing_size;
-
-        owner_sub->device->mem_copy_to(mem);
-        owner_sub->ptr_map[key] = mem.device_pointer;
-
-        if (mem.type == MEM_GLOBAL || mem.type == MEM_TEXTURE) {
-          /* Need to create texture objects and update pointer in kernel globals on all devices */
-          foreach (SubDevice *island_sub, island) {
-            if (island_sub != owner_sub) {
-              island_sub->device->mem_copy_to(mem);
-            }
-          }
-        }
-      }
-    }
-
-    mem.device = this;
-    mem.device_pointer = key;
-    stats.mem_alloc(mem.device_size - existing_size);
-  }
-
-  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override
-  {
-    device_ptr key = mem.device_pointer;
-    int i = 0, sub_h = h / devices.size();
-
-    foreach (SubDevice &sub, devices) {
-      int sy = y + i * sub_h;
-      int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h;
-
-      SubDevice *owner_sub = find_matching_mem_device(key, sub);
-      mem.device = owner_sub->device;
-      mem.device_pointer = owner_sub->ptr_map[key];
-
-      owner_sub->device->mem_copy_from(mem, sy, w, sh, elem);
-      i++;
-    }
-
-    mem.device = this;
-    mem.device_pointer = key;
-  }
-
-  void mem_zero(device_memory &mem) override
-  {
-    device_ptr existing_key = mem.device_pointer;
-    device_ptr key = (existing_key) ? existing_key : unique_key++;
-    size_t existing_size = mem.device_size;
-
-    /* This is a hack to only allocate the tile buffers on denoising devices
-     * Similarly the tile buffers also need to be allocated separately on all devices so any
-     * overlap rendered for denoising does not interfere with each other */
-    if (strcmp(mem.name, "RenderBuffers") == 0 && use_denoising) {
-      vector<device_ptr> device_pointers;
-      device_pointers.reserve(devices.size());
-
-      foreach (SubDevice &sub, devices) {
-        mem.device = sub.device;
-        mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
-        mem.device_size = existing_size;
-
-        sub.device->mem_zero(mem);
-        sub.ptr_map[key] = mem.device_pointer;
-
-        device_pointers.push_back(mem.device_pointer);
-      }
-      foreach (SubDevice &sub, denoising_devices) {
-        if (matching_rendering_and_denoising_devices) {
-          sub.ptr_map[key] = device_pointers.front();
-          device_pointers.erase(device_pointers.begin());
-        }
-        else {
-          mem.device = sub.device;
-          mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
-          mem.device_size = existing_size;
-
-          sub.device->mem_zero(mem);
-          sub.ptr_map[key] = mem.device_pointer;
-        }
-      }
-    }
-    else {
-      foreach (const vector<SubDevice *> &island, peer_islands) {
-        SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
-        mem.device = owner_sub->device;
-        mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
-        mem.device_size = existing_size;
-
-        owner_sub->device->mem_zero(mem);
-        owner_sub->ptr_map[key] = mem.device_pointer;
-      }
-    }
-
-    mem.device = this;
-    mem.device_pointer = key;
-    stats.mem_alloc(mem.device_size - existing_size);
-  }
-
-  void mem_free(device_memory &mem) override
-  {
-    device_ptr key = mem.device_pointer;
-    size_t existing_size = mem.device_size;
-
-    /* Free memory that was allocated for all devices (see above) on each device */
-    if (mem.type == MEM_PIXELS || (strcmp(mem.name, "RenderBuffers") == 0 && use_denoising)) {
-      foreach (SubDevice &sub, devices) {
-        mem.device = sub.device;
-        mem.device_pointer = sub.ptr_map[key];
-        mem.device_size = existing_size;
-
-        sub.device->mem_free(mem);
-        sub.ptr_map.erase(sub.ptr_map.find(key));
-      }
-      foreach (SubDevice &sub, denoising_devices) {
-        if (matching_rendering_and_denoising_devices) {
-          sub.ptr_map.erase(key);
-        }
-        else {
-          mem.device = sub.device;
-          mem.device_pointer = sub.ptr_map[key];
-          mem.device_size = existing_size;
-
-          sub.device->mem_free(mem);
-          sub.ptr_map.erase(sub.ptr_map.find(key));
-        }
-      }
-    }
-    else {
-      foreach (const vector<SubDevice *> &island, peer_islands) {
-        SubDevice *owner_sub = find_matching_mem_device(key, *island.front());
-        mem.device = owner_sub->device;
-        mem.device_pointer = owner_sub->ptr_map[key];
-        mem.device_size = existing_size;
-
-        owner_sub->device->mem_free(mem);
-        owner_sub->ptr_map.erase(owner_sub->ptr_map.find(key));
-
-        if (mem.type == MEM_TEXTURE) {
-          /* Free texture objects on all devices */
-          foreach (SubDevice *island_sub, island) {
-            if (island_sub != owner_sub) {
-              island_sub->device->mem_free(mem);
-            }
-          }
-        }
-      }
-    }
-
-    mem.device = this;
-    mem.device_pointer = 0;
-    mem.device_size = 0;
-    stats.mem_free(existing_size);
-  }
-
-  void const_copy_to(const char *name, void *host, size_t size) override
-  {
-    foreach (SubDevice &sub, devices)
-      sub.device->const_copy_to(name, host, size);
-  }
-
-  void draw_pixels(device_memory &rgba,
-                   int y,
-                   int w,
-                   int h,
-                   int width,
-                   int height,
-                   int dx,
-                   int dy,
-                   int dw,
-                   int dh,
-                   bool transparent,
-                   const DeviceDrawParams &draw_params) override
-  {
-    assert(rgba.type == MEM_PIXELS);
-
-    device_ptr key = rgba.device_pointer;
-    int i = 0, sub_h = h / devices.size();
-    int sub_height = height / devices.size();
-
-    foreach (SubDevice &sub, devices) {
-      int sy = y + i * sub_h;
-      int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h;
-      int sheight = (i == (int)devices.size() - 1) ? height - sub_height * i : sub_height;
-      int sdy = dy + i * sub_height;
-      /* adjust math for w/width */
-
-      rgba.device_pointer = sub.ptr_map[key];
-      sub.device->draw_pixels(
-          rgba, sy, w, sh, width, sheight, dx, sdy, dw, dh, transparent, draw_params);
-      i++;
-    }
-
-    rgba.device_pointer = key;
-  }
-
-  void map_tile(Device *sub_device, RenderTile &tile) override
-  {
-    if (!tile.buffer) {
-      return;
-    }
-
-    foreach (SubDevice &sub, devices) {
-      if (sub.device == sub_device) {
-        tile.buffer = find_matching_mem(tile.buffer, sub);
-        return;
-      }
-    }
-
-    foreach (SubDevice &sub, denoising_devices) {
-      if (sub.device == sub_device) {
-        tile.buffer = sub.ptr_map[tile.buffer];
-        return;
-      }
-    }
-  }
-
-  int device_number(Device *sub_device) override
-  {
-    int i = 0;
-
-    foreach (SubDevice &sub, devices) {
-      if (sub.device == sub_device)
-        return i;
-      i++;
-    }
-
-    foreach (SubDevice &sub, denoising_devices) {
-      if (sub.device == sub_device)
-        return i;
-      i++;
-    }
-
-    return -1;
-  }
-
-  void map_neighbor_tiles(Device *sub_device, RenderTileNeighbors &neighbors) override
-  {
-    for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
-      RenderTile &tile = neighbors.tiles[i];
-
-      if (!tile.buffers) {
-        continue;
-      }
-
-      device_vector<float> &mem = tile.buffers->buffer;
-      tile.buffer = mem.device_pointer;
-
-      if (mem.device == this && matching_rendering_and_denoising_devices) {
-        /* Skip unnecessary copies in viewport mode (buffer covers the
-         * whole image), but still need to fix up the tile device pointer. */
-        map_tile(sub_device, tile);
-        continue;
-      }
-
-      /* If the tile was rendered on another device, copy its memory to
-       * to the current device now, for the duration of the denoising task.
-       * Note that this temporarily modifies the RenderBuffers and calls
-       * the device, so this function is not thread safe. */
-      if (mem.device != sub_device) {
-        /* Only copy from device to host once. This is faster, but
-         * also required for the case where a CPU thread is denoising
-         * a tile rendered on the GPU. In that case we have to avoid
-         * overwriting the buffer being de-noised by the CPU thread. */
-        if (!tile.buffers->map_neighbor_copied) {
-          tile.buffers->map_neighbor_copied = true;
-          mem.copy_from_device();
-        }
-
-        if (mem.device == this) {
-          /* Can re-use memory if tile is already allocated on the sub device. */
-          map_tile(sub_device, tile);
-          mem.swap_device(sub_device, mem.device_size, tile.buffer);
-        }
-        else {
-          mem.swap_device(sub_device, 0, 0);
-        }
-
-        mem.copy_to_device();
-
-        tile.buffer = mem.device_pointer;
-        tile.device_size = mem.device_size;
-
-        mem.restore_device();
-      }
-    }
-  }
-
-  void unmap_neighbor_tiles(Device *sub_device, RenderTileNeighbors &neighbors) override
-  {
-    RenderTile &target_tile = neighbors.target;
-    device_vector<float> &mem = target_tile.buffers->buffer;
-
-    if (mem.device == this && matching_rendering_and_denoising_devices) {
-      return;
-    }
-
-    /* Copy denoised result back to the host. */
-    mem.swap_device(sub_device, target_tile.device_size, target_tile.buffer);
-    mem.copy_from_device();
-    mem.restore_device();
-
-    /* Copy denoised result to the original device. */
-    mem.copy_to_device();
-
-    for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
-      RenderTile &tile = neighbors.tiles[i];
-      if (!tile.buffers) {
-        continue;
-      }
-
-      device_vector<float> &mem = tile.buffers->buffer;
-
-      if (mem.device != sub_device && mem.device != this) {
-        /* Free up memory again if it was allocated for the copy above. */
-        mem.swap_device(sub_device, tile.device_size, tile.buffer);
-        sub_device->mem_free(mem);
-        mem.restore_device();
-      }
-    }
-  }
-
-  int get_split_task_count(DeviceTask &task) override
-  {
-    int total_tasks = 0;
-    list<DeviceTask> tasks;
-    task.split(tasks, devices.size());
-    foreach (SubDevice &sub, devices) {
-      if (!tasks.empty()) {
-        DeviceTask subtask = tasks.front();
-        tasks.pop_front();
-
-        total_tasks += sub.device->get_split_task_count(subtask);
-      }
-    }
-    return total_tasks;
-  }
-
-  void task_add(DeviceTask &task) override
-  {
-    list<SubDevice> task_devices = devices;
-    if (!denoising_devices.empty()) {
-      if (task.type == DeviceTask::DENOISE_BUFFER) {
-        /* Denoising tasks should be redirected to the denoising devices entirely. */
-        task_devices = denoising_devices;
-      }
-      else if (task.type == DeviceTask::RENDER && (task.tile_types & RenderTile::DENOISE)) {
-        const uint tile_types = task.tile_types;
-        /* For normal rendering tasks only redirect the denoising part to the denoising devices.
-         * Do not need to split the task here, since they all run through 'acquire_tile'. */
-        task.tile_types = RenderTile::DENOISE;
-        foreach (SubDevice &sub, denoising_devices) {
-          sub.device->task_add(task);
-        }
-        /* Rendering itself should still be executed on the rendering devices. */
-        task.tile_types = tile_types ^ RenderTile::DENOISE;
-      }
-    }
-
-    list<DeviceTask> tasks;
-    task.split(tasks, task_devices.size());
-
-    foreach (SubDevice &sub, task_devices) {
-      if (!tasks.empty()) {
-        DeviceTask subtask = tasks.front();
-        tasks.pop_front();
-
-        if (task.buffer)
-          subtask.buffer = find_matching_mem(task.buffer, sub);
-        if (task.rgba_byte)
-          subtask.rgba_byte = sub.ptr_map[task.rgba_byte];
-        if (task.rgba_half)
-          subtask.rgba_half = sub.ptr_map[task.rgba_half];
-        if (task.shader_input)
-          subtask.shader_input = find_matching_mem(task.shader_input, sub);
-        if (task.shader_output)
-          subtask.shader_output = find_matching_mem(task.shader_output, sub);
-
-        sub.device->task_add(subtask);
-
-        if (task.buffers && task.buffers->buffer.device == this) {
-          /* Synchronize access to RenderBuffers, since 'map_neighbor_tiles' is not thread-safe. */
-          sub.device->task_wait();
-        }
-      }
-    }
-  }
-
-  void task_wait() override
-  {
-    foreach (SubDevice &sub, devices)
-      sub.device->task_wait();
-    foreach (SubDevice &sub, denoising_devices)
-      sub.device->task_wait();
-  }
-
-  void task_cancel() override
-  {
-    foreach (SubDevice &sub, devices)
-      sub.device->task_cancel();
-    foreach (SubDevice &sub, denoising_devices)
-      sub.device->task_cancel();
-  }
-};
-
-Device *device_multi_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
-{
-  return new MultiDevice(info, stats, profiler, background);
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp
deleted file mode 100644
index 8904b517e92..00000000000
--- a/intern/cycles/device/device_network.cpp
+++ /dev/null
@@ -1,812 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "device/device_network.h"
-#include "device/device.h"
-#include "device/device_intern.h"
-
-#include "util/util_foreach.h"
-#include "util/util_logging.h"
-
-#if defined(WITH_NETWORK)
-
-CCL_NAMESPACE_BEGIN
-
-typedef map<device_ptr, device_ptr> PtrMap;
-typedef vector<uint8_t> DataVector;
-typedef map<device_ptr, DataVector> DataMap;
-
-/* tile list */
-typedef vector<RenderTile> TileList;
-
-/* search a list of tiles and find the one that matches the passed render tile */
-static TileList::iterator tile_list_find(TileList &tile_list, RenderTile &tile)
-{
-  for (TileList::iterator it = tile_list.begin(); it != tile_list.end(); ++it)
-    if (tile.x == it->x && tile.y == it->y && tile.start_sample == it->start_sample)
-      return it;
-  return tile_list.end();
-}
-
-class NetworkDevice : public Device {
- public:
-  boost::asio::io_service io_service;
-  tcp::socket socket;
-  device_ptr mem_counter;
-  DeviceTask the_task; /* todo: handle multiple tasks */
-
-  thread_mutex rpc_lock;
-
-  virtual bool show_samples() const
-  {
-    return false;
-  }
-
-  NetworkDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, const char *address)
-      : Device(info, stats, profiler, true), socket(io_service)
-  {
-    error_func = NetworkError();
-    stringstream portstr;
-    portstr << SERVER_PORT;
-
-    tcp::resolver resolver(io_service);
-    tcp::resolver::query query(address, portstr.str());
-    tcp::resolver::iterator endpoint_iterator = resolver.resolve(query);
-    tcp::resolver::iterator end;
-
-    boost::system::error_code error = boost::asio::error::host_not_found;
-    while (error && endpoint_iterator != end) {
-      socket.close();
-      socket.connect(*endpoint_iterator++, error);
-    }
-
-    if (error)
-      error_func.network_error(error.message());
-
-    mem_counter = 0;
-  }
-
-  ~NetworkDevice()
-  {
-    RPCSend snd(socket, &error_func, "stop");
-    snd.write();
-  }
-
-  virtual BVHLayoutMask get_bvh_layout_mask() const
-  {
-    return BVH_LAYOUT_BVH2;
-  }
-
-  void mem_alloc(device_memory &mem)
-  {
-    if (mem.name) {
-      VLOG(1) << "Buffer allocate: " << mem.name << ", "
-              << string_human_readable_number(mem.memory_size()) << " bytes. ("
-              << string_human_readable_size(mem.memory_size()) << ")";
-    }
-
-    thread_scoped_lock lock(rpc_lock);
-
-    mem.device_pointer = ++mem_counter;
-
-    RPCSend snd(socket, &error_func, "mem_alloc");
-    snd.add(mem);
-    snd.write();
-  }
-
-  void mem_copy_to(device_memory &mem)
-  {
-    thread_scoped_lock lock(rpc_lock);
-
-    RPCSend snd(socket, &error_func, "mem_copy_to");
-
-    snd.add(mem);
-    snd.write();
-    snd.write_buffer(mem.host_pointer, mem.memory_size());
-  }
-
-  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
-  {
-    thread_scoped_lock lock(rpc_lock);
-
-    size_t data_size = mem.memory_size();
-
-    RPCSend snd(socket, &error_func, "mem_copy_from");
-
-    snd.add(mem);
-    snd.add(y);
-    snd.add(w);
-    snd.add(h);
-    snd.add(elem);
-    snd.write();
-
-    RPCReceive rcv(socket, &error_func);
-    rcv.read_buffer(mem.host_pointer, data_size);
-  }
-
-  void mem_zero(device_memory &mem)
-  {
-    thread_scoped_lock lock(rpc_lock);
-
-    RPCSend snd(socket, &error_func, "mem_zero");
-
-    snd.add(mem);
-    snd.write();
-  }
-
-  void mem_free(device_memory &mem)
-  {
-    if (mem.device_pointer) {
-      thread_scoped_lock lock(rpc_lock);
-
-      RPCSend snd(socket, &error_func, "mem_free");
-
-      snd.add(mem);
-      snd.write();
-
-      mem.device_pointer = 0;
-    }
-  }
-
-  void const_copy_to(const char *name, void *host, size_t size)
-  {
-    thread_scoped_lock lock(rpc_lock);
-
-    RPCSend snd(socket, &error_func, "const_copy_to");
-
-    string name_string(name);
-
-    snd.add(name_string);
-    snd.add(size);
-    snd.write();
-    snd.write_buffer(host, size);
-  }
-
-  bool load_kernels(const DeviceRequestedFeatures &requested_features)
-  {
-    if (error_func.have_error())
-      return false;
-
-    thread_scoped_lock lock(rpc_lock);
-
-    RPCSend snd(socket, &error_func, "load_kernels");
-    snd.add(requested_features.experimental);
-    snd.add(requested_features.max_closure);
-    snd.add(requested_features.max_nodes_group);
-    snd.add(requested_features.nodes_features);
-    snd.write();
-
-    bool result;
-    RPCReceive rcv(socket, &error_func);
-    rcv.read(result);
-
-    return result;
-  }
-
-  void task_add(DeviceTask &task)
-  {
-    thread_scoped_lock lock(rpc_lock);
-
-    the_task = task;
-
-    RPCSend snd(socket, &error_func, "task_add");
-    snd.add(task);
-    snd.write();
-  }
-
-  void task_wait()
-  {
-    thread_scoped_lock lock(rpc_lock);
-
-    RPCSend snd(socket, &error_func, "task_wait");
-    snd.write();
-
-    lock.unlock();
-
-    TileList the_tiles;
-
-    /* todo: run this threaded for connecting to multiple clients */
-    for (;;) {
-      if (error_func.have_error())
-        break;
-
-      RenderTile tile;
-
-      lock.lock();
-      RPCReceive rcv(socket, &error_func);
-
-      if (rcv.name == "acquire_tile") {
-        lock.unlock();
-
-        /* todo: watch out for recursive calls! */
-        if (the_task.acquire_tile(this, tile)) { /* write return as bool */
-          the_tiles.push_back(tile);
-
-          lock.lock();
-          RPCSend snd(socket, &error_func, "acquire_tile");
-          snd.add(tile);
-          snd.write();
-          lock.unlock();
-        }
-        else {
-          lock.lock();
-          RPCSend snd(socket, &error_func, "acquire_tile_none");
-          snd.write();
-          lock.unlock();
-        }
-      }
-      else if (rcv.name == "release_tile") {
-        rcv.read(tile);
-        lock.unlock();
-
-        TileList::iterator it = tile_list_find(the_tiles, tile);
-        if (it != the_tiles.end()) {
-          tile.buffers = it->buffers;
-          the_tiles.erase(it);
-        }
-
-        assert(tile.buffers != NULL);
-
-        the_task.release_tile(tile);
-
-        lock.lock();
-        RPCSend snd(socket, &error_func, "release_tile");
-        snd.write();
-        lock.unlock();
-      }
-      else if (rcv.name == "task_wait_done") {
-        lock.unlock();
-        break;
-      }
-      else
-        lock.unlock();
-    }
-  }
-
-  void task_cancel()
-  {
-    thread_scoped_lock lock(rpc_lock);
-    RPCSend snd(socket, &error_func, "task_cancel");
-    snd.write();
-  }
-
-  int get_split_task_count(DeviceTask &)
-  {
-    return 1;
-  }
-
- private:
-  NetworkError error_func;
-};
-
-Device *device_network_create(DeviceInfo &info,
-                              Stats &stats,
-                              Profiler &profiler,
-                              const char *address)
-{
-  return new NetworkDevice(info, stats, profiler, address);
-}
-
-void device_network_info(vector<DeviceInfo> &devices)
-{
-  DeviceInfo info;
-
-  info.type = DEVICE_NETWORK;
-  info.description = "Network Device";
-  info.id = "NETWORK";
-  info.num = 0;
-
-  /* todo: get this info from device */
-  info.has_volume_decoupled = false;
-  info.has_adaptive_stop_per_sample = false;
-  info.has_osl = false;
-  info.denoisers = DENOISER_NONE;
-
-  devices.push_back(info);
-}
-
-class DeviceServer {
- public:
-  thread_mutex rpc_lock;
-
-  void network_error(const string &message)
-  {
-    error_func.network_error(message);
-  }
-
-  bool have_error()
-  {
-    return error_func.have_error();
-  }
-
-  DeviceServer(Device *device_, tcp::socket &socket_)
-      : device(device_), socket(socket_), stop(false), blocked_waiting(false)
-  {
-    error_func = NetworkError();
-  }
-
-  void listen()
-  {
-    /* receive remote function calls */
-    for (;;) {
-      listen_step();
-
-      if (stop)
-        break;
-    }
-  }
-
- protected:
-  void listen_step()
-  {
-    thread_scoped_lock lock(rpc_lock);
-    RPCReceive rcv(socket, &error_func);
-
-    if (rcv.name == "stop")
-      stop = true;
-    else
-      process(rcv, lock);
-  }
-
-  /* create a memory buffer for a device buffer and insert it into mem_data */
-  DataVector &data_vector_insert(device_ptr client_pointer, size_t data_size)
-  {
-    /* create a new DataVector and insert it into mem_data */
-    pair<DataMap::iterator, bool> data_ins = mem_data.insert(
-        DataMap::value_type(client_pointer, DataVector()));
-
-    /* make sure it was a unique insertion */
-    assert(data_ins.second);
-
-    /* get a reference to the inserted vector */
-    DataVector &data_v = data_ins.first->second;
-
-    /* size the vector */
-    data_v.resize(data_size);
-
-    return data_v;
-  }
-
-  DataVector &data_vector_find(device_ptr client_pointer)
-  {
-    DataMap::iterator i = mem_data.find(client_pointer);
-    assert(i != mem_data.end());
-    return i->second;
-  }
-
-  /* setup mapping and reverse mapping of client_pointer<->real_pointer */
-  void pointer_mapping_insert(device_ptr client_pointer, device_ptr real_pointer)
-  {
-    pair<PtrMap::iterator, bool> mapins;
-
-    /* insert mapping from client pointer to our real device pointer */
-    mapins = ptr_map.insert(PtrMap::value_type(client_pointer, real_pointer));
-    assert(mapins.second);
-
-    /* insert reverse mapping from real our device pointer to client pointer */
-    mapins = ptr_imap.insert(PtrMap::value_type(real_pointer, client_pointer));
-    assert(mapins.second);
-  }
-
-  device_ptr device_ptr_from_client_pointer(device_ptr client_pointer)
-  {
-    PtrMap::iterator i = ptr_map.find(client_pointer);
-    assert(i != ptr_map.end());
-    return i->second;
-  }
-
-  device_ptr device_ptr_from_client_pointer_erase(device_ptr client_pointer)
-  {
-    PtrMap::iterator i = ptr_map.find(client_pointer);
-    assert(i != ptr_map.end());
-
-    device_ptr result = i->second;
-
-    /* erase the mapping */
-    ptr_map.erase(i);
-
-    /* erase the reverse mapping */
-    PtrMap::iterator irev = ptr_imap.find(result);
-    assert(irev != ptr_imap.end());
-    ptr_imap.erase(irev);
-
-    /* erase the data vector */
-    DataMap::iterator idata = mem_data.find(client_pointer);
-    assert(idata != mem_data.end());
-    mem_data.erase(idata);
-
-    return result;
-  }
-
-  /* note that the lock must be already acquired upon entry.
-   * This is necessary because the caller often peeks at
-   * the header and delegates control to here when it doesn't
-   * specifically handle the current RPC.
-   * The lock must be unlocked before returning */
-  void process(RPCReceive &rcv, thread_scoped_lock &lock)
-  {
-    if (rcv.name == "mem_alloc") {
-      string name;
-      network_device_memory mem(device);
-      rcv.read(mem, name);
-      lock.unlock();
-
-      /* Allocate host side data buffer. */
-      size_t data_size = mem.memory_size();
-      device_ptr client_pointer = mem.device_pointer;
-
-      DataVector &data_v = data_vector_insert(client_pointer, data_size);
-      mem.host_pointer = (data_size) ? (void *)&(data_v[0]) : 0;
-
-      /* Perform the allocation on the actual device. */
-      device->mem_alloc(mem);
-
-      /* Store a mapping to/from client_pointer and real device pointer. */
-      pointer_mapping_insert(client_pointer, mem.device_pointer);
-    }
-    else if (rcv.name == "mem_copy_to") {
-      string name;
-      network_device_memory mem(device);
-      rcv.read(mem, name);
-      lock.unlock();
-
-      size_t data_size = mem.memory_size();
-      device_ptr client_pointer = mem.device_pointer;
-
-      if (client_pointer) {
-        /* Lookup existing host side data buffer. */
-        DataVector &data_v = data_vector_find(client_pointer);
-        mem.host_pointer = (void *)&data_v[0];
-
-        /* Translate the client pointer to a real device pointer. */
-        mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
-      }
-      else {
-        /* Allocate host side data buffer. */
-        DataVector &data_v = data_vector_insert(client_pointer, data_size);
-        mem.host_pointer = (data_size) ? (void *)&(data_v[0]) : 0;
-      }
-
-      /* Copy data from network into memory buffer. */
-      rcv.read_buffer((uint8_t *)mem.host_pointer, data_size);
-
-      /* Copy the data from the memory buffer to the device buffer. */
-      device->mem_copy_to(mem);
-
-      if (!client_pointer) {
-        /* Store a mapping to/from client_pointer and real device pointer. */
-        pointer_mapping_insert(client_pointer, mem.device_pointer);
-      }
-    }
-    else if (rcv.name == "mem_copy_from") {
-      string name;
-      network_device_memory mem(device);
-      int y, w, h, elem;
-
-      rcv.read(mem, name);
-      rcv.read(y);
-      rcv.read(w);
-      rcv.read(h);
-      rcv.read(elem);
-
-      device_ptr client_pointer = mem.device_pointer;
-      mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
-
-      DataVector &data_v = data_vector_find(client_pointer);
-
-      mem.host_pointer = (device_ptr) & (data_v[0]);
-
-      device->mem_copy_from(mem, y, w, h, elem);
-
-      size_t data_size = mem.memory_size();
-
-      RPCSend snd(socket, &error_func, "mem_copy_from");
-      snd.write();
-      snd.write_buffer((uint8_t *)mem.host_pointer, data_size);
-      lock.unlock();
-    }
-    else if (rcv.name == "mem_zero") {
-      string name;
-      network_device_memory mem(device);
-      rcv.read(mem, name);
-      lock.unlock();
-
-      size_t data_size = mem.memory_size();
-      device_ptr client_pointer = mem.device_pointer;
-
-      if (client_pointer) {
-        /* Lookup existing host side data buffer. */
-        DataVector &data_v = data_vector_find(client_pointer);
-        mem.host_pointer = (void *)&data_v[0];
-
-        /* Translate the client pointer to a real device pointer. */
-        mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
-      }
-      else {
-        /* Allocate host side data buffer. */
-        DataVector &data_v = data_vector_insert(client_pointer, data_size);
-        mem.host_pointer = (void *) ? (device_ptr) & (data_v[0]) : 0;
-      }
-
-      /* Zero memory. */
-      device->mem_zero(mem);
-
-      if (!client_pointer) {
-        /* Store a mapping to/from client_pointer and real device pointer. */
-        pointer_mapping_insert(client_pointer, mem.device_pointer);
-      }
-    }
-    else if (rcv.name == "mem_free") {
-      string name;
-      network_device_memory mem(device);
-
-      rcv.read(mem, name);
-      lock.unlock();
-
-      device_ptr client_pointer = mem.device_pointer;
-
-      mem.device_pointer = device_ptr_from_client_pointer_erase(client_pointer);
-
-      device->mem_free(mem);
-    }
-    else if (rcv.name == "const_copy_to") {
-      string name_string;
-      size_t size;
-
-      rcv.read(name_string);
-      rcv.read(size);
-
-      vector<char> host_vector(size);
-      rcv.read_buffer(&host_vector[0], size);
-      lock.unlock();
-
-      device->const_copy_to(name_string.c_str(), &host_vector[0], size);
-    }
-    else if (rcv.name == "load_kernels") {
-      DeviceRequestedFeatures requested_features;
-      rcv.read(requested_features.experimental);
-      rcv.read(requested_features.max_closure);
-      rcv.read(requested_features.max_nodes_group);
-      rcv.read(requested_features.nodes_features);
-
-      bool result;
-      result = device->load_kernels(requested_features);
-      RPCSend snd(socket, &error_func, "load_kernels");
-      snd.add(result);
-      snd.write();
-      lock.unlock();
-    }
-    else if (rcv.name == "task_add") {
-      DeviceTask task;
-
-      rcv.read(task);
-      lock.unlock();
-
-      if (task.buffer)
-        task.buffer = device_ptr_from_client_pointer(task.buffer);
-
-      if (task.rgba_half)
-        task.rgba_half = device_ptr_from_client_pointer(task.rgba_half);
-
-      if (task.rgba_byte)
-        task.rgba_byte = device_ptr_from_client_pointer(task.rgba_byte);
-
-      if (task.shader_input)
-        task.shader_input = device_ptr_from_client_pointer(task.shader_input);
-
-      if (task.shader_output)
-        task.shader_output = device_ptr_from_client_pointer(task.shader_output);
-
-      task.acquire_tile = function_bind(&DeviceServer::task_acquire_tile, this, _1, _2);
-      task.release_tile = function_bind(&DeviceServer::task_release_tile, this, _1);
-      task.update_progress_sample = function_bind(&DeviceServer::task_update_progress_sample,
-                                                  this);
-      task.update_tile_sample = function_bind(&DeviceServer::task_update_tile_sample, this, _1);
-      task.get_cancel = function_bind(&DeviceServer::task_get_cancel, this);
-
-      device->task_add(task);
-    }
-    else if (rcv.name == "task_wait") {
-      lock.unlock();
-
-      blocked_waiting = true;
-      device->task_wait();
-      blocked_waiting = false;
-
-      lock.lock();
-      RPCSend snd(socket, &error_func, "task_wait_done");
-      snd.write();
-      lock.unlock();
-    }
-    else if (rcv.name == "task_cancel") {
-      lock.unlock();
-      device->task_cancel();
-    }
-    else if (rcv.name == "acquire_tile") {
-      AcquireEntry entry;
-      entry.name = rcv.name;
-      rcv.read(entry.tile);
-      acquire_queue.push_back(entry);
-      lock.unlock();
-    }
-    else if (rcv.name == "acquire_tile_none") {
-      AcquireEntry entry;
-      entry.name = rcv.name;
-      acquire_queue.push_back(entry);
-      lock.unlock();
-    }
-    else if (rcv.name == "release_tile") {
-      AcquireEntry entry;
-      entry.name = rcv.name;
-      acquire_queue.push_back(entry);
-      lock.unlock();
-    }
-    else {
-      cout << "Error: unexpected RPC receive call \"" + rcv.name + "\"\n";
-      lock.unlock();
-    }
-  }
-
-  bool task_acquire_tile(Device *, RenderTile &tile)
-  {
-    thread_scoped_lock acquire_lock(acquire_mutex);
-
-    bool result = false;
-
-    RPCSend snd(socket, &error_func, "acquire_tile");
-    snd.write();
-
-    do {
-      if (blocked_waiting)
-        listen_step();
-
-      /* todo: avoid busy wait loop */
-      thread_scoped_lock lock(rpc_lock);
-
-      if (!acquire_queue.empty()) {
-        AcquireEntry entry = acquire_queue.front();
-        acquire_queue.pop_front();
-
-        if (entry.name == "acquire_tile") {
-          tile = entry.tile;
-
-          if (tile.buffer)
-            tile.buffer = ptr_map[tile.buffer];
-
-          result = true;
-          break;
-        }
-        else if (entry.name == "acquire_tile_none") {
-          break;
-        }
-        else {
-          cout << "Error: unexpected acquire RPC receive call \"" + entry.name + "\"\n";
-        }
-      }
-    } while (acquire_queue.empty() && !stop && !have_error());
-
-    return result;
-  }
-
-  void task_update_progress_sample()
-  {
-    ; /* skip */
-  }
-
-  void task_update_tile_sample(RenderTile &)
-  {
-    ; /* skip */
-  }
-
-  void task_release_tile(RenderTile &tile)
-  {
-    thread_scoped_lock acquire_lock(acquire_mutex);
-
-    if (tile.buffer)
-      tile.buffer = ptr_imap[tile.buffer];
-
-    {
-      thread_scoped_lock lock(rpc_lock);
-      RPCSend snd(socket, &error_func, "release_tile");
-      snd.add(tile);
-      snd.write();
-      lock.unlock();
-    }
-
-    do {
-      if (blocked_waiting)
-        listen_step();
-
-      /* todo: avoid busy wait loop */
-      thread_scoped_lock lock(rpc_lock);
-
-      if (!acquire_queue.empty()) {
-        AcquireEntry entry = acquire_queue.front();
-        acquire_queue.pop_front();
-
-        if (entry.name == "release_tile") {
-          lock.unlock();
-          break;
-        }
-        else {
-          cout << "Error: unexpected release RPC receive call \"" + entry.name + "\"\n";
-        }
-      }
-    } while (acquire_queue.empty() && !stop);
-  }
-
-  bool task_get_cancel()
-  {
-    return false;
-  }
-
-  /* properties */
-  Device *device;
-  tcp::socket &socket;
-
-  /* mapping of remote to local pointer */
-  PtrMap ptr_map;
-  PtrMap ptr_imap;
-  DataMap mem_data;
-
-  struct AcquireEntry {
-    string name;
-    RenderTile tile;
-  };
-
-  thread_mutex acquire_mutex;
-  list<AcquireEntry> acquire_queue;
-
-  bool stop;
-  bool blocked_waiting;
-
- private:
-  NetworkError error_func;
-
-  /* todo: free memory and device (osl) on network error */
-};
-
-void Device::server_run()
-{
-  try {
-    /* starts thread that responds to discovery requests */
-    ServerDiscovery discovery;
-
-    for (;;) {
-      /* accept connection */
-      boost::asio::io_service io_service;
-      tcp::acceptor acceptor(io_service, tcp::endpoint(tcp::v4(), SERVER_PORT));
-
-      tcp::socket socket(io_service);
-      acceptor.accept(socket);
-
-      string remote_address = socket.remote_endpoint().address().to_string();
-      printf("Connected to remote client at: %s\n", remote_address.c_str());
-
-      DeviceServer server(this, socket);
-      server.listen();
-
-      printf("Disconnected.\n");
-    }
-  }
-  catch (exception &e) {
-    fprintf(stderr, "Network server exception: %s\n", e.what());
-  }
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/device_network.h b/intern/cycles/device/device_network.h
deleted file mode 100644
index b3a0f6daa57..00000000000
--- a/intern/cycles/device/device_network.h
+++ /dev/null
@@ -1,490 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEVICE_NETWORK_H__
-#define __DEVICE_NETWORK_H__
-
-#ifdef WITH_NETWORK
-
-#  include <boost/archive/binary_iarchive.hpp>
-#  include <boost/archive/binary_oarchive.hpp>
-#  include <boost/archive/text_iarchive.hpp>
-#  include <boost/archive/text_oarchive.hpp>
-#  include <boost/array.hpp>
-#  include <boost/asio.hpp>
-#  include <boost/bind.hpp>
-#  include <boost/serialization/vector.hpp>
-#  include <boost/thread.hpp>
-
-#  include <deque>
-#  include <iostream>
-#  include <sstream>
-
-#  include "render/buffers.h"
-
-#  include "util/util_foreach.h"
-#  include "util/util_list.h"
-#  include "util/util_map.h"
-#  include "util/util_param.h"
-#  include "util/util_string.h"
-
-CCL_NAMESPACE_BEGIN
-
-using std::cerr;
-using std::cout;
-using std::exception;
-using std::hex;
-using std::setw;
-
-using boost::asio::ip::tcp;
-
-static const int SERVER_PORT = 5120;
-static const int DISCOVER_PORT = 5121;
-static const string DISCOVER_REQUEST_MSG = "REQUEST_RENDER_SERVER_IP";
-static const string DISCOVER_REPLY_MSG = "REPLY_RENDER_SERVER_IP";
-
-#  if 0
-typedef boost::archive::text_oarchive o_archive;
-typedef boost::archive::text_iarchive i_archive;
-#  else
-typedef boost::archive::binary_oarchive o_archive;
-typedef boost::archive::binary_iarchive i_archive;
-#  endif
-
-/* Serialization of device memory */
-
-class network_device_memory : public device_memory {
- public:
-  network_device_memory(Device *device) : device_memory(device, "", MEM_READ_ONLY)
-  {
-  }
-
-  ~network_device_memory()
-  {
-    device_pointer = 0;
-  };
-
-  vector<char> local_data;
-};
-
-/* Common network error function / object for both DeviceNetwork and DeviceServer. */
-class NetworkError {
- public:
-  NetworkError()
-  {
-    error = "";
-    error_count = 0;
-  }
-
-  ~NetworkError()
-  {
-  }
-
-  void network_error(const string &message)
-  {
-    error = message;
-    error_count += 1;
-  }
-
-  bool have_error()
-  {
-    return true ? error_count > 0 : false;
-  }
-
- private:
-  string error;
-  int error_count;
-};
-
-/* Remote procedure call Send */
-
-class RPCSend {
- public:
-  RPCSend(tcp::socket &socket_, NetworkError *e, const string &name_ = "")
-      : name(name_), socket(socket_), archive(archive_stream), sent(false)
-  {
-    archive &name_;
-    error_func = e;
-    fprintf(stderr, "rpc send %s\n", name.c_str());
-  }
-
-  ~RPCSend()
-  {
-  }
-
-  void add(const device_memory &mem)
-  {
-    archive &mem.data_type &mem.data_elements &mem.data_size;
-    archive &mem.data_width &mem.data_height &mem.data_depth &mem.device_pointer;
-    archive &mem.type &string(mem.name);
-    archive &mem.interpolation &mem.extension;
-    archive &mem.device_pointer;
-  }
-
-  template<typename T> void add(const T &data)
-  {
-    archive &data;
-  }
-
-  void add(const DeviceTask &task)
-  {
-    int type = (int)task.type;
-    archive &type &task.x &task.y &task.w &task.h;
-    archive &task.rgba_byte &task.rgba_half &task.buffer &task.sample &task.num_samples;
-    archive &task.offset &task.stride;
-    archive &task.shader_input &task.shader_output &task.shader_eval_type;
-    archive &task.shader_x &task.shader_w;
-    archive &task.need_finish_queue;
-  }
-
-  void add(const RenderTile &tile)
-  {
-    archive &tile.x &tile.y &tile.w &tile.h;
-    archive &tile.start_sample &tile.num_samples &tile.sample;
-    archive &tile.resolution &tile.offset &tile.stride;
-    archive &tile.buffer;
-  }
-
-  void write()
-  {
-    boost::system::error_code error;
-
-    /* get string from stream */
-    string archive_str = archive_stream.str();
-
-    /* first send fixed size header with size of following data */
-    ostringstream header_stream;
-    header_stream << setw(8) << hex << archive_str.size();
-    string header_str = header_stream.str();
-
-    boost::asio::write(
-        socket, boost::asio::buffer(header_str), boost::asio::transfer_all(), error);
-
-    if (error.value())
-      error_func->network_error(error.message());
-
-    /* then send actual data */
-    boost::asio::write(
-        socket, boost::asio::buffer(archive_str), boost::asio::transfer_all(), error);
-
-    if (error.value())
-      error_func->network_error(error.message());
-
-    sent = true;
-  }
-
-  void write_buffer(void *buffer, size_t size)
-  {
-    boost::system::error_code error;
-
-    boost::asio::write(
-        socket, boost::asio::buffer(buffer, size), boost::asio::transfer_all(), error);
-
-    if (error.value())
-      error_func->network_error(error.message());
-  }
-
- protected:
-  string name;
-  tcp::socket &socket;
-  ostringstream archive_stream;
-  o_archive archive;
-  bool sent;
-  NetworkError *error_func;
-};
-
-/* Remote procedure call Receive */
-
-class RPCReceive {
- public:
-  RPCReceive(tcp::socket &socket_, NetworkError *e)
-      : socket(socket_), archive_stream(NULL), archive(NULL)
-  {
-    error_func = e;
-    /* read head with fixed size */
-    vector<char> header(8);
-    boost::system::error_code error;
-    size_t len = boost::asio::read(socket, boost::asio::buffer(header), error);
-
-    if (error.value()) {
-      error_func->network_error(error.message());
-    }
-
-    /* verify if we got something */
-    if (len == header.size()) {
-      /* decode header */
-      string header_str(&header[0], header.size());
-      istringstream header_stream(header_str);
-
-      size_t data_size;
-
-      if ((header_stream >> hex >> data_size)) {
-
-        vector<char> data(data_size);
-        size_t len = boost::asio::read(socket, boost::asio::buffer(data), error);
-
-        if (error.value())
-          error_func->network_error(error.message());
-
-        if (len == data_size) {
-          archive_str = (data.size()) ? string(&data[0], data.size()) : string("");
-
-          archive_stream = new istringstream(archive_str);
-          archive = new i_archive(*archive_stream);
-
-          *archive &name;
-          fprintf(stderr, "rpc receive %s\n", name.c_str());
-        }
-        else {
-          error_func->network_error("Network receive error: data size doesn't match header");
-        }
-      }
-      else {
-        error_func->network_error("Network receive error: can't decode data size from header");
-      }
-    }
-    else {
-      error_func->network_error("Network receive error: invalid header size");
-    }
-  }
-
-  ~RPCReceive()
-  {
-    delete archive;
-    delete archive_stream;
-  }
-
-  void read(network_device_memory &mem, string &name)
-  {
-    *archive &mem.data_type &mem.data_elements &mem.data_size;
-    *archive &mem.data_width &mem.data_height &mem.data_depth &mem.device_pointer;
-    *archive &mem.type &name;
-    *archive &mem.interpolation &mem.extension;
-    *archive &mem.device_pointer;
-
-    mem.name = name.c_str();
-    mem.host_pointer = 0;
-
-    /* Can't transfer OpenGL texture over network. */
-    if (mem.type == MEM_PIXELS) {
-      mem.type = MEM_READ_WRITE;
-    }
-  }
-
-  template<typename T> void read(T &data)
-  {
-    *archive &data;
-  }
-
-  void read_buffer(void *buffer, size_t size)
-  {
-    boost::system::error_code error;
-    size_t len = boost::asio::read(socket, boost::asio::buffer(buffer, size), error);
-
-    if (error.value()) {
-      error_func->network_error(error.message());
-    }
-
-    if (len != size)
-      cout << "Network receive error: buffer size doesn't match expected size\n";
-  }
-
-  void read(DeviceTask &task)
-  {
-    int type;
-
-    *archive &type &task.x &task.y &task.w &task.h;
-    *archive &task.rgba_byte &task.rgba_half &task.buffer &task.sample &task.num_samples;
-    *archive &task.offset &task.stride;
-    *archive &task.shader_input &task.shader_output &task.shader_eval_type;
-    *archive &task.shader_x &task.shader_w;
-    *archive &task.need_finish_queue;
-
-    task.type = (DeviceTask::Type)type;
-  }
-
-  void read(RenderTile &tile)
-  {
-    *archive &tile.x &tile.y &tile.w &tile.h;
-    *archive &tile.start_sample &tile.num_samples &tile.sample;
-    *archive &tile.resolution &tile.offset &tile.stride;
-    *archive &tile.buffer;
-
-    tile.buffers = NULL;
-  }
-
-  string name;
-
- protected:
-  tcp::socket &socket;
-  string archive_str;
-  istringstream *archive_stream;
-  i_archive *archive;
-  NetworkError *error_func;
-};
-
-/* Server auto discovery */
-
-class ServerDiscovery {
- public:
-  explicit ServerDiscovery(bool discover = false)
-      : listen_socket(io_service), collect_servers(false)
-  {
-    /* setup listen socket */
-    listen_endpoint.address(boost::asio::ip::address_v4::any());
-    listen_endpoint.port(DISCOVER_PORT);
-
-    listen_socket.open(listen_endpoint.protocol());
-
-    boost::asio::socket_base::reuse_address option(true);
-    listen_socket.set_option(option);
-
-    listen_socket.bind(listen_endpoint);
-
-    /* setup receive callback */
-    async_receive();
-
-    /* start server discovery */
-    if (discover) {
-      collect_servers = true;
-      servers.clear();
-
-      broadcast_message(DISCOVER_REQUEST_MSG);
-    }
-
-    /* start thread */
-    work = new boost::asio::io_service::work(io_service);
-    thread = new boost::thread(boost::bind(&boost::asio::io_service::run, &io_service));
-  }
-
-  ~ServerDiscovery()
-  {
-    io_service.stop();
-    thread->join();
-    delete thread;
-    delete work;
-  }
-
-  vector<string> get_server_list()
-  {
-    vector<string> result;
-
-    mutex.lock();
-    result = vector<string>(servers.begin(), servers.end());
-    mutex.unlock();
-
-    return result;
-  }
-
- private:
-  void handle_receive_from(const boost::system::error_code &error, size_t size)
-  {
-    if (error) {
-      cout << "Server discovery receive error: " << error.message() << "\n";
-      return;
-    }
-
-    if (size > 0) {
-      string msg = string(receive_buffer, size);
-
-      /* handle incoming message */
-      if (collect_servers) {
-        if (msg == DISCOVER_REPLY_MSG) {
-          string address = receive_endpoint.address().to_string();
-
-          mutex.lock();
-
-          /* add address if it's not already in the list */
-          bool found = std::find(servers.begin(), servers.end(), address) != servers.end();
-
-          if (!found)
-            servers.push_back(address);
-
-          mutex.unlock();
-        }
-      }
-      else {
-        /* reply to request */
-        if (msg == DISCOVER_REQUEST_MSG)
-          broadcast_message(DISCOVER_REPLY_MSG);
-      }
-    }
-
-    async_receive();
-  }
-
-  void async_receive()
-  {
-    listen_socket.async_receive_from(boost::asio::buffer(receive_buffer),
-                                     receive_endpoint,
-                                     boost::bind(&ServerDiscovery::handle_receive_from,
-                                                 this,
-                                                 boost::asio::placeholders::error,
-                                                 boost::asio::placeholders::bytes_transferred));
-  }
-
-  void broadcast_message(const string &msg)
-  {
-    /* setup broadcast socket */
-    boost::asio::ip::udp::socket socket(io_service);
-
-    socket.open(boost::asio::ip::udp::v4());
-
-    boost::asio::socket_base::broadcast option(true);
-    socket.set_option(option);
-
-    boost::asio::ip::udp::endpoint broadcast_endpoint(
-        boost::asio::ip::address::from_string("255.255.255.255"), DISCOVER_PORT);
-
-    /* broadcast message */
-    socket.send_to(boost::asio::buffer(msg), broadcast_endpoint);
-  }
-
-  /* network service and socket */
-  boost::asio::io_service io_service;
-  boost::asio::ip::udp::endpoint listen_endpoint;
-  boost::asio::ip::udp::socket listen_socket;
-
-  /* threading */
-  boost::thread *thread;
-  boost::asio::io_service::work *work;
-  boost::mutex mutex;
-
-  /* buffer and endpoint for receiving messages */
-  char receive_buffer[256];
-  boost::asio::ip::udp::endpoint receive_endpoint;
-
-  // os, version, devices, status, host name, group name, ip as far as fields go
-  struct ServerInfo {
-    string cycles_version;
-    string os;
-    int device_count;
-    string status;
-    string host_name;
-    string group_name;
-    string host_addr;
-  };
-
-  /* collection of server addresses in list */
-  bool collect_servers;
-  vector<string> servers;
-};
-
-CCL_NAMESPACE_END
-
-#endif
-
-#endif /* __DEVICE_NETWORK_H__ */
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
deleted file mode 100644
index 9abb7cfb7fe..00000000000
--- a/intern/cycles/device/device_opencl.cpp
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPENCL
-
-#  include "device/opencl/device_opencl.h"
-#  include "device/device.h"
-#  include "device/device_intern.h"
-
-#  include "util/util_foreach.h"
-#  include "util/util_logging.h"
-#  include "util/util_set.h"
-#  include "util/util_string.h"
-
-CCL_NAMESPACE_BEGIN
-
-Device *device_opencl_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
-{
-  return opencl_create_split_device(info, stats, profiler, background);
-}
-
-bool device_opencl_init()
-{
-  static bool initialized = false;
-  static bool result = false;
-
-  if (initialized)
-    return result;
-
-  initialized = true;
-
-  if (OpenCLInfo::device_type() != 0) {
-    int clew_result = clewInit();
-    if (clew_result == CLEW_SUCCESS) {
-      VLOG(1) << "CLEW initialization succeeded.";
-      result = true;
-    }
-    else {
-      VLOG(1) << "CLEW initialization failed: "
-              << ((clew_result == CLEW_ERROR_ATEXIT_FAILED) ? "Error setting up atexit() handler" :
-                                                              "Error opening the library");
-    }
-  }
-  else {
-    VLOG(1) << "Skip initializing CLEW, platform is force disabled.";
-    result = false;
-  }
-
-  return result;
-}
-
-static cl_int device_opencl_get_num_platforms_safe(cl_uint *num_platforms)
-{
-#  ifdef _WIN32
-  __try {
-    return clGetPlatformIDs(0, NULL, num_platforms);
-  }
-  __except (EXCEPTION_EXECUTE_HANDLER) {
-    /* Ignore crashes inside the OpenCL driver and hope we can
-     * survive even with corrupted OpenCL installs. */
-    fprintf(stderr, "Cycles OpenCL: driver crashed, continuing without OpenCL.\n");
-  }
-
-  *num_platforms = 0;
-  return CL_DEVICE_NOT_FOUND;
-#  else
-  return clGetPlatformIDs(0, NULL, num_platforms);
-#  endif
-}
-
-void device_opencl_info(vector<DeviceInfo> &devices)
-{
-  cl_uint num_platforms = 0;
-  device_opencl_get_num_platforms_safe(&num_platforms);
-  if (num_platforms == 0) {
-    return;
-  }
-
-  vector<OpenCLPlatformDevice> usable_devices;
-  OpenCLInfo::get_usable_devices(&usable_devices);
-  /* Devices are numbered consecutively across platforms. */
-  int num_devices = 0;
-  set<string> unique_ids;
-  foreach (OpenCLPlatformDevice &platform_device, usable_devices) {
-    /* Compute unique ID for persistent user preferences. */
-    const string &platform_name = platform_device.platform_name;
-    const string &device_name = platform_device.device_name;
-    string hardware_id = platform_device.hardware_id;
-    if (hardware_id == "") {
-      hardware_id = string_printf("ID_%d", num_devices);
-    }
-    string id = string("OPENCL_") + platform_name + "_" + device_name + "_" + hardware_id;
-
-    /* Hardware ID might not be unique, add device number in that case. */
-    if (unique_ids.find(id) != unique_ids.end()) {
-      id += string_printf("_ID_%d", num_devices);
-    }
-    unique_ids.insert(id);
-
-    /* Create DeviceInfo. */
-    DeviceInfo info;
-    info.type = DEVICE_OPENCL;
-    info.description = string_remove_trademark(string(device_name));
-    info.num = num_devices;
-    /* We don't know if it's used for display, but assume it is. */
-    info.display_device = true;
-    info.use_split_kernel = true;
-    info.has_volume_decoupled = false;
-    info.has_adaptive_stop_per_sample = false;
-    info.denoisers = DENOISER_NLM;
-    info.id = id;
-
-    /* Check OpenCL extensions */
-    info.has_half_images = platform_device.device_extensions.find("cl_khr_fp16") != string::npos;
-
-    /* Disabled for now due to apparent AMD driver bug. */
-    info.has_nanovdb = platform_name != "AMD Accelerated Parallel Processing";
-
-    devices.push_back(info);
-    num_devices++;
-  }
-}
-
-string device_opencl_capabilities()
-{
-  if (OpenCLInfo::device_type() == 0) {
-    return "All OpenCL devices are forced to be OFF";
-  }
-  string result = "";
-  string error_msg = ""; /* Only used by opencl_assert(), but in the future
-                          * it could also be nicely reported to the console.
-                          */
-  cl_uint num_platforms = 0;
-  opencl_assert(device_opencl_get_num_platforms_safe(&num_platforms));
-  if (num_platforms == 0) {
-    return "No OpenCL platforms found\n";
-  }
-  result += string_printf("Number of platforms: %u\n", num_platforms);
-
-  vector<cl_platform_id> platform_ids;
-  platform_ids.resize(num_platforms);
-  opencl_assert(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL));
-
-#  define APPEND_INFO(func, id, name, what, type) \
-    do { \
-      type data; \
-      memset(&data, 0, sizeof(data)); \
-      opencl_assert(func(id, what, sizeof(data), &data, NULL)); \
-      result += string_printf("%s: %s\n", name, to_string(data).c_str()); \
-    } while (false)
-#  define APPEND_STRING_INFO_IMPL(func, id, name, what, is_optional) \
-    do { \
-      string value; \
-      size_t length = 0; \
-      if (func(id, what, 0, NULL, &length) == CL_SUCCESS) { \
-        vector<char> buffer(length + 1); \
-        if (func(id, what, buffer.size(), buffer.data(), NULL) == CL_SUCCESS) { \
-          value = string(buffer.data()); \
-        } \
-      } \
-      if (is_optional && !(length != 0 && value[0] != '\0')) { \
-        break; \
-      } \
-      result += string_printf("%s: %s\n", name, value.c_str()); \
-    } while (false)
-#  define APPEND_PLATFORM_STRING_INFO(id, name, what) \
-    APPEND_STRING_INFO_IMPL(clGetPlatformInfo, id, "\tPlatform " name, what, false)
-#  define APPEND_STRING_EXTENSION_INFO(func, id, name, what) \
-    APPEND_STRING_INFO_IMPL(clGetPlatformInfo, id, "\tPlatform " name, what, true)
-#  define APPEND_PLATFORM_INFO(id, name, what, type) \
-    APPEND_INFO(clGetPlatformInfo, id, "\tPlatform " name, what, type)
-#  define APPEND_DEVICE_INFO(id, name, what, type) \
-    APPEND_INFO(clGetDeviceInfo, id, "\t\t\tDevice " name, what, type)
-#  define APPEND_DEVICE_STRING_INFO(id, name, what) \
-    APPEND_STRING_INFO_IMPL(clGetDeviceInfo, id, "\t\t\tDevice " name, what, false)
-#  define APPEND_DEVICE_STRING_EXTENSION_INFO(id, name, what) \
-    APPEND_STRING_INFO_IMPL(clGetDeviceInfo, id, "\t\t\tDevice " name, what, true)
-
-  vector<cl_device_id> device_ids;
-  for (cl_uint platform = 0; platform < num_platforms; ++platform) {
-    cl_platform_id platform_id = platform_ids[platform];
-
-    result += string_printf("Platform #%u\n", platform);
-
-    APPEND_PLATFORM_STRING_INFO(platform_id, "Name", CL_PLATFORM_NAME);
-    APPEND_PLATFORM_STRING_INFO(platform_id, "Vendor", CL_PLATFORM_VENDOR);
-    APPEND_PLATFORM_STRING_INFO(platform_id, "Version", CL_PLATFORM_VERSION);
-    APPEND_PLATFORM_STRING_INFO(platform_id, "Profile", CL_PLATFORM_PROFILE);
-    APPEND_PLATFORM_STRING_INFO(platform_id, "Extensions", CL_PLATFORM_EXTENSIONS);
-
-    cl_uint num_devices = 0;
-    opencl_assert(
-        clGetDeviceIDs(platform_ids[platform], CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices));
-    result += string_printf("\tNumber of devices: %u\n", num_devices);
-
-    device_ids.resize(num_devices);
-    opencl_assert(clGetDeviceIDs(
-        platform_ids[platform], CL_DEVICE_TYPE_ALL, num_devices, &device_ids[0], NULL));
-    for (cl_uint device = 0; device < num_devices; ++device) {
-      cl_device_id device_id = device_ids[device];
-
-      result += string_printf("\t\tDevice: #%u\n", device);
-
-      APPEND_DEVICE_STRING_INFO(device_id, "Name", CL_DEVICE_NAME);
-      APPEND_DEVICE_STRING_EXTENSION_INFO(device_id, "Board Name", CL_DEVICE_BOARD_NAME_AMD);
-      APPEND_DEVICE_STRING_INFO(device_id, "Vendor", CL_DEVICE_VENDOR);
-      APPEND_DEVICE_STRING_INFO(device_id, "OpenCL C Version", CL_DEVICE_OPENCL_C_VERSION);
-      APPEND_DEVICE_STRING_INFO(device_id, "Profile", CL_DEVICE_PROFILE);
-      APPEND_DEVICE_STRING_INFO(device_id, "Version", CL_DEVICE_VERSION);
-      APPEND_DEVICE_STRING_INFO(device_id, "Extensions", CL_DEVICE_EXTENSIONS);
-      APPEND_DEVICE_INFO(
-          device_id, "Max clock frequency (MHz)", CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint);
-      APPEND_DEVICE_INFO(device_id, "Max compute units", CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint);
-      APPEND_DEVICE_INFO(device_id, "Max work group size", CL_DEVICE_MAX_WORK_GROUP_SIZE, size_t);
-    }
-  }
-
-#  undef APPEND_INFO
-#  undef APPEND_STRING_INFO_IMPL
-#  undef APPEND_PLATFORM_STRING_INFO
-#  undef APPEND_STRING_EXTENSION_INFO
-#  undef APPEND_PLATFORM_INFO
-#  undef APPEND_DEVICE_INFO
-#  undef APPEND_DEVICE_STRING_INFO
-#  undef APPEND_DEVICE_STRING_EXTENSION_INFO
-
-  return result;
-}
-
-CCL_NAMESPACE_END
-
-#endif /* WITH_OPENCL */
diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp
deleted file mode 100644
index 6f9a7943722..00000000000
--- a/intern/cycles/device/device_optix.cpp
+++ /dev/null
@@ -1,1936 +0,0 @@
-/*
- * Copyright 2019, NVIDIA Corporation.
- * Copyright 2019, Blender Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPTIX
-
-#  include "bvh/bvh.h"
-#  include "bvh/bvh_optix.h"
-#  include "device/cuda/device_cuda.h"
-#  include "device/device_denoising.h"
-#  include "device/device_intern.h"
-#  include "render/buffers.h"
-#  include "render/hair.h"
-#  include "render/mesh.h"
-#  include "render/object.h"
-#  include "render/scene.h"
-#  include "util/util_debug.h"
-#  include "util/util_logging.h"
-#  include "util/util_md5.h"
-#  include "util/util_path.h"
-#  include "util/util_progress.h"
-#  include "util/util_time.h"
-
-#  ifdef WITH_CUDA_DYNLOAD
-#    include <cuew.h>
-// Do not use CUDA SDK headers when using CUEW
-#    define OPTIX_DONT_INCLUDE_CUDA
-#  endif
-#  include <optix_function_table_definition.h>
-#  include <optix_stubs.h>
-
-// TODO(pmours): Disable this once drivers have native support
-#  define OPTIX_DENOISER_NO_PIXEL_STRIDE 1
-
-CCL_NAMESPACE_BEGIN
-
-/* Make sure this stays in sync with kernel_globals.h */
-struct ShaderParams {
-  uint4 *input;
-  float4 *output;
-  int type;
-  int filter;
-  int sx;
-  int offset;
-  int sample;
-};
-struct KernelParams {
-  WorkTile tile;
-  KernelData data;
-  ShaderParams shader;
-#  define KERNEL_TEX(type, name) const type *name;
-#  include "kernel/kernel_textures.h"
-#  undef KERNEL_TEX
-};
-
-#  define check_result_cuda(stmt) \
-    { \
-      CUresult res = stmt; \
-      if (res != CUDA_SUCCESS) { \
-        const char *name; \
-        cuGetErrorName(res, &name); \
-        set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
-        return; \
-      } \
-    } \
-    (void)0
-#  define check_result_cuda_ret(stmt) \
-    { \
-      CUresult res = stmt; \
-      if (res != CUDA_SUCCESS) { \
-        const char *name; \
-        cuGetErrorName(res, &name); \
-        set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
-        return false; \
-      } \
-    } \
-    (void)0
-
-#  define check_result_optix(stmt) \
-    { \
-      enum OptixResult res = stmt; \
-      if (res != OPTIX_SUCCESS) { \
-        const char *name = optixGetErrorName(res); \
-        set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
-        return; \
-      } \
-    } \
-    (void)0
-#  define check_result_optix_ret(stmt) \
-    { \
-      enum OptixResult res = stmt; \
-      if (res != OPTIX_SUCCESS) { \
-        const char *name = optixGetErrorName(res); \
-        set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
-        return false; \
-      } \
-    } \
-    (void)0
-
-#  define launch_filter_kernel(func_name, w, h, args) \
-    { \
-      CUfunction func; \
-      check_result_cuda_ret(cuModuleGetFunction(&func, cuFilterModule, func_name)); \
-      check_result_cuda_ret(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1)); \
-      int threads; \
-      check_result_cuda_ret( \
-          cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
-      threads = (int)sqrt((float)threads); \
-      int xblocks = ((w) + threads - 1) / threads; \
-      int yblocks = ((h) + threads - 1) / threads; \
-      check_result_cuda_ret( \
-          cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0)); \
-    } \
-    (void)0
-
-class OptiXDevice : public CUDADevice {
-
-  // List of OptiX program groups
-  enum {
-    PG_RGEN,
-    PG_MISS,
-    PG_HITD,  // Default hit group
-    PG_HITS,  // __SHADOW_RECORD_ALL__ hit group
-    PG_HITL,  // __BVH_LOCAL__ hit group (only used for triangles)
-#  if OPTIX_ABI_VERSION >= 36
-    PG_HITD_MOTION,
-    PG_HITS_MOTION,
-#  endif
-    PG_BAKE,  // kernel_bake_evaluate
-    PG_DISP,  // kernel_displace_evaluate
-    PG_BACK,  // kernel_background_evaluate
-    PG_CALL,
-    NUM_PROGRAM_GROUPS = PG_CALL + 3
-  };
-
-  // List of OptiX pipelines
-  enum { PIP_PATH_TRACE, PIP_SHADER_EVAL, NUM_PIPELINES };
-
-  // A single shader binding table entry
-  struct SbtRecord {
-    char header[OPTIX_SBT_RECORD_HEADER_SIZE];
-  };
-
-  // Information stored about CUDA memory allocations
-  struct CUDAMem {
-    bool free_map_host = false;
-    CUarray array = NULL;
-    CUtexObject texobject = 0;
-    bool use_mapped_host = false;
-  };
-
-  // Helper class to manage current CUDA context
-  struct CUDAContextScope {
-    CUDAContextScope(CUcontext ctx)
-    {
-      cuCtxPushCurrent(ctx);
-    }
-    ~CUDAContextScope()
-    {
-      cuCtxPopCurrent(NULL);
-    }
-  };
-
-  // Use a pool with multiple threads to support launches with multiple CUDA streams
-  TaskPool task_pool;
-
-  vector<CUstream> cuda_stream;
-  OptixDeviceContext context = NULL;
-
-  OptixModule optix_module = NULL;  // All necessary OptiX kernels are in one module
-  OptixModule builtin_modules[2] = {};
-  OptixPipeline pipelines[NUM_PIPELINES] = {};
-
-  bool motion_blur = false;
-  device_vector<SbtRecord> sbt_data;
-  device_only_memory<KernelParams> launch_params;
-  OptixTraversableHandle tlas_handle = 0;
-
-  OptixDenoiser denoiser = NULL;
-  device_only_memory<unsigned char> denoiser_state;
-  int denoiser_input_passes = 0;
-
-  vector<device_only_memory<char>> delayed_free_bvh_memory;
-  thread_mutex delayed_free_bvh_mutex;
-
- public:
-  OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
-      : CUDADevice(info_, stats_, profiler_, background_),
-        sbt_data(this, "__sbt", MEM_READ_ONLY),
-        launch_params(this, "__params", false),
-        denoiser_state(this, "__denoiser_state", true)
-  {
-    // Store number of CUDA streams in device info
-    info.cpu_threads = DebugFlags().optix.cuda_streams;
-
-    // Make the CUDA context current
-    if (!cuContext) {
-      return;  // Do not initialize if CUDA context creation failed already
-    }
-    const CUDAContextScope scope(cuContext);
-
-    // Create OptiX context for this device
-    OptixDeviceContextOptions options = {};
-#  ifdef WITH_CYCLES_LOGGING
-    options.logCallbackLevel = 4;  // Fatal = 1, Error = 2, Warning = 3, Print = 4
-    options.logCallbackFunction =
-        [](unsigned int level, const char *, const char *message, void *) {
-          switch (level) {
-            case 1:
-              LOG_IF(FATAL, VLOG_IS_ON(1)) << message;
-              break;
-            case 2:
-              LOG_IF(ERROR, VLOG_IS_ON(1)) << message;
-              break;
-            case 3:
-              LOG_IF(WARNING, VLOG_IS_ON(1)) << message;
-              break;
-            case 4:
-              LOG_IF(INFO, VLOG_IS_ON(1)) << message;
-              break;
-          }
-        };
-#  endif
-    check_result_optix(optixDeviceContextCreate(cuContext, &options, &context));
-#  ifdef WITH_CYCLES_LOGGING
-    check_result_optix(optixDeviceContextSetLogCallback(
-        context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel));
-#  endif
-
-    // Create launch streams
-    cuda_stream.resize(info.cpu_threads);
-    for (int i = 0; i < info.cpu_threads; ++i)
-      check_result_cuda(cuStreamCreate(&cuda_stream[i], CU_STREAM_NON_BLOCKING));
-
-    // Fix weird compiler bug that assigns wrong size
-    launch_params.data_elements = sizeof(KernelParams);
-    // Allocate launch parameter buffer memory on device
-    launch_params.alloc_to_device(info.cpu_threads);
-  }
-  ~OptiXDevice()
-  {
-    // Stop processing any more tasks
-    task_pool.cancel();
-
-    // Make CUDA context current
-    const CUDAContextScope scope(cuContext);
-
-    free_bvh_memory_delayed();
-
-    sbt_data.free();
-    texture_info.free();
-    launch_params.free();
-    denoiser_state.free();
-
-    // Unload modules
-    if (optix_module != NULL)
-      optixModuleDestroy(optix_module);
-    for (unsigned int i = 0; i < 2; ++i)
-      if (builtin_modules[i] != NULL)
-        optixModuleDestroy(builtin_modules[i]);
-    for (unsigned int i = 0; i < NUM_PIPELINES; ++i)
-      if (pipelines[i] != NULL)
-        optixPipelineDestroy(pipelines[i]);
-
-    // Destroy launch streams
-    for (CUstream stream : cuda_stream)
-      cuStreamDestroy(stream);
-
-    if (denoiser != NULL)
-      optixDenoiserDestroy(denoiser);
-
-    optixDeviceContextDestroy(context);
-  }
-
- private:
-  bool show_samples() const override
-  {
-    // Only show samples if not rendering multiple tiles in parallel
-    return info.cpu_threads == 1;
-  }
-
-  BVHLayoutMask get_bvh_layout_mask() const override
-  {
-    // CUDA kernels are used when doing baking, so need to build a BVH those can understand too!
-    if (optix_module == NULL)
-      return CUDADevice::get_bvh_layout_mask();
-
-    // OptiX has its own internal acceleration structure format
-    return BVH_LAYOUT_OPTIX;
-  }
-
-  string compile_kernel_get_common_cflags(const DeviceRequestedFeatures &requested_features,
-                                          bool filter,
-                                          bool /*split*/) override
-  {
-    // Split kernel is not supported in OptiX
-    string common_cflags = CUDADevice::compile_kernel_get_common_cflags(
-        requested_features, filter, false);
-
-    // Add OptiX SDK include directory to include paths
-    const char *optix_sdk_path = getenv("OPTIX_ROOT_DIR");
-    if (optix_sdk_path) {
-      common_cflags += string_printf(" -I\"%s/include\"", optix_sdk_path);
-    }
-
-    // Specialization for shader raytracing
-    if (requested_features.use_shader_raytrace) {
-      common_cflags += " --keep-device-functions";
-    }
-    else {
-      common_cflags += " -D __NO_SHADER_RAYTRACE__";
-    }
-
-    return common_cflags;
-  }
-
-  bool load_kernels(const DeviceRequestedFeatures &requested_features) override
-  {
-    if (have_error()) {
-      // Abort early if context creation failed already
-      return false;
-    }
-
-    // Load CUDA modules because we need some of the utility kernels
-    if (!CUDADevice::load_kernels(requested_features)) {
-      return false;
-    }
-
-    // Baking is currently performed using CUDA, so no need to load OptiX kernels
-    if (requested_features.use_baking) {
-      return true;
-    }
-
-    const CUDAContextScope scope(cuContext);
-
-    // Unload existing OptiX module and pipelines first
-    if (optix_module != NULL) {
-      optixModuleDestroy(optix_module);
-      optix_module = NULL;
-    }
-    for (unsigned int i = 0; i < 2; ++i) {
-      if (builtin_modules[i] != NULL) {
-        optixModuleDestroy(builtin_modules[i]);
-        builtin_modules[i] = NULL;
-      }
-    }
-    for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
-      if (pipelines[i] != NULL) {
-        optixPipelineDestroy(pipelines[i]);
-        pipelines[i] = NULL;
-      }
-    }
-
-    OptixModuleCompileOptions module_options = {};
-    module_options.maxRegisterCount = 0;  // Do not set an explicit register limit
-    module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
-    module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
-
-#  if OPTIX_ABI_VERSION >= 41
-    module_options.boundValues = nullptr;
-    module_options.numBoundValues = 0;
-#  endif
-
-    OptixPipelineCompileOptions pipeline_options = {};
-    // Default to no motion blur and two-level graph, since it is the fastest option
-    pipeline_options.usesMotionBlur = false;
-    pipeline_options.traversableGraphFlags =
-        OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING;
-    pipeline_options.numPayloadValues = 6;
-    pipeline_options.numAttributeValues = 2;  // u, v
-    pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE;
-    pipeline_options.pipelineLaunchParamsVariableName = "__params";  // See kernel_globals.h
-
-#  if OPTIX_ABI_VERSION >= 36
-    pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE;
-    if (requested_features.use_hair) {
-      if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) {
-        pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE;
-      }
-      else {
-        pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
-      }
-    }
-#  endif
-
-    // Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
-    // This is necessary since objects may be reported to have motion if the Vector pass is
-    // active, but may still need to be rendered without motion blur if that isn't active as well
-    motion_blur = requested_features.use_object_motion;
-
-    if (motion_blur) {
-      pipeline_options.usesMotionBlur = true;
-      // Motion blur can insert motion transforms into the traversal graph
-      // It is no longer a two-level graph then, so need to set flags to allow any configuration
-      pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY;
-    }
-
-    {  // Load and compile PTX module with OptiX kernels
-      string ptx_data, ptx_filename = path_get(requested_features.use_shader_raytrace ?
-                                                   "lib/kernel_optix_shader_raytrace.ptx" :
-                                                   "lib/kernel_optix.ptx");
-      if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
-        if (!getenv("OPTIX_ROOT_DIR")) {
-          set_error(
-              "Missing OPTIX_ROOT_DIR environment variable (which must be set with the path to "
-              "the Optix SDK to be able to compile Optix kernels on demand).");
-          return false;
-        }
-        ptx_filename = compile_kernel(requested_features, "kernel_optix", "optix", true);
-      }
-      if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) {
-        set_error("Failed to load OptiX kernel from '" + ptx_filename + "'");
-        return false;
-      }
-
-      check_result_optix_ret(optixModuleCreateFromPTX(context,
-                                                      &module_options,
-                                                      &pipeline_options,
-                                                      ptx_data.data(),
-                                                      ptx_data.size(),
-                                                      nullptr,
-                                                      0,
-                                                      &optix_module));
-    }
-
-    // Create program groups
-    OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
-    OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
-    OptixProgramGroupOptions group_options = {};  // There are no options currently
-    group_descs[PG_RGEN].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
-    group_descs[PG_RGEN].raygen.module = optix_module;
-    // Ignore branched integrator for now (see "requested_features.use_integrator_branched")
-    group_descs[PG_RGEN].raygen.entryFunctionName = "__raygen__kernel_optix_path_trace";
-    group_descs[PG_MISS].kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
-    group_descs[PG_MISS].miss.module = optix_module;
-    group_descs[PG_MISS].miss.entryFunctionName = "__miss__kernel_optix_miss";
-    group_descs[PG_HITD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
-    group_descs[PG_HITD].hitgroup.moduleCH = optix_module;
-    group_descs[PG_HITD].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit";
-    group_descs[PG_HITD].hitgroup.moduleAH = optix_module;
-    group_descs[PG_HITD].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_visibility_test";
-    group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
-    group_descs[PG_HITS].hitgroup.moduleAH = optix_module;
-    group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit";
-
-    if (requested_features.use_hair) {
-      group_descs[PG_HITD].hitgroup.moduleIS = optix_module;
-      group_descs[PG_HITS].hitgroup.moduleIS = optix_module;
-
-      // Add curve intersection programs
-      if (requested_features.use_hair_thick) {
-        // Slower programs for thick hair since that also slows down ribbons.
-        // Ideally this should not be needed.
-        group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_all";
-        group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_all";
-      }
-      else {
-        group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
-        group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
-      }
-
-#  if OPTIX_ABI_VERSION >= 36
-      if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) {
-        OptixBuiltinISOptions builtin_options = {};
-        builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
-        builtin_options.usesMotionBlur = false;
-
-        check_result_optix_ret(optixBuiltinISModuleGet(
-            context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[0]));
-
-        group_descs[PG_HITD].hitgroup.moduleIS = builtin_modules[0];
-        group_descs[PG_HITD].hitgroup.entryFunctionNameIS = nullptr;
-        group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0];
-        group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr;
-
-        if (motion_blur) {
-          builtin_options.usesMotionBlur = true;
-
-          check_result_optix_ret(optixBuiltinISModuleGet(
-              context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[1]));
-
-          group_descs[PG_HITD_MOTION] = group_descs[PG_HITD];
-          group_descs[PG_HITD_MOTION].hitgroup.moduleIS = builtin_modules[1];
-          group_descs[PG_HITS_MOTION] = group_descs[PG_HITS];
-          group_descs[PG_HITS_MOTION].hitgroup.moduleIS = builtin_modules[1];
-        }
-      }
-#  endif
-    }
-
-    if (requested_features.use_subsurface || requested_features.use_shader_raytrace) {
-      // Add hit group for local intersections
-      group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
-      group_descs[PG_HITL].hitgroup.moduleAH = optix_module;
-      group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit";
-    }
-
-    if (requested_features.use_baking) {
-      group_descs[PG_BAKE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
-      group_descs[PG_BAKE].raygen.module = optix_module;
-      group_descs[PG_BAKE].raygen.entryFunctionName = "__raygen__kernel_optix_bake";
-    }
-
-    if (requested_features.use_true_displacement) {
-      group_descs[PG_DISP].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
-      group_descs[PG_DISP].raygen.module = optix_module;
-      group_descs[PG_DISP].raygen.entryFunctionName = "__raygen__kernel_optix_displace";
-    }
-
-    if (requested_features.use_background_light) {
-      group_descs[PG_BACK].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
-      group_descs[PG_BACK].raygen.module = optix_module;
-      group_descs[PG_BACK].raygen.entryFunctionName = "__raygen__kernel_optix_background";
-    }
-
-    // Shader raytracing replaces some functions with direct callables
-    if (requested_features.use_shader_raytrace) {
-      group_descs[PG_CALL + 0].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
-      group_descs[PG_CALL + 0].callables.moduleDC = optix_module;
-      group_descs[PG_CALL + 0].callables.entryFunctionNameDC = "__direct_callable__svm_eval_nodes";
-      group_descs[PG_CALL + 1].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
-      group_descs[PG_CALL + 1].callables.moduleDC = optix_module;
-      group_descs[PG_CALL + 1].callables.entryFunctionNameDC =
-          "__direct_callable__kernel_volume_shadow";
-      group_descs[PG_CALL + 2].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
-      group_descs[PG_CALL + 2].callables.moduleDC = optix_module;
-      group_descs[PG_CALL + 2].callables.entryFunctionNameDC =
-          "__direct_callable__subsurface_scatter_multi_setup";
-    }
-
-    check_result_optix_ret(optixProgramGroupCreate(
-        context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups));
-
-    // Get program stack sizes
-    OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
-    // Set up SBT, which in this case is used only to select between different programs
-    sbt_data.alloc(NUM_PROGRAM_GROUPS);
-    memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS);
-    for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
-      check_result_optix_ret(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
-      check_result_optix_ret(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
-    }
-    sbt_data.copy_to_device();  // Upload SBT to device
-
-    // Calculate maximum trace continuation stack size
-    unsigned int trace_css = stack_size[PG_HITD].cssCH;
-    // This is based on the maximum of closest-hit and any-hit/intersection programs
-    trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
-    trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
-    trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
-#  if OPTIX_ABI_VERSION >= 36
-    trace_css = std::max(trace_css,
-                         stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH);
-    trace_css = std::max(trace_css,
-                         stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH);
-#  endif
-
-    OptixPipelineLinkOptions link_options = {};
-    link_options.maxTraceDepth = 1;
-    link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
-#  if OPTIX_ABI_VERSION < 24
-    link_options.overrideUsesMotionBlur = motion_blur;
-#  endif
-
-    {  // Create path tracing pipeline
-      vector<OptixProgramGroup> pipeline_groups;
-      pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
-      pipeline_groups.push_back(groups[PG_RGEN]);
-      pipeline_groups.push_back(groups[PG_MISS]);
-      pipeline_groups.push_back(groups[PG_HITD]);
-      pipeline_groups.push_back(groups[PG_HITS]);
-      pipeline_groups.push_back(groups[PG_HITL]);
-#  if OPTIX_ABI_VERSION >= 36
-      if (motion_blur) {
-        pipeline_groups.push_back(groups[PG_HITD_MOTION]);
-        pipeline_groups.push_back(groups[PG_HITS_MOTION]);
-      }
-#  endif
-      if (requested_features.use_shader_raytrace) {
-        pipeline_groups.push_back(groups[PG_CALL + 0]);
-        pipeline_groups.push_back(groups[PG_CALL + 1]);
-        pipeline_groups.push_back(groups[PG_CALL + 2]);
-      }
-
-      check_result_optix_ret(optixPipelineCreate(context,
-                                                 &pipeline_options,
-                                                 &link_options,
-                                                 pipeline_groups.data(),
-                                                 pipeline_groups.size(),
-                                                 nullptr,
-                                                 0,
-                                                 &pipelines[PIP_PATH_TRACE]));
-
-      // Combine ray generation and trace continuation stack size
-      const unsigned int css = stack_size[PG_RGEN].cssRG + link_options.maxTraceDepth * trace_css;
-      // Max direct callable depth is one of the following, so combine accordingly
-      // - __raygen__ -> svm_eval_nodes
-      // - __raygen__ -> kernel_volume_shadow -> svm_eval_nodes
-      // - __raygen__ -> subsurface_scatter_multi_setup -> svm_eval_nodes
-      const unsigned int dss = stack_size[PG_CALL + 0].dssDC +
-                               std::max(stack_size[PG_CALL + 1].dssDC,
-                                        stack_size[PG_CALL + 2].dssDC);
-
-      // Set stack size depending on pipeline options
-      check_result_optix_ret(
-          optixPipelineSetStackSize(pipelines[PIP_PATH_TRACE],
-                                    0,
-                                    requested_features.use_shader_raytrace ? dss : 0,
-                                    css,
-                                    motion_blur ? 3 : 2));
-    }
-
-    // Only need to create shader evaluation pipeline if one of these features is used:
-    const bool use_shader_eval_pipeline = requested_features.use_baking ||
-                                          requested_features.use_background_light ||
-                                          requested_features.use_true_displacement;
-
-    if (use_shader_eval_pipeline) {  // Create shader evaluation pipeline
-      vector<OptixProgramGroup> pipeline_groups;
-      pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
-      pipeline_groups.push_back(groups[PG_BAKE]);
-      pipeline_groups.push_back(groups[PG_DISP]);
-      pipeline_groups.push_back(groups[PG_BACK]);
-      pipeline_groups.push_back(groups[PG_MISS]);
-      pipeline_groups.push_back(groups[PG_HITD]);
-      pipeline_groups.push_back(groups[PG_HITS]);
-      pipeline_groups.push_back(groups[PG_HITL]);
-#  if OPTIX_ABI_VERSION >= 36
-      if (motion_blur) {
-        pipeline_groups.push_back(groups[PG_HITD_MOTION]);
-        pipeline_groups.push_back(groups[PG_HITS_MOTION]);
-      }
-#  endif
-      if (requested_features.use_shader_raytrace) {
-        pipeline_groups.push_back(groups[PG_CALL + 0]);
-        pipeline_groups.push_back(groups[PG_CALL + 1]);
-        pipeline_groups.push_back(groups[PG_CALL + 2]);
-      }
-
-      check_result_optix_ret(optixPipelineCreate(context,
-                                                 &pipeline_options,
-                                                 &link_options,
-                                                 pipeline_groups.data(),
-                                                 pipeline_groups.size(),
-                                                 nullptr,
-                                                 0,
-                                                 &pipelines[PIP_SHADER_EVAL]));
-
-      // Calculate continuation stack size based on the maximum of all ray generation stack sizes
-      const unsigned int css = std::max(stack_size[PG_BAKE].cssRG,
-                                        std::max(stack_size[PG_DISP].cssRG,
-                                                 stack_size[PG_BACK].cssRG)) +
-                               link_options.maxTraceDepth * trace_css;
-      const unsigned int dss = stack_size[PG_CALL + 0].dssDC +
-                               std::max(stack_size[PG_CALL + 1].dssDC,
-                                        stack_size[PG_CALL + 2].dssDC);
-
-      check_result_optix_ret(
-          optixPipelineSetStackSize(pipelines[PIP_SHADER_EVAL],
-                                    0,
-                                    requested_features.use_shader_raytrace ? dss : 0,
-                                    css,
-                                    motion_blur ? 3 : 2));
-    }
-
-    // Clean up program group objects
-    for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
-      optixProgramGroupDestroy(groups[i]);
-    }
-
-    return true;
-  }
-
-  void thread_run(DeviceTask &task, int thread_index)  // Main task entry point
-  {
-    if (have_error())
-      return;  // Abort early if there was an error previously
-
-    if (task.type == DeviceTask::RENDER) {
-      if (thread_index != 0) {
-        // Only execute denoising in a single thread (see also 'task_add')
-        task.tile_types &= ~RenderTile::DENOISE;
-      }
-
-      RenderTile tile;
-      while (task.acquire_tile(this, tile, task.tile_types)) {
-        if (tile.task == RenderTile::PATH_TRACE)
-          launch_render(task, tile, thread_index);
-        else if (tile.task == RenderTile::BAKE) {
-          // Perform baking using CUDA, since it is not currently implemented in OptiX
-          device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
-          CUDADevice::render(task, tile, work_tiles);
-        }
-        else if (tile.task == RenderTile::DENOISE)
-          launch_denoise(task, tile);
-        task.release_tile(tile);
-        if (task.get_cancel() && !task.need_finish_queue)
-          break;  // User requested cancellation
-        else if (have_error())
-          break;  // Abort rendering when encountering an error
-      }
-    }
-    else if (task.type == DeviceTask::SHADER) {
-      // CUDA kernels are used when doing baking
-      if (optix_module == NULL)
-        CUDADevice::shader(task);
-      else
-        launch_shader_eval(task, thread_index);
-    }
-    else if (task.type == DeviceTask::DENOISE_BUFFER) {
-      // Set up a single tile that covers the whole task and denoise it
-      RenderTile tile;
-      tile.x = task.x;
-      tile.y = task.y;
-      tile.w = task.w;
-      tile.h = task.h;
-      tile.buffer = task.buffer;
-      tile.num_samples = task.num_samples;
-      tile.start_sample = task.sample;
-      tile.offset = task.offset;
-      tile.stride = task.stride;
-      tile.buffers = task.buffers;
-
-      launch_denoise(task, tile);
-    }
-  }
-
-  void launch_render(DeviceTask &task, RenderTile &rtile, int thread_index)
-  {
-    assert(thread_index < launch_params.data_size);
-
-    // Keep track of total render time of this tile
-    const scoped_timer timer(&rtile.buffers->render_time);
-
-    WorkTile wtile;
-    wtile.x = rtile.x;
-    wtile.y = rtile.y;
-    wtile.w = rtile.w;
-    wtile.h = rtile.h;
-    wtile.offset = rtile.offset;
-    wtile.stride = rtile.stride;
-    wtile.buffer = (float *)rtile.buffer;
-
-    const int end_sample = rtile.start_sample + rtile.num_samples;
-    // Keep this number reasonable to avoid running into TDRs
-    int step_samples = (info.display_device ? 8 : 32);
-
-    // Offset into launch params buffer so that streams use separate data
-    device_ptr launch_params_ptr = launch_params.device_pointer +
-                                   thread_index * launch_params.data_elements;
-
-    const CUDAContextScope scope(cuContext);
-
-    for (int sample = rtile.start_sample; sample < end_sample;) {
-      // Copy work tile information to device
-      wtile.start_sample = sample;
-      wtile.num_samples = step_samples;
-      if (task.adaptive_sampling.use) {
-        wtile.num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
-      }
-      wtile.num_samples = min(wtile.num_samples, end_sample - sample);
-      device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
-      check_result_cuda(
-          cuMemcpyHtoDAsync(d_wtile_ptr, &wtile, sizeof(wtile), cuda_stream[thread_index]));
-
-      OptixShaderBindingTable sbt_params = {};
-      sbt_params.raygenRecord = sbt_data.device_pointer + PG_RGEN * sizeof(SbtRecord);
-      sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord);
-      sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
-      sbt_params.missRecordCount = 1;
-      sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
-      sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
-#  if OPTIX_ABI_VERSION >= 36
-      sbt_params.hitgroupRecordCount = 5;  // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL
-#  else
-      sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITS, PG_HITL
-#  endif
-      sbt_params.callablesRecordBase = sbt_data.device_pointer + PG_CALL * sizeof(SbtRecord);
-      sbt_params.callablesRecordCount = 3;
-      sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
-
-      // Launch the ray generation program
-      check_result_optix(optixLaunch(pipelines[PIP_PATH_TRACE],
-                                     cuda_stream[thread_index],
-                                     launch_params_ptr,
-                                     launch_params.data_elements,
-                                     &sbt_params,
-                                     // Launch with samples close to each other for better locality
-                                     wtile.w * wtile.num_samples,
-                                     wtile.h,
-                                     1));
-
-      // Run the adaptive sampling kernels at selected samples aligned to step samples.
-      uint filter_sample = wtile.start_sample + wtile.num_samples - 1;
-      if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
-        adaptive_sampling_filter(filter_sample, &wtile, d_wtile_ptr, cuda_stream[thread_index]);
-      }
-
-      // Wait for launch to finish
-      check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
-
-      // Update current sample, so it is displayed correctly
-      sample += wtile.num_samples;
-      rtile.sample = sample;
-      // Update task progress after the kernel completed rendering
-      task.update_progress(&rtile, wtile.w * wtile.h * wtile.num_samples);
-
-      if (task.get_cancel() && !task.need_finish_queue)
-        return;  // Cancel rendering
-    }
-
-    // Finalize adaptive sampling
-    if (task.adaptive_sampling.use) {
-      device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
-      adaptive_sampling_post(rtile, &wtile, d_wtile_ptr, cuda_stream[thread_index]);
-      check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
-      task.update_progress(&rtile, rtile.w * rtile.h * wtile.num_samples);
-    }
-  }
-
-  bool launch_denoise(DeviceTask &task, RenderTile &rtile)
-  {
-    // Update current sample (for display and NLM denoising task)
-    rtile.sample = rtile.start_sample + rtile.num_samples;
-
-    // Make CUDA context current now, since it is used for both denoising tasks
-    const CUDAContextScope scope(cuContext);
-
-    // Choose between OptiX and NLM denoising
-    if (task.denoising.type == DENOISER_OPTIX) {
-      // Map neighboring tiles onto this device, indices are as following:
-      // Where index 4 is the center tile and index 9 is the target for the result.
-      //   0 1 2
-      //   3 4 5
-      //   6 7 8  9
-      RenderTileNeighbors neighbors(rtile);
-      task.map_neighbor_tiles(neighbors, this);
-      RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
-      RenderTile &target_tile = neighbors.target;
-      rtile = center_tile;  // Tile may have been modified by mapping code
-
-      // Calculate size of the tile to denoise (including overlap)
-      int4 rect = center_tile.bounds();
-      // Overlap between tiles has to be at least 64 pixels
-      // TODO(pmours): Query this value from OptiX
-      rect = rect_expand(rect, 64);
-      int4 clip_rect = neighbors.bounds();
-      rect = rect_clip(rect, clip_rect);
-      int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y);
-      int2 overlap_offset = make_int2(rtile.x - rect.x, rtile.y - rect.y);
-
-      // Calculate byte offsets and strides
-      int pixel_stride = task.pass_stride * (int)sizeof(float);
-      int pixel_offset = (rtile.offset + rtile.x + rtile.y * rtile.stride) * pixel_stride;
-      const int pass_offset[3] = {
-          (task.pass_denoising_data + DENOISING_PASS_COLOR) * (int)sizeof(float),
-          (task.pass_denoising_data + DENOISING_PASS_ALBEDO) * (int)sizeof(float),
-          (task.pass_denoising_data + DENOISING_PASS_NORMAL) * (int)sizeof(float)};
-
-      // Start with the current tile pointer offset
-      int input_stride = pixel_stride;
-      device_ptr input_ptr = rtile.buffer + pixel_offset;
-
-      // Copy tile data into a common buffer if necessary
-      device_only_memory<float> input(this, "denoiser input", true);
-      device_vector<TileInfo> tile_info_mem(this, "denoiser tile info", MEM_READ_ONLY);
-
-      bool contiguous_memory = true;
-      for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
-        if (neighbors.tiles[i].buffer && neighbors.tiles[i].buffer != rtile.buffer) {
-          contiguous_memory = false;
-        }
-      }
-
-      if (contiguous_memory) {
-        // Tiles are in continous memory, so can just subtract overlap offset
-        input_ptr -= (overlap_offset.x + overlap_offset.y * rtile.stride) * pixel_stride;
-        // Stride covers the whole width of the image and not just a single tile
-        input_stride *= rtile.stride;
-      }
-      else {
-        // Adjacent tiles are in separate memory regions, so need to copy them into a single one
-        input.alloc_to_device(rect_size.x * rect_size.y * task.pass_stride);
-        // Start with the new input buffer
-        input_ptr = input.device_pointer;
-        // Stride covers the width of the new input buffer, which includes tile width and overlap
-        input_stride *= rect_size.x;
-
-        TileInfo *tile_info = tile_info_mem.alloc(1);
-        for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
-          tile_info->offsets[i] = neighbors.tiles[i].offset;
-          tile_info->strides[i] = neighbors.tiles[i].stride;
-          tile_info->buffers[i] = neighbors.tiles[i].buffer;
-        }
-        tile_info->x[0] = neighbors.tiles[3].x;
-        tile_info->x[1] = neighbors.tiles[4].x;
-        tile_info->x[2] = neighbors.tiles[5].x;
-        tile_info->x[3] = neighbors.tiles[5].x + neighbors.tiles[5].w;
-        tile_info->y[0] = neighbors.tiles[1].y;
-        tile_info->y[1] = neighbors.tiles[4].y;
-        tile_info->y[2] = neighbors.tiles[7].y;
-        tile_info->y[3] = neighbors.tiles[7].y + neighbors.tiles[7].h;
-        tile_info_mem.copy_to_device();
-
-        void *args[] = {
-            &input.device_pointer, &tile_info_mem.device_pointer, &rect.x, &task.pass_stride};
-        launch_filter_kernel("kernel_cuda_filter_copy_input", rect_size.x, rect_size.y, args);
-      }
-
-#  if OPTIX_DENOISER_NO_PIXEL_STRIDE
-      device_only_memory<float> input_rgb(this, "denoiser input rgb", true);
-      input_rgb.alloc_to_device(rect_size.x * rect_size.y * 3 * task.denoising.input_passes);
-
-      void *input_args[] = {&input_rgb.device_pointer,
-                            &input_ptr,
-                            &rect_size.x,
-                            &rect_size.y,
-                            &input_stride,
-                            &task.pass_stride,
-                            const_cast<int *>(pass_offset),
-                            &task.denoising.input_passes,
-                            &rtile.sample};
-      launch_filter_kernel(
-          "kernel_cuda_filter_convert_to_rgb", rect_size.x, rect_size.y, input_args);
-
-      input_ptr = input_rgb.device_pointer;
-      pixel_stride = 3 * sizeof(float);
-      input_stride = rect_size.x * pixel_stride;
-#  endif
-
-      const bool recreate_denoiser = (denoiser == NULL) ||
-                                     (task.denoising.input_passes != denoiser_input_passes);
-      if (recreate_denoiser) {
-        // Destroy existing handle before creating new one
-        if (denoiser != NULL) {
-          optixDenoiserDestroy(denoiser);
-        }
-
-        // Create OptiX denoiser handle on demand when it is first used
-        OptixDenoiserOptions denoiser_options = {};
-        assert(task.denoising.input_passes >= 1 && task.denoising.input_passes <= 3);
-#  if OPTIX_ABI_VERSION >= 47
-        denoiser_options.guideAlbedo = task.denoising.input_passes >= 2;
-        denoiser_options.guideNormal = task.denoising.input_passes >= 3;
-        check_result_optix_ret(optixDenoiserCreate(
-            context, OPTIX_DENOISER_MODEL_KIND_HDR, &denoiser_options, &denoiser));
-#  else
-        denoiser_options.inputKind = static_cast<OptixDenoiserInputKind>(
-            OPTIX_DENOISER_INPUT_RGB + (task.denoising.input_passes - 1));
-#    if OPTIX_ABI_VERSION < 28
-        denoiser_options.pixelFormat = OPTIX_PIXEL_FORMAT_FLOAT3;
-#    endif
-        check_result_optix_ret(optixDenoiserCreate(context, &denoiser_options, &denoiser));
-        check_result_optix_ret(
-            optixDenoiserSetModel(denoiser, OPTIX_DENOISER_MODEL_KIND_HDR, NULL, 0));
-#  endif
-
-        // OptiX denoiser handle was created with the requested number of input passes
-        denoiser_input_passes = task.denoising.input_passes;
-      }
-
-      OptixDenoiserSizes sizes = {};
-      check_result_optix_ret(
-          optixDenoiserComputeMemoryResources(denoiser, rect_size.x, rect_size.y, &sizes));
-
-#  if OPTIX_ABI_VERSION < 28
-      const size_t scratch_size = sizes.recommendedScratchSizeInBytes;
-#  else
-      const size_t scratch_size = sizes.withOverlapScratchSizeInBytes;
-#  endif
-      const size_t scratch_offset = sizes.stateSizeInBytes;
-
-      // Allocate denoiser state if tile size has changed since last setup
-      if (recreate_denoiser || (denoiser_state.data_width != rect_size.x ||
-                                denoiser_state.data_height != rect_size.y)) {
-        denoiser_state.alloc_to_device(scratch_offset + scratch_size);
-
-        // Initialize denoiser state for the current tile size
-        check_result_optix_ret(optixDenoiserSetup(denoiser,
-                                                  0,
-                                                  rect_size.x,
-                                                  rect_size.y,
-                                                  denoiser_state.device_pointer,
-                                                  scratch_offset,
-                                                  denoiser_state.device_pointer + scratch_offset,
-                                                  scratch_size));
-
-        denoiser_state.data_width = rect_size.x;
-        denoiser_state.data_height = rect_size.y;
-      }
-
-      // Set up input and output layer information
-      OptixImage2D input_layers[3] = {};
-      OptixImage2D output_layers[1] = {};
-
-      for (int i = 0; i < 3; ++i) {
-#  if OPTIX_DENOISER_NO_PIXEL_STRIDE
-        input_layers[i].data = input_ptr + (rect_size.x * rect_size.y * pixel_stride * i);
-#  else
-        input_layers[i].data = input_ptr + pass_offset[i];
-#  endif
-        input_layers[i].width = rect_size.x;
-        input_layers[i].height = rect_size.y;
-        input_layers[i].rowStrideInBytes = input_stride;
-        input_layers[i].pixelStrideInBytes = pixel_stride;
-        input_layers[i].format = OPTIX_PIXEL_FORMAT_FLOAT3;
-      }
-
-#  if OPTIX_DENOISER_NO_PIXEL_STRIDE
-      output_layers[0].data = input_ptr;
-      output_layers[0].width = rect_size.x;
-      output_layers[0].height = rect_size.y;
-      output_layers[0].rowStrideInBytes = input_stride;
-      output_layers[0].pixelStrideInBytes = pixel_stride;
-      int2 output_offset = overlap_offset;
-      overlap_offset = make_int2(0, 0);  // Not supported by denoiser API, so apply manually
-#  else
-      output_layers[0].data = target_tile.buffer + pixel_offset;
-      output_layers[0].width = target_tile.w;
-      output_layers[0].height = target_tile.h;
-      output_layers[0].rowStrideInBytes = target_tile.stride * pixel_stride;
-      output_layers[0].pixelStrideInBytes = pixel_stride;
-#  endif
-      output_layers[0].format = OPTIX_PIXEL_FORMAT_FLOAT3;
-
-#  if OPTIX_ABI_VERSION >= 47
-      OptixDenoiserLayer image_layers = {};
-      image_layers.input = input_layers[0];
-      image_layers.output = output_layers[0];
-
-      OptixDenoiserGuideLayer guide_layers = {};
-      guide_layers.albedo = input_layers[1];
-      guide_layers.normal = input_layers[2];
-#  endif
-
-      // Finally run denonising
-      OptixDenoiserParams params = {};  // All parameters are disabled/zero
-#  if OPTIX_ABI_VERSION >= 47
-      check_result_optix_ret(optixDenoiserInvoke(denoiser,
-                                                 NULL,
-                                                 &params,
-                                                 denoiser_state.device_pointer,
-                                                 scratch_offset,
-                                                 &guide_layers,
-                                                 &image_layers,
-                                                 1,
-                                                 overlap_offset.x,
-                                                 overlap_offset.y,
-                                                 denoiser_state.device_pointer + scratch_offset,
-                                                 scratch_size));
-#  else
-      check_result_optix_ret(optixDenoiserInvoke(denoiser,
-                                                 NULL,
-                                                 &params,
-                                                 denoiser_state.device_pointer,
-                                                 scratch_offset,
-                                                 input_layers,
-                                                 task.denoising.input_passes,
-                                                 overlap_offset.x,
-                                                 overlap_offset.y,
-                                                 output_layers,
-                                                 denoiser_state.device_pointer + scratch_offset,
-                                                 scratch_size));
-#  endif
-
-#  if OPTIX_DENOISER_NO_PIXEL_STRIDE
-      void *output_args[] = {&input_ptr,
-                             &target_tile.buffer,
-                             &output_offset.x,
-                             &output_offset.y,
-                             &rect_size.x,
-                             &rect_size.y,
-                             &target_tile.x,
-                             &target_tile.y,
-                             &target_tile.w,
-                             &target_tile.h,
-                             &target_tile.offset,
-                             &target_tile.stride,
-                             &task.pass_stride,
-                             &rtile.sample};
-      launch_filter_kernel(
-          "kernel_cuda_filter_convert_from_rgb", target_tile.w, target_tile.h, output_args);
-#  endif
-
-      check_result_cuda_ret(cuStreamSynchronize(0));
-
-      task.unmap_neighbor_tiles(neighbors, this);
-    }
-    else {
-      // Run CUDA denoising kernels
-      DenoisingTask denoising(this, task);
-      CUDADevice::denoise(rtile, denoising);
-    }
-
-    // Update task progress after the denoiser completed processing
-    task.update_progress(&rtile, rtile.w * rtile.h);
-
-    return true;
-  }
-
-  void launch_shader_eval(DeviceTask &task, int thread_index)
-  {
-    unsigned int rgen_index = PG_BACK;
-    if (task.shader_eval_type >= SHADER_EVAL_BAKE)
-      rgen_index = PG_BAKE;
-    if (task.shader_eval_type == SHADER_EVAL_DISPLACE)
-      rgen_index = PG_DISP;
-
-    const CUDAContextScope scope(cuContext);
-
-    device_ptr launch_params_ptr = launch_params.device_pointer +
-                                   thread_index * launch_params.data_elements;
-
-    for (int sample = 0; sample < task.num_samples; ++sample) {
-      ShaderParams params;
-      params.input = (uint4 *)task.shader_input;
-      params.output = (float4 *)task.shader_output;
-      params.type = task.shader_eval_type;
-      params.filter = task.shader_filter;
-      params.sx = task.shader_x;
-      params.offset = task.offset;
-      params.sample = sample;
-
-      check_result_cuda(cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, shader),
-                                          &params,
-                                          sizeof(params),
-                                          cuda_stream[thread_index]));
-
-      OptixShaderBindingTable sbt_params = {};
-      sbt_params.raygenRecord = sbt_data.device_pointer + rgen_index * sizeof(SbtRecord);
-      sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord);
-      sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
-      sbt_params.missRecordCount = 1;
-      sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
-      sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
-#  if OPTIX_ABI_VERSION >= 36
-      sbt_params.hitgroupRecordCount = 5;  // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL
-#  else
-      sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITS, PG_HITL
-#  endif
-      sbt_params.callablesRecordBase = sbt_data.device_pointer + PG_CALL * sizeof(SbtRecord);
-      sbt_params.callablesRecordCount = 3;
-      sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
-
-      check_result_optix(optixLaunch(pipelines[PIP_SHADER_EVAL],
-                                     cuda_stream[thread_index],
-                                     launch_params_ptr,
-                                     launch_params.data_elements,
-                                     &sbt_params,
-                                     task.shader_w,
-                                     1,
-                                     1));
-
-      check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
-
-      task.update_progress(NULL);
-    }
-  }
-
-  bool build_optix_bvh(BVHOptiX *bvh,
-                       OptixBuildOperation operation,
-                       const OptixBuildInput &build_input,
-                       uint16_t num_motion_steps)
-  {
-    /* Allocate and build acceleration structures only one at a time, to prevent parallel builds
-     * from running out of memory (since both original and compacted acceleration structure memory
-     * may be allocated at the same time for the duration of this function). The builds would
-     * otherwise happen on the same CUDA stream anyway. */
-    static thread_mutex mutex;
-    thread_scoped_lock lock(mutex);
-
-    const CUDAContextScope scope(cuContext);
-
-    const bool use_fast_trace_bvh = (bvh->params.bvh_type == SceneParams::BVH_STATIC);
-
-    // Compute memory usage
-    OptixAccelBufferSizes sizes = {};
-    OptixAccelBuildOptions options = {};
-    options.operation = operation;
-    if (use_fast_trace_bvh) {
-      VLOG(2) << "Using fast to trace OptiX BVH";
-      options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
-    }
-    else {
-      VLOG(2) << "Using fast to update OptiX BVH";
-      options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD | OPTIX_BUILD_FLAG_ALLOW_UPDATE;
-    }
-
-    options.motionOptions.numKeys = num_motion_steps;
-    options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH;
-    options.motionOptions.timeBegin = 0.0f;
-    options.motionOptions.timeEnd = 1.0f;
-
-    check_result_optix_ret(
-        optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));
-
-    // Allocate required output buffers
-    device_only_memory<char> temp_mem(this, "optix temp as build mem", true);
-    temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
-    if (!temp_mem.device_pointer)
-      return false;  // Make sure temporary memory allocation succeeded
-
-    // Acceleration structure memory has to be allocated on the device (not allowed to be on host)
-    device_only_memory<char> &out_data = bvh->as_data;
-    if (operation == OPTIX_BUILD_OPERATION_BUILD) {
-      assert(out_data.device == this);
-      out_data.alloc_to_device(sizes.outputSizeInBytes);
-      if (!out_data.device_pointer)
-        return false;
-    }
-    else {
-      assert(out_data.device_pointer && out_data.device_size >= sizes.outputSizeInBytes);
-    }
-
-    // Finally build the acceleration structure
-    OptixAccelEmitDesc compacted_size_prop = {};
-    compacted_size_prop.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
-    // A tiny space was allocated for this property at the end of the temporary buffer above
-    // Make sure this pointer is 8-byte aligned
-    compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8);
-
-    OptixTraversableHandle out_handle = 0;
-    check_result_optix_ret(optixAccelBuild(context,
-                                           NULL,
-                                           &options,
-                                           &build_input,
-                                           1,
-                                           temp_mem.device_pointer,
-                                           sizes.tempSizeInBytes,
-                                           out_data.device_pointer,
-                                           sizes.outputSizeInBytes,
-                                           &out_handle,
-                                           use_fast_trace_bvh ? &compacted_size_prop : NULL,
-                                           use_fast_trace_bvh ? 1 : 0));
-    bvh->traversable_handle = static_cast<uint64_t>(out_handle);
-
-    // Wait for all operations to finish
-    check_result_cuda_ret(cuStreamSynchronize(NULL));
-
-    // Compact acceleration structure to save memory (only if using fast trace as the
-    // OPTIX_BUILD_FLAG_ALLOW_COMPACTION flag is only set in this case).
-    if (use_fast_trace_bvh) {
-      uint64_t compacted_size = sizes.outputSizeInBytes;
-      check_result_cuda_ret(
-          cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size)));
-
-      // Temporary memory is no longer needed, so free it now to make space
-      temp_mem.free();
-
-      // There is no point compacting if the size does not change
-      if (compacted_size < sizes.outputSizeInBytes) {
-        device_only_memory<char> compacted_data(this, "optix compacted as", false);
-        compacted_data.alloc_to_device(compacted_size);
-        if (!compacted_data.device_pointer)
-          // Do not compact if memory allocation for compacted acceleration structure fails
-          // Can just use the uncompacted one then, so succeed here regardless
-          return true;
-
-        check_result_optix_ret(optixAccelCompact(context,
-                                                 NULL,
-                                                 out_handle,
-                                                 compacted_data.device_pointer,
-                                                 compacted_size,
-                                                 &out_handle));
-        bvh->traversable_handle = static_cast<uint64_t>(out_handle);
-
-        // Wait for compaction to finish
-        check_result_cuda_ret(cuStreamSynchronize(NULL));
-
-        std::swap(out_data.device_size, compacted_data.device_size);
-        std::swap(out_data.device_pointer, compacted_data.device_pointer);
-        // Original acceleration structure memory is freed when 'compacted_data' goes out of scope
-      }
-    }
-
-    return true;
-  }
-
-  void build_bvh(BVH *bvh, Progress &progress, bool refit) override
-  {
-    if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2) {
-      /* For baking CUDA is used, build appropriate BVH for that. */
-      Device::build_bvh(bvh, progress, refit);
-      return;
-    }
-
-    const bool use_fast_trace_bvh = (bvh->params.bvh_type == SceneParams::BVH_STATIC);
-
-    free_bvh_memory_delayed();
-
-    BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
-
-    progress.set_substatus("Building OptiX acceleration structure");
-
-    if (!bvh->params.top_level) {
-      assert(bvh->objects.size() == 1 && bvh->geometry.size() == 1);
-
-      OptixBuildOperation operation = OPTIX_BUILD_OPERATION_BUILD;
-      /* Refit is only possible when using fast to trace BVH (because AS is built with
-       * OPTIX_BUILD_FLAG_ALLOW_UPDATE only there, see above). */
-      if (refit && !use_fast_trace_bvh) {
-        assert(bvh_optix->traversable_handle != 0);
-        operation = OPTIX_BUILD_OPERATION_UPDATE;
-      }
-      else {
-        bvh_optix->as_data.free();
-        bvh_optix->traversable_handle = 0;
-      }
-
-      // Build bottom level acceleration structures (BLAS)
-      Geometry *const geom = bvh->geometry[0];
-      if (geom->geometry_type == Geometry::HAIR) {
-        // Build BLAS for curve primitives
-        Hair *const hair = static_cast<Hair *const>(geom);
-        if (hair->num_curves() == 0) {
-          return;
-        }
-
-        const size_t num_segments = hair->num_segments();
-
-        size_t num_motion_steps = 1;
-        Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-        if (motion_blur && hair->get_use_motion_blur() && motion_keys) {
-          num_motion_steps = hair->get_motion_steps();
-        }
-
-        device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY);
-#  if OPTIX_ABI_VERSION >= 36
-        device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
-        device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
-        // Four control points for each curve segment
-        const size_t num_vertices = num_segments * 4;
-        if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
-          index_data.alloc(num_segments);
-          vertex_data.alloc(num_vertices * num_motion_steps);
-        }
-        else
-#  endif
-          aabb_data.alloc(num_segments * num_motion_steps);
-
-        // Get AABBs for each motion step
-        for (size_t step = 0; step < num_motion_steps; ++step) {
-          // The center step for motion vertices is not stored in the attribute
-          const float3 *keys = hair->get_curve_keys().data();
-          size_t center_step = (num_motion_steps - 1) / 2;
-          if (step != center_step) {
-            size_t attr_offset = (step > center_step) ? step - 1 : step;
-            // Technically this is a float4 array, but sizeof(float3) == sizeof(float4)
-            keys = motion_keys->data_float3() + attr_offset * hair->get_curve_keys().size();
-          }
-
-          for (size_t j = 0, i = 0; j < hair->num_curves(); ++j) {
-            const Hair::Curve curve = hair->get_curve(j);
-#  if OPTIX_ABI_VERSION >= 36
-            const array<float> &curve_radius = hair->get_curve_radius();
-#  endif
-
-            for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) {
-#  if OPTIX_ABI_VERSION >= 36
-              if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
-                int k0 = curve.first_key + segment;
-                int k1 = k0 + 1;
-                int ka = max(k0 - 1, curve.first_key);
-                int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1);
-
-                const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x);
-                const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y);
-                const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z);
-                const float4 pw = make_float4(
-                    curve_radius[ka], curve_radius[k0], curve_radius[k1], curve_radius[kb]);
-
-                // Convert Catmull-Rom data to Bezier spline
-                static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f;
-                static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f;
-                static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f;
-                static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f;
-
-                index_data[i] = i * 4;
-                float4 *const v = vertex_data.data() + step * num_vertices + index_data[i];
-                v[0] = make_float4(
-                    dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw));
-                v[1] = make_float4(
-                    dot(cr2bsp1, px), dot(cr2bsp1, py), dot(cr2bsp1, pz), dot(cr2bsp1, pw));
-                v[2] = make_float4(
-                    dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw));
-                v[3] = make_float4(
-                    dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw));
-              }
-              else
-#  endif
-              {
-                BoundBox bounds = BoundBox::empty;
-                curve.bounds_grow(segment, keys, hair->get_curve_radius().data(), bounds);
-
-                const size_t index = step * num_segments + i;
-                aabb_data[index].minX = bounds.min.x;
-                aabb_data[index].minY = bounds.min.y;
-                aabb_data[index].minZ = bounds.min.z;
-                aabb_data[index].maxX = bounds.max.x;
-                aabb_data[index].maxY = bounds.max.y;
-                aabb_data[index].maxZ = bounds.max.z;
-              }
-            }
-          }
-        }
-
-        // Upload AABB data to GPU
-        aabb_data.copy_to_device();
-#  if OPTIX_ABI_VERSION >= 36
-        index_data.copy_to_device();
-        vertex_data.copy_to_device();
-#  endif
-
-        vector<device_ptr> aabb_ptrs;
-        aabb_ptrs.reserve(num_motion_steps);
-#  if OPTIX_ABI_VERSION >= 36
-        vector<device_ptr> width_ptrs;
-        vector<device_ptr> vertex_ptrs;
-        width_ptrs.reserve(num_motion_steps);
-        vertex_ptrs.reserve(num_motion_steps);
-#  endif
-        for (size_t step = 0; step < num_motion_steps; ++step) {
-          aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb));
-#  if OPTIX_ABI_VERSION >= 36
-          const device_ptr base_ptr = vertex_data.device_pointer +
-                                      step * num_vertices * sizeof(float4);
-          width_ptrs.push_back(base_ptr + 3 * sizeof(float));  // Offset by vertex size
-          vertex_ptrs.push_back(base_ptr);
-#  endif
-        }
-
-        // Force a single any-hit call, so shadow record-all behavior works correctly
-        unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
-        OptixBuildInput build_input = {};
-#  if OPTIX_ABI_VERSION >= 36
-        if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
-          build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES;
-          build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
-          build_input.curveArray.numPrimitives = num_segments;
-          build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
-          build_input.curveArray.numVertices = num_vertices;
-          build_input.curveArray.vertexStrideInBytes = sizeof(float4);
-          build_input.curveArray.widthBuffers = (CUdeviceptr *)width_ptrs.data();
-          build_input.curveArray.widthStrideInBytes = sizeof(float4);
-          build_input.curveArray.indexBuffer = (CUdeviceptr)index_data.device_pointer;
-          build_input.curveArray.indexStrideInBytes = sizeof(int);
-          build_input.curveArray.flag = build_flags;
-          build_input.curveArray.primitiveIndexOffset = hair->optix_prim_offset;
-        }
-        else
-#  endif
-        {
-          // Disable visibility test any-hit program, since it is already checked during
-          // intersection. Those trace calls that require anyhit can force it with a ray flag.
-          build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
-
-          build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
-#  if OPTIX_ABI_VERSION < 23
-          build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
-          build_input.aabbArray.numPrimitives = num_segments;
-          build_input.aabbArray.strideInBytes = sizeof(OptixAabb);
-          build_input.aabbArray.flags = &build_flags;
-          build_input.aabbArray.numSbtRecords = 1;
-          build_input.aabbArray.primitiveIndexOffset = hair->optix_prim_offset;
-#  else
-          build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
-          build_input.customPrimitiveArray.numPrimitives = num_segments;
-          build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb);
-          build_input.customPrimitiveArray.flags = &build_flags;
-          build_input.customPrimitiveArray.numSbtRecords = 1;
-          build_input.customPrimitiveArray.primitiveIndexOffset = hair->optix_prim_offset;
-#  endif
-        }
-
-        if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
-          progress.set_error("Failed to build OptiX acceleration structure");
-        }
-      }
-      else if (geom->geometry_type == Geometry::MESH || geom->geometry_type == Geometry::VOLUME) {
-        // Build BLAS for triangle primitives
-        Mesh *const mesh = static_cast<Mesh *const>(geom);
-        if (mesh->num_triangles() == 0) {
-          return;
-        }
-
-        const size_t num_verts = mesh->get_verts().size();
-
-        size_t num_motion_steps = 1;
-        Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-        if (motion_blur && mesh->get_use_motion_blur() && motion_keys) {
-          num_motion_steps = mesh->get_motion_steps();
-        }
-
-        device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
-        index_data.alloc(mesh->get_triangles().size());
-        memcpy(index_data.data(),
-               mesh->get_triangles().data(),
-               mesh->get_triangles().size() * sizeof(int));
-        device_vector<float3> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
-        vertex_data.alloc(num_verts * num_motion_steps);
-
-        for (size_t step = 0; step < num_motion_steps; ++step) {
-          const float3 *verts = mesh->get_verts().data();
-
-          size_t center_step = (num_motion_steps - 1) / 2;
-          // The center step for motion vertices is not stored in the attribute
-          if (step != center_step) {
-            verts = motion_keys->data_float3() +
-                    (step > center_step ? step - 1 : step) * num_verts;
-          }
-
-          memcpy(vertex_data.data() + num_verts * step, verts, num_verts * sizeof(float3));
-        }
-
-        // Upload triangle data to GPU
-        index_data.copy_to_device();
-        vertex_data.copy_to_device();
-
-        vector<device_ptr> vertex_ptrs;
-        vertex_ptrs.reserve(num_motion_steps);
-        for (size_t step = 0; step < num_motion_steps; ++step) {
-          vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3));
-        }
-
-        // Force a single any-hit call, so shadow record-all behavior works correctly
-        unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
-        OptixBuildInput build_input = {};
-        build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
-        build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
-        build_input.triangleArray.numVertices = num_verts;
-        build_input.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3;
-        build_input.triangleArray.vertexStrideInBytes = sizeof(float3);
-        build_input.triangleArray.indexBuffer = index_data.device_pointer;
-        build_input.triangleArray.numIndexTriplets = mesh->num_triangles();
-        build_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3;
-        build_input.triangleArray.indexStrideInBytes = 3 * sizeof(int);
-        build_input.triangleArray.flags = &build_flags;
-        // The SBT does not store per primitive data since Cycles already allocates separate
-        // buffers for that purpose. OptiX does not allow this to be zero though, so just pass in
-        // one and rely on that having the same meaning in this case.
-        build_input.triangleArray.numSbtRecords = 1;
-        build_input.triangleArray.primitiveIndexOffset = mesh->optix_prim_offset;
-
-        if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
-          progress.set_error("Failed to build OptiX acceleration structure");
-        }
-      }
-    }
-    else {
-      unsigned int num_instances = 0;
-      unsigned int max_num_instances = 0xFFFFFFFF;
-
-      bvh_optix->as_data.free();
-      bvh_optix->traversable_handle = 0;
-      bvh_optix->motion_transform_data.free();
-
-      optixDeviceContextGetProperty(context,
-                                    OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID,
-                                    &max_num_instances,
-                                    sizeof(max_num_instances));
-      // Do not count first bit, which is used to distinguish instanced and non-instanced objects
-      max_num_instances >>= 1;
-      if (bvh->objects.size() > max_num_instances) {
-        progress.set_error(
-            "Failed to build OptiX acceleration structure because there are too many instances");
-        return;
-      }
-
-      // Fill instance descriptions
-#  if OPTIX_ABI_VERSION < 41
-      device_vector<OptixAabb> aabbs(this, "optix tlas aabbs", MEM_READ_ONLY);
-      aabbs.alloc(bvh->objects.size());
-#  endif
-      device_vector<OptixInstance> instances(this, "optix tlas instances", MEM_READ_ONLY);
-      instances.alloc(bvh->objects.size());
-
-      // Calculate total motion transform size and allocate memory for them
-      size_t motion_transform_offset = 0;
-      if (motion_blur) {
-        size_t total_motion_transform_size = 0;
-        for (Object *const ob : bvh->objects) {
-          if (ob->is_traceable() && ob->use_motion()) {
-            total_motion_transform_size = align_up(total_motion_transform_size,
-                                                   OPTIX_TRANSFORM_BYTE_ALIGNMENT);
-            const size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
-            total_motion_transform_size = total_motion_transform_size +
-                                          sizeof(OptixSRTMotionTransform) +
-                                          motion_keys * sizeof(OptixSRTData);
-          }
-        }
-
-        assert(bvh_optix->motion_transform_data.device == this);
-        bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size);
-      }
-
-      for (Object *ob : bvh->objects) {
-        // Skip non-traceable objects
-        if (!ob->is_traceable())
-          continue;
-
-        BVHOptiX *const blas = static_cast<BVHOptiX *>(ob->get_geometry()->bvh);
-        OptixTraversableHandle handle = blas->traversable_handle;
-
-#  if OPTIX_ABI_VERSION < 41
-        OptixAabb &aabb = aabbs[num_instances];
-        aabb.minX = ob->bounds.min.x;
-        aabb.minY = ob->bounds.min.y;
-        aabb.minZ = ob->bounds.min.z;
-        aabb.maxX = ob->bounds.max.x;
-        aabb.maxY = ob->bounds.max.y;
-        aabb.maxZ = ob->bounds.max.z;
-#  endif
-
-        OptixInstance &instance = instances[num_instances++];
-        memset(&instance, 0, sizeof(instance));
-
-        // Clear transform to identity matrix
-        instance.transform[0] = 1.0f;
-        instance.transform[5] = 1.0f;
-        instance.transform[10] = 1.0f;
-
-        // Set user instance ID to object index (but leave low bit blank)
-        instance.instanceId = ob->get_device_index() << 1;
-
-        // Have to have at least one bit in the mask, or else instance would always be culled
-        instance.visibilityMask = 1;
-
-        if (ob->get_geometry()->has_volume) {
-          // Volumes have a special bit set in the visibility mask so a trace can mask only volumes
-          instance.visibilityMask |= 2;
-        }
-
-        if (ob->get_geometry()->geometry_type == Geometry::HAIR) {
-          // Same applies to curves (so they can be skipped in local trace calls)
-          instance.visibilityMask |= 4;
-
-#  if OPTIX_ABI_VERSION >= 36
-          if (motion_blur && ob->get_geometry()->has_motion_blur() &&
-              DebugFlags().optix.curves_api &&
-              static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) {
-            // Select between motion blur and non-motion blur built-in intersection module
-            instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
-          }
-#  endif
-        }
-
-        // Insert motion traversable if object has motion
-        if (motion_blur && ob->use_motion()) {
-          size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
-          size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
-                                         motion_keys * sizeof(OptixSRTData);
-
-          const CUDAContextScope scope(cuContext);
-
-          motion_transform_offset = align_up(motion_transform_offset,
-                                             OPTIX_TRANSFORM_BYTE_ALIGNMENT);
-          CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data.device_pointer +
-                                             motion_transform_offset;
-          motion_transform_offset += motion_transform_size;
-
-          // Allocate host side memory for motion transform and fill it with transform data
-          OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
-              new uint8_t[motion_transform_size]);
-          motion_transform.child = handle;
-          motion_transform.motionOptions.numKeys = ob->get_motion().size();
-          motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
-          motion_transform.motionOptions.timeBegin = 0.0f;
-          motion_transform.motionOptions.timeEnd = 1.0f;
-
-          OptixSRTData *const srt_data = motion_transform.srtData;
-          array<DecomposedTransform> decomp(ob->get_motion().size());
-          transform_motion_decompose(
-              decomp.data(), ob->get_motion().data(), ob->get_motion().size());
-
-          for (size_t i = 0; i < ob->get_motion().size(); ++i) {
-            // Scale
-            srt_data[i].sx = decomp[i].y.w;  // scale.x.x
-            srt_data[i].sy = decomp[i].z.w;  // scale.y.y
-            srt_data[i].sz = decomp[i].w.w;  // scale.z.z
-
-            // Shear
-            srt_data[i].a = decomp[i].z.x;  // scale.x.y
-            srt_data[i].b = decomp[i].z.y;  // scale.x.z
-            srt_data[i].c = decomp[i].w.x;  // scale.y.z
-            assert(decomp[i].z.z == 0.0f);  // scale.y.x
-            assert(decomp[i].w.y == 0.0f);  // scale.z.x
-            assert(decomp[i].w.z == 0.0f);  // scale.z.y
-
-            // Pivot point
-            srt_data[i].pvx = 0.0f;
-            srt_data[i].pvy = 0.0f;
-            srt_data[i].pvz = 0.0f;
-
-            // Rotation
-            srt_data[i].qx = decomp[i].x.x;
-            srt_data[i].qy = decomp[i].x.y;
-            srt_data[i].qz = decomp[i].x.z;
-            srt_data[i].qw = decomp[i].x.w;
-
-            // Translation
-            srt_data[i].tx = decomp[i].y.x;
-            srt_data[i].ty = decomp[i].y.y;
-            srt_data[i].tz = decomp[i].y.z;
-          }
-
-          // Upload motion transform to GPU
-          cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
-          delete[] reinterpret_cast<uint8_t *>(&motion_transform);
-
-          // Disable instance transform if object uses motion transform already
-          instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
-
-          // Get traversable handle to motion transform
-          optixConvertPointerToTraversableHandle(context,
-                                                 motion_transform_gpu,
-                                                 OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
-                                                 &instance.traversableHandle);
-        }
-        else {
-          instance.traversableHandle = handle;
-
-          if (ob->get_geometry()->is_instanced()) {
-            // Set transform matrix
-            memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform));
-          }
-          else {
-            // Disable instance transform if geometry already has it applied to vertex data
-            instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
-            // Non-instanced objects read ID from 'prim_object', so distinguish
-            // them from instanced objects with the low bit set
-            instance.instanceId |= 1;
-          }
-        }
-      }
-
-      // Upload instance descriptions
-#  if OPTIX_ABI_VERSION < 41
-      aabbs.resize(num_instances);
-      aabbs.copy_to_device();
-#  endif
-      instances.resize(num_instances);
-      instances.copy_to_device();
-
-      // Build top-level acceleration structure (TLAS)
-      OptixBuildInput build_input = {};
-      build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
-#  if OPTIX_ABI_VERSION < 41  // Instance AABBs no longer need to be set since OptiX 7.2
-      build_input.instanceArray.aabbs = aabbs.device_pointer;
-      build_input.instanceArray.numAabbs = num_instances;
-#  endif
-      build_input.instanceArray.instances = instances.device_pointer;
-      build_input.instanceArray.numInstances = num_instances;
-
-      if (!build_optix_bvh(bvh_optix, OPTIX_BUILD_OPERATION_BUILD, build_input, 0)) {
-        progress.set_error("Failed to build OptiX acceleration structure");
-      }
-      tlas_handle = bvh_optix->traversable_handle;
-    }
-  }
-
-  void release_optix_bvh(BVH *bvh) override
-  {
-    thread_scoped_lock lock(delayed_free_bvh_mutex);
-    /* Do delayed free of BVH memory, since geometry holding BVH might be deleted
-     * while GPU is still rendering. */
-    BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
-
-    delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->as_data));
-    delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->motion_transform_data));
-    bvh_optix->traversable_handle = 0;
-  }
-
-  void free_bvh_memory_delayed()
-  {
-    thread_scoped_lock lock(delayed_free_bvh_mutex);
-    delayed_free_bvh_memory.free_memory();
-  }
-
-  void const_copy_to(const char *name, void *host, size_t size) override
-  {
-    // Set constant memory for CUDA module
-    // TODO(pmours): This is only used for tonemapping (see 'film_convert').
-    //               Could be removed by moving those functions to filter CUDA module.
-    CUDADevice::const_copy_to(name, host, size);
-
-    if (strcmp(name, "__data") == 0) {
-      assert(size <= sizeof(KernelData));
-
-      // Update traversable handle (since it is different for each device on multi devices)
-      KernelData *const data = (KernelData *)host;
-      *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;
-
-      update_launch_params(offsetof(KernelParams, data), host, size);
-      return;
-    }
-
-    // Update data storage pointers in launch parameters
-#  define KERNEL_TEX(data_type, tex_name) \
-    if (strcmp(name, #tex_name) == 0) { \
-      update_launch_params(offsetof(KernelParams, tex_name), host, size); \
-      return; \
-    }
-#  include "kernel/kernel_textures.h"
-#  undef KERNEL_TEX
-  }
-
-  void update_launch_params(size_t offset, void *data, size_t data_size)
-  {
-    const CUDAContextScope scope(cuContext);
-
-    for (int i = 0; i < info.cpu_threads; ++i)
-      check_result_cuda(
-          cuMemcpyHtoD(launch_params.device_pointer + i * launch_params.data_elements + offset,
-                       data,
-                       data_size));
-  }
-
-  void task_add(DeviceTask &task) override
-  {
-    // Upload texture information to device if it has changed since last launch
-    load_texture_info();
-
-    if (task.type == DeviceTask::FILM_CONVERT) {
-      // Execute in main thread because of OpenGL access
-      film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
-      return;
-    }
-
-    if (task.type == DeviceTask::DENOISE_BUFFER) {
-      // Execute denoising in a single thread (e.g. to avoid race conditions during creation)
-      task_pool.push([=] {
-        DeviceTask task_copy = task;
-        thread_run(task_copy, 0);
-      });
-      return;
-    }
-
-    // Split task into smaller ones
-    list<DeviceTask> tasks;
-    task.split(tasks, info.cpu_threads);
-
-    // Queue tasks in internal task pool
-    int task_index = 0;
-    for (DeviceTask &task : tasks) {
-      task_pool.push([=] {
-        // Using task index parameter instead of thread index, since number of CUDA streams may
-        // differ from number of threads
-        DeviceTask task_copy = task;
-        thread_run(task_copy, task_index);
-      });
-      task_index++;
-    }
-  }
-
-  void task_wait() override
-  {
-    // Wait for all queued tasks to finish
-    task_pool.wait_work();
-  }
-
-  void task_cancel() override
-  {
-    // Cancel any remaining tasks in the internal pool
-    task_pool.cancel();
-  }
-};
-
-bool device_optix_init()
-{
-  if (g_optixFunctionTable.optixDeviceContextCreate != NULL)
-    return true;  // Already initialized function table
-
-  // Need to initialize CUDA as well
-  if (!device_cuda_init())
-    return false;
-
-  const OptixResult result = optixInit();
-
-  if (result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) {
-    VLOG(1) << "OptiX initialization failed because the installed NVIDIA driver is too old. "
-               "Please update to the latest driver first!";
-    return false;
-  }
-  else if (result != OPTIX_SUCCESS) {
-    VLOG(1) << "OptiX initialization failed with error code " << (unsigned int)result;
-    return false;
-  }
-
-  // Loaded OptiX successfully!
-  return true;
-}
-
-void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices)
-{
-  devices.reserve(cuda_devices.size());
-
-  // Simply add all supported CUDA devices as OptiX devices again
-  for (DeviceInfo info : cuda_devices) {
-    assert(info.type == DEVICE_CUDA);
-
-    int major;
-    cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, info.num);
-    if (major < 5) {
-      continue;  // Only Maxwell and up are supported by OptiX
-    }
-
-    info.type = DEVICE_OPTIX;
-    info.id += "_OptiX";
-    info.denoisers |= DENOISER_OPTIX;
-    info.has_branched_path = false;
-
-    devices.push_back(info);
-  }
-}
-
-Device *device_optix_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
-{
-  return new OptiXDevice(info, stats, profiler, background);
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/device_queue.cpp b/intern/cycles/device/device_queue.cpp
new file mode 100644
index 00000000000..a89ba68d62c
--- /dev/null
+++ b/intern/cycles/device/device_queue.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device_queue.h"
+
+#include "util/util_algorithm.h"
+#include "util/util_logging.h"
+#include "util/util_time.h"
+
+#include <iomanip>
+
+CCL_NAMESPACE_BEGIN
+
+DeviceQueue::DeviceQueue(Device *device)
+    : device(device), last_kernels_enqueued_(0), last_sync_time_(0.0)
+{
+  DCHECK_NE(device, nullptr);
+}
+
+DeviceQueue::~DeviceQueue()
+{
+  if (VLOG_IS_ON(3)) {
+    /* Print kernel execution times sorted by time. */
+    vector<pair<DeviceKernelMask, double>> stats_sorted;
+    for (const auto &stat : stats_kernel_time_) {
+      stats_sorted.push_back(stat);
+    }
+
+    sort(stats_sorted.begin(),
+         stats_sorted.end(),
+         [](const pair<DeviceKernelMask, double> &a, const pair<DeviceKernelMask, double> &b) {
+           return a.second > b.second;
+         });
+
+    VLOG(3) << "GPU queue stats:";
+    for (const auto &[mask, time] : stats_sorted) {
+      VLOG(3) << "  " << std::setfill(' ') << std::setw(10) << std::fixed << std::setprecision(5)
+              << std::right << time << "s: " << device_kernel_mask_as_string(mask);
+    }
+  }
+}
+
+void DeviceQueue::debug_init_execution()
+{
+  if (VLOG_IS_ON(3)) {
+    last_sync_time_ = time_dt();
+    last_kernels_enqueued_ = 0;
+  }
+}
+
+void DeviceQueue::debug_enqueue(DeviceKernel kernel, const int work_size)
+{
+  if (VLOG_IS_ON(3)) {
+    VLOG(4) << "GPU queue launch " << device_kernel_as_string(kernel) << ", work_size "
+            << work_size;
+    last_kernels_enqueued_ |= (uint64_t(1) << (uint64_t)kernel);
+  }
+}
+
+void DeviceQueue::debug_synchronize()
+{
+  if (VLOG_IS_ON(3)) {
+    const double new_time = time_dt();
+    const double elapsed_time = new_time - last_sync_time_;
+    VLOG(4) << "GPU queue synchronize, elapsed " << std::setw(10) << elapsed_time << "s";
+
+    stats_kernel_time_[last_kernels_enqueued_] += elapsed_time;
+
+    last_sync_time_ = new_time;
+    last_kernels_enqueued_ = 0;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_queue.h b/intern/cycles/device/device_queue.h
new file mode 100644
index 00000000000..edda3e61d51
--- /dev/null
+++ b/intern/cycles/device/device_queue.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "device/device_kernel.h"
+
+#include "device/device_graphics_interop.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class device_memory;
+
+struct KernelWorkTile;
+
+/* Abstraction of a command queue for a device.
+ * Provides API to schedule kernel execution in a specific queue with minimal possible overhead
+ * from driver side.
+ *
+ * This class encapsulates all properties needed for commands execution. */
+class DeviceQueue {
+ public:
+  virtual ~DeviceQueue();
+
+  /* Number of concurrent states to process for integrator,
+   * based on number of cores and/or available memory. */
+  virtual int num_concurrent_states(const size_t state_size) const = 0;
+
+  /* Number of states which keeps the device occupied with work without loosing performance.
+   * The renderer will add more work (when available) when number of active paths falls below this
+   * value. */
+  virtual int num_concurrent_busy_states() const = 0;
+
+  /* Initialize execution of kernels on this queue.
+   *
+   * Will, for example, load all data required by the kernels from Device to global or path state.
+   *
+   * Use this method after device synchronization has finished before enqueueing any kernels. */
+  virtual void init_execution() = 0;
+
+  /* Test if an optional device kernel is available. */
+  virtual bool kernel_available(DeviceKernel kernel) const = 0;
+
+  /* Enqueue kernel execution.
+   *
+   * Execute the kernel work_size times on the device.
+   * Supported arguments types:
+   * - int: pass pointer to the int
+   * - device memory: pass pointer to device_memory.device_pointer
+   * Return false if there was an error executing this or a previous kernel. */
+  virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) = 0;
+
+  /* Wait unit all enqueued kernels have finished execution.
+   * Return false if there was an error executing any of the enqueued kernels. */
+  virtual bool synchronize() = 0;
+
+  /* Copy memory to/from device as part of the command queue, to ensure
+   * operations are done in order without having to synchronize. */
+  virtual void zero_to_device(device_memory &mem) = 0;
+  virtual void copy_to_device(device_memory &mem) = 0;
+  virtual void copy_from_device(device_memory &mem) = 0;
+
+  /* Graphics resources interoperability.
+   *
+   * The interoperability comes here by the meaning that the device is capable of computing result
+   * directly into an OpenGL (or other graphics library) buffer. */
+
+  /* Create graphics interoperability context which will be taking care of mapping graphics
+   * resource as a buffer writable by kernels of this device. */
+  virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create()
+  {
+    LOG(FATAL) << "Request of GPU interop of a device which does not support it.";
+    return nullptr;
+  }
+
+  /* Device this queue has been created for. */
+  Device *device;
+
+ protected:
+  /* Hide construction so that allocation via `Device` API is enforced. */
+  explicit DeviceQueue(Device *device);
+
+  /* Implementations call these from the corresponding methods to generate debugging logs. */
+  void debug_init_execution();
+  void debug_enqueue(DeviceKernel kernel, const int work_size);
+  void debug_synchronize();
+
+  /* Combination of kernels enqueued together sync last synchronize. */
+  DeviceKernelMask last_kernels_enqueued_;
+  /* Time of synchronize call. */
+  double last_sync_time_;
+  /* Accumulated execution time for combinations of kernels launched together. */
+  map<DeviceKernelMask, double> stats_kernel_time_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
deleted file mode 100644
index 9889f688aaa..00000000000
--- a/intern/cycles/device/device_split_kernel.cpp
+++ /dev/null
@@ -1,389 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "device/device_split_kernel.h"
-
-#include "kernel/kernel_types.h"
-#include "kernel/split/kernel_split_data_types.h"
-
-#include "util/util_logging.h"
-#include "util/util_time.h"
-
-CCL_NAMESPACE_BEGIN
-
-static const double alpha = 0.1; /* alpha for rolling average */
-
-DeviceSplitKernel::DeviceSplitKernel(Device *device)
-    : device(device),
-      split_data(device, "split_data"),
-      ray_state(device, "ray_state", MEM_READ_WRITE),
-      queue_index(device, "queue_index"),
-      use_queues_flag(device, "use_queues_flag"),
-      work_pool_wgs(device, "work_pool_wgs"),
-      kernel_data_initialized(false)
-{
-  avg_time_per_sample = 0.0;
-
-  kernel_path_init = NULL;
-  kernel_scene_intersect = NULL;
-  kernel_lamp_emission = NULL;
-  kernel_do_volume = NULL;
-  kernel_queue_enqueue = NULL;
-  kernel_indirect_background = NULL;
-  kernel_shader_setup = NULL;
-  kernel_shader_sort = NULL;
-  kernel_shader_eval = NULL;
-  kernel_holdout_emission_blurring_pathtermination_ao = NULL;
-  kernel_subsurface_scatter = NULL;
-  kernel_direct_lighting = NULL;
-  kernel_shadow_blocked_ao = NULL;
-  kernel_shadow_blocked_dl = NULL;
-  kernel_enqueue_inactive = NULL;
-  kernel_next_iteration_setup = NULL;
-  kernel_indirect_subsurface = NULL;
-  kernel_buffer_update = NULL;
-  kernel_adaptive_stopping = NULL;
-  kernel_adaptive_filter_x = NULL;
-  kernel_adaptive_filter_y = NULL;
-  kernel_adaptive_adjust_samples = NULL;
-}
-
-DeviceSplitKernel::~DeviceSplitKernel()
-{
-  split_data.free();
-  ray_state.free();
-  use_queues_flag.free();
-  queue_index.free();
-  work_pool_wgs.free();
-
-  delete kernel_path_init;
-  delete kernel_scene_intersect;
-  delete kernel_lamp_emission;
-  delete kernel_do_volume;
-  delete kernel_queue_enqueue;
-  delete kernel_indirect_background;
-  delete kernel_shader_setup;
-  delete kernel_shader_sort;
-  delete kernel_shader_eval;
-  delete kernel_holdout_emission_blurring_pathtermination_ao;
-  delete kernel_subsurface_scatter;
-  delete kernel_direct_lighting;
-  delete kernel_shadow_blocked_ao;
-  delete kernel_shadow_blocked_dl;
-  delete kernel_enqueue_inactive;
-  delete kernel_next_iteration_setup;
-  delete kernel_indirect_subsurface;
-  delete kernel_buffer_update;
-  delete kernel_adaptive_stopping;
-  delete kernel_adaptive_filter_x;
-  delete kernel_adaptive_filter_y;
-  delete kernel_adaptive_adjust_samples;
-}
-
-bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures &requested_features)
-{
-#define LOAD_KERNEL(name) \
-  kernel_##name = get_split_kernel_function(#name, requested_features); \
-  if (!kernel_##name) { \
-    device->set_error(string("Split kernel error: failed to load kernel_") + #name); \
-    return false; \
-  }
-
-  LOAD_KERNEL(path_init);
-  LOAD_KERNEL(scene_intersect);
-  LOAD_KERNEL(lamp_emission);
-  if (requested_features.use_volume) {
-    LOAD_KERNEL(do_volume);
-  }
-  LOAD_KERNEL(queue_enqueue);
-  LOAD_KERNEL(indirect_background);
-  LOAD_KERNEL(shader_setup);
-  LOAD_KERNEL(shader_sort);
-  LOAD_KERNEL(shader_eval);
-  LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
-  LOAD_KERNEL(subsurface_scatter);
-  LOAD_KERNEL(direct_lighting);
-  LOAD_KERNEL(shadow_blocked_ao);
-  LOAD_KERNEL(shadow_blocked_dl);
-  LOAD_KERNEL(enqueue_inactive);
-  LOAD_KERNEL(next_iteration_setup);
-  LOAD_KERNEL(indirect_subsurface);
-  LOAD_KERNEL(buffer_update);
-  LOAD_KERNEL(adaptive_stopping);
-  LOAD_KERNEL(adaptive_filter_x);
-  LOAD_KERNEL(adaptive_filter_y);
-  LOAD_KERNEL(adaptive_adjust_samples);
-
-#undef LOAD_KERNEL
-
-  /* Re-initialiaze kernel-dependent data when kernels change. */
-  kernel_data_initialized = false;
-
-  return true;
-}
-
-size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory &kg,
-                                                           device_memory &data,
-                                                           uint64_t max_buffer_size)
-{
-  uint64_t size_per_element = state_buffer_size(kg, data, 1024) / 1024;
-  VLOG(1) << "Split state element size: " << string_human_readable_number(size_per_element)
-          << " bytes. (" << string_human_readable_size(size_per_element) << ").";
-  return max_buffer_size / size_per_element;
-}
-
-bool DeviceSplitKernel::path_trace(DeviceTask &task,
-                                   RenderTile &tile,
-                                   device_memory &kgbuffer,
-                                   device_memory &kernel_data)
-{
-  if (device->have_error()) {
-    return false;
-  }
-
-  /* Allocate all required global memory once. */
-  if (!kernel_data_initialized) {
-    kernel_data_initialized = true;
-
-    /* Set local size */
-    int2 lsize = split_kernel_local_size();
-    local_size[0] = lsize[0];
-    local_size[1] = lsize[1];
-
-    /* Set global size */
-    int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task);
-
-    /* Make sure that set work size is a multiple of local
-     * work size dimensions.
-     */
-    global_size[0] = round_up(gsize[0], local_size[0]);
-    global_size[1] = round_up(gsize[1], local_size[1]);
-
-    int num_global_elements = global_size[0] * global_size[1];
-    assert(num_global_elements % WORK_POOL_SIZE == 0);
-
-    /* Calculate max groups */
-
-    /* Denotes the maximum work groups possible w.r.t. current requested tile size. */
-    unsigned int work_pool_size = (device->info.type == DEVICE_CPU) ? WORK_POOL_SIZE_CPU :
-                                                                      WORK_POOL_SIZE_GPU;
-    unsigned int max_work_groups = num_global_elements / work_pool_size + 1;
-
-    /* Allocate work_pool_wgs memory. */
-    work_pool_wgs.alloc_to_device(max_work_groups);
-    queue_index.alloc_to_device(NUM_QUEUES);
-    use_queues_flag.alloc_to_device(1);
-    split_data.alloc_to_device(state_buffer_size(kgbuffer, kernel_data, num_global_elements));
-    ray_state.alloc(num_global_elements);
-  }
-
-  /* Number of elements in the global state buffer */
-  int num_global_elements = global_size[0] * global_size[1];
-
-#define ENQUEUE_SPLIT_KERNEL(name, global_size, local_size) \
-  if (device->have_error()) { \
-    return false; \
-  } \
-  if (!kernel_##name->enqueue( \
-          KernelDimensions(global_size, local_size), kgbuffer, kernel_data)) { \
-    return false; \
-  }
-
-  tile.sample = tile.start_sample;
-
-  /* for exponential increase between tile updates */
-  int time_multiplier = 1;
-
-  while (tile.sample < tile.start_sample + tile.num_samples) {
-    /* to keep track of how long it takes to run a number of samples */
-    double start_time = time_dt();
-
-    /* initial guess to start rolling average */
-    const int initial_num_samples = 1;
-    /* approx number of samples per second */
-    const int samples_per_second = (avg_time_per_sample > 0.0) ?
-                                       int(double(time_multiplier) / avg_time_per_sample) + 1 :
-                                       initial_num_samples;
-
-    RenderTile subtile = tile;
-    subtile.start_sample = tile.sample;
-    subtile.num_samples = samples_per_second;
-
-    if (task.adaptive_sampling.use) {
-      subtile.num_samples = task.adaptive_sampling.align_samples(subtile.start_sample,
-                                                                 subtile.num_samples);
-    }
-
-    /* Don't go beyond requested number of samples. */
-    subtile.num_samples = min(subtile.num_samples,
-                              tile.start_sample + tile.num_samples - tile.sample);
-
-    if (device->have_error()) {
-      return false;
-    }
-
-    /* reset state memory here as global size for data_init
-     * kernel might not be large enough to do in kernel
-     */
-    work_pool_wgs.zero_to_device();
-    split_data.zero_to_device();
-    ray_state.zero_to_device();
-
-    if (!enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
-                                        subtile,
-                                        num_global_elements,
-                                        kgbuffer,
-                                        kernel_data,
-                                        split_data,
-                                        ray_state,
-                                        queue_index,
-                                        use_queues_flag,
-                                        work_pool_wgs)) {
-      return false;
-    }
-
-    ENQUEUE_SPLIT_KERNEL(path_init, global_size, local_size);
-
-    bool activeRaysAvailable = true;
-    double cancel_time = DBL_MAX;
-
-    while (activeRaysAvailable) {
-      /* Do path-iteration in host [Enqueue Path-iteration kernels. */
-      for (int PathIter = 0; PathIter < 16; PathIter++) {
-        ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
-        if (kernel_do_volume) {
-          ENQUEUE_SPLIT_KERNEL(do_volume, global_size, local_size);
-        }
-        ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(indirect_background, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(shader_setup, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(shader_sort, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(
-            holdout_emission_blurring_pathtermination_ao, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(subsurface_scatter, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(enqueue_inactive, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(indirect_subsurface, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(buffer_update, global_size, local_size);
-
-        if (task.get_cancel() && cancel_time == DBL_MAX) {
-          /* Wait up to twice as many seconds for current samples to finish
-           * to avoid artifacts in render result from ending too soon.
-           */
-          cancel_time = time_dt() + 2.0 * time_multiplier;
-        }
-
-        if (time_dt() > cancel_time) {
-          return true;
-        }
-      }
-
-      /* Decide if we should exit path-iteration in host. */
-      ray_state.copy_from_device(0, global_size[0] * global_size[1], 1);
-
-      activeRaysAvailable = false;
-
-      for (int rayStateIter = 0; rayStateIter < global_size[0] * global_size[1]; ++rayStateIter) {
-        if (!IS_STATE(ray_state.data(), rayStateIter, RAY_INACTIVE)) {
-          if (IS_STATE(ray_state.data(), rayStateIter, RAY_INVALID)) {
-            /* Something went wrong, abort to avoid looping endlessly. */
-            device->set_error("Split kernel error: invalid ray state");
-            return false;
-          }
-
-          /* Not all rays are RAY_INACTIVE. */
-          activeRaysAvailable = true;
-          break;
-        }
-      }
-
-      if (time_dt() > cancel_time) {
-        return true;
-      }
-    }
-
-    int filter_sample = tile.sample + subtile.num_samples - 1;
-    if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
-      size_t buffer_size[2];
-      buffer_size[0] = round_up(tile.w, local_size[0]);
-      buffer_size[1] = round_up(tile.h, local_size[1]);
-      kernel_adaptive_stopping->enqueue(
-          KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
-      buffer_size[0] = round_up(tile.h, local_size[0]);
-      buffer_size[1] = round_up(1, local_size[1]);
-      kernel_adaptive_filter_x->enqueue(
-          KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
-      buffer_size[0] = round_up(tile.w, local_size[0]);
-      buffer_size[1] = round_up(1, local_size[1]);
-      kernel_adaptive_filter_y->enqueue(
-          KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
-    }
-
-    double time_per_sample = ((time_dt() - start_time) / subtile.num_samples);
-
-    if (avg_time_per_sample == 0.0) {
-      /* start rolling average */
-      avg_time_per_sample = time_per_sample;
-    }
-    else {
-      avg_time_per_sample = alpha * time_per_sample + (1.0 - alpha) * avg_time_per_sample;
-    }
-
-#undef ENQUEUE_SPLIT_KERNEL
-
-    tile.sample += subtile.num_samples;
-    task.update_progress(&tile, tile.w * tile.h * subtile.num_samples);
-
-    time_multiplier = min(time_multiplier << 1, 10);
-
-    if (task.get_cancel()) {
-      return true;
-    }
-  }
-
-  if (task.adaptive_sampling.use) {
-    /* Reset the start samples. */
-    RenderTile subtile = tile;
-    subtile.start_sample = tile.start_sample;
-    subtile.num_samples = tile.sample - tile.start_sample;
-    enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
-                                   subtile,
-                                   num_global_elements,
-                                   kgbuffer,
-                                   kernel_data,
-                                   split_data,
-                                   ray_state,
-                                   queue_index,
-                                   use_queues_flag,
-                                   work_pool_wgs);
-    size_t buffer_size[2];
-    buffer_size[0] = round_up(tile.w, local_size[0]);
-    buffer_size[1] = round_up(tile.h, local_size[1]);
-    kernel_adaptive_adjust_samples->enqueue(
-        KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
-  }
-
-  return true;
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h
deleted file mode 100644
index 07a21b10299..00000000000
--- a/intern/cycles/device/device_split_kernel.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEVICE_SPLIT_KERNEL_H__
-#define __DEVICE_SPLIT_KERNEL_H__
-
-#include "device/device.h"
-#include "render/buffers.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* When allocate global memory in chunks. We may not be able to
- * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks;
- * Since some bytes may be needed for aligning chunks of memory;
- * This is the amount of memory that we dedicate for that purpose.
- */
-#define DATA_ALLOCATION_MEM_FACTOR 5000000  // 5MB
-
-/* Types used for split kernel */
-
-class KernelDimensions {
- public:
-  size_t global_size[2];
-  size_t local_size[2];
-
-  KernelDimensions(size_t global_size_[2], size_t local_size_[2])
-  {
-    memcpy(global_size, global_size_, sizeof(global_size));
-    memcpy(local_size, local_size_, sizeof(local_size));
-  }
-};
-
-class SplitKernelFunction {
- public:
-  virtual ~SplitKernelFunction()
-  {
-  }
-
-  /* enqueue the kernel, returns false if there is an error */
-  virtual bool enqueue(const KernelDimensions &dim, device_memory &kg, device_memory &data) = 0;
-};
-
-class DeviceSplitKernel {
- private:
-  Device *device;
-
-  SplitKernelFunction *kernel_path_init;
-  SplitKernelFunction *kernel_scene_intersect;
-  SplitKernelFunction *kernel_lamp_emission;
-  SplitKernelFunction *kernel_do_volume;
-  SplitKernelFunction *kernel_queue_enqueue;
-  SplitKernelFunction *kernel_indirect_background;
-  SplitKernelFunction *kernel_shader_setup;
-  SplitKernelFunction *kernel_shader_sort;
-  SplitKernelFunction *kernel_shader_eval;
-  SplitKernelFunction *kernel_holdout_emission_blurring_pathtermination_ao;
-  SplitKernelFunction *kernel_subsurface_scatter;
-  SplitKernelFunction *kernel_direct_lighting;
-  SplitKernelFunction *kernel_shadow_blocked_ao;
-  SplitKernelFunction *kernel_shadow_blocked_dl;
-  SplitKernelFunction *kernel_enqueue_inactive;
-  SplitKernelFunction *kernel_next_iteration_setup;
-  SplitKernelFunction *kernel_indirect_subsurface;
-  SplitKernelFunction *kernel_buffer_update;
-  SplitKernelFunction *kernel_adaptive_stopping;
-  SplitKernelFunction *kernel_adaptive_filter_x;
-  SplitKernelFunction *kernel_adaptive_filter_y;
-  SplitKernelFunction *kernel_adaptive_adjust_samples;
-
-  /* Global memory variables [porting]; These memory is used for
-   * co-operation between different kernels; Data written by one
-   * kernel will be available to another kernel via this global
-   * memory.
-   */
-  device_only_memory<uchar> split_data;
-  device_vector<uchar> ray_state;
-  device_only_memory<int>
-      queue_index; /* Array of size num_queues that tracks the size of each queue. */
-
-  /* Flag to make sceneintersect and lampemission kernel use queues. */
-  device_only_memory<char> use_queues_flag;
-
-  /* Approximate time it takes to complete one sample */
-  double avg_time_per_sample;
-
-  /* Work pool with respect to each work group. */
-  device_only_memory<unsigned int> work_pool_wgs;
-
-  /* Cached kernel-dependent data, initialized once. */
-  bool kernel_data_initialized;
-  size_t local_size[2];
-  size_t global_size[2];
-
- public:
-  explicit DeviceSplitKernel(Device *device);
-  virtual ~DeviceSplitKernel();
-
-  bool load_kernels(const DeviceRequestedFeatures &requested_features);
-  bool path_trace(DeviceTask &task,
-                  RenderTile &rtile,
-                  device_memory &kgbuffer,
-                  device_memory &kernel_data);
-
-  virtual uint64_t state_buffer_size(device_memory &kg,
-                                     device_memory &data,
-                                     size_t num_threads) = 0;
-  size_t max_elements_for_max_buffer_size(device_memory &kg,
-                                          device_memory &data,
-                                          uint64_t max_buffer_size);
-
-  virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
-                                              RenderTile &rtile,
-                                              int num_global_elements,
-                                              device_memory &kernel_globals,
-                                              device_memory &kernel_data_,
-                                              device_memory &split_data,
-                                              device_memory &ray_state,
-                                              device_memory &queue_index,
-                                              device_memory &use_queues_flag,
-                                              device_memory &work_pool_wgs) = 0;
-
-  virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
-                                                         const DeviceRequestedFeatures &) = 0;
-  virtual int2 split_kernel_local_size() = 0;
-  virtual int2 split_kernel_global_size(device_memory &kg,
-                                        device_memory &data,
-                                        DeviceTask &task) = 0;
-};
-
-CCL_NAMESPACE_END
-
-#endif /* __DEVICE_SPLIT_KERNEL_H__ */
diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp
deleted file mode 100644
index 55fbaa31e42..00000000000
--- a/intern/cycles/device/device_task.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stdlib.h>
-#include <string.h>
-
-#include "device/device_task.h"
-
-#include "render/buffers.h"
-
-#include "util/util_algorithm.h"
-#include "util/util_time.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* Device Task */
-
-DeviceTask::DeviceTask(Type type_)
-    : type(type_),
-      x(0),
-      y(0),
-      w(0),
-      h(0),
-      rgba_byte(0),
-      rgba_half(0),
-      buffer(0),
-      sample(0),
-      num_samples(1),
-      shader_input(0),
-      shader_output(0),
-      shader_eval_type(0),
-      shader_filter(0),
-      shader_x(0),
-      shader_w(0),
-      buffers(nullptr),
-      tile_types(0),
-      denoising_from_render(false),
-      pass_stride(0),
-      frame_stride(0),
-      target_pass_stride(0),
-      pass_denoising_data(0),
-      pass_denoising_clean(0),
-      need_finish_queue(false),
-      integrator_branched(false)
-{
-  last_update_time = time_dt();
-}
-
-int DeviceTask::get_subtask_count(int num, int max_size) const
-{
-  if (max_size != 0) {
-    int max_size_num;
-
-    if (type == SHADER) {
-      max_size_num = (shader_w + max_size - 1) / max_size;
-    }
-    else {
-      max_size = max(1, max_size / w);
-      max_size_num = (h + max_size - 1) / max_size;
-    }
-
-    num = max(max_size_num, num);
-  }
-
-  if (type == SHADER) {
-    num = min(shader_w, num);
-  }
-  else if (type == RENDER) {
-  }
-  else {
-    num = min(h, num);
-  }
-
-  return num;
-}
-
-void DeviceTask::split(list<DeviceTask> &tasks, int num, int max_size) const
-{
-  num = get_subtask_count(num, max_size);
-
-  if (type == SHADER) {
-    for (int i = 0; i < num; i++) {
-      int tx = shader_x + (shader_w / num) * i;
-      int tw = (i == num - 1) ? shader_w - i * (shader_w / num) : shader_w / num;
-
-      DeviceTask task = *this;
-
-      task.shader_x = tx;
-      task.shader_w = tw;
-
-      tasks.push_back(task);
-    }
-  }
-  else if (type == RENDER) {
-    for (int i = 0; i < num; i++)
-      tasks.push_back(*this);
-  }
-  else {
-    for (int i = 0; i < num; i++) {
-      int ty = y + (h / num) * i;
-      int th = (i == num - 1) ? h - i * (h / num) : h / num;
-
-      DeviceTask task = *this;
-
-      task.y = ty;
-      task.h = th;
-
-      tasks.push_back(task);
-    }
-  }
-}
-
-void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples)
-{
-  if (type == FILM_CONVERT)
-    return;
-
-  if (update_progress_sample) {
-    if (pixel_samples == -1) {
-      pixel_samples = shader_w;
-    }
-    update_progress_sample(pixel_samples, rtile ? rtile->sample : 0);
-  }
-
-  if (update_tile_sample) {
-    double current_time = time_dt();
-
-    if (current_time - last_update_time >= 1.0) {
-      update_tile_sample(*rtile);
-
-      last_update_time = current_time;
-    }
-  }
-}
-
-/* Adaptive Sampling */
-
-AdaptiveSampling::AdaptiveSampling() : use(true), adaptive_step(0), min_samples(0)
-{
-}
-
-/* Render samples in steps that align with the adaptive filtering. */
-int AdaptiveSampling::align_samples(int sample, int num_samples) const
-{
-  int end_sample = sample + num_samples;
-
-  /* Round down end sample to the nearest sample that needs filtering. */
-  end_sample &= ~(adaptive_step - 1);
-
-  if (end_sample <= sample) {
-    /* In order to reach the next sample that needs filtering, we'd need
-     * to increase num_samples. We don't do that in this function, so
-     * just keep it as is and don't filter this time around. */
-    return num_samples;
-  }
-  return end_sample - sample;
-}
-
-bool AdaptiveSampling::need_filter(int sample) const
-{
-  if (sample > min_samples) {
-    return (sample & (adaptive_step - 1)) == (adaptive_step - 1);
-  }
-  else {
-    return false;
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
deleted file mode 100644
index 3f7cf47b692..00000000000
--- a/intern/cycles/device/device_task.h
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEVICE_TASK_H__
-#define __DEVICE_TASK_H__
-
-#include "device/device_memory.h"
-
-#include "util/util_function.h"
-#include "util/util_list.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* Device Task */
-
-class Device;
-class RenderBuffers;
-class RenderTile;
-class RenderTileNeighbors;
-class Tile;
-
-enum DenoiserType {
-  DENOISER_NLM = 1,
-  DENOISER_OPTIX = 2,
-  DENOISER_OPENIMAGEDENOISE = 4,
-  DENOISER_NUM,
-
-  DENOISER_NONE = 0,
-  DENOISER_ALL = ~0,
-};
-
-enum DenoiserInput {
-  DENOISER_INPUT_RGB = 1,
-  DENOISER_INPUT_RGB_ALBEDO = 2,
-  DENOISER_INPUT_RGB_ALBEDO_NORMAL = 3,
-
-  DENOISER_INPUT_NUM,
-};
-
-typedef int DenoiserTypeMask;
-
-class DenoiseParams {
- public:
-  /* Apply denoiser to image. */
-  bool use;
-  /* Output denoising data passes (possibly without applying the denoiser). */
-  bool store_passes;
-
-  /* Denoiser type. */
-  DenoiserType type;
-
-  /* Viewport start sample. */
-  int start_sample;
-
-  /** Native Denoiser. */
-
-  /* Pixel radius for neighboring pixels to take into account. */
-  int radius;
-  /* Controls neighbor pixel weighting for the denoising filter. */
-  float strength;
-  /* Preserve more or less detail based on feature passes. */
-  float feature_strength;
-  /* When removing pixels that don't carry information,
-   * use a relative threshold instead of an absolute one. */
-  bool relative_pca;
-  /* How many frames before and after the current center frame are included. */
-  int neighbor_frames;
-  /* Clamp the input to the range of +-1e8. Should be enough for any legitimate data. */
-  bool clamp_input;
-
-  /** OIDN/Optix Denoiser. */
-
-  /* Passes handed over to the OIDN/OptiX denoiser (default to color + albedo). */
-  DenoiserInput input_passes;
-
-  DenoiseParams()
-  {
-    use = false;
-    store_passes = false;
-
-    type = DENOISER_NLM;
-
-    radius = 8;
-    strength = 0.5f;
-    feature_strength = 0.5f;
-    relative_pca = false;
-    neighbor_frames = 2;
-    clamp_input = true;
-
-    /* Default to color + albedo only, since normal input does not always have the desired effect
-     * when denoising with OptiX. */
-    input_passes = DENOISER_INPUT_RGB_ALBEDO;
-
-    start_sample = 0;
-  }
-
-  /* Test if a denoising task needs to run, also to prefilter passes for the native
-   * denoiser when we are not applying denoising to the combined image. */
-  bool need_denoising_task() const
-  {
-    return (use || (store_passes && type == DENOISER_NLM));
-  }
-};
-
-class AdaptiveSampling {
- public:
-  AdaptiveSampling();
-
-  int align_samples(int sample, int num_samples) const;
-  bool need_filter(int sample) const;
-
-  bool use;
-  int adaptive_step;
-  int min_samples;
-};
-
-class DeviceTask {
- public:
-  typedef enum { RENDER, FILM_CONVERT, SHADER, DENOISE_BUFFER } Type;
-  Type type;
-
-  int x, y, w, h;
-  device_ptr rgba_byte;
-  device_ptr rgba_half;
-  device_ptr buffer;
-  int sample;
-  int num_samples;
-  int offset, stride;
-
-  device_ptr shader_input;
-  device_ptr shader_output;
-  int shader_eval_type;
-  int shader_filter;
-  int shader_x, shader_w;
-
-  RenderBuffers *buffers;
-
-  explicit DeviceTask(Type type = RENDER);
-
-  int get_subtask_count(int num, int max_size = 0) const;
-  void split(list<DeviceTask> &tasks, int num, int max_size = 0) const;
-
-  void update_progress(RenderTile *rtile, int pixel_samples = -1);
-
-  function<bool(Device *device, RenderTile &, uint)> acquire_tile;
-  function<void(long, int)> update_progress_sample;
-  function<void(RenderTile &)> update_tile_sample;
-  function<void(RenderTile &)> release_tile;
-  function<bool()> get_cancel;
-  function<bool()> get_tile_stolen;
-  function<void(RenderTileNeighbors &, Device *)> map_neighbor_tiles;
-  function<void(RenderTileNeighbors &, Device *)> unmap_neighbor_tiles;
-
-  uint tile_types;
-  DenoiseParams denoising;
-  bool denoising_from_render;
-  vector<int> denoising_frames;
-
-  int pass_stride;
-  int frame_stride;
-  int target_pass_stride;
-  int pass_denoising_data;
-  int pass_denoising_clean;
-
-  bool need_finish_queue;
-  bool integrator_branched;
-  AdaptiveSampling adaptive_sampling;
-
- protected:
-  double last_update_time;
-};
-
-CCL_NAMESPACE_END
-
-#endif /* __DEVICE_TASK_H__ */
diff --git a/intern/cycles/device/device_dummy.cpp b/intern/cycles/device/dummy/device.cpp
index 5112fc152e5..678276ed025 100644
--- a/intern/cycles/device/device_dummy.cpp
+++ b/intern/cycles/device/dummy/device.cpp
@@ -14,8 +14,10 @@
  * limitations under the License.
  */
 
+#include "device/dummy/device.h"
+
 #include "device/device.h"
-#include "device/device_intern.h"
+#include "device/device_queue.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -23,8 +25,8 @@ CCL_NAMESPACE_BEGIN
 
 class DummyDevice : public Device {
  public:
-  DummyDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
-      : Device(info_, stats_, profiler_, background_)
+  DummyDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_)
+      : Device(info_, stats_, profiler_)
   {
     error_msg = info.error_msg;
   }
@@ -61,23 +63,11 @@ class DummyDevice : public Device {
   virtual void const_copy_to(const char *, void *, size_t) override
   {
   }
-
-  virtual void task_add(DeviceTask &) override
-  {
-  }
-
-  virtual void task_wait() override
-  {
-  }
-
-  virtual void task_cancel() override
-  {
-  }
 };
 
-Device *device_dummy_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
+Device *device_dummy_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
 {
-  return new DummyDevice(info, stats, profiler, background);
+  return new DummyDevice(info, stats, profiler);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/dummy/device.h b/intern/cycles/device/dummy/device.h
new file mode 100644
index 00000000000..832a9568129
--- /dev/null
+++ b/intern/cycles/device/dummy/device.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_string.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+Device *device_dummy_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/multi/device.cpp b/intern/cycles/device/multi/device.cpp
new file mode 100644
index 00000000000..6dbcce2d9a5
--- /dev/null
+++ b/intern/cycles/device/multi/device.cpp
@@ -0,0 +1,423 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/multi/device.h"
+
+#include <sstream>
+#include <stdlib.h>
+
+#include "bvh/bvh_multi.h"
+
+#include "device/device.h"
+#include "device/device_queue.h"
+
+#include "render/buffers.h"
+#include "render/geometry.h"
+
+#include "util/util_foreach.h"
+#include "util/util_list.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_time.h"
+
+CCL_NAMESPACE_BEGIN
+
+class MultiDevice : public Device {
+ public:
+  struct SubDevice {
+    Stats stats;
+    Device *device;
+    map<device_ptr, device_ptr> ptr_map;
+    int peer_island_index = -1;
+  };
+
+  list<SubDevice> devices;
+  device_ptr unique_key;
+  vector<vector<SubDevice *>> peer_islands;
+
+  MultiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+      : Device(info, stats, profiler), unique_key(1)
+  {
+    foreach (const DeviceInfo &subinfo, info.multi_devices) {
+      /* Always add CPU devices at the back since GPU devices can change
+       * host memory pointers, which CPU uses as device pointer. */
+      SubDevice *sub;
+      if (subinfo.type == DEVICE_CPU) {
+        devices.emplace_back();
+        sub = &devices.back();
+      }
+      else {
+        devices.emplace_front();
+        sub = &devices.front();
+      }
+
+      /* The pointer to 'sub->stats' will stay valid even after new devices
+       * are added, since 'devices' is a linked list. */
+      sub->device = Device::create(subinfo, sub->stats, profiler);
+    }
+
+    /* Build a list of peer islands for the available render devices */
+    foreach (SubDevice &sub, devices) {
+      /* First ensure that every device is in at least once peer island */
+      if (sub.peer_island_index < 0) {
+        peer_islands.emplace_back();
+        sub.peer_island_index = (int)peer_islands.size() - 1;
+        peer_islands[sub.peer_island_index].push_back(&sub);
+      }
+
+      if (!info.has_peer_memory) {
+        continue;
+      }
+
+      /* Second check peer access between devices and fill up the islands accordingly */
+      foreach (SubDevice &peer_sub, devices) {
+        if (peer_sub.peer_island_index < 0 &&
+            peer_sub.device->info.type == sub.device->info.type &&
+            peer_sub.device->check_peer_access(sub.device)) {
+          peer_sub.peer_island_index = sub.peer_island_index;
+          peer_islands[sub.peer_island_index].push_back(&peer_sub);
+        }
+      }
+    }
+  }
+
+  ~MultiDevice()
+  {
+    foreach (SubDevice &sub, devices)
+      delete sub.device;
+  }
+
+  const string &error_message() override
+  {
+    error_msg.clear();
+
+    foreach (SubDevice &sub, devices)
+      error_msg += sub.device->error_message();
+
+    return error_msg;
+  }
+
+  virtual bool show_samples() const override
+  {
+    if (devices.size() > 1) {
+      return false;
+    }
+    return devices.front().device->show_samples();
+  }
+
+  virtual BVHLayoutMask get_bvh_layout_mask() const override
+  {
+    BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_ALL;
+    BVHLayoutMask bvh_layout_mask_all = BVH_LAYOUT_NONE;
+    foreach (const SubDevice &sub_device, devices) {
+      BVHLayoutMask device_bvh_layout_mask = sub_device.device->get_bvh_layout_mask();
+      bvh_layout_mask &= device_bvh_layout_mask;
+      bvh_layout_mask_all |= device_bvh_layout_mask;
+    }
+
+    /* With multiple OptiX devices, every device needs its own acceleration structure */
+    if (bvh_layout_mask == BVH_LAYOUT_OPTIX) {
+      return BVH_LAYOUT_MULTI_OPTIX;
+    }
+
+    /* When devices do not share a common BVH layout, fall back to creating one for each */
+    const BVHLayoutMask BVH_LAYOUT_OPTIX_EMBREE = (BVH_LAYOUT_OPTIX | BVH_LAYOUT_EMBREE);
+    if ((bvh_layout_mask_all & BVH_LAYOUT_OPTIX_EMBREE) == BVH_LAYOUT_OPTIX_EMBREE) {
+      return BVH_LAYOUT_MULTI_OPTIX_EMBREE;
+    }
+
+    return bvh_layout_mask;
+  }
+
+  bool load_kernels(const uint kernel_features) override
+  {
+    foreach (SubDevice &sub, devices)
+      if (!sub.device->load_kernels(kernel_features))
+        return false;
+
+    return true;
+  }
+
+  void build_bvh(BVH *bvh, Progress &progress, bool refit) override
+  {
+    /* Try to build and share a single acceleration structure, if possible */
+    if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2 || bvh->params.bvh_layout == BVH_LAYOUT_EMBREE) {
+      devices.back().device->build_bvh(bvh, progress, refit);
+      return;
+    }
+
+    assert(bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX ||
+           bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE);
+
+    BVHMulti *const bvh_multi = static_cast<BVHMulti *>(bvh);
+    bvh_multi->sub_bvhs.resize(devices.size());
+
+    vector<BVHMulti *> geom_bvhs;
+    geom_bvhs.reserve(bvh->geometry.size());
+    foreach (Geometry *geom, bvh->geometry) {
+      geom_bvhs.push_back(static_cast<BVHMulti *>(geom->bvh));
+    }
+
+    /* Broadcast acceleration structure build to all render devices */
+    size_t i = 0;
+    foreach (SubDevice &sub, devices) {
+      /* Change geometry BVH pointers to the sub BVH */
+      for (size_t k = 0; k < bvh->geometry.size(); ++k) {
+        bvh->geometry[k]->bvh = geom_bvhs[k]->sub_bvhs[i];
+      }
+
+      if (!bvh_multi->sub_bvhs[i]) {
+        BVHParams params = bvh->params;
+        if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX)
+          params.bvh_layout = BVH_LAYOUT_OPTIX;
+        else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE)
+          params.bvh_layout = sub.device->info.type == DEVICE_OPTIX ? BVH_LAYOUT_OPTIX :
+                                                                      BVH_LAYOUT_EMBREE;
+
+        /* Skip building a bottom level acceleration structure for non-instanced geometry on Embree
+         * (since they are put into the top level directly, see bvh_embree.cpp) */
+        if (!params.top_level && params.bvh_layout == BVH_LAYOUT_EMBREE &&
+            !bvh->geometry[0]->is_instanced()) {
+          i++;
+          continue;
+        }
+
+        bvh_multi->sub_bvhs[i] = BVH::create(params, bvh->geometry, bvh->objects, sub.device);
+      }
+
+      sub.device->build_bvh(bvh_multi->sub_bvhs[i], progress, refit);
+      i++;
+    }
+
+    /* Change geometry BVH pointers back to the multi BVH. */
+    for (size_t k = 0; k < bvh->geometry.size(); ++k) {
+      bvh->geometry[k]->bvh = geom_bvhs[k];
+    }
+  }
+
+  virtual void *get_cpu_osl_memory() override
+  {
+    if (devices.size() > 1) {
+      return NULL;
+    }
+    return devices.front().device->get_cpu_osl_memory();
+  }
+
+  bool is_resident(device_ptr key, Device *sub_device) override
+  {
+    foreach (SubDevice &sub, devices) {
+      if (sub.device == sub_device) {
+        return find_matching_mem_device(key, sub)->device == sub_device;
+      }
+    }
+    return false;
+  }
+
+  SubDevice *find_matching_mem_device(device_ptr key, SubDevice &sub)
+  {
+    assert(key != 0 && (sub.peer_island_index >= 0 || sub.ptr_map.find(key) != sub.ptr_map.end()));
+
+    /* Get the memory owner of this key (first try current device, then peer devices) */
+    SubDevice *owner_sub = &sub;
+    if (owner_sub->ptr_map.find(key) == owner_sub->ptr_map.end()) {
+      foreach (SubDevice *island_sub, peer_islands[sub.peer_island_index]) {
+        if (island_sub != owner_sub &&
+            island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) {
+          owner_sub = island_sub;
+        }
+      }
+    }
+    return owner_sub;
+  }
+
+  SubDevice *find_suitable_mem_device(device_ptr key, const vector<SubDevice *> &island)
+  {
+    assert(!island.empty());
+
+    /* Get the memory owner of this key or the device with the lowest memory usage when new */
+    SubDevice *owner_sub = island.front();
+    foreach (SubDevice *island_sub, island) {
+      if (key ? (island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) :
+                (island_sub->device->stats.mem_used < owner_sub->device->stats.mem_used)) {
+        owner_sub = island_sub;
+      }
+    }
+    return owner_sub;
+  }
+
+  inline device_ptr find_matching_mem(device_ptr key, SubDevice &sub)
+  {
+    return find_matching_mem_device(key, sub)->ptr_map[key];
+  }
+
+  void mem_alloc(device_memory &mem) override
+  {
+    device_ptr key = unique_key++;
+
+    assert(mem.type == MEM_READ_ONLY || mem.type == MEM_READ_WRITE || mem.type == MEM_DEVICE_ONLY);
+    /* The remaining memory types can be distributed across devices */
+    foreach (const vector<SubDevice *> &island, peer_islands) {
+      SubDevice *owner_sub = find_suitable_mem_device(key, island);
+      mem.device = owner_sub->device;
+      mem.device_pointer = 0;
+      mem.device_size = 0;
+
+      owner_sub->device->mem_alloc(mem);
+      owner_sub->ptr_map[key] = mem.device_pointer;
+    }
+
+    mem.device = this;
+    mem.device_pointer = key;
+    stats.mem_alloc(mem.device_size);
+  }
+
+  void mem_copy_to(device_memory &mem) override
+  {
+    device_ptr existing_key = mem.device_pointer;
+    device_ptr key = (existing_key) ? existing_key : unique_key++;
+    size_t existing_size = mem.device_size;
+
+    /* The tile buffers are allocated on each device (see below), so copy to all of them */
+    foreach (const vector<SubDevice *> &island, peer_islands) {
+      SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
+      mem.device = owner_sub->device;
+      mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
+      mem.device_size = existing_size;
+
+      owner_sub->device->mem_copy_to(mem);
+      owner_sub->ptr_map[key] = mem.device_pointer;
+
+      if (mem.type == MEM_GLOBAL || mem.type == MEM_TEXTURE) {
+        /* Need to create texture objects and update pointer in kernel globals on all devices */
+        foreach (SubDevice *island_sub, island) {
+          if (island_sub != owner_sub) {
+            island_sub->device->mem_copy_to(mem);
+          }
+        }
+      }
+    }
+
+    mem.device = this;
+    mem.device_pointer = key;
+    stats.mem_alloc(mem.device_size - existing_size);
+  }
+
+  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override
+  {
+    device_ptr key = mem.device_pointer;
+    int i = 0, sub_h = h / devices.size();
+
+    foreach (SubDevice &sub, devices) {
+      int sy = y + i * sub_h;
+      int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h;
+
+      SubDevice *owner_sub = find_matching_mem_device(key, sub);
+      mem.device = owner_sub->device;
+      mem.device_pointer = owner_sub->ptr_map[key];
+
+      owner_sub->device->mem_copy_from(mem, sy, w, sh, elem);
+      i++;
+    }
+
+    mem.device = this;
+    mem.device_pointer = key;
+  }
+
+  void mem_zero(device_memory &mem) override
+  {
+    device_ptr existing_key = mem.device_pointer;
+    device_ptr key = (existing_key) ? existing_key : unique_key++;
+    size_t existing_size = mem.device_size;
+
+    foreach (const vector<SubDevice *> &island, peer_islands) {
+      SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
+      mem.device = owner_sub->device;
+      mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
+      mem.device_size = existing_size;
+
+      owner_sub->device->mem_zero(mem);
+      owner_sub->ptr_map[key] = mem.device_pointer;
+    }
+
+    mem.device = this;
+    mem.device_pointer = key;
+    stats.mem_alloc(mem.device_size - existing_size);
+  }
+
+  void mem_free(device_memory &mem) override
+  {
+    device_ptr key = mem.device_pointer;
+    size_t existing_size = mem.device_size;
+
+    /* Free memory that was allocated for all devices (see above) on each device */
+    foreach (const vector<SubDevice *> &island, peer_islands) {
+      SubDevice *owner_sub = find_matching_mem_device(key, *island.front());
+      mem.device = owner_sub->device;
+      mem.device_pointer = owner_sub->ptr_map[key];
+      mem.device_size = existing_size;
+
+      owner_sub->device->mem_free(mem);
+      owner_sub->ptr_map.erase(owner_sub->ptr_map.find(key));
+
+      if (mem.type == MEM_TEXTURE) {
+        /* Free texture objects on all devices */
+        foreach (SubDevice *island_sub, island) {
+          if (island_sub != owner_sub) {
+            island_sub->device->mem_free(mem);
+          }
+        }
+      }
+    }
+
+    mem.device = this;
+    mem.device_pointer = 0;
+    mem.device_size = 0;
+    stats.mem_free(existing_size);
+  }
+
+  void const_copy_to(const char *name, void *host, size_t size) override
+  {
+    foreach (SubDevice &sub, devices)
+      sub.device->const_copy_to(name, host, size);
+  }
+
+  int device_number(Device *sub_device) override
+  {
+    int i = 0;
+
+    foreach (SubDevice &sub, devices) {
+      if (sub.device == sub_device)
+        return i;
+      i++;
+    }
+
+    return -1;
+  }
+
+  virtual void foreach_device(const function<void(Device *)> &callback) override
+  {
+    foreach (SubDevice &sub, devices) {
+      sub.device->foreach_device(callback);
+    }
+  }
+};
+
+Device *device_multi_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+{
+  return new MultiDevice(info, stats, profiler);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/multi/device.h b/intern/cycles/device/multi/device.h
new file mode 100644
index 00000000000..6e121014a1f
--- /dev/null
+++ b/intern/cycles/device/multi/device.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_string.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+Device *device_multi_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/opencl/device_opencl.h b/intern/cycles/device/opencl/device_opencl.h
deleted file mode 100644
index a65e764b0d4..00000000000
--- a/intern/cycles/device/opencl/device_opencl.h
+++ /dev/null
@@ -1,658 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPENCL
-
-#  include "device/device.h"
-#  include "device/device_denoising.h"
-#  include "device/device_split_kernel.h"
-
-#  include "util/util_map.h"
-#  include "util/util_param.h"
-#  include "util/util_string.h"
-#  include "util/util_task.h"
-
-#  include "clew.h"
-
-#  include "device/opencl/memory_manager.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* Disable workarounds, seems to be working fine on latest drivers. */
-#  define CYCLES_DISABLE_DRIVER_WORKAROUNDS
-
-/* Define CYCLES_DISABLE_DRIVER_WORKAROUNDS to disable workarounds for testing. */
-#  ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS
-/* Work around AMD driver hangs by ensuring each command is finished before doing anything else. */
-#    undef clEnqueueNDRangeKernel
-#    define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \
-      CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \
-      clFinish(a);
-
-#    undef clEnqueueWriteBuffer
-#    define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \
-      CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \
-      clFinish(a);
-
-#    undef clEnqueueReadBuffer
-#    define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \
-      CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \
-      clFinish(a);
-#  endif /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */
-
-#  define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
-
-struct OpenCLPlatformDevice {
-  OpenCLPlatformDevice(cl_platform_id platform_id,
-                       const string &platform_name,
-                       cl_device_id device_id,
-                       cl_device_type device_type,
-                       const string &device_name,
-                       const string &hardware_id,
-                       const string &device_extensions)
-      : platform_id(platform_id),
-        platform_name(platform_name),
-        device_id(device_id),
-        device_type(device_type),
-        device_name(device_name),
-        hardware_id(hardware_id),
-        device_extensions(device_extensions)
-  {
-  }
-  cl_platform_id platform_id;
-  string platform_name;
-  cl_device_id device_id;
-  cl_device_type device_type;
-  string device_name;
-  string hardware_id;
-  string device_extensions;
-};
-
-/* Contains all static OpenCL helper functions. */
-class OpenCLInfo {
- public:
-  static cl_device_type device_type();
-  static bool use_debug();
-  static bool device_supported(const string &platform_name, const cl_device_id device_id);
-  static bool platform_version_check(cl_platform_id platform, string *error = NULL);
-  static bool device_version_check(cl_device_id device, string *error = NULL);
-  static bool get_device_version(cl_device_id device,
-                                 int *r_major,
-                                 int *r_minor,
-                                 string *error = NULL);
-  static string get_hardware_id(const string &platform_name, cl_device_id device_id);
-  static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices);
-
-  /* ** Some handy shortcuts to low level cl*GetInfo() functions. ** */
-
-  /* Platform information. */
-  static bool get_num_platforms(cl_uint *num_platforms, cl_int *error = NULL);
-  static cl_uint get_num_platforms();
-
-  static bool get_platforms(vector<cl_platform_id> *platform_ids, cl_int *error = NULL);
-  static vector<cl_platform_id> get_platforms();
-
-  static bool get_platform_name(cl_platform_id platform_id, string *platform_name);
-  static string get_platform_name(cl_platform_id platform_id);
-
-  static bool get_num_platform_devices(cl_platform_id platform_id,
-                                       cl_device_type device_type,
-                                       cl_uint *num_devices,
-                                       cl_int *error = NULL);
-  static cl_uint get_num_platform_devices(cl_platform_id platform_id, cl_device_type device_type);
-
-  static bool get_platform_devices(cl_platform_id platform_id,
-                                   cl_device_type device_type,
-                                   vector<cl_device_id> *device_ids,
-                                   cl_int *error = NULL);
-  static vector<cl_device_id> get_platform_devices(cl_platform_id platform_id,
-                                                   cl_device_type device_type);
-
-  /* Device information. */
-  static bool get_device_name(cl_device_id device_id, string *device_name, cl_int *error = NULL);
-
-  static string get_device_name(cl_device_id device_id);
-
-  static bool get_device_extensions(cl_device_id device_id,
-                                    string *device_extensions,
-                                    cl_int *error = NULL);
-
-  static string get_device_extensions(cl_device_id device_id);
-
-  static bool get_device_type(cl_device_id device_id,
-                              cl_device_type *device_type,
-                              cl_int *error = NULL);
-  static cl_device_type get_device_type(cl_device_id device_id);
-
-  static bool get_driver_version(cl_device_id device_id,
-                                 int *major,
-                                 int *minor,
-                                 cl_int *error = NULL);
-
-  static int mem_sub_ptr_alignment(cl_device_id device_id);
-
-  /* Get somewhat more readable device name.
-   * Main difference is AMD OpenCL here which only gives code name
-   * for the regular device name. This will give more sane device
-   * name using some extensions.
-   */
-  static string get_readable_device_name(cl_device_id device_id);
-};
-
-/* Thread safe cache for contexts and programs.
- */
-class OpenCLCache {
-  struct Slot {
-    struct ProgramEntry {
-      ProgramEntry();
-      ProgramEntry(const ProgramEntry &rhs);
-      ~ProgramEntry();
-      cl_program program;
-      thread_mutex *mutex;
-    };
-
-    Slot();
-    Slot(const Slot &rhs);
-    ~Slot();
-
-    thread_mutex *context_mutex;
-    cl_context context;
-    typedef map<ustring, ProgramEntry> EntryMap;
-    EntryMap programs;
-  };
-
-  /* key is combination of platform ID and device ID */
-  typedef pair<cl_platform_id, cl_device_id> PlatformDevicePair;
-
-  /* map of Slot objects */
-  typedef map<PlatformDevicePair, Slot> CacheMap;
-  CacheMap cache;
-
-  /* MD5 hash of the kernel source. */
-  string kernel_md5;
-
-  thread_mutex cache_lock;
-  thread_mutex kernel_md5_lock;
-
-  /* lazy instantiate */
-  static OpenCLCache &global_instance();
-
- public:
-  enum ProgramName {
-    OCL_DEV_BASE_PROGRAM,
-    OCL_DEV_MEGAKERNEL_PROGRAM,
-  };
-
-  /* Lookup context in the cache. If this returns NULL, slot_locker
-   * will be holding a lock for the cache. slot_locker should refer to a
-   * default constructed thread_scoped_lock. */
-  static cl_context get_context(cl_platform_id platform,
-                                cl_device_id device,
-                                thread_scoped_lock &slot_locker);
-  /* Same as above. */
-  static cl_program get_program(cl_platform_id platform,
-                                cl_device_id device,
-                                ustring key,
-                                thread_scoped_lock &slot_locker);
-
-  /* Store context in the cache. You MUST have tried to get the item before storing to it. */
-  static void store_context(cl_platform_id platform,
-                            cl_device_id device,
-                            cl_context context,
-                            thread_scoped_lock &slot_locker);
-  /* Same as above. */
-  static void store_program(cl_platform_id platform,
-                            cl_device_id device,
-                            cl_program program,
-                            ustring key,
-                            thread_scoped_lock &slot_locker);
-
-  static string get_kernel_md5();
-};
-
-#  define opencl_device_assert(device, stmt) \
-    { \
-      cl_int err = stmt; \
-\
-      if (err != CL_SUCCESS) { \
-        string message = string_printf( \
-            "OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
-        if ((device)->error_message() == "") { \
-          (device)->set_error(message); \
-        } \
-        fprintf(stderr, "%s\n", message.c_str()); \
-      } \
-    } \
-    (void)0
-
-#  define opencl_assert(stmt) \
-    { \
-      cl_int err = stmt; \
-\
-      if (err != CL_SUCCESS) { \
-        string message = string_printf( \
-            "OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
-        if (error_msg == "") { \
-          error_msg = message; \
-        } \
-        fprintf(stderr, "%s\n", message.c_str()); \
-      } \
-    } \
-    (void)0
-
-class OpenCLDevice : public Device {
- public:
-  DedicatedTaskPool task_pool;
-
-  /* Task pool for required kernels (base, AO kernels during foreground rendering) */
-  TaskPool load_required_kernel_task_pool;
-  /* Task pool for optional kernels (feature kernels during foreground rendering) */
-  TaskPool load_kernel_task_pool;
-  std::atomic<int> load_kernel_num_compiling;
-
-  cl_context cxContext;
-  cl_command_queue cqCommandQueue;
-  cl_platform_id cpPlatform;
-  cl_device_id cdDevice;
-  cl_int ciErr;
-  int device_num;
-
-  class OpenCLProgram {
-   public:
-    OpenCLProgram() : loaded(false), needs_compiling(true), program(NULL), device(NULL)
-    {
-    }
-    OpenCLProgram(OpenCLDevice *device,
-                  const string &program_name,
-                  const string &kernel_name,
-                  const string &kernel_build_options,
-                  bool use_stdout = true);
-    ~OpenCLProgram();
-
-    void add_kernel(ustring name);
-
-    /* Try to load the program from device cache or disk */
-    bool load();
-    /* Compile the kernel (first separate, fail-back to local). */
-    void compile();
-    /* Create the OpenCL kernels after loading or compiling */
-    void create_kernels();
-
-    bool is_loaded() const
-    {
-      return loaded;
-    }
-    const string &get_log() const
-    {
-      return log;
-    }
-    void report_error();
-
-    /* Wait until this kernel is available to be used
-     * It will return true when the kernel is available.
-     * It will return false when the kernel is not available
-     * or could not be loaded. */
-    bool wait_for_availability();
-
-    cl_kernel operator()();
-    cl_kernel operator()(ustring name);
-
-    void release();
-
-   private:
-    bool build_kernel(const string *debug_src);
-    /* Build the program by calling the own process.
-     * This is required for multithreaded OpenCL compilation, since most Frameworks serialize
-     * build calls internally if they come from the same process.
-     * If that is not supported, this function just returns false.
-     */
-    bool compile_separate(const string &clbin);
-    /* Build the program by calling OpenCL directly. */
-    bool compile_kernel(const string *debug_src);
-    /* Loading and saving the program from/to disk. */
-    bool load_binary(const string &clbin, const string *debug_src = NULL);
-    bool save_binary(const string &clbin);
-
-    void add_log(const string &msg, bool is_debug);
-    void add_error(const string &msg);
-
-    bool loaded;
-    bool needs_compiling;
-
-    cl_program program;
-    OpenCLDevice *device;
-
-    /* Used for the OpenCLCache key. */
-    string program_name;
-
-    string kernel_file, kernel_build_options, device_md5;
-
-    bool use_stdout;
-    string log, error_msg;
-    string compile_output;
-
-    map<ustring, cl_kernel> kernels;
-  };
-
-  /* Container for all types of split programs. */
-  class OpenCLSplitPrograms {
-   public:
-    OpenCLDevice *device;
-    OpenCLProgram program_split;
-    OpenCLProgram program_lamp_emission;
-    OpenCLProgram program_do_volume;
-    OpenCLProgram program_indirect_background;
-    OpenCLProgram program_shader_eval;
-    OpenCLProgram program_holdout_emission_blurring_pathtermination_ao;
-    OpenCLProgram program_subsurface_scatter;
-    OpenCLProgram program_direct_lighting;
-    OpenCLProgram program_shadow_blocked_ao;
-    OpenCLProgram program_shadow_blocked_dl;
-
-    OpenCLSplitPrograms(OpenCLDevice *device);
-    ~OpenCLSplitPrograms();
-
-    /* Load the kernels and put the created kernels in the given
-     * `programs` parameter. */
-    void load_kernels(vector<OpenCLProgram *> &programs,
-                      const DeviceRequestedFeatures &requested_features);
-  };
-
-  DeviceSplitKernel *split_kernel;
-
-  OpenCLProgram base_program;
-  OpenCLProgram bake_program;
-  OpenCLProgram displace_program;
-  OpenCLProgram background_program;
-  OpenCLProgram denoising_program;
-
-  OpenCLSplitPrograms kernel_programs;
-
-  typedef map<string, device_vector<uchar> *> ConstMemMap;
-  typedef map<string, device_ptr> MemMap;
-
-  ConstMemMap const_mem_map;
-  MemMap mem_map;
-
-  bool device_initialized;
-  string platform_name;
-  string device_name;
-
-  bool opencl_error(cl_int err);
-  void opencl_error(const string &message);
-  void opencl_assert_err(cl_int err, const char *where);
-
-  OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-  ~OpenCLDevice();
-
-  static void CL_CALLBACK context_notify_callback(const char *err_info,
-                                                  const void * /*private_info*/,
-                                                  size_t /*cb*/,
-                                                  void *user_data);
-
-  bool opencl_version_check();
-  OpenCLSplitPrograms *get_split_programs();
-
-  string device_md5_hash(string kernel_custom_build_options = "");
-  bool load_kernels(const DeviceRequestedFeatures &requested_features);
-  void load_required_kernels(const DeviceRequestedFeatures &requested_features);
-
-  bool wait_for_availability(const DeviceRequestedFeatures &requested_features);
-  DeviceKernelStatus get_active_kernel_switch_state();
-
-  /* Get the name of the opencl program for the given kernel */
-  const string get_opencl_program_name(const string &kernel_name);
-  /* Get the program file name to compile (*.cl) for the given kernel */
-  const string get_opencl_program_filename(const string &kernel_name);
-  string get_build_options(const DeviceRequestedFeatures &requested_features,
-                           const string &opencl_program_name);
-  /* Enable the default features to reduce recompilation events */
-  void enable_default_features(DeviceRequestedFeatures &features);
-
-  void mem_alloc(device_memory &mem);
-  void mem_copy_to(device_memory &mem);
-  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem);
-  void mem_zero(device_memory &mem);
-  void mem_free(device_memory &mem);
-
-  int mem_sub_ptr_alignment();
-
-  void const_copy_to(const char *name, void *host, size_t size);
-  void global_alloc(device_memory &mem);
-  void global_free(device_memory &mem);
-  void tex_alloc(device_texture &mem);
-  void tex_free(device_texture &mem);
-
-  size_t global_size_round_up(int group_size, int global_size);
-  void enqueue_kernel(cl_kernel kernel,
-                      size_t w,
-                      size_t h,
-                      bool x_workgroups = false,
-                      size_t max_workgroup_size = -1);
-  void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name);
-  void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
-
-  void film_convert(DeviceTask &task,
-                    device_ptr buffer,
-                    device_ptr rgba_byte,
-                    device_ptr rgba_half);
-  void shader(DeviceTask &task);
-  void update_adaptive(DeviceTask &task, RenderTile &tile, int sample);
-  void bake(DeviceTask &task, RenderTile &tile);
-
-  void denoise(RenderTile &tile, DenoisingTask &denoising);
-
-  int get_split_task_count(DeviceTask & /*task*/)
-  {
-    return 1;
-  }
-
-  void task_add(DeviceTask &task)
-  {
-    task_pool.push([=] {
-      DeviceTask task_copy = task;
-      thread_run(task_copy);
-    });
-  }
-
-  void task_wait()
-  {
-    task_pool.wait();
-  }
-
-  void task_cancel()
-  {
-    task_pool.cancel();
-  }
-
-  void thread_run(DeviceTask &task);
-
-  virtual BVHLayoutMask get_bvh_layout_mask() const
-  {
-    return BVH_LAYOUT_BVH2;
-  }
-
-  virtual bool show_samples() const
-  {
-    return true;
-  }
-
- protected:
-  string kernel_build_options(const string *debug_src = NULL);
-
-  void mem_zero_kernel(device_ptr ptr, size_t size);
-
-  bool denoising_non_local_means(device_ptr image_ptr,
-                                 device_ptr guide_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr out_ptr,
-                                 DenoisingTask *task);
-  bool denoising_construct_transform(DenoisingTask *task);
-  bool denoising_accumulate(device_ptr color_ptr,
-                            device_ptr color_variance_ptr,
-                            device_ptr scale_ptr,
-                            int frame,
-                            DenoisingTask *task);
-  bool denoising_solve(device_ptr output_ptr, DenoisingTask *task);
-  bool denoising_combine_halves(device_ptr a_ptr,
-                                device_ptr b_ptr,
-                                device_ptr mean_ptr,
-                                device_ptr variance_ptr,
-                                int r,
-                                int4 rect,
-                                DenoisingTask *task);
-  bool denoising_divide_shadow(device_ptr a_ptr,
-                               device_ptr b_ptr,
-                               device_ptr sample_variance_ptr,
-                               device_ptr sv_variance_ptr,
-                               device_ptr buffer_variance_ptr,
-                               DenoisingTask *task);
-  bool denoising_get_feature(int mean_offset,
-                             int variance_offset,
-                             device_ptr mean_ptr,
-                             device_ptr variance_ptr,
-                             float scale,
-                             DenoisingTask *task);
-  bool denoising_write_feature(int to_offset,
-                               device_ptr from_ptr,
-                               device_ptr buffer_ptr,
-                               DenoisingTask *task);
-  bool denoising_detect_outliers(device_ptr image_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr depth_ptr,
-                                 device_ptr output_ptr,
-                                 DenoisingTask *task);
-
-  device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int size);
-  void mem_free_sub_ptr(device_ptr ptr);
-
-  class ArgumentWrapper {
-   public:
-    ArgumentWrapper() : size(0), pointer(NULL)
-    {
-    }
-
-    ArgumentWrapper(device_memory &argument)
-        : size(sizeof(void *)), pointer((void *)(&argument.device_pointer))
-    {
-    }
-
-    template<typename T>
-    ArgumentWrapper(device_vector<T> &argument)
-        : size(sizeof(void *)), pointer((void *)(&argument.device_pointer))
-    {
-    }
-
-    template<typename T>
-    ArgumentWrapper(device_only_memory<T> &argument)
-        : size(sizeof(void *)), pointer((void *)(&argument.device_pointer))
-    {
-    }
-    template<typename T> ArgumentWrapper(T &argument) : size(sizeof(argument)), pointer(&argument)
-    {
-    }
-
-    ArgumentWrapper(int argument) : size(sizeof(int)), int_value(argument), pointer(&int_value)
-    {
-    }
-
-    ArgumentWrapper(float argument)
-        : size(sizeof(float)), float_value(argument), pointer(&float_value)
-    {
-    }
-
-    size_t size;
-    int int_value;
-    float float_value;
-    void *pointer;
-  };
-
-  /* TODO(sergey): In the future we can use variadic templates, once
-   * C++0x is allowed. Should allow to clean this up a bit.
-   */
-  int kernel_set_args(cl_kernel kernel,
-                      int start_argument_index,
-                      const ArgumentWrapper &arg1 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg2 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg3 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg4 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg5 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg6 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg7 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg8 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg9 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg10 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg11 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg12 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg13 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg14 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg15 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg16 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg17 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg18 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg19 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg20 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg21 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg22 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg23 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg24 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg25 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg26 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg27 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg28 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg29 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg30 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg31 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg32 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg33 = ArgumentWrapper());
-
-  void release_kernel_safe(cl_kernel kernel);
-  void release_mem_object_safe(cl_mem mem);
-  void release_program_safe(cl_program program);
-
-  /* ** Those guys are for working around some compiler-specific bugs ** */
-
-  cl_program load_cached_kernel(ustring key, thread_scoped_lock &cache_locker);
-
-  void store_cached_kernel(cl_program program, ustring key, thread_scoped_lock &cache_locker);
-
- private:
-  MemoryManager memory_manager;
-  friend class MemoryManager;
-
-  static_assert_align(TextureInfo, 16);
-  device_vector<TextureInfo> texture_info;
-
-  typedef map<string, device_memory *> TexturesMap;
-  TexturesMap textures;
-
-  bool textures_need_update;
-
- protected:
-  void flush_texture_buffers();
-
-  friend class OpenCLSplitKernel;
-  friend class OpenCLSplitKernelFunction;
-};
-
-Device *opencl_create_split_device(DeviceInfo &info,
-                                   Stats &stats,
-                                   Profiler &profiler,
-                                   bool background);
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/opencl/device_opencl_impl.cpp b/intern/cycles/device/opencl/device_opencl_impl.cpp
deleted file mode 100644
index 31a2265700c..00000000000
--- a/intern/cycles/device/opencl/device_opencl_impl.cpp
+++ /dev/null
@@ -1,2113 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPENCL
-
-#  include "device/opencl/device_opencl.h"
-
-#  include "kernel/kernel_types.h"
-#  include "kernel/split/kernel_split_data_types.h"
-
-#  include "util/util_algorithm.h"
-#  include "util/util_debug.h"
-#  include "util/util_foreach.h"
-#  include "util/util_logging.h"
-#  include "util/util_md5.h"
-#  include "util/util_path.h"
-#  include "util/util_time.h"
-
-CCL_NAMESPACE_BEGIN
-
-struct texture_slot_t {
-  texture_slot_t(const string &name, int slot) : name(name), slot(slot)
-  {
-  }
-  string name;
-  int slot;
-};
-
-static const string NON_SPLIT_KERNELS =
-    "denoising "
-    "base "
-    "background "
-    "displace ";
-
-static const string SPLIT_BUNDLE_KERNELS =
-    "data_init "
-    "path_init "
-    "state_buffer_size "
-    "scene_intersect "
-    "queue_enqueue "
-    "shader_setup "
-    "shader_sort "
-    "enqueue_inactive "
-    "next_iteration_setup "
-    "indirect_subsurface "
-    "buffer_update "
-    "adaptive_stopping "
-    "adaptive_filter_x "
-    "adaptive_filter_y "
-    "adaptive_adjust_samples";
-
-const string OpenCLDevice::get_opencl_program_name(const string &kernel_name)
-{
-  if (NON_SPLIT_KERNELS.find(kernel_name) != std::string::npos) {
-    return kernel_name;
-  }
-  else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) {
-    return "split_bundle";
-  }
-  else {
-    return "split_" + kernel_name;
-  }
-}
-
-const string OpenCLDevice::get_opencl_program_filename(const string &kernel_name)
-{
-  if (kernel_name == "denoising") {
-    return "filter.cl";
-  }
-  else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) {
-    return "kernel_split_bundle.cl";
-  }
-  else {
-    return "kernel_" + kernel_name + ".cl";
-  }
-}
-
-/* Enable features that we always want to compile to reduce recompilation events */
-void OpenCLDevice::enable_default_features(DeviceRequestedFeatures &features)
-{
-  features.use_transparent = true;
-  features.use_shadow_tricks = true;
-  features.use_principled = true;
-  features.use_denoising = true;
-
-  if (!background) {
-    features.max_nodes_group = NODE_GROUP_LEVEL_MAX;
-    features.nodes_features = NODE_FEATURE_ALL;
-    features.use_hair = true;
-    features.use_subsurface = true;
-    features.use_camera_motion = false;
-    features.use_object_motion = false;
-  }
-}
-
-string OpenCLDevice::get_build_options(const DeviceRequestedFeatures &requested_features,
-                                       const string &opencl_program_name)
-{
-  /* first check for non-split kernel programs */
-  if (opencl_program_name == "base" || opencl_program_name == "denoising") {
-    return "";
-  }
-  else if (opencl_program_name == "bake") {
-    /* Note: get_build_options for bake is only requested when baking is enabled.
-     * displace and background are always requested.
-     * `__SPLIT_KERNEL__` must not be present in the compile directives for bake */
-    DeviceRequestedFeatures features(requested_features);
-    enable_default_features(features);
-    features.use_denoising = false;
-    features.use_object_motion = false;
-    features.use_camera_motion = false;
-    features.use_hair = true;
-    features.use_subsurface = true;
-    features.max_nodes_group = NODE_GROUP_LEVEL_MAX;
-    features.nodes_features = NODE_FEATURE_ALL;
-    features.use_integrator_branched = false;
-    return features.get_build_options();
-  }
-  else if (opencl_program_name == "displace") {
-    /* As displacement does not use any nodes from the Shading group (eg BSDF).
-     * We disable all features that are related to shading. */
-    DeviceRequestedFeatures features(requested_features);
-    enable_default_features(features);
-    features.use_denoising = false;
-    features.use_object_motion = false;
-    features.use_camera_motion = false;
-    features.use_baking = false;
-    features.use_transparent = false;
-    features.use_shadow_tricks = false;
-    features.use_subsurface = false;
-    features.use_volume = false;
-    features.nodes_features &= ~NODE_FEATURE_VOLUME;
-    features.use_denoising = false;
-    features.use_principled = false;
-    features.use_integrator_branched = false;
-    return features.get_build_options();
-  }
-  else if (opencl_program_name == "background") {
-    /* Background uses Background shading
-     * It is save to disable shadow features, subsurface and volumetric. */
-    DeviceRequestedFeatures features(requested_features);
-    enable_default_features(features);
-    features.use_baking = false;
-    features.use_object_motion = false;
-    features.use_camera_motion = false;
-    features.use_transparent = false;
-    features.use_shadow_tricks = false;
-    features.use_denoising = false;
-    /* NOTE: currently possible to use surface nodes like `Hair Info`, `Bump` node.
-     * Perhaps we should remove them in UI as it does not make any sense when
-     * rendering background. */
-    features.nodes_features &= ~NODE_FEATURE_VOLUME;
-    features.use_subsurface = false;
-    features.use_volume = false;
-    features.use_shader_raytrace = false;
-    features.use_patch_evaluation = false;
-    features.use_integrator_branched = false;
-    return features.get_build_options();
-  }
-
-  string build_options = "-D__SPLIT_KERNEL__ ";
-  /* Set compute device build option. */
-  cl_device_type device_type;
-  OpenCLInfo::get_device_type(this->cdDevice, &device_type, &this->ciErr);
-  assert(this->ciErr == CL_SUCCESS);
-  if (device_type == CL_DEVICE_TYPE_GPU) {
-    build_options += "-D__COMPUTE_DEVICE_GPU__ ";
-  }
-
-  DeviceRequestedFeatures nofeatures;
-  enable_default_features(nofeatures);
-
-  /* Add program specific optimized compile directives */
-  if (opencl_program_name == "split_do_volume" && !requested_features.use_volume) {
-    build_options += nofeatures.get_build_options();
-  }
-  else {
-    DeviceRequestedFeatures features(requested_features);
-    enable_default_features(features);
-
-    /* Always turn off baking at this point. Baking is only useful when building the bake kernel.
-     * this also makes sure that the kernels that are build during baking can be reused
-     * when not doing any baking. */
-    features.use_baking = false;
-
-    /* Do not vary on shaders when program doesn't do any shading.
-     * We have bundled them in a single program. */
-    if (opencl_program_name == "split_bundle") {
-      features.max_nodes_group = 0;
-      features.nodes_features = 0;
-      features.use_shader_raytrace = false;
-    }
-
-    /* No specific settings, just add the regular ones */
-    build_options += features.get_build_options();
-  }
-
-  return build_options;
-}
-
-OpenCLDevice::OpenCLSplitPrograms::OpenCLSplitPrograms(OpenCLDevice *device_)
-{
-  device = device_;
-}
-
-OpenCLDevice::OpenCLSplitPrograms::~OpenCLSplitPrograms()
-{
-  program_split.release();
-  program_lamp_emission.release();
-  program_do_volume.release();
-  program_indirect_background.release();
-  program_shader_eval.release();
-  program_holdout_emission_blurring_pathtermination_ao.release();
-  program_subsurface_scatter.release();
-  program_direct_lighting.release();
-  program_shadow_blocked_ao.release();
-  program_shadow_blocked_dl.release();
-}
-
-void OpenCLDevice::OpenCLSplitPrograms::load_kernels(
-    vector<OpenCLProgram *> &programs, const DeviceRequestedFeatures &requested_features)
-{
-  if (!requested_features.use_baking) {
-#  define ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(kernel_name) \
-    program_split.add_kernel(ustring("path_trace_" #kernel_name));
-#  define ADD_SPLIT_KERNEL_PROGRAM(kernel_name) \
-    const string program_name_##kernel_name = "split_" #kernel_name; \
-    program_##kernel_name = OpenCLDevice::OpenCLProgram( \
-        device, \
-        program_name_##kernel_name, \
-        "kernel_" #kernel_name ".cl", \
-        device->get_build_options(requested_features, program_name_##kernel_name)); \
-    program_##kernel_name.add_kernel(ustring("path_trace_" #kernel_name)); \
-    programs.push_back(&program_##kernel_name);
-
-    /* Ordered with most complex kernels first, to reduce overall compile time. */
-    ADD_SPLIT_KERNEL_PROGRAM(subsurface_scatter);
-    ADD_SPLIT_KERNEL_PROGRAM(direct_lighting);
-    ADD_SPLIT_KERNEL_PROGRAM(indirect_background);
-    if (requested_features.use_volume) {
-      ADD_SPLIT_KERNEL_PROGRAM(do_volume);
-    }
-    ADD_SPLIT_KERNEL_PROGRAM(shader_eval);
-    ADD_SPLIT_KERNEL_PROGRAM(lamp_emission);
-    ADD_SPLIT_KERNEL_PROGRAM(holdout_emission_blurring_pathtermination_ao);
-    ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_dl);
-    ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_ao);
-
-    /* Quick kernels bundled in a single program to reduce overhead of starting
-     * Blender processes. */
-    program_split = OpenCLDevice::OpenCLProgram(
-        device,
-        "split_bundle",
-        "kernel_split_bundle.cl",
-        device->get_build_options(requested_features, "split_bundle"));
-
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(data_init);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(state_buffer_size);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(path_init);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(scene_intersect);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(queue_enqueue);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_setup);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_sort);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(enqueue_inactive);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(next_iteration_setup);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(indirect_subsurface);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(buffer_update);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_stopping);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_x);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_y);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_adjust_samples);
-    programs.push_back(&program_split);
-
-#  undef ADD_SPLIT_KERNEL_PROGRAM
-#  undef ADD_SPLIT_KERNEL_BUNDLE_PROGRAM
-  }
-}
-
-namespace {
-
-/* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
- * fetch its size.
- */
-typedef struct KernelGlobalsDummy {
-  ccl_constant KernelData *data;
-  ccl_global char *buffers[8];
-
-#  define KERNEL_TEX(type, name) TextureInfo name;
-#  include "kernel/kernel_textures.h"
-#  undef KERNEL_TEX
-  SplitData split_data;
-  SplitParams split_param_data;
-} KernelGlobalsDummy;
-
-}  // namespace
-
-struct CachedSplitMemory {
-  int id;
-  device_memory *split_data;
-  device_memory *ray_state;
-  device_memory *queue_index;
-  device_memory *use_queues_flag;
-  device_memory *work_pools;
-  device_ptr *buffer;
-};
-
-class OpenCLSplitKernelFunction : public SplitKernelFunction {
- public:
-  OpenCLDevice *device;
-  OpenCLDevice::OpenCLProgram program;
-  CachedSplitMemory &cached_memory;
-  int cached_id;
-
-  OpenCLSplitKernelFunction(OpenCLDevice *device, CachedSplitMemory &cached_memory)
-      : device(device), cached_memory(cached_memory), cached_id(cached_memory.id - 1)
-  {
-  }
-
-  ~OpenCLSplitKernelFunction()
-  {
-    program.release();
-  }
-
-  virtual bool enqueue(const KernelDimensions &dim, device_memory &kg, device_memory &data)
-  {
-    if (cached_id != cached_memory.id) {
-      cl_uint start_arg_index = device->kernel_set_args(
-          program(), 0, kg, data, *cached_memory.split_data, *cached_memory.ray_state);
-
-      device->set_kernel_arg_buffers(program(), &start_arg_index);
-
-      start_arg_index += device->kernel_set_args(program(),
-                                                 start_arg_index,
-                                                 *cached_memory.queue_index,
-                                                 *cached_memory.use_queues_flag,
-                                                 *cached_memory.work_pools,
-                                                 *cached_memory.buffer);
-
-      cached_id = cached_memory.id;
-    }
-
-    device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
-                                           program(),
-                                           2,
-                                           NULL,
-                                           dim.global_size,
-                                           dim.local_size,
-                                           0,
-                                           NULL,
-                                           NULL);
-
-    device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
-
-    if (device->ciErr != CL_SUCCESS) {
-      string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
-                                     clewErrorString(device->ciErr));
-      device->opencl_error(message);
-      return false;
-    }
-
-    return true;
-  }
-};
-
-class OpenCLSplitKernel : public DeviceSplitKernel {
-  OpenCLDevice *device;
-  CachedSplitMemory cached_memory;
-
- public:
-  explicit OpenCLSplitKernel(OpenCLDevice *device) : DeviceSplitKernel(device), device(device)
-  {
-  }
-
-  virtual SplitKernelFunction *get_split_kernel_function(
-      const string &kernel_name, const DeviceRequestedFeatures &requested_features)
-  {
-    OpenCLSplitKernelFunction *kernel = new OpenCLSplitKernelFunction(device, cached_memory);
-
-    const string program_name = device->get_opencl_program_name(kernel_name);
-    kernel->program = OpenCLDevice::OpenCLProgram(
-        device,
-        program_name,
-        device->get_opencl_program_filename(kernel_name),
-        device->get_build_options(requested_features, program_name));
-
-    kernel->program.add_kernel(ustring("path_trace_" + kernel_name));
-    kernel->program.load();
-
-    if (!kernel->program.is_loaded()) {
-      delete kernel;
-      return NULL;
-    }
-
-    return kernel;
-  }
-
-  virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads)
-  {
-    device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
-    size_buffer.alloc(1);
-    size_buffer.zero_to_device();
-
-    uint threads = num_threads;
-    OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs();
-    cl_kernel kernel_state_buffer_size = programs->program_split(
-        ustring("path_trace_state_buffer_size"));
-    device->kernel_set_args(kernel_state_buffer_size, 0, kg, data, threads, size_buffer);
-
-    size_t global_size = 64;
-    device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
-                                           kernel_state_buffer_size,
-                                           1,
-                                           NULL,
-                                           &global_size,
-                                           NULL,
-                                           0,
-                                           NULL,
-                                           NULL);
-
-    device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
-
-    size_buffer.copy_from_device(0, 1, 1);
-    size_t size = size_buffer[0];
-    size_buffer.free();
-
-    if (device->ciErr != CL_SUCCESS) {
-      string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
-                                     clewErrorString(device->ciErr));
-      device->opencl_error(message);
-      return 0;
-    }
-
-    return size;
-  }
-
-  virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
-                                              RenderTile &rtile,
-                                              int num_global_elements,
-                                              device_memory &kernel_globals,
-                                              device_memory &kernel_data,
-                                              device_memory &split_data,
-                                              device_memory &ray_state,
-                                              device_memory &queue_index,
-                                              device_memory &use_queues_flag,
-                                              device_memory &work_pool_wgs)
-  {
-    cl_int dQueue_size = dim.global_size[0] * dim.global_size[1];
-
-    /* Set the range of samples to be processed for every ray in
-     * path-regeneration logic.
-     */
-    cl_int start_sample = rtile.start_sample;
-    cl_int end_sample = rtile.start_sample + rtile.num_samples;
-
-    OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs();
-    cl_kernel kernel_data_init = programs->program_split(ustring("path_trace_data_init"));
-
-    cl_uint start_arg_index = device->kernel_set_args(kernel_data_init,
-                                                      0,
-                                                      kernel_globals,
-                                                      kernel_data,
-                                                      split_data,
-                                                      num_global_elements,
-                                                      ray_state);
-
-    device->set_kernel_arg_buffers(kernel_data_init, &start_arg_index);
-
-    start_arg_index += device->kernel_set_args(kernel_data_init,
-                                               start_arg_index,
-                                               start_sample,
-                                               end_sample,
-                                               rtile.x,
-                                               rtile.y,
-                                               rtile.w,
-                                               rtile.h,
-                                               rtile.offset,
-                                               rtile.stride,
-                                               queue_index,
-                                               dQueue_size,
-                                               use_queues_flag,
-                                               work_pool_wgs,
-                                               rtile.num_samples,
-                                               rtile.buffer);
-
-    /* Enqueue ckPathTraceKernel_data_init kernel. */
-    device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
-                                           kernel_data_init,
-                                           2,
-                                           NULL,
-                                           dim.global_size,
-                                           dim.local_size,
-                                           0,
-                                           NULL,
-                                           NULL);
-
-    device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
-
-    if (device->ciErr != CL_SUCCESS) {
-      string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
-                                     clewErrorString(device->ciErr));
-      device->opencl_error(message);
-      return false;
-    }
-
-    cached_memory.split_data = &split_data;
-    cached_memory.ray_state = &ray_state;
-    cached_memory.queue_index = &queue_index;
-    cached_memory.use_queues_flag = &use_queues_flag;
-    cached_memory.work_pools = &work_pool_wgs;
-    cached_memory.buffer = &rtile.buffer;
-    cached_memory.id++;
-
-    return true;
-  }
-
-  virtual int2 split_kernel_local_size()
-  {
-    return make_int2(64, 1);
-  }
-
-  virtual int2 split_kernel_global_size(device_memory &kg,
-                                        device_memory &data,
-                                        DeviceTask & /*task*/)
-  {
-    cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice);
-    /* Use small global size on CPU devices as it seems to be much faster. */
-    if (type == CL_DEVICE_TYPE_CPU) {
-      VLOG(1) << "Global size: (64, 64).";
-      return make_int2(64, 64);
-    }
-
-    cl_ulong max_buffer_size;
-    clGetDeviceInfo(
-        device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
-
-    if (DebugFlags().opencl.mem_limit) {
-      max_buffer_size = min(max_buffer_size,
-                            cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used));
-    }
-
-    VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(max_buffer_size)
-            << " bytes. (" << string_human_readable_size(max_buffer_size) << ").";
-
-    /* Limit to 2gb, as we shouldn't need more than that and some devices may support much more. */
-    max_buffer_size = min(max_buffer_size / 2, (cl_ulong)2l * 1024 * 1024 * 1024);
-
-    size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size);
-    int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64),
-                                 (int)sqrt(num_elements));
-
-    if (device->info.description.find("Intel") != string::npos) {
-      global_size = make_int2(min(512, global_size.x), min(512, global_size.y));
-    }
-
-    VLOG(1) << "Global size: " << global_size << ".";
-    return global_size;
-  }
-};
-
-bool OpenCLDevice::opencl_error(cl_int err)
-{
-  if (err != CL_SUCCESS) {
-    string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err));
-    if (error_msg == "")
-      error_msg = message;
-    fprintf(stderr, "%s\n", message.c_str());
-    return true;
-  }
-
-  return false;
-}
-
-void OpenCLDevice::opencl_error(const string &message)
-{
-  if (error_msg == "")
-    error_msg = message;
-  fprintf(stderr, "%s\n", message.c_str());
-}
-
-void OpenCLDevice::opencl_assert_err(cl_int err, const char *where)
-{
-  if (err != CL_SUCCESS) {
-    string message = string_printf(
-        "OpenCL error (%d): %s in %s", err, clewErrorString(err), where);
-    if (error_msg == "")
-      error_msg = message;
-    fprintf(stderr, "%s\n", message.c_str());
-#  ifndef NDEBUG
-    abort();
-#  endif
-  }
-}
-
-OpenCLDevice::OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
-    : Device(info, stats, profiler, background),
-      load_kernel_num_compiling(0),
-      kernel_programs(this),
-      memory_manager(this),
-      texture_info(this, "__texture_info", MEM_GLOBAL)
-{
-  cpPlatform = NULL;
-  cdDevice = NULL;
-  cxContext = NULL;
-  cqCommandQueue = NULL;
-  device_initialized = false;
-  textures_need_update = true;
-
-  vector<OpenCLPlatformDevice> usable_devices;
-  OpenCLInfo::get_usable_devices(&usable_devices);
-  if (usable_devices.size() == 0) {
-    opencl_error("OpenCL: no devices found.");
-    return;
-  }
-  assert(info.num < usable_devices.size());
-  OpenCLPlatformDevice &platform_device = usable_devices[info.num];
-  device_num = info.num;
-  cpPlatform = platform_device.platform_id;
-  cdDevice = platform_device.device_id;
-  platform_name = platform_device.platform_name;
-  device_name = platform_device.device_name;
-  VLOG(2) << "Creating new Cycles device for OpenCL platform " << platform_name << ", device "
-          << device_name << ".";
-
-  {
-    /* try to use cached context */
-    thread_scoped_lock cache_locker;
-    cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker);
-
-    if (cxContext == NULL) {
-      /* create context properties array to specify platform */
-      const cl_context_properties context_props[] = {
-          CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, 0, 0};
-
-      /* create context */
-      cxContext = clCreateContext(
-          context_props, 1, &cdDevice, context_notify_callback, cdDevice, &ciErr);
-
-      if (opencl_error(ciErr)) {
-        opencl_error("OpenCL: clCreateContext failed");
-        return;
-      }
-
-      /* cache it */
-      OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker);
-    }
-  }
-
-  cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
-  if (opencl_error(ciErr)) {
-    opencl_error("OpenCL: Error creating command queue");
-    return;
-  }
-
-  /* Allocate this right away so that texture_info
-   * is placed at offset 0 in the device memory buffers. */
-  texture_info.resize(1);
-  memory_manager.alloc("texture_info", texture_info);
-
-  device_initialized = true;
-
-  split_kernel = new OpenCLSplitKernel(this);
-}
-
-OpenCLDevice::~OpenCLDevice()
-{
-  task_pool.cancel();
-  load_required_kernel_task_pool.cancel();
-  load_kernel_task_pool.cancel();
-
-  memory_manager.free();
-
-  ConstMemMap::iterator mt;
-  for (mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
-    delete mt->second;
-  }
-
-  base_program.release();
-  bake_program.release();
-  displace_program.release();
-  background_program.release();
-  denoising_program.release();
-
-  if (cqCommandQueue)
-    clReleaseCommandQueue(cqCommandQueue);
-  if (cxContext)
-    clReleaseContext(cxContext);
-
-  delete split_kernel;
-}
-
-void CL_CALLBACK OpenCLDevice::context_notify_callback(const char *err_info,
-                                                       const void * /*private_info*/,
-                                                       size_t /*cb*/,
-                                                       void *user_data)
-{
-  string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data);
-  fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info);
-}
-
-bool OpenCLDevice::opencl_version_check()
-{
-  string error;
-  if (!OpenCLInfo::platform_version_check(cpPlatform, &error)) {
-    opencl_error(error);
-    return false;
-  }
-  if (!OpenCLInfo::device_version_check(cdDevice, &error)) {
-    opencl_error(error);
-    return false;
-  }
-  return true;
-}
-
-string OpenCLDevice::device_md5_hash(string kernel_custom_build_options)
-{
-  MD5Hash md5;
-  char version[256], driver[256], name[256], vendor[256];
-
-  clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
-  clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
-  clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
-  clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL);
-
-  md5.append((uint8_t *)vendor, strlen(vendor));
-  md5.append((uint8_t *)version, strlen(version));
-  md5.append((uint8_t *)name, strlen(name));
-  md5.append((uint8_t *)driver, strlen(driver));
-
-  string options = kernel_build_options();
-  options += kernel_custom_build_options;
-  md5.append((uint8_t *)options.c_str(), options.size());
-
-  return md5.get_hex();
-}
-
-bool OpenCLDevice::load_kernels(const DeviceRequestedFeatures &requested_features)
-{
-  VLOG(2) << "Loading kernels for platform " << platform_name << ", device " << device_name << ".";
-  /* Verify if device was initialized. */
-  if (!device_initialized) {
-    fprintf(stderr, "OpenCL: failed to initialize device.\n");
-    return false;
-  }
-
-  /* Verify we have right opencl version. */
-  if (!opencl_version_check())
-    return false;
-
-  load_required_kernels(requested_features);
-
-  vector<OpenCLProgram *> programs;
-  kernel_programs.load_kernels(programs, requested_features);
-
-  if (!requested_features.use_baking && requested_features.use_denoising) {
-    denoising_program = OpenCLProgram(
-        this, "denoising", "filter.cl", get_build_options(requested_features, "denoising"));
-    denoising_program.add_kernel(ustring("filter_divide_shadow"));
-    denoising_program.add_kernel(ustring("filter_get_feature"));
-    denoising_program.add_kernel(ustring("filter_write_feature"));
-    denoising_program.add_kernel(ustring("filter_detect_outliers"));
-    denoising_program.add_kernel(ustring("filter_combine_halves"));
-    denoising_program.add_kernel(ustring("filter_construct_transform"));
-    denoising_program.add_kernel(ustring("filter_nlm_calc_difference"));
-    denoising_program.add_kernel(ustring("filter_nlm_blur"));
-    denoising_program.add_kernel(ustring("filter_nlm_calc_weight"));
-    denoising_program.add_kernel(ustring("filter_nlm_update_output"));
-    denoising_program.add_kernel(ustring("filter_nlm_normalize"));
-    denoising_program.add_kernel(ustring("filter_nlm_construct_gramian"));
-    denoising_program.add_kernel(ustring("filter_finalize"));
-    programs.push_back(&denoising_program);
-  }
-
-  load_required_kernel_task_pool.wait_work();
-
-  /* Parallel compilation of Cycles kernels, this launches multiple
-   * processes to workaround OpenCL frameworks serializing the calls
-   * internally within a single process. */
-  foreach (OpenCLProgram *program, programs) {
-    if (!program->load()) {
-      load_kernel_num_compiling++;
-      load_kernel_task_pool.push([=] {
-        program->compile();
-        load_kernel_num_compiling--;
-      });
-    }
-  }
-  return true;
-}
-
-void OpenCLDevice::load_required_kernels(const DeviceRequestedFeatures &requested_features)
-{
-  vector<OpenCLProgram *> programs;
-  base_program = OpenCLProgram(
-      this, "base", "kernel_base.cl", get_build_options(requested_features, "base"));
-  base_program.add_kernel(ustring("convert_to_byte"));
-  base_program.add_kernel(ustring("convert_to_half_float"));
-  base_program.add_kernel(ustring("zero_buffer"));
-  programs.push_back(&base_program);
-
-  if (requested_features.use_true_displacement) {
-    displace_program = OpenCLProgram(
-        this, "displace", "kernel_displace.cl", get_build_options(requested_features, "displace"));
-    displace_program.add_kernel(ustring("displace"));
-    programs.push_back(&displace_program);
-  }
-
-  if (requested_features.use_background_light) {
-    background_program = OpenCLProgram(this,
-                                       "background",
-                                       "kernel_background.cl",
-                                       get_build_options(requested_features, "background"));
-    background_program.add_kernel(ustring("background"));
-    programs.push_back(&background_program);
-  }
-
-  if (requested_features.use_baking) {
-    bake_program = OpenCLProgram(
-        this, "bake", "kernel_bake.cl", get_build_options(requested_features, "bake"));
-    bake_program.add_kernel(ustring("bake"));
-    programs.push_back(&bake_program);
-  }
-
-  foreach (OpenCLProgram *program, programs) {
-    if (!program->load()) {
-      load_required_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program));
-    }
-  }
-}
-
-bool OpenCLDevice::wait_for_availability(const DeviceRequestedFeatures &requested_features)
-{
-  if (requested_features.use_baking) {
-    /* For baking, kernels have already been loaded in load_required_kernels(). */
-    return true;
-  }
-
-  load_kernel_task_pool.wait_work();
-  return split_kernel->load_kernels(requested_features);
-}
-
-OpenCLDevice::OpenCLSplitPrograms *OpenCLDevice::get_split_programs()
-{
-  return &kernel_programs;
-}
-
-DeviceKernelStatus OpenCLDevice::get_active_kernel_switch_state()
-{
-  return DEVICE_KERNEL_USING_FEATURE_KERNEL;
-}
-
-void OpenCLDevice::mem_alloc(device_memory &mem)
-{
-  if (mem.name) {
-    VLOG(1) << "Buffer allocate: " << mem.name << ", "
-            << string_human_readable_number(mem.memory_size()) << " bytes. ("
-            << string_human_readable_size(mem.memory_size()) << ")";
-  }
-
-  size_t size = mem.memory_size();
-
-  /* check there is enough memory available for the allocation */
-  cl_ulong max_alloc_size = 0;
-  clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL);
-
-  if (DebugFlags().opencl.mem_limit) {
-    max_alloc_size = min(max_alloc_size, cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used));
-  }
-
-  if (size > max_alloc_size) {
-    string error = "Scene too complex to fit in available memory.";
-    if (mem.name != NULL) {
-      error += string_printf(" (allocating buffer %s failed.)", mem.name);
-    }
-    set_error(error);
-
-    return;
-  }
-
-  cl_mem_flags mem_flag;
-  void *mem_ptr = NULL;
-
-  if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL)
-    mem_flag = CL_MEM_READ_ONLY;
-  else
-    mem_flag = CL_MEM_READ_WRITE;
-
-  /* Zero-size allocation might be invoked by render, but not really
-   * supported by OpenCL. Using NULL as device pointer also doesn't really
-   * work for some reason, so for the time being we'll use special case
-   * will null_mem buffer.
-   */
-  if (size != 0) {
-    mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, mem_flag, size, mem_ptr, &ciErr);
-    opencl_assert_err(ciErr, "clCreateBuffer");
-  }
-  else {
-    mem.device_pointer = 0;
-  }
-
-  stats.mem_alloc(size);
-  mem.device_size = size;
-}
-
-void OpenCLDevice::mem_copy_to(device_memory &mem)
-{
-  if (mem.type == MEM_GLOBAL) {
-    global_free(mem);
-    global_alloc(mem);
-  }
-  else if (mem.type == MEM_TEXTURE) {
-    tex_free((device_texture &)mem);
-    tex_alloc((device_texture &)mem);
-  }
-  else {
-    if (!mem.device_pointer) {
-      mem_alloc(mem);
-    }
-
-    /* this is blocking */
-    size_t size = mem.memory_size();
-    if (size != 0) {
-      opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
-                                         CL_MEM_PTR(mem.device_pointer),
-                                         CL_TRUE,
-                                         0,
-                                         size,
-                                         mem.host_pointer,
-                                         0,
-                                         NULL,
-                                         NULL));
-    }
-  }
-}
-
-void OpenCLDevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
-{
-  size_t offset = elem * y * w;
-  size_t size = elem * w * h;
-  assert(size != 0);
-  opencl_assert(clEnqueueReadBuffer(cqCommandQueue,
-                                    CL_MEM_PTR(mem.device_pointer),
-                                    CL_TRUE,
-                                    offset,
-                                    size,
-                                    (uchar *)mem.host_pointer + offset,
-                                    0,
-                                    NULL,
-                                    NULL));
-}
-
-void OpenCLDevice::mem_zero_kernel(device_ptr mem, size_t size)
-{
-  base_program.wait_for_availability();
-  cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
-
-  size_t global_size[] = {1024, 1024};
-  size_t num_threads = global_size[0] * global_size[1];
-
-  cl_mem d_buffer = CL_MEM_PTR(mem);
-  cl_ulong d_offset = 0;
-  cl_ulong d_size = 0;
-
-  while (d_offset < size) {
-    d_size = std::min<cl_ulong>(num_threads * sizeof(float4), size - d_offset);
-
-    kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
-
-    ciErr = clEnqueueNDRangeKernel(
-        cqCommandQueue, ckZeroBuffer, 2, NULL, global_size, NULL, 0, NULL, NULL);
-    opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
-
-    d_offset += d_size;
-  }
-}
-
-void OpenCLDevice::mem_zero(device_memory &mem)
-{
-  if (!mem.device_pointer) {
-    mem_alloc(mem);
-  }
-
-  if (mem.device_pointer) {
-    if (base_program.is_loaded()) {
-      mem_zero_kernel(mem.device_pointer, mem.memory_size());
-    }
-
-    if (mem.host_pointer) {
-      memset(mem.host_pointer, 0, mem.memory_size());
-    }
-
-    if (!base_program.is_loaded()) {
-      void *zero = mem.host_pointer;
-
-      if (!mem.host_pointer) {
-        zero = util_aligned_malloc(mem.memory_size(), 16);
-        memset(zero, 0, mem.memory_size());
-      }
-
-      opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
-                                         CL_MEM_PTR(mem.device_pointer),
-                                         CL_TRUE,
-                                         0,
-                                         mem.memory_size(),
-                                         zero,
-                                         0,
-                                         NULL,
-                                         NULL));
-
-      if (!mem.host_pointer) {
-        util_aligned_free(zero);
-      }
-    }
-  }
-}
-
-void OpenCLDevice::mem_free(device_memory &mem)
-{
-  if (mem.type == MEM_GLOBAL) {
-    global_free(mem);
-  }
-  else if (mem.type == MEM_TEXTURE) {
-    tex_free((device_texture &)mem);
-  }
-  else {
-    if (mem.device_pointer) {
-      if (mem.device_pointer != 0) {
-        opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
-      }
-      mem.device_pointer = 0;
-
-      stats.mem_free(mem.device_size);
-      mem.device_size = 0;
-    }
-  }
-}
-
-int OpenCLDevice::mem_sub_ptr_alignment()
-{
-  return OpenCLInfo::mem_sub_ptr_alignment(cdDevice);
-}
-
-device_ptr OpenCLDevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int size)
-{
-  cl_mem_flags mem_flag;
-  if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL)
-    mem_flag = CL_MEM_READ_ONLY;
-  else
-    mem_flag = CL_MEM_READ_WRITE;
-
-  cl_buffer_region info;
-  info.origin = mem.memory_elements_size(offset);
-  info.size = mem.memory_elements_size(size);
-
-  device_ptr sub_buf = (device_ptr)clCreateSubBuffer(
-      CL_MEM_PTR(mem.device_pointer), mem_flag, CL_BUFFER_CREATE_TYPE_REGION, &info, &ciErr);
-  opencl_assert_err(ciErr, "clCreateSubBuffer");
-  return sub_buf;
-}
-
-void OpenCLDevice::mem_free_sub_ptr(device_ptr device_pointer)
-{
-  if (device_pointer != 0) {
-    opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer)));
-  }
-}
-
-void OpenCLDevice::const_copy_to(const char *name, void *host, size_t size)
-{
-  ConstMemMap::iterator i = const_mem_map.find(name);
-  device_vector<uchar> *data;
-
-  if (i == const_mem_map.end()) {
-    data = new device_vector<uchar>(this, name, MEM_READ_ONLY);
-    data->alloc(size);
-    const_mem_map.insert(ConstMemMap::value_type(name, data));
-  }
-  else {
-    data = i->second;
-  }
-
-  memcpy(data->data(), host, size);
-  data->copy_to_device();
-}
-
-void OpenCLDevice::global_alloc(device_memory &mem)
-{
-  VLOG(1) << "Global memory allocate: " << mem.name << ", "
-          << string_human_readable_number(mem.memory_size()) << " bytes. ("
-          << string_human_readable_size(mem.memory_size()) << ")";
-
-  memory_manager.alloc(mem.name, mem);
-  /* Set the pointer to non-null to keep code that inspects its value from thinking its
-   * unallocated. */
-  mem.device_pointer = 1;
-  textures[mem.name] = &mem;
-  textures_need_update = true;
-}
-
-void OpenCLDevice::global_free(device_memory &mem)
-{
-  if (mem.device_pointer) {
-    mem.device_pointer = 0;
-
-    if (memory_manager.free(mem)) {
-      textures_need_update = true;
-    }
-
-    foreach (TexturesMap::value_type &value, textures) {
-      if (value.second == &mem) {
-        textures.erase(value.first);
-        break;
-      }
-    }
-  }
-}
-
-void OpenCLDevice::tex_alloc(device_texture &mem)
-{
-  VLOG(1) << "Texture allocate: " << mem.name << ", "
-          << string_human_readable_number(mem.memory_size()) << " bytes. ("
-          << string_human_readable_size(mem.memory_size()) << ")";
-
-  memory_manager.alloc(mem.name, mem);
-  /* Set the pointer to non-null to keep code that inspects its value from thinking its
-   * unallocated. */
-  mem.device_pointer = 1;
-  textures[mem.name] = &mem;
-  textures_need_update = true;
-}
-
-void OpenCLDevice::tex_free(device_texture &mem)
-{
-  global_free(mem);
-}
-
-size_t OpenCLDevice::global_size_round_up(int group_size, int global_size)
-{
-  int r = global_size % group_size;
-  return global_size + ((r == 0) ? 0 : group_size - r);
-}
-
-void OpenCLDevice::enqueue_kernel(
-    cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size)
-{
-  size_t workgroup_size, max_work_items[3];
-
-  clGetKernelWorkGroupInfo(
-      kernel, cdDevice, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
-  clGetDeviceInfo(
-      cdDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, max_work_items, NULL);
-
-  if (max_workgroup_size > 0 && workgroup_size > max_workgroup_size) {
-    workgroup_size = max_workgroup_size;
-  }
-
-  /* Try to divide evenly over 2 dimensions. */
-  size_t local_size[2];
-  if (x_workgroups) {
-    local_size[0] = workgroup_size;
-    local_size[1] = 1;
-  }
-  else {
-    size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
-    local_size[0] = local_size[1] = sqrt_workgroup_size;
-  }
-
-  /* Some implementations have max size 1 on 2nd dimension. */
-  if (local_size[1] > max_work_items[1]) {
-    local_size[0] = workgroup_size / max_work_items[1];
-    local_size[1] = max_work_items[1];
-  }
-
-  size_t global_size[2] = {global_size_round_up(local_size[0], w),
-                           global_size_round_up(local_size[1], h)};
-
-  /* Vertical size of 1 is coming from bake/shade kernels where we should
-   * not round anything up because otherwise we'll either be doing too
-   * much work per pixel (if we don't check global ID on Y axis) or will
-   * be checking for global ID to always have Y of 0.
-   */
-  if (h == 1) {
-    global_size[h] = 1;
-  }
-
-  /* run kernel */
-  opencl_assert(
-      clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
-  opencl_assert(clFlush(cqCommandQueue));
-}
-
-void OpenCLDevice::set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name)
-{
-  cl_mem ptr;
-
-  MemMap::iterator i = mem_map.find(name);
-  if (i != mem_map.end()) {
-    ptr = CL_MEM_PTR(i->second);
-  }
-  else {
-    ptr = 0;
-  }
-
-  opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void *)&ptr));
-}
-
-void OpenCLDevice::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
-{
-  flush_texture_buffers();
-
-  memory_manager.set_kernel_arg_buffers(kernel, narg);
-}
-
-void OpenCLDevice::flush_texture_buffers()
-{
-  if (!textures_need_update) {
-    return;
-  }
-  textures_need_update = false;
-
-  /* Setup slots for textures. */
-  int num_slots = 0;
-
-  vector<texture_slot_t> texture_slots;
-
-#  define KERNEL_TEX(type, name) \
-    if (textures.find(#name) != textures.end()) { \
-      texture_slots.push_back(texture_slot_t(#name, num_slots)); \
-    } \
-    num_slots++;
-#  include "kernel/kernel_textures.h"
-
-  int num_data_slots = num_slots;
-
-  foreach (TexturesMap::value_type &tex, textures) {
-    string name = tex.first;
-    device_memory *mem = tex.second;
-
-    if (mem->type == MEM_TEXTURE) {
-      const uint id = ((device_texture *)mem)->slot;
-      texture_slots.push_back(texture_slot_t(name, num_data_slots + id));
-      num_slots = max(num_slots, num_data_slots + id + 1);
-    }
-  }
-
-  /* Realloc texture descriptors buffer. */
-  memory_manager.free(texture_info);
-  texture_info.resize(num_slots);
-  memory_manager.alloc("texture_info", texture_info);
-
-  /* Fill in descriptors */
-  foreach (texture_slot_t &slot, texture_slots) {
-    device_memory *mem = textures[slot.name];
-    TextureInfo &info = texture_info[slot.slot];
-
-    MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name);
-
-    if (mem->type == MEM_TEXTURE) {
-      info = ((device_texture *)mem)->info;
-    }
-    else {
-      memset(&info, 0, sizeof(TextureInfo));
-    }
-
-    info.data = desc.offset;
-    info.cl_buffer = desc.device_buffer;
-  }
-
-  /* Force write of descriptors. */
-  memory_manager.free(texture_info);
-  memory_manager.alloc("texture_info", texture_info);
-}
-
-void OpenCLDevice::thread_run(DeviceTask &task)
-{
-  flush_texture_buffers();
-
-  if (task.type == DeviceTask::RENDER) {
-    RenderTile tile;
-    DenoisingTask denoising(this, task);
-
-    /* Allocate buffer for kernel globals */
-    device_only_memory<KernelGlobalsDummy> kgbuffer(this, "kernel_globals");
-    kgbuffer.alloc_to_device(1);
-
-    /* Keep rendering tiles until done. */
-    while (task.acquire_tile(this, tile, task.tile_types)) {
-      if (tile.task == RenderTile::PATH_TRACE) {
-        assert(tile.task == RenderTile::PATH_TRACE);
-        scoped_timer timer(&tile.buffers->render_time);
-
-        split_kernel->path_trace(task, tile, kgbuffer, *const_mem_map["__data"]);
-
-        /* Complete kernel execution before release tile. */
-        /* This helps in multi-device render;
-         * The device that reaches the critical-section function
-         * release_tile waits (stalling other devices from entering
-         * release_tile) for all kernels to complete. If device1 (a
-         * slow-render device) reaches release_tile first then it would
-         * stall device2 (a fast-render device) from proceeding to render
-         * next tile.
-         */
-        clFinish(cqCommandQueue);
-      }
-      else if (tile.task == RenderTile::BAKE) {
-        bake(task, tile);
-      }
-      else if (tile.task == RenderTile::DENOISE) {
-        tile.sample = tile.start_sample + tile.num_samples;
-        denoise(tile, denoising);
-        task.update_progress(&tile, tile.w * tile.h);
-      }
-
-      task.release_tile(tile);
-    }
-
-    kgbuffer.free();
-  }
-  else if (task.type == DeviceTask::SHADER) {
-    shader(task);
-  }
-  else if (task.type == DeviceTask::FILM_CONVERT) {
-    film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
-  }
-  else if (task.type == DeviceTask::DENOISE_BUFFER) {
-    RenderTile tile;
-    tile.x = task.x;
-    tile.y = task.y;
-    tile.w = task.w;
-    tile.h = task.h;
-    tile.buffer = task.buffer;
-    tile.sample = task.sample + task.num_samples;
-    tile.num_samples = task.num_samples;
-    tile.start_sample = task.sample;
-    tile.offset = task.offset;
-    tile.stride = task.stride;
-    tile.buffers = task.buffers;
-
-    DenoisingTask denoising(this, task);
-    denoise(tile, denoising);
-    task.update_progress(&tile, tile.w * tile.h);
-  }
-}
-
-void OpenCLDevice::film_convert(DeviceTask &task,
-                                device_ptr buffer,
-                                device_ptr rgba_byte,
-                                device_ptr rgba_half)
-{
-  /* cast arguments to cl types */
-  cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
-  cl_mem d_rgba = (rgba_byte) ? CL_MEM_PTR(rgba_byte) : CL_MEM_PTR(rgba_half);
-  cl_mem d_buffer = CL_MEM_PTR(buffer);
-  cl_int d_x = task.x;
-  cl_int d_y = task.y;
-  cl_int d_w = task.w;
-  cl_int d_h = task.h;
-  cl_float d_sample_scale = 1.0f / (task.sample + 1);
-  cl_int d_offset = task.offset;
-  cl_int d_stride = task.stride;
-
-  cl_kernel ckFilmConvertKernel = (rgba_byte) ? base_program(ustring("convert_to_byte")) :
-                                                base_program(ustring("convert_to_half_float"));
-
-  cl_uint start_arg_index = kernel_set_args(ckFilmConvertKernel, 0, d_data, d_rgba, d_buffer);
-
-  set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index);
-
-  start_arg_index += kernel_set_args(ckFilmConvertKernel,
-                                     start_arg_index,
-                                     d_sample_scale,
-                                     d_x,
-                                     d_y,
-                                     d_w,
-                                     d_h,
-                                     d_offset,
-                                     d_stride);
-
-  enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
-}
-
-bool OpenCLDevice::denoising_non_local_means(device_ptr image_ptr,
-                                             device_ptr guide_ptr,
-                                             device_ptr variance_ptr,
-                                             device_ptr out_ptr,
-                                             DenoisingTask *task)
-{
-  int stride = task->buffer.stride;
-  int w = task->buffer.width;
-  int h = task->buffer.h;
-  int r = task->nlm_state.r;
-  int f = task->nlm_state.f;
-  float a = task->nlm_state.a;
-  float k_2 = task->nlm_state.k_2;
-
-  int pass_stride = task->buffer.pass_stride;
-  int num_shifts = (2 * r + 1) * (2 * r + 1);
-  int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
-
-  device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride * num_shifts);
-  device_sub_ptr blurDifference(
-      task->buffer.temporary_mem, pass_stride * num_shifts, pass_stride * num_shifts);
-  device_sub_ptr weightAccum(
-      task->buffer.temporary_mem, 2 * pass_stride * num_shifts, pass_stride);
-  cl_mem weightAccum_mem = CL_MEM_PTR(*weightAccum);
-  cl_mem difference_mem = CL_MEM_PTR(*difference);
-  cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference);
-
-  cl_mem image_mem = CL_MEM_PTR(image_ptr);
-  cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
-  cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
-  cl_mem out_mem = CL_MEM_PTR(out_ptr);
-  cl_mem scale_mem = NULL;
-
-  mem_zero_kernel(*weightAccum, sizeof(float) * pass_stride);
-  mem_zero_kernel(out_ptr, sizeof(float) * pass_stride);
-
-  cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
-  cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
-  cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
-  cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output"));
-  cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize"));
-
-  kernel_set_args(ckNLMCalcDifference,
-                  0,
-                  guide_mem,
-                  variance_mem,
-                  scale_mem,
-                  difference_mem,
-                  w,
-                  h,
-                  stride,
-                  pass_stride,
-                  r,
-                  channel_offset,
-                  0,
-                  a,
-                  k_2);
-  kernel_set_args(
-      ckNLMBlur, 0, difference_mem, blurDifference_mem, w, h, stride, pass_stride, r, f);
-  kernel_set_args(
-      ckNLMCalcWeight, 0, blurDifference_mem, difference_mem, w, h, stride, pass_stride, r, f);
-  kernel_set_args(ckNLMUpdateOutput,
-                  0,
-                  blurDifference_mem,
-                  image_mem,
-                  out_mem,
-                  weightAccum_mem,
-                  w,
-                  h,
-                  stride,
-                  pass_stride,
-                  channel_offset,
-                  r,
-                  f);
-
-  enqueue_kernel(ckNLMCalcDifference, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMCalcWeight, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMUpdateOutput, w * h, num_shifts, true);
-
-  kernel_set_args(ckNLMNormalize, 0, out_mem, weightAccum_mem, w, h, stride);
-  enqueue_kernel(ckNLMNormalize, w, h);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_construct_transform(DenoisingTask *task)
-{
-  cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
-  cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
-  cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
-  cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
-
-  char use_time = task->buffer.use_time ? 1 : 0;
-
-  cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
-
-  int arg_ofs = kernel_set_args(ckFilterConstructTransform, 0, buffer_mem, tile_info_mem);
-  cl_mem buffers[9];
-  for (int i = 0; i < 9; i++) {
-    buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
-    arg_ofs += kernel_set_args(ckFilterConstructTransform, arg_ofs, buffers[i]);
-  }
-  kernel_set_args(ckFilterConstructTransform,
-                  arg_ofs,
-                  transform_mem,
-                  rank_mem,
-                  task->filter_area,
-                  task->rect,
-                  task->buffer.pass_stride,
-                  task->buffer.frame_stride,
-                  use_time,
-                  task->radius,
-                  task->pca_threshold);
-
-  enqueue_kernel(ckFilterConstructTransform, task->storage.w, task->storage.h, 256);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_accumulate(device_ptr color_ptr,
-                                        device_ptr color_variance_ptr,
-                                        device_ptr scale_ptr,
-                                        int frame,
-                                        DenoisingTask *task)
-{
-  cl_mem color_mem = CL_MEM_PTR(color_ptr);
-  cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
-  cl_mem scale_mem = CL_MEM_PTR(scale_ptr);
-
-  cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
-  cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
-  cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
-  cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
-  cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
-
-  cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
-  cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
-  cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
-  cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
-
-  int w = task->reconstruction_state.source_w;
-  int h = task->reconstruction_state.source_h;
-  int stride = task->buffer.stride;
-  int frame_offset = frame * task->buffer.frame_stride;
-  int t = task->tile_info->frames[frame];
-  char use_time = task->buffer.use_time ? 1 : 0;
-
-  int r = task->radius;
-  int pass_stride = task->buffer.pass_stride;
-  int num_shifts = (2 * r + 1) * (2 * r + 1);
-
-  device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride * num_shifts);
-  device_sub_ptr blurDifference(
-      task->buffer.temporary_mem, pass_stride * num_shifts, pass_stride * num_shifts);
-  cl_mem difference_mem = CL_MEM_PTR(*difference);
-  cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference);
-
-  kernel_set_args(ckNLMCalcDifference,
-                  0,
-                  color_mem,
-                  color_variance_mem,
-                  scale_mem,
-                  difference_mem,
-                  w,
-                  h,
-                  stride,
-                  pass_stride,
-                  r,
-                  pass_stride,
-                  frame_offset,
-                  1.0f,
-                  task->nlm_k_2);
-  kernel_set_args(
-      ckNLMBlur, 0, difference_mem, blurDifference_mem, w, h, stride, pass_stride, r, 4);
-  kernel_set_args(
-      ckNLMCalcWeight, 0, blurDifference_mem, difference_mem, w, h, stride, pass_stride, r, 4);
-  kernel_set_args(ckNLMConstructGramian,
-                  0,
-                  t,
-                  blurDifference_mem,
-                  buffer_mem,
-                  transform_mem,
-                  rank_mem,
-                  XtWX_mem,
-                  XtWY_mem,
-                  task->reconstruction_state.filter_window,
-                  w,
-                  h,
-                  stride,
-                  pass_stride,
-                  r,
-                  4,
-                  frame_offset,
-                  use_time);
-
-  enqueue_kernel(ckNLMCalcDifference, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMCalcWeight, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMConstructGramian, w * h, num_shifts, true, 256);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_solve(device_ptr output_ptr, DenoisingTask *task)
-{
-  cl_kernel ckFinalize = denoising_program(ustring("filter_finalize"));
-
-  cl_mem output_mem = CL_MEM_PTR(output_ptr);
-  cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
-  cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
-  cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
-
-  int w = task->reconstruction_state.source_w;
-  int h = task->reconstruction_state.source_h;
-
-  kernel_set_args(ckFinalize,
-                  0,
-                  output_mem,
-                  rank_mem,
-                  XtWX_mem,
-                  XtWY_mem,
-                  task->filter_area,
-                  task->reconstruction_state.buffer_params,
-                  task->render_buffer.samples);
-  enqueue_kernel(ckFinalize, w, h);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_combine_halves(device_ptr a_ptr,
-                                            device_ptr b_ptr,
-                                            device_ptr mean_ptr,
-                                            device_ptr variance_ptr,
-                                            int r,
-                                            int4 rect,
-                                            DenoisingTask *task)
-{
-  cl_mem a_mem = CL_MEM_PTR(a_ptr);
-  cl_mem b_mem = CL_MEM_PTR(b_ptr);
-  cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
-  cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
-
-  cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves"));
-
-  kernel_set_args(ckFilterCombineHalves, 0, mean_mem, variance_mem, a_mem, b_mem, rect, r);
-  enqueue_kernel(ckFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_divide_shadow(device_ptr a_ptr,
-                                           device_ptr b_ptr,
-                                           device_ptr sample_variance_ptr,
-                                           device_ptr sv_variance_ptr,
-                                           device_ptr buffer_variance_ptr,
-                                           DenoisingTask *task)
-{
-  cl_mem a_mem = CL_MEM_PTR(a_ptr);
-  cl_mem b_mem = CL_MEM_PTR(b_ptr);
-  cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr);
-  cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr);
-  cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr);
-
-  cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
-
-  cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
-
-  int arg_ofs = kernel_set_args(
-      ckFilterDivideShadow, 0, task->render_buffer.samples, tile_info_mem);
-  cl_mem buffers[9];
-  for (int i = 0; i < 9; i++) {
-    buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
-    arg_ofs += kernel_set_args(ckFilterDivideShadow, arg_ofs, buffers[i]);
-  }
-  kernel_set_args(ckFilterDivideShadow,
-                  arg_ofs,
-                  a_mem,
-                  b_mem,
-                  sample_variance_mem,
-                  sv_variance_mem,
-                  buffer_variance_mem,
-                  task->rect,
-                  task->render_buffer.pass_stride,
-                  task->render_buffer.offset);
-  enqueue_kernel(ckFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_get_feature(int mean_offset,
-                                         int variance_offset,
-                                         device_ptr mean_ptr,
-                                         device_ptr variance_ptr,
-                                         float scale,
-                                         DenoisingTask *task)
-{
-  cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
-  cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
-
-  cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
-
-  cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
-
-  int arg_ofs = kernel_set_args(ckFilterGetFeature, 0, task->render_buffer.samples, tile_info_mem);
-  cl_mem buffers[9];
-  for (int i = 0; i < 9; i++) {
-    buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
-    arg_ofs += kernel_set_args(ckFilterGetFeature, arg_ofs, buffers[i]);
-  }
-  kernel_set_args(ckFilterGetFeature,
-                  arg_ofs,
-                  mean_offset,
-                  variance_offset,
-                  mean_mem,
-                  variance_mem,
-                  scale,
-                  task->rect,
-                  task->render_buffer.pass_stride,
-                  task->render_buffer.offset);
-  enqueue_kernel(ckFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_write_feature(int out_offset,
-                                           device_ptr from_ptr,
-                                           device_ptr buffer_ptr,
-                                           DenoisingTask *task)
-{
-  cl_mem from_mem = CL_MEM_PTR(from_ptr);
-  cl_mem buffer_mem = CL_MEM_PTR(buffer_ptr);
-
-  cl_kernel ckFilterWriteFeature = denoising_program(ustring("filter_write_feature"));
-
-  kernel_set_args(ckFilterWriteFeature,
-                  0,
-                  task->render_buffer.samples,
-                  task->reconstruction_state.buffer_params,
-                  task->filter_area,
-                  from_mem,
-                  buffer_mem,
-                  out_offset,
-                  task->rect);
-  enqueue_kernel(ckFilterWriteFeature, task->filter_area.z, task->filter_area.w);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_detect_outliers(device_ptr image_ptr,
-                                             device_ptr variance_ptr,
-                                             device_ptr depth_ptr,
-                                             device_ptr output_ptr,
-                                             DenoisingTask *task)
-{
-  cl_mem image_mem = CL_MEM_PTR(image_ptr);
-  cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
-  cl_mem depth_mem = CL_MEM_PTR(depth_ptr);
-  cl_mem output_mem = CL_MEM_PTR(output_ptr);
-
-  cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers"));
-
-  kernel_set_args(ckFilterDetectOutliers,
-                  0,
-                  image_mem,
-                  variance_mem,
-                  depth_mem,
-                  output_mem,
-                  task->rect,
-                  task->buffer.pass_stride);
-  enqueue_kernel(ckFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  return true;
-}
-
-void OpenCLDevice::denoise(RenderTile &rtile, DenoisingTask &denoising)
-{
-  denoising.functions.construct_transform = function_bind(
-      &OpenCLDevice::denoising_construct_transform, this, &denoising);
-  denoising.functions.accumulate = function_bind(
-      &OpenCLDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
-  denoising.functions.solve = function_bind(&OpenCLDevice::denoising_solve, this, _1, &denoising);
-  denoising.functions.divide_shadow = function_bind(
-      &OpenCLDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
-  denoising.functions.non_local_means = function_bind(
-      &OpenCLDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
-  denoising.functions.combine_halves = function_bind(
-      &OpenCLDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
-  denoising.functions.get_feature = function_bind(
-      &OpenCLDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
-  denoising.functions.write_feature = function_bind(
-      &OpenCLDevice::denoising_write_feature, this, _1, _2, _3, &denoising);
-  denoising.functions.detect_outliers = function_bind(
-      &OpenCLDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
-
-  denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
-  denoising.render_buffer.samples = rtile.sample;
-  denoising.buffer.gpu_temporary_mem = true;
-
-  denoising.run_denoising(rtile);
-}
-
-void OpenCLDevice::shader(DeviceTask &task)
-{
-  /* cast arguments to cl types */
-  cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
-  cl_mem d_input = CL_MEM_PTR(task.shader_input);
-  cl_mem d_output = CL_MEM_PTR(task.shader_output);
-  cl_int d_shader_eval_type = task.shader_eval_type;
-  cl_int d_shader_filter = task.shader_filter;
-  cl_int d_shader_x = task.shader_x;
-  cl_int d_shader_w = task.shader_w;
-  cl_int d_offset = task.offset;
-
-  OpenCLDevice::OpenCLProgram *program = &background_program;
-  if (task.shader_eval_type == SHADER_EVAL_DISPLACE) {
-    program = &displace_program;
-  }
-  program->wait_for_availability();
-  cl_kernel kernel = (*program)();
-
-  cl_uint start_arg_index = kernel_set_args(kernel, 0, d_data, d_input, d_output);
-
-  set_kernel_arg_buffers(kernel, &start_arg_index);
-
-  start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_eval_type);
-  if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
-    start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_filter);
-  }
-  start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_x, d_shader_w, d_offset);
-
-  for (int sample = 0; sample < task.num_samples; sample++) {
-
-    if (task.get_cancel())
-      break;
-
-    kernel_set_args(kernel, start_arg_index, sample);
-
-    enqueue_kernel(kernel, task.shader_w, 1);
-
-    clFinish(cqCommandQueue);
-
-    task.update_progress(NULL);
-  }
-}
-
-void OpenCLDevice::bake(DeviceTask &task, RenderTile &rtile)
-{
-  scoped_timer timer(&rtile.buffers->render_time);
-
-  /* Cast arguments to cl types. */
-  cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
-  cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
-  cl_int d_x = rtile.x;
-  cl_int d_y = rtile.y;
-  cl_int d_w = rtile.w;
-  cl_int d_h = rtile.h;
-  cl_int d_offset = rtile.offset;
-  cl_int d_stride = rtile.stride;
-
-  bake_program.wait_for_availability();
-  cl_kernel kernel = bake_program();
-
-  cl_uint start_arg_index = kernel_set_args(kernel, 0, d_data, d_buffer);
-
-  set_kernel_arg_buffers(kernel, &start_arg_index);
-
-  start_arg_index += kernel_set_args(
-      kernel, start_arg_index, d_x, d_y, d_w, d_h, d_offset, d_stride);
-
-  int start_sample = rtile.start_sample;
-  int end_sample = rtile.start_sample + rtile.num_samples;
-
-  for (int sample = start_sample; sample < end_sample; sample++) {
-    if (task.get_cancel()) {
-      if (task.need_finish_queue == false)
-        break;
-    }
-
-    kernel_set_args(kernel, start_arg_index, sample);
-
-    enqueue_kernel(kernel, d_w, d_h);
-    clFinish(cqCommandQueue);
-
-    rtile.sample = sample + 1;
-
-    task.update_progress(&rtile, rtile.w * rtile.h);
-  }
-}
-
-static bool kernel_build_opencl_2(cl_device_id cdDevice)
-{
-  /* Build with OpenCL 2.0 if available, this improves performance
-   * with AMD OpenCL drivers on Windows and Linux (legacy drivers).
-   * Note that OpenCL selects the highest 1.x version by default,
-   * only for 2.0 do we need the explicit compiler flag. */
-  int version_major, version_minor;
-  if (OpenCLInfo::get_device_version(cdDevice, &version_major, &version_minor)) {
-    if (version_major >= 2) {
-      /* This appears to trigger a driver bug in Radeon RX cards with certain
-       * driver version, so don't use OpenCL 2.0 for those. */
-      string device_name = OpenCLInfo::get_readable_device_name(cdDevice);
-      if (string_startswith(device_name, "Radeon RX 4") ||
-          string_startswith(device_name, "Radeon (TM) RX 4") ||
-          string_startswith(device_name, "Radeon RX 5") ||
-          string_startswith(device_name, "Radeon (TM) RX 5")) {
-        char version[256] = "";
-        int driver_major, driver_minor;
-        clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
-        if (sscanf(version, "OpenCL 2.0 AMD-APP (%d.%d)", &driver_major, &driver_minor) == 2) {
-          return !(driver_major == 3075 && driver_minor <= 12);
-        }
-      }
-
-      return true;
-    }
-  }
-
-  return false;
-}
-
-string OpenCLDevice::kernel_build_options(const string *debug_src)
-{
-  string build_options = "-cl-no-signed-zeros -cl-mad-enable ";
-
-  if (kernel_build_opencl_2(cdDevice)) {
-    build_options += "-cl-std=CL2.0 ";
-  }
-
-  if (platform_name == "NVIDIA CUDA") {
-    build_options +=
-        "-D__KERNEL_OPENCL_NVIDIA__ "
-        "-cl-nv-maxrregcount=32 "
-        "-cl-nv-verbose ";
-
-    uint compute_capability_major, compute_capability_minor;
-    clGetDeviceInfo(cdDevice,
-                    CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
-                    sizeof(cl_uint),
-                    &compute_capability_major,
-                    NULL);
-    clGetDeviceInfo(cdDevice,
-                    CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
-                    sizeof(cl_uint),
-                    &compute_capability_minor,
-                    NULL);
-
-    build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ",
-                                   compute_capability_major * 100 + compute_capability_minor * 10);
-  }
-
-  else if (platform_name == "Apple")
-    build_options += "-D__KERNEL_OPENCL_APPLE__ ";
-
-  else if (platform_name == "AMD Accelerated Parallel Processing")
-    build_options += "-D__KERNEL_OPENCL_AMD__ ";
-
-  else if (platform_name == "Intel(R) OpenCL") {
-    build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ ";
-
-    /* Options for gdb source level kernel debugging.
-     * this segfaults on linux currently.
-     */
-    if (OpenCLInfo::use_debug() && debug_src)
-      build_options += "-g -s \"" + *debug_src + "\" ";
-  }
-
-  if (info.has_half_images) {
-    build_options += "-D__KERNEL_CL_KHR_FP16__ ";
-  }
-
-  if (OpenCLInfo::use_debug()) {
-    build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
-  }
-
-#  ifdef WITH_NANOVDB
-  if (info.has_nanovdb) {
-    build_options += "-DWITH_NANOVDB ";
-  }
-#  endif
-
-  return build_options;
-}
-
-/* TODO(sergey): In the future we can use variadic templates, once
- * C++0x is allowed. Should allow to clean this up a bit.
- */
-int OpenCLDevice::kernel_set_args(cl_kernel kernel,
-                                  int start_argument_index,
-                                  const ArgumentWrapper &arg1,
-                                  const ArgumentWrapper &arg2,
-                                  const ArgumentWrapper &arg3,
-                                  const ArgumentWrapper &arg4,
-                                  const ArgumentWrapper &arg5,
-                                  const ArgumentWrapper &arg6,
-                                  const ArgumentWrapper &arg7,
-                                  const ArgumentWrapper &arg8,
-                                  const ArgumentWrapper &arg9,
-                                  const ArgumentWrapper &arg10,
-                                  const ArgumentWrapper &arg11,
-                                  const ArgumentWrapper &arg12,
-                                  const ArgumentWrapper &arg13,
-                                  const ArgumentWrapper &arg14,
-                                  const ArgumentWrapper &arg15,
-                                  const ArgumentWrapper &arg16,
-                                  const ArgumentWrapper &arg17,
-                                  const ArgumentWrapper &arg18,
-                                  const ArgumentWrapper &arg19,
-                                  const ArgumentWrapper &arg20,
-                                  const ArgumentWrapper &arg21,
-                                  const ArgumentWrapper &arg22,
-                                  const ArgumentWrapper &arg23,
-                                  const ArgumentWrapper &arg24,
-                                  const ArgumentWrapper &arg25,
-                                  const ArgumentWrapper &arg26,
-                                  const ArgumentWrapper &arg27,
-                                  const ArgumentWrapper &arg28,
-                                  const ArgumentWrapper &arg29,
-                                  const ArgumentWrapper &arg30,
-                                  const ArgumentWrapper &arg31,
-                                  const ArgumentWrapper &arg32,
-                                  const ArgumentWrapper &arg33)
-{
-  int current_arg_index = 0;
-#  define FAKE_VARARG_HANDLE_ARG(arg) \
-    do { \
-      if (arg.pointer != NULL) { \
-        opencl_assert(clSetKernelArg( \
-            kernel, start_argument_index + current_arg_index, arg.size, arg.pointer)); \
-        ++current_arg_index; \
-      } \
-      else { \
-        return current_arg_index; \
-      } \
-    } while (false)
-  FAKE_VARARG_HANDLE_ARG(arg1);
-  FAKE_VARARG_HANDLE_ARG(arg2);
-  FAKE_VARARG_HANDLE_ARG(arg3);
-  FAKE_VARARG_HANDLE_ARG(arg4);
-  FAKE_VARARG_HANDLE_ARG(arg5);
-  FAKE_VARARG_HANDLE_ARG(arg6);
-  FAKE_VARARG_HANDLE_ARG(arg7);
-  FAKE_VARARG_HANDLE_ARG(arg8);
-  FAKE_VARARG_HANDLE_ARG(arg9);
-  FAKE_VARARG_HANDLE_ARG(arg10);
-  FAKE_VARARG_HANDLE_ARG(arg11);
-  FAKE_VARARG_HANDLE_ARG(arg12);
-  FAKE_VARARG_HANDLE_ARG(arg13);
-  FAKE_VARARG_HANDLE_ARG(arg14);
-  FAKE_VARARG_HANDLE_ARG(arg15);
-  FAKE_VARARG_HANDLE_ARG(arg16);
-  FAKE_VARARG_HANDLE_ARG(arg17);
-  FAKE_VARARG_HANDLE_ARG(arg18);
-  FAKE_VARARG_HANDLE_ARG(arg19);
-  FAKE_VARARG_HANDLE_ARG(arg20);
-  FAKE_VARARG_HANDLE_ARG(arg21);
-  FAKE_VARARG_HANDLE_ARG(arg22);
-  FAKE_VARARG_HANDLE_ARG(arg23);
-  FAKE_VARARG_HANDLE_ARG(arg24);
-  FAKE_VARARG_HANDLE_ARG(arg25);
-  FAKE_VARARG_HANDLE_ARG(arg26);
-  FAKE_VARARG_HANDLE_ARG(arg27);
-  FAKE_VARARG_HANDLE_ARG(arg28);
-  FAKE_VARARG_HANDLE_ARG(arg29);
-  FAKE_VARARG_HANDLE_ARG(arg30);
-  FAKE_VARARG_HANDLE_ARG(arg31);
-  FAKE_VARARG_HANDLE_ARG(arg32);
-  FAKE_VARARG_HANDLE_ARG(arg33);
-#  undef FAKE_VARARG_HANDLE_ARG
-  return current_arg_index;
-}
-
-void OpenCLDevice::release_kernel_safe(cl_kernel kernel)
-{
-  if (kernel) {
-    clReleaseKernel(kernel);
-  }
-}
-
-void OpenCLDevice::release_mem_object_safe(cl_mem mem)
-{
-  if (mem != NULL) {
-    clReleaseMemObject(mem);
-  }
-}
-
-void OpenCLDevice::release_program_safe(cl_program program)
-{
-  if (program) {
-    clReleaseProgram(program);
-  }
-}
-
-/* ** Those guys are for working around some compiler-specific bugs ** */
-
-cl_program OpenCLDevice::load_cached_kernel(ustring key, thread_scoped_lock &cache_locker)
-{
-  return OpenCLCache::get_program(cpPlatform, cdDevice, key, cache_locker);
-}
-
-void OpenCLDevice::store_cached_kernel(cl_program program,
-                                       ustring key,
-                                       thread_scoped_lock &cache_locker)
-{
-  OpenCLCache::store_program(cpPlatform, cdDevice, program, key, cache_locker);
-}
-
-Device *opencl_create_split_device(DeviceInfo &info,
-                                   Stats &stats,
-                                   Profiler &profiler,
-                                   bool background)
-{
-  return new OpenCLDevice(info, stats, profiler, background);
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/opencl/memory_manager.cpp b/intern/cycles/device/opencl/memory_manager.cpp
deleted file mode 100644
index 4330e07cb37..00000000000
--- a/intern/cycles/device/opencl/memory_manager.cpp
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPENCL
-
-#  include "util/util_foreach.h"
-
-#  include "device/opencl/device_opencl.h"
-#  include "device/opencl/memory_manager.h"
-
-CCL_NAMESPACE_BEGIN
-
-void MemoryManager::DeviceBuffer::add_allocation(Allocation &allocation)
-{
-  allocations.push_back(&allocation);
-}
-
-void MemoryManager::DeviceBuffer::update_device_memory(OpenCLDevice *device)
-{
-  bool need_realloc = false;
-
-  /* Calculate total size and remove any freed. */
-  size_t total_size = 0;
-
-  for (int i = allocations.size() - 1; i >= 0; i--) {
-    Allocation *allocation = allocations[i];
-
-    /* Remove allocations that have been freed. */
-    if (!allocation->mem || allocation->mem->memory_size() == 0) {
-      allocation->device_buffer = NULL;
-      allocation->size = 0;
-
-      allocations.erase(allocations.begin() + i);
-
-      need_realloc = true;
-
-      continue;
-    }
-
-    /* Get actual size for allocation. */
-    size_t alloc_size = align_up(allocation->mem->memory_size(), 16);
-
-    if (allocation->size != alloc_size) {
-      /* Allocation is either new or resized. */
-      allocation->size = alloc_size;
-      allocation->needs_copy_to_device = true;
-
-      need_realloc = true;
-    }
-
-    total_size += alloc_size;
-  }
-
-  /* Always allocate non-empty buffer, NULL pointers cause problems with some drivers. */
-  total_size = std::max(total_size, (size_t)16);
-
-  if (need_realloc) {
-    cl_ulong max_buffer_size;
-    clGetDeviceInfo(
-        device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
-
-    if (total_size > max_buffer_size) {
-      device->set_error("Scene too complex to fit in available memory.");
-      return;
-    }
-
-    device_only_memory<uchar> *new_buffer = new device_only_memory<uchar>(device,
-                                                                          "memory manager buffer");
-
-    new_buffer->alloc_to_device(total_size);
-
-    size_t offset = 0;
-
-    foreach (Allocation *allocation, allocations) {
-      if (allocation->needs_copy_to_device) {
-        /* Copy from host to device. */
-        opencl_device_assert(device,
-                             clEnqueueWriteBuffer(device->cqCommandQueue,
-                                                  CL_MEM_PTR(new_buffer->device_pointer),
-                                                  CL_FALSE,
-                                                  offset,
-                                                  allocation->mem->memory_size(),
-                                                  allocation->mem->host_pointer,
-                                                  0,
-                                                  NULL,
-                                                  NULL));
-
-        allocation->needs_copy_to_device = false;
-      }
-      else {
-        /* Fast copy from memory already on device. */
-        opencl_device_assert(device,
-                             clEnqueueCopyBuffer(device->cqCommandQueue,
-                                                 CL_MEM_PTR(buffer->device_pointer),
-                                                 CL_MEM_PTR(new_buffer->device_pointer),
-                                                 allocation->desc.offset,
-                                                 offset,
-                                                 allocation->mem->memory_size(),
-                                                 0,
-                                                 NULL,
-                                                 NULL));
-      }
-
-      allocation->desc.offset = offset;
-      offset += allocation->size;
-    }
-
-    delete buffer;
-
-    buffer = new_buffer;
-  }
-  else {
-    assert(total_size == buffer->data_size);
-
-    size_t offset = 0;
-
-    foreach (Allocation *allocation, allocations) {
-      if (allocation->needs_copy_to_device) {
-        /* Copy from host to device. */
-        opencl_device_assert(device,
-                             clEnqueueWriteBuffer(device->cqCommandQueue,
-                                                  CL_MEM_PTR(buffer->device_pointer),
-                                                  CL_FALSE,
-                                                  offset,
-                                                  allocation->mem->memory_size(),
-                                                  allocation->mem->host_pointer,
-                                                  0,
-                                                  NULL,
-                                                  NULL));
-
-        allocation->needs_copy_to_device = false;
-      }
-
-      offset += allocation->size;
-    }
-  }
-
-  /* Not really necessary, but seems to improve responsiveness for some reason. */
-  clFinish(device->cqCommandQueue);
-}
-
-void MemoryManager::DeviceBuffer::free(OpenCLDevice *)
-{
-  buffer->free();
-}
-
-MemoryManager::DeviceBuffer *MemoryManager::smallest_device_buffer()
-{
-  DeviceBuffer *smallest = device_buffers;
-
-  foreach (DeviceBuffer &device_buffer, device_buffers) {
-    if (device_buffer.size < smallest->size) {
-      smallest = &device_buffer;
-    }
-  }
-
-  return smallest;
-}
-
-MemoryManager::MemoryManager(OpenCLDevice *device) : device(device), need_update(false)
-{
-  foreach (DeviceBuffer &device_buffer, device_buffers) {
-    device_buffer.buffer = new device_only_memory<uchar>(device, "memory manager buffer");
-  }
-}
-
-void MemoryManager::free()
-{
-  foreach (DeviceBuffer &device_buffer, device_buffers) {
-    device_buffer.free(device);
-  }
-}
-
-void MemoryManager::alloc(const char *name, device_memory &mem)
-{
-  Allocation &allocation = allocations[name];
-
-  allocation.mem = &mem;
-  allocation.needs_copy_to_device = true;
-
-  if (!allocation.device_buffer) {
-    DeviceBuffer *device_buffer = smallest_device_buffer();
-    allocation.device_buffer = device_buffer;
-
-    allocation.desc.device_buffer = device_buffer - device_buffers;
-
-    device_buffer->add_allocation(allocation);
-
-    device_buffer->size += mem.memory_size();
-  }
-
-  need_update = true;
-}
-
-bool MemoryManager::free(device_memory &mem)
-{
-  foreach (AllocationsMap::value_type &value, allocations) {
-    Allocation &allocation = value.second;
-    if (allocation.mem == &mem) {
-
-      allocation.device_buffer->size -= mem.memory_size();
-
-      allocation.mem = NULL;
-      allocation.needs_copy_to_device = false;
-
-      need_update = true;
-      return true;
-    }
-  }
-
-  return false;
-}
-
-MemoryManager::BufferDescriptor MemoryManager::get_descriptor(string name)
-{
-  update_device_memory();
-
-  Allocation &allocation = allocations[name];
-  return allocation.desc;
-}
-
-void MemoryManager::update_device_memory()
-{
-  if (!need_update) {
-    return;
-  }
-
-  need_update = false;
-
-  foreach (DeviceBuffer &device_buffer, device_buffers) {
-    device_buffer.update_device_memory(device);
-  }
-}
-
-void MemoryManager::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
-{
-  update_device_memory();
-
-  foreach (DeviceBuffer &device_buffer, device_buffers) {
-    if (device_buffer.buffer->device_pointer) {
-      device->kernel_set_args(kernel, (*narg)++, *device_buffer.buffer);
-    }
-    else {
-      device->kernel_set_args(kernel, (*narg)++);
-    }
-  }
-}
-
-CCL_NAMESPACE_END
-
-#endif /* WITH_OPENCL */
diff --git a/intern/cycles/device/opencl/memory_manager.h b/intern/cycles/device/opencl/memory_manager.h
deleted file mode 100644
index 23624f837a6..00000000000
--- a/intern/cycles/device/opencl/memory_manager.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "device/device.h"
-
-#include "util/util_map.h"
-#include "util/util_string.h"
-#include "util/util_vector.h"
-
-#include "clew.h"
-
-CCL_NAMESPACE_BEGIN
-
-class OpenCLDevice;
-
-class MemoryManager {
- public:
-  static const int NUM_DEVICE_BUFFERS = 8;
-
-  struct BufferDescriptor {
-    uint device_buffer;
-    cl_ulong offset;
-  };
-
- private:
-  struct DeviceBuffer;
-
-  struct Allocation {
-    device_memory *mem;
-
-    DeviceBuffer *device_buffer;
-    size_t size; /* Size of actual allocation, may be larger than requested. */
-
-    BufferDescriptor desc;
-
-    bool needs_copy_to_device;
-
-    Allocation() : mem(NULL), device_buffer(NULL), size(0), needs_copy_to_device(false)
-    {
-    }
-  };
-
-  struct DeviceBuffer {
-    device_only_memory<uchar> *buffer;
-    vector<Allocation *> allocations;
-    size_t size; /* Size of all allocations. */
-
-    DeviceBuffer() : buffer(NULL), size(0)
-    {
-    }
-
-    ~DeviceBuffer()
-    {
-      delete buffer;
-      buffer = NULL;
-    }
-
-    void add_allocation(Allocation &allocation);
-
-    void update_device_memory(OpenCLDevice *device);
-
-    void free(OpenCLDevice *device);
-  };
-
-  OpenCLDevice *device;
-
-  DeviceBuffer device_buffers[NUM_DEVICE_BUFFERS];
-
-  typedef unordered_map<string, Allocation> AllocationsMap;
-  AllocationsMap allocations;
-
-  bool need_update;
-
-  DeviceBuffer *smallest_device_buffer();
-
- public:
-  MemoryManager(OpenCLDevice *device);
-
-  void free(); /* Free all memory. */
-
-  void alloc(const char *name, device_memory &mem);
-  bool free(device_memory &mem);
-
-  BufferDescriptor get_descriptor(string name);
-
-  void update_device_memory();
-  void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
-};
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp
deleted file mode 100644
index 3929cf77f15..00000000000
--- a/intern/cycles/device/opencl/opencl_util.cpp
+++ /dev/null
@@ -1,1326 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPENCL
-
-#  include "device/device_intern.h"
-#  include "device/opencl/device_opencl.h"
-
-#  include "util/util_debug.h"
-#  include "util/util_logging.h"
-#  include "util/util_md5.h"
-#  include "util/util_path.h"
-#  include "util/util_semaphore.h"
-#  include "util/util_system.h"
-#  include "util/util_time.h"
-
-using std::cerr;
-using std::endl;
-
-CCL_NAMESPACE_BEGIN
-
-OpenCLCache::Slot::ProgramEntry::ProgramEntry() : program(NULL), mutex(NULL)
-{
-}
-
-OpenCLCache::Slot::ProgramEntry::ProgramEntry(const ProgramEntry &rhs)
-    : program(rhs.program), mutex(NULL)
-{
-}
-
-OpenCLCache::Slot::ProgramEntry::~ProgramEntry()
-{
-  delete mutex;
-}
-
-OpenCLCache::Slot::Slot() : context_mutex(NULL), context(NULL)
-{
-}
-
-OpenCLCache::Slot::Slot(const Slot &rhs)
-    : context_mutex(NULL), context(NULL), programs(rhs.programs)
-{
-}
-
-OpenCLCache::Slot::~Slot()
-{
-  delete context_mutex;
-}
-
-OpenCLCache &OpenCLCache::global_instance()
-{
-  static OpenCLCache instance;
-  return instance;
-}
-
-cl_context OpenCLCache::get_context(cl_platform_id platform,
-                                    cl_device_id device,
-                                    thread_scoped_lock &slot_locker)
-{
-  assert(platform != NULL);
-
-  OpenCLCache &self = global_instance();
-
-  thread_scoped_lock cache_lock(self.cache_lock);
-
-  pair<CacheMap::iterator, bool> ins = self.cache.insert(
-      CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
-
-  Slot &slot = ins.first->second;
-
-  /* create slot lock only while holding cache lock */
-  if (!slot.context_mutex)
-    slot.context_mutex = new thread_mutex;
-
-  /* need to unlock cache before locking slot, to allow store to complete */
-  cache_lock.unlock();
-
-  /* lock the slot */
-  slot_locker = thread_scoped_lock(*slot.context_mutex);
-
-  /* If the thing isn't cached */
-  if (slot.context == NULL) {
-    /* return with the caller's lock holder holding the slot lock */
-    return NULL;
-  }
-
-  /* the item was already cached, release the slot lock */
-  slot_locker.unlock();
-
-  cl_int ciErr = clRetainContext(slot.context);
-  assert(ciErr == CL_SUCCESS);
-  (void)ciErr;
-
-  return slot.context;
-}
-
-cl_program OpenCLCache::get_program(cl_platform_id platform,
-                                    cl_device_id device,
-                                    ustring key,
-                                    thread_scoped_lock &slot_locker)
-{
-  assert(platform != NULL);
-
-  OpenCLCache &self = global_instance();
-
-  thread_scoped_lock cache_lock(self.cache_lock);
-
-  pair<CacheMap::iterator, bool> ins = self.cache.insert(
-      CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
-
-  Slot &slot = ins.first->second;
-
-  pair<Slot::EntryMap::iterator, bool> ins2 = slot.programs.insert(
-      Slot::EntryMap::value_type(key, Slot::ProgramEntry()));
-
-  Slot::ProgramEntry &entry = ins2.first->second;
-
-  /* create slot lock only while holding cache lock */
-  if (!entry.mutex)
-    entry.mutex = new thread_mutex;
-
-  /* need to unlock cache before locking slot, to allow store to complete */
-  cache_lock.unlock();
-
-  /* lock the slot */
-  slot_locker = thread_scoped_lock(*entry.mutex);
-
-  /* If the thing isn't cached */
-  if (entry.program == NULL) {
-    /* return with the caller's lock holder holding the slot lock */
-    return NULL;
-  }
-
-  /* the item was already cached, release the slot lock */
-  slot_locker.unlock();
-
-  cl_int ciErr = clRetainProgram(entry.program);
-  assert(ciErr == CL_SUCCESS);
-  (void)ciErr;
-
-  return entry.program;
-}
-
-void OpenCLCache::store_context(cl_platform_id platform,
-                                cl_device_id device,
-                                cl_context context,
-                                thread_scoped_lock &slot_locker)
-{
-  assert(platform != NULL);
-  assert(device != NULL);
-  assert(context != NULL);
-
-  OpenCLCache &self = global_instance();
-
-  thread_scoped_lock cache_lock(self.cache_lock);
-  CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
-  cache_lock.unlock();
-
-  Slot &slot = i->second;
-
-  /* sanity check */
-  assert(i != self.cache.end());
-  assert(slot.context == NULL);
-
-  slot.context = context;
-
-  /* unlock the slot */
-  slot_locker.unlock();
-
-  /* increment reference count in OpenCL.
-   * The caller is going to release the object when done with it. */
-  cl_int ciErr = clRetainContext(context);
-  assert(ciErr == CL_SUCCESS);
-  (void)ciErr;
-}
-
-void OpenCLCache::store_program(cl_platform_id platform,
-                                cl_device_id device,
-                                cl_program program,
-                                ustring key,
-                                thread_scoped_lock &slot_locker)
-{
-  assert(platform != NULL);
-  assert(device != NULL);
-  assert(program != NULL);
-
-  OpenCLCache &self = global_instance();
-
-  thread_scoped_lock cache_lock(self.cache_lock);
-
-  CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
-  assert(i != self.cache.end());
-  Slot &slot = i->second;
-
-  Slot::EntryMap::iterator i2 = slot.programs.find(key);
-  assert(i2 != slot.programs.end());
-  Slot::ProgramEntry &entry = i2->second;
-
-  assert(entry.program == NULL);
-
-  cache_lock.unlock();
-
-  entry.program = program;
-
-  /* unlock the slot */
-  slot_locker.unlock();
-
-  /* Increment reference count in OpenCL.
-   * The caller is going to release the object when done with it.
-   */
-  cl_int ciErr = clRetainProgram(program);
-  assert(ciErr == CL_SUCCESS);
-  (void)ciErr;
-}
-
-string OpenCLCache::get_kernel_md5()
-{
-  OpenCLCache &self = global_instance();
-  thread_scoped_lock lock(self.kernel_md5_lock);
-
-  if (self.kernel_md5.empty()) {
-    self.kernel_md5 = path_files_md5_hash(path_get("source"));
-  }
-  return self.kernel_md5;
-}
-
-static string get_program_source(const string &kernel_file)
-{
-  string source = "#include \"kernel/kernels/opencl/" + kernel_file + "\"\n";
-  /* We compile kernels consisting of many files. unfortunately OpenCL
-   * kernel caches do not seem to recognize changes in included files.
-   * so we force recompile on changes by adding the md5 hash of all files.
-   */
-  source = path_source_replace_includes(source, path_get("source"));
-  source += "\n// " + util_md5_string(source) + "\n";
-  return source;
-}
-
-OpenCLDevice::OpenCLProgram::OpenCLProgram(OpenCLDevice *device,
-                                           const string &program_name,
-                                           const string &kernel_file,
-                                           const string &kernel_build_options,
-                                           bool use_stdout)
-    : device(device),
-      program_name(program_name),
-      kernel_file(kernel_file),
-      kernel_build_options(kernel_build_options),
-      use_stdout(use_stdout)
-{
-  loaded = false;
-  needs_compiling = true;
-  program = NULL;
-}
-
-OpenCLDevice::OpenCLProgram::~OpenCLProgram()
-{
-  release();
-}
-
-void OpenCLDevice::OpenCLProgram::release()
-{
-  for (map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end();
-       ++kernel) {
-    if (kernel->second) {
-      clReleaseKernel(kernel->second);
-      kernel->second = NULL;
-    }
-  }
-  if (program) {
-    clReleaseProgram(program);
-    program = NULL;
-  }
-}
-
-void OpenCLDevice::OpenCLProgram::add_log(const string &msg, bool debug)
-{
-  if (!use_stdout) {
-    log += msg + "\n";
-  }
-  else if (!debug) {
-    printf("%s\n", msg.c_str());
-    fflush(stdout);
-  }
-  else {
-    VLOG(2) << msg;
-  }
-}
-
-void OpenCLDevice::OpenCLProgram::add_error(const string &msg)
-{
-  if (use_stdout) {
-    fprintf(stderr, "%s\n", msg.c_str());
-  }
-  if (error_msg == "") {
-    error_msg += "\n";
-  }
-  error_msg += msg;
-}
-
-void OpenCLDevice::OpenCLProgram::add_kernel(ustring name)
-{
-  if (!kernels.count(name)) {
-    kernels[name] = NULL;
-  }
-}
-
-bool OpenCLDevice::OpenCLProgram::build_kernel(const string *debug_src)
-{
-  string build_options;
-  build_options = device->kernel_build_options(debug_src) + kernel_build_options;
-
-  VLOG(1) << "Build options passed to clBuildProgram: '" << build_options << "'.";
-  cl_int ciErr = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL);
-
-  /* show warnings even if build is successful */
-  size_t ret_val_size = 0;
-
-  clGetProgramBuildInfo(program, device->cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
-
-  if (ciErr != CL_SUCCESS) {
-    add_error(string("OpenCL build failed with error ") + clewErrorString(ciErr) +
-              ", errors in console.");
-  }
-
-  if (ret_val_size > 1) {
-    vector<char> build_log(ret_val_size + 1);
-    clGetProgramBuildInfo(
-        program, device->cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL);
-
-    build_log[ret_val_size] = '\0';
-    /* Skip meaningless empty output from the NVidia compiler. */
-    if (!(ret_val_size == 2 && build_log[0] == '\n')) {
-      add_log(string("OpenCL program ") + program_name + " build output: " + string(&build_log[0]),
-              ciErr == CL_SUCCESS);
-    }
-  }
-
-  return (ciErr == CL_SUCCESS);
-}
-
-bool OpenCLDevice::OpenCLProgram::compile_kernel(const string *debug_src)
-{
-  string source = get_program_source(kernel_file);
-
-  if (debug_src) {
-    path_write_text(*debug_src, source);
-  }
-
-  size_t source_len = source.size();
-  const char *source_str = source.c_str();
-  cl_int ciErr;
-
-  program = clCreateProgramWithSource(device->cxContext, 1, &source_str, &source_len, &ciErr);
-
-  if (ciErr != CL_SUCCESS) {
-    add_error(string("OpenCL program creation failed: ") + clewErrorString(ciErr));
-    return false;
-  }
-
-  double starttime = time_dt();
-  add_log(string("Cycles: compiling OpenCL program ") + program_name + "...", false);
-  add_log(string("Build flags: ") + kernel_build_options, true);
-
-  if (!build_kernel(debug_src))
-    return false;
-
-  double elapsed = time_dt() - starttime;
-  add_log(
-      string_printf("Kernel compilation of %s finished in %.2lfs.", program_name.c_str(), elapsed),
-      false);
-
-  return true;
-}
-
-static void escape_python_string(string &str)
-{
-  /* Escape string to be passed as a Python raw string with '' quotes'. */
-  string_replace(str, "'", "\'");
-}
-
-static int opencl_compile_process_limit()
-{
-  /* Limit number of concurrent processes compiling, with a heuristic based
-   * on total physical RAM and estimate of memory usage needed when compiling
-   * with all Cycles features enabled.
-   *
-   * This is somewhat arbitrary as we don't know the actual available RAM or
-   * how much the kernel compilation will needed depending on the features, but
-   * better than not limiting at all. */
-  static const int64_t GB = 1024LL * 1024LL * 1024LL;
-  static const int64_t process_memory = 2 * GB;
-  static const int64_t base_memory = 2 * GB;
-  static const int64_t system_memory = system_physical_ram();
-  static const int64_t process_limit = (system_memory - base_memory) / process_memory;
-
-  return max((int)process_limit, 1);
-}
-
-bool OpenCLDevice::OpenCLProgram::compile_separate(const string &clbin)
-{
-  /* Construct arguments. */
-  vector<string> args;
-  args.push_back("--background");
-  args.push_back("--factory-startup");
-  args.push_back("--python-expr");
-
-  int device_platform_id = device->device_num;
-  string device_name = device->device_name;
-  string platform_name = device->platform_name;
-  string build_options = device->kernel_build_options(NULL) + kernel_build_options;
-  string kernel_file_escaped = kernel_file;
-  string clbin_escaped = clbin;
-
-  escape_python_string(device_name);
-  escape_python_string(platform_name);
-  escape_python_string(build_options);
-  escape_python_string(kernel_file_escaped);
-  escape_python_string(clbin_escaped);
-
-  args.push_back(string_printf(
-      "import _cycles; _cycles.opencl_compile(r'%d', r'%s', r'%s', r'%s', r'%s', r'%s')",
-      device_platform_id,
-      device_name.c_str(),
-      platform_name.c_str(),
-      build_options.c_str(),
-      kernel_file_escaped.c_str(),
-      clbin_escaped.c_str()));
-
-  /* Limit number of concurrent processes compiling. */
-  static thread_counting_semaphore semaphore(opencl_compile_process_limit());
-  semaphore.acquire();
-
-  /* Compile. */
-  const double starttime = time_dt();
-  add_log(string("Cycles: compiling OpenCL program ") + program_name + "...", false);
-  add_log(string("Build flags: ") + kernel_build_options, true);
-  const bool success = system_call_self(args);
-  const double elapsed = time_dt() - starttime;
-
-  semaphore.release();
-
-  if (!success || !path_exists(clbin)) {
-    return false;
-  }
-
-  add_log(
-      string_printf("Kernel compilation of %s finished in %.2lfs.", program_name.c_str(), elapsed),
-      false);
-
-  return load_binary(clbin);
-}
-
-/* Compile opencl kernel. This method is called from the _cycles Python
- * module compile kernels. Parameters must match function above. */
-bool device_opencl_compile_kernel(const vector<string> &parameters)
-{
-  int device_platform_id = std::stoi(parameters[0]);
-  const string &device_name = parameters[1];
-  const string &platform_name = parameters[2];
-  const string &build_options = parameters[3];
-  const string &kernel_file = parameters[4];
-  const string &binary_path = parameters[5];
-
-  if (clewInit() != CLEW_SUCCESS) {
-    return false;
-  }
-
-  vector<OpenCLPlatformDevice> usable_devices;
-  OpenCLInfo::get_usable_devices(&usable_devices);
-  if (device_platform_id >= usable_devices.size()) {
-    return false;
-  }
-
-  OpenCLPlatformDevice &platform_device = usable_devices[device_platform_id];
-  if (platform_device.platform_name != platform_name ||
-      platform_device.device_name != device_name) {
-    return false;
-  }
-
-  cl_platform_id platform = platform_device.platform_id;
-  cl_device_id device = platform_device.device_id;
-  const cl_context_properties context_props[] = {
-      CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0, 0};
-
-  cl_int err;
-  cl_context context = clCreateContext(context_props, 1, &device, NULL, NULL, &err);
-  if (err != CL_SUCCESS) {
-    return false;
-  }
-
-  string source = get_program_source(kernel_file);
-  size_t source_len = source.size();
-  const char *source_str = source.c_str();
-  cl_program program = clCreateProgramWithSource(context, 1, &source_str, &source_len, &err);
-  bool result = false;
-
-  if (err == CL_SUCCESS) {
-    err = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL);
-
-    if (err == CL_SUCCESS) {
-      size_t size = 0;
-      clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
-      if (size > 0) {
-        vector<uint8_t> binary(size);
-        uint8_t *bytes = &binary[0];
-        clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &bytes, NULL);
-        result = path_write_binary(binary_path, binary);
-      }
-    }
-    clReleaseProgram(program);
-  }
-
-  clReleaseContext(context);
-
-  return result;
-}
-
-bool OpenCLDevice::OpenCLProgram::load_binary(const string &clbin, const string *debug_src)
-{
-  /* read binary into memory */
-  vector<uint8_t> binary;
-
-  if (!path_read_binary(clbin, binary)) {
-    add_error(string_printf("OpenCL failed to read cached binary %s.", clbin.c_str()));
-    return false;
-  }
-
-  /* create program */
-  cl_int status, ciErr;
-  size_t size = binary.size();
-  const uint8_t *bytes = &binary[0];
-
-  program = clCreateProgramWithBinary(
-      device->cxContext, 1, &device->cdDevice, &size, &bytes, &status, &ciErr);
-
-  if (status != CL_SUCCESS || ciErr != CL_SUCCESS) {
-    add_error(string("OpenCL failed create program from cached binary ") + clbin + ": " +
-              clewErrorString(status) + " " + clewErrorString(ciErr));
-    return false;
-  }
-
-  if (!build_kernel(debug_src))
-    return false;
-
-  return true;
-}
-
-bool OpenCLDevice::OpenCLProgram::save_binary(const string &clbin)
-{
-  size_t size = 0;
-  clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
-
-  if (!size)
-    return false;
-
-  vector<uint8_t> binary(size);
-  uint8_t *bytes = &binary[0];
-
-  clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &bytes, NULL);
-
-  return path_write_binary(clbin, binary);
-}
-
-bool OpenCLDevice::OpenCLProgram::load()
-{
-  loaded = false;
-  string device_md5 = device->device_md5_hash(kernel_build_options);
-
-  /* Try to use cached kernel. */
-  thread_scoped_lock cache_locker;
-  ustring cache_key(program_name + device_md5);
-  program = device->load_cached_kernel(cache_key, cache_locker);
-  if (!program) {
-    add_log(string("OpenCL program ") + program_name + " not found in cache.", true);
-
-    /* need to create source to get md5 */
-    string source = get_program_source(kernel_file);
-
-    string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" +
-                      util_md5_string(source);
-    basename = path_cache_get(path_join("kernels", basename));
-    string clbin = basename + ".clbin";
-
-    /* If binary kernel exists already, try use it. */
-    if (path_exists(clbin) && load_binary(clbin)) {
-      /* Kernel loaded from binary, nothing to do. */
-      add_log(string("Loaded program from ") + clbin + ".", true);
-
-      /* Cache the program. */
-      device->store_cached_kernel(program, cache_key, cache_locker);
-    }
-    else {
-      add_log(string("OpenCL program ") + program_name + " not found on disk.", true);
-      cache_locker.unlock();
-    }
-  }
-
-  if (program) {
-    create_kernels();
-    loaded = true;
-    needs_compiling = false;
-  }
-
-  return loaded;
-}
-
-void OpenCLDevice::OpenCLProgram::compile()
-{
-  assert(device);
-
-  string device_md5 = device->device_md5_hash(kernel_build_options);
-
-  /* Try to use cached kernel. */
-  thread_scoped_lock cache_locker;
-  ustring cache_key(program_name + device_md5);
-  program = device->load_cached_kernel(cache_key, cache_locker);
-
-  if (!program) {
-
-    add_log(string("OpenCL program ") + program_name + " not found in cache.", true);
-
-    /* need to create source to get md5 */
-    string source = get_program_source(kernel_file);
-
-    string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" +
-                      util_md5_string(source);
-    basename = path_cache_get(path_join("kernels", basename));
-    string clbin = basename + ".clbin";
-
-    /* path to preprocessed source for debugging */
-    string clsrc, *debug_src = NULL;
-
-    if (OpenCLInfo::use_debug()) {
-      clsrc = basename + ".cl";
-      debug_src = &clsrc;
-    }
-
-    if (DebugFlags().running_inside_blender && compile_separate(clbin)) {
-      add_log(string("Built and loaded program from ") + clbin + ".", true);
-      loaded = true;
-    }
-    else {
-      if (DebugFlags().running_inside_blender) {
-        add_log(string("Separate-process building of ") + clbin +
-                    " failed, will fall back to regular building.",
-                true);
-      }
-
-      /* If does not exist or loading binary failed, compile kernel. */
-      if (!compile_kernel(debug_src)) {
-        needs_compiling = false;
-        return;
-      }
-
-      /* Save binary for reuse. */
-      if (!save_binary(clbin)) {
-        add_log(string("Saving compiled OpenCL kernel to ") + clbin + " failed!", true);
-      }
-    }
-
-    /* Cache the program. */
-    device->store_cached_kernel(program, cache_key, cache_locker);
-  }
-
-  create_kernels();
-  needs_compiling = false;
-  loaded = true;
-}
-
-void OpenCLDevice::OpenCLProgram::create_kernels()
-{
-  for (map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end();
-       ++kernel) {
-    assert(kernel->second == NULL);
-    cl_int ciErr;
-    string name = "kernel_ocl_" + kernel->first.string();
-    kernel->second = clCreateKernel(program, name.c_str(), &ciErr);
-    if (device->opencl_error(ciErr)) {
-      add_error(string("Error getting kernel ") + name + " from program " + program_name + ": " +
-                clewErrorString(ciErr));
-      return;
-    }
-  }
-}
-
-bool OpenCLDevice::OpenCLProgram::wait_for_availability()
-{
-  add_log(string("Waiting for availability of ") + program_name + ".", true);
-  while (needs_compiling) {
-    time_sleep(0.1);
-  }
-  return loaded;
-}
-
-void OpenCLDevice::OpenCLProgram::report_error()
-{
-  /* If loaded is true, there was no error. */
-  if (loaded)
-    return;
-  /* if use_stdout is true, the error was already reported. */
-  if (use_stdout)
-    return;
-
-  cerr << error_msg << endl;
-  if (!compile_output.empty()) {
-    cerr << "OpenCL kernel build output for " << program_name << ":" << endl;
-    cerr << compile_output << endl;
-  }
-}
-
-cl_kernel OpenCLDevice::OpenCLProgram::operator()()
-{
-  assert(kernels.size() == 1);
-  return kernels.begin()->second;
-}
-
-cl_kernel OpenCLDevice::OpenCLProgram::operator()(ustring name)
-{
-  assert(kernels.count(name));
-  return kernels[name];
-}
-
-cl_device_type OpenCLInfo::device_type()
-{
-  switch (DebugFlags().opencl.device_type) {
-    case DebugFlags::OpenCL::DEVICE_NONE:
-      return 0;
-    case DebugFlags::OpenCL::DEVICE_ALL:
-      return CL_DEVICE_TYPE_ALL;
-    case DebugFlags::OpenCL::DEVICE_DEFAULT:
-      return CL_DEVICE_TYPE_DEFAULT;
-    case DebugFlags::OpenCL::DEVICE_CPU:
-      return CL_DEVICE_TYPE_CPU;
-    case DebugFlags::OpenCL::DEVICE_GPU:
-      return CL_DEVICE_TYPE_GPU;
-    case DebugFlags::OpenCL::DEVICE_ACCELERATOR:
-      return CL_DEVICE_TYPE_ACCELERATOR;
-    default:
-      return CL_DEVICE_TYPE_ALL;
-  }
-}
-
-bool OpenCLInfo::use_debug()
-{
-  return DebugFlags().opencl.debug;
-}
-
-bool OpenCLInfo::device_supported(const string &platform_name, const cl_device_id device_id)
-{
-  cl_device_type device_type;
-  if (!get_device_type(device_id, &device_type)) {
-    return false;
-  }
-  string device_name;
-  if (!get_device_name(device_id, &device_name)) {
-    return false;
-  }
-
-  int driver_major = 0;
-  int driver_minor = 0;
-  if (!get_driver_version(device_id, &driver_major, &driver_minor)) {
-    return false;
-  }
-  VLOG(3) << "OpenCL driver version " << driver_major << "." << driver_minor;
-
-  if (getenv("CYCLES_OPENCL_TEST")) {
-    return true;
-  }
-
-  /* Allow Intel GPUs on Intel OpenCL platform. */
-  if (platform_name.find("Intel") != string::npos) {
-    if (device_type != CL_DEVICE_TYPE_GPU) {
-      /* OpenCL on Intel CPU is not an officially supported configuration.
-       * Use hybrid CPU+GPU rendering to utilize both GPU and CPU. */
-      return false;
-    }
-
-#  ifdef __APPLE__
-    /* Apple uses own framework, which can also put Iris onto AMD frame-work.
-     * This isn't supported configuration. */
-    return false;
-#  else
-    if (device_name.find("Iris") != string::npos || device_name.find("Xe") != string::npos) {
-      return true;
-    }
-#  endif
-  }
-
-  if (platform_name == "AMD Accelerated Parallel Processing" &&
-      device_type == CL_DEVICE_TYPE_GPU) {
-    if (driver_major < 2236) {
-      VLOG(1) << "AMD driver version " << driver_major << "." << driver_minor << " not supported.";
-      return false;
-    }
-    const char *blacklist[] = {/* GCN 1 */
-                               "Tahiti",
-                               "Pitcairn",
-                               "Capeverde",
-                               "Oland",
-                               "Hainan",
-                               NULL};
-    for (int i = 0; blacklist[i] != NULL; i++) {
-      if (device_name == blacklist[i]) {
-        VLOG(1) << "AMD device " << device_name << " not supported";
-        return false;
-      }
-    }
-    return true;
-  }
-  if (platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) {
-    return false;
-  }
-  return false;
-}
-
-bool OpenCLInfo::platform_version_check(cl_platform_id platform, string *error)
-{
-  const int req_major = 1, req_minor = 1;
-  int major, minor;
-  char version[256];
-  clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(version), &version, NULL);
-  if (sscanf(version, "OpenCL %d.%d", &major, &minor) < 2) {
-    if (error != NULL) {
-      *error = string_printf("OpenCL: failed to parse platform version string (%s).", version);
-    }
-    return false;
-  }
-  if (!((major == req_major && minor >= req_minor) || (major > req_major))) {
-    if (error != NULL) {
-      *error = string_printf(
-          "OpenCL: platform version 1.1 or later required, found %d.%d", major, minor);
-    }
-    return false;
-  }
-  if (error != NULL) {
-    *error = "";
-  }
-  return true;
-}
-
-bool OpenCLInfo::get_device_version(cl_device_id device, int *r_major, int *r_minor, string *error)
-{
-  char version[256];
-  clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, sizeof(version), &version, NULL);
-  if (sscanf(version, "OpenCL C %d.%d", r_major, r_minor) < 2) {
-    if (error != NULL) {
-      *error = string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version);
-    }
-    return false;
-  }
-  if (error != NULL) {
-    *error = "";
-  }
-  return true;
-}
-
-bool OpenCLInfo::device_version_check(cl_device_id device, string *error)
-{
-  const int req_major = 1, req_minor = 1;
-  int major, minor;
-  if (!get_device_version(device, &major, &minor, error)) {
-    return false;
-  }
-
-  if (!((major == req_major && minor >= req_minor) || (major > req_major))) {
-    if (error != NULL) {
-      *error = string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor);
-    }
-    return false;
-  }
-  if (error != NULL) {
-    *error = "";
-  }
-  return true;
-}
-
-string OpenCLInfo::get_hardware_id(const string &platform_name, cl_device_id device_id)
-{
-  if (platform_name == "AMD Accelerated Parallel Processing" || platform_name == "Apple") {
-    /* Use cl_amd_device_topology extension. */
-    cl_char topology[24];
-    if (clGetDeviceInfo(device_id, 0x4037, sizeof(topology), topology, NULL) == CL_SUCCESS &&
-        topology[0] == 1) {
-      return string_printf("%02x:%02x.%01x",
-                           (unsigned int)topology[21],
-                           (unsigned int)topology[22],
-                           (unsigned int)topology[23]);
-    }
-  }
-  else if (platform_name == "NVIDIA CUDA") {
-    /* Use two undocumented options of the cl_nv_device_attribute_query extension. */
-    cl_int bus_id, slot_id;
-    if (clGetDeviceInfo(device_id, 0x4008, sizeof(cl_int), &bus_id, NULL) == CL_SUCCESS &&
-        clGetDeviceInfo(device_id, 0x4009, sizeof(cl_int), &slot_id, NULL) == CL_SUCCESS) {
-      return string_printf("%02x:%02x.%01x",
-                           (unsigned int)(bus_id),
-                           (unsigned int)(slot_id >> 3),
-                           (unsigned int)(slot_id & 0x7));
-    }
-  }
-  /* No general way to get a hardware ID from OpenCL => give up. */
-  return "";
-}
-
-void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices)
-{
-  const cl_device_type device_type = OpenCLInfo::device_type();
-  static bool first_time = true;
-#  define FIRST_VLOG(severity) \
-    if (first_time) \
-    VLOG(severity)
-
-  usable_devices->clear();
-
-  if (device_type == 0) {
-    FIRST_VLOG(2) << "OpenCL devices are forced to be disabled.";
-    first_time = false;
-    return;
-  }
-
-  cl_int error;
-  vector<cl_device_id> device_ids;
-  vector<cl_platform_id> platform_ids;
-
-  /* Get platforms. */
-  if (!get_platforms(&platform_ids, &error)) {
-    FIRST_VLOG(2) << "Error fetching platforms:" << string(clewErrorString(error));
-    first_time = false;
-    return;
-  }
-  if (platform_ids.size() == 0) {
-    FIRST_VLOG(2) << "No OpenCL platforms were found.";
-    first_time = false;
-    return;
-  }
-  /* Devices are numbered consecutively across platforms. */
-  for (int platform = 0; platform < platform_ids.size(); platform++) {
-    cl_platform_id platform_id = platform_ids[platform];
-    string platform_name;
-    if (!get_platform_name(platform_id, &platform_name)) {
-      FIRST_VLOG(2) << "Failed to get platform name, ignoring.";
-      continue;
-    }
-    FIRST_VLOG(2) << "Enumerating devices for platform " << platform_name << ".";
-    if (!platform_version_check(platform_id)) {
-      FIRST_VLOG(2) << "Ignoring platform " << platform_name
-                    << " due to too old compiler version.";
-      continue;
-    }
-    if (!get_platform_devices(platform_id, device_type, &device_ids, &error)) {
-      FIRST_VLOG(2) << "Ignoring platform " << platform_name
-                    << ", failed to fetch of devices: " << string(clewErrorString(error));
-      continue;
-    }
-    if (device_ids.size() == 0) {
-      FIRST_VLOG(2) << "Ignoring platform " << platform_name << ", it has no devices.";
-      continue;
-    }
-    for (int num = 0; num < device_ids.size(); num++) {
-      const cl_device_id device_id = device_ids[num];
-      string device_name;
-      if (!get_device_name(device_id, &device_name, &error)) {
-        FIRST_VLOG(2) << "Failed to fetch device name: " << string(clewErrorString(error))
-                      << ", ignoring.";
-        continue;
-      }
-      if (!device_version_check(device_id)) {
-        FIRST_VLOG(2) << "Ignoring device " << device_name << " due to old compiler version.";
-        continue;
-      }
-      if (device_supported(platform_name, device_id)) {
-        cl_device_type device_type;
-        if (!get_device_type(device_id, &device_type, &error)) {
-          FIRST_VLOG(2) << "Ignoring device " << device_name
-                        << ", failed to fetch device type:" << string(clewErrorString(error));
-          continue;
-        }
-        string readable_device_name = get_readable_device_name(device_id);
-        if (readable_device_name != device_name) {
-          FIRST_VLOG(2) << "Using more readable device name: " << readable_device_name;
-        }
-        FIRST_VLOG(2) << "Adding new device " << readable_device_name << ".";
-        string hardware_id = get_hardware_id(platform_name, device_id);
-        string device_extensions = get_device_extensions(device_id);
-        usable_devices->push_back(OpenCLPlatformDevice(platform_id,
-                                                       platform_name,
-                                                       device_id,
-                                                       device_type,
-                                                       readable_device_name,
-                                                       hardware_id,
-                                                       device_extensions));
-      }
-      else {
-        FIRST_VLOG(2) << "Ignoring device " << device_name << ", not officially supported yet.";
-      }
-    }
-  }
-  first_time = false;
-}
-
-bool OpenCLInfo::get_platforms(vector<cl_platform_id> *platform_ids, cl_int *error)
-{
-  /* Reset from possible previous state. */
-  platform_ids->resize(0);
-  cl_uint num_platforms;
-  if (!get_num_platforms(&num_platforms, error)) {
-    return false;
-  }
-  /* Get actual platforms. */
-  cl_int err;
-  platform_ids->resize(num_platforms);
-  if ((err = clGetPlatformIDs(num_platforms, &platform_ids->at(0), NULL)) != CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  return true;
-}
-
-vector<cl_platform_id> OpenCLInfo::get_platforms()
-{
-  vector<cl_platform_id> platform_ids;
-  get_platforms(&platform_ids);
-  return platform_ids;
-}
-
-bool OpenCLInfo::get_num_platforms(cl_uint *num_platforms, cl_int *error)
-{
-  cl_int err;
-  if ((err = clGetPlatformIDs(0, NULL, num_platforms)) != CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    *num_platforms = 0;
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  return true;
-}
-
-cl_uint OpenCLInfo::get_num_platforms()
-{
-  cl_uint num_platforms;
-  if (!get_num_platforms(&num_platforms)) {
-    return 0;
-  }
-  return num_platforms;
-}
-
-bool OpenCLInfo::get_platform_name(cl_platform_id platform_id, string *platform_name)
-{
-  char buffer[256];
-  if (clGetPlatformInfo(platform_id, CL_PLATFORM_NAME, sizeof(buffer), &buffer, NULL) !=
-      CL_SUCCESS) {
-    *platform_name = "";
-    return false;
-  }
-  *platform_name = buffer;
-  return true;
-}
-
-string OpenCLInfo::get_platform_name(cl_platform_id platform_id)
-{
-  string platform_name;
-  if (!get_platform_name(platform_id, &platform_name)) {
-    return "";
-  }
-  return platform_name;
-}
-
-bool OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id,
-                                          cl_device_type device_type,
-                                          cl_uint *num_devices,
-                                          cl_int *error)
-{
-  cl_int err;
-  if ((err = clGetDeviceIDs(platform_id, device_type, 0, NULL, num_devices)) != CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    *num_devices = 0;
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  return true;
-}
-
-cl_uint OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id,
-                                             cl_device_type device_type)
-{
-  cl_uint num_devices;
-  if (!get_num_platform_devices(platform_id, device_type, &num_devices)) {
-    return 0;
-  }
-  return num_devices;
-}
-
-bool OpenCLInfo::get_platform_devices(cl_platform_id platform_id,
-                                      cl_device_type device_type,
-                                      vector<cl_device_id> *device_ids,
-                                      cl_int *error)
-{
-  /* Reset from possible previous state. */
-  device_ids->resize(0);
-  /* Get number of devices to pre-allocate memory. */
-  cl_uint num_devices;
-  if (!get_num_platform_devices(platform_id, device_type, &num_devices, error)) {
-    return false;
-  }
-  /* Get actual device list. */
-  device_ids->resize(num_devices);
-  cl_int err;
-  if ((err = clGetDeviceIDs(platform_id, device_type, num_devices, &device_ids->at(0), NULL)) !=
-      CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  return true;
-}
-
-vector<cl_device_id> OpenCLInfo::get_platform_devices(cl_platform_id platform_id,
-                                                      cl_device_type device_type)
-{
-  vector<cl_device_id> devices;
-  get_platform_devices(platform_id, device_type, &devices);
-  return devices;
-}
-
-bool OpenCLInfo::get_device_name(cl_device_id device_id, string *device_name, cl_int *error)
-{
-  char buffer[1024];
-  cl_int err;
-  if ((err = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(buffer), &buffer, NULL)) !=
-      CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    *device_name = "";
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  *device_name = buffer;
-  return true;
-}
-
-string OpenCLInfo::get_device_name(cl_device_id device_id)
-{
-  string device_name;
-  if (!get_device_name(device_id, &device_name)) {
-    return "";
-  }
-  return device_name;
-}
-
-bool OpenCLInfo::get_device_extensions(cl_device_id device_id,
-                                       string *device_extensions,
-                                       cl_int *error)
-{
-  size_t extension_length = 0;
-  cl_int err;
-  /* Determine the size of the extension string. */
-  if ((err = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, 0, 0, &extension_length)) !=
-      CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    *device_extensions = "";
-    return false;
-  }
-  vector<char> buffer(extension_length);
-  if ((err = clGetDeviceInfo(
-           device_id, CL_DEVICE_EXTENSIONS, extension_length, buffer.data(), NULL)) !=
-      CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    *device_extensions = "";
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  *device_extensions = string(buffer.data());
-  return true;
-}
-
-string OpenCLInfo::get_device_extensions(cl_device_id device_id)
-{
-  string device_extensions;
-  if (!get_device_extensions(device_id, &device_extensions)) {
-    return "";
-  }
-  return device_extensions;
-}
-
-bool OpenCLInfo::get_device_type(cl_device_id device_id,
-                                 cl_device_type *device_type,
-                                 cl_int *error)
-{
-  cl_int err;
-  if ((err = clGetDeviceInfo(
-           device_id, CL_DEVICE_TYPE, sizeof(cl_device_type), device_type, NULL)) != CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    *device_type = 0;
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  return true;
-}
-
-cl_device_type OpenCLInfo::get_device_type(cl_device_id device_id)
-{
-  cl_device_type device_type;
-  if (!get_device_type(device_id, &device_type)) {
-    return 0;
-  }
-  return device_type;
-}
-
-string OpenCLInfo::get_readable_device_name(cl_device_id device_id)
-{
-  string name = "";
-  char board_name[1024];
-  size_t length = 0;
-  if (clGetDeviceInfo(
-          device_id, CL_DEVICE_BOARD_NAME_AMD, sizeof(board_name), &board_name, &length) ==
-      CL_SUCCESS) {
-    if (length != 0 && board_name[0] != '\0') {
-      name = board_name;
-    }
-  }
-
-  /* Fallback to standard device name API. */
-  if (name.empty()) {
-    name = get_device_name(device_id);
-  }
-
-  /* Special exception for AMD Vega, need to be able to tell
-   * Vega 56 from 64 apart.
-   */
-  if (name == "Radeon RX Vega") {
-    cl_int max_compute_units = 0;
-    if (clGetDeviceInfo(device_id,
-                        CL_DEVICE_MAX_COMPUTE_UNITS,
-                        sizeof(max_compute_units),
-                        &max_compute_units,
-                        NULL) == CL_SUCCESS) {
-      name += " " + to_string(max_compute_units);
-    }
-  }
-
-  /* Distinguish from our native CPU device. */
-  if (get_device_type(device_id) & CL_DEVICE_TYPE_CPU) {
-    name += " (OpenCL)";
-  }
-
-  return name;
-}
-
-bool OpenCLInfo::get_driver_version(cl_device_id device_id, int *major, int *minor, cl_int *error)
-{
-  char buffer[1024];
-  cl_int err;
-  if ((err = clGetDeviceInfo(device_id, CL_DRIVER_VERSION, sizeof(buffer), &buffer, NULL)) !=
-      CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  if (sscanf(buffer, "%d.%d", major, minor) < 2) {
-    VLOG(1) << string_printf("OpenCL: failed to parse driver version string (%s).", buffer);
-    return false;
-  }
-  return true;
-}
-
-int OpenCLInfo::mem_sub_ptr_alignment(cl_device_id device_id)
-{
-  int base_align_bits;
-  if (clGetDeviceInfo(
-          device_id, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(int), &base_align_bits, NULL) ==
-      CL_SUCCESS) {
-    return base_align_bits / 8;
-  }
-  return 1;
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/optix/device.cpp b/intern/cycles/device/optix/device.cpp
new file mode 100644
index 00000000000..13f23bd229a
--- /dev/null
+++ b/intern/cycles/device/optix/device.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/optix/device.h"
+
+#include "device/cuda/device.h"
+#include "device/optix/device_impl.h"
+#include "util/util_logging.h"
+
+#ifdef WITH_OPTIX
+#  include <optix_function_table_definition.h>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+bool device_optix_init()
+{
+#ifdef WITH_OPTIX
+  if (g_optixFunctionTable.optixDeviceContextCreate != NULL) {
+    /* Already initialized function table. */
+    return true;
+  }
+
+  /* Need to initialize CUDA as well. */
+  if (!device_cuda_init()) {
+    return false;
+  }
+
+  const OptixResult result = optixInit();
+
+  if (result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) {
+    VLOG(1) << "OptiX initialization failed because the installed NVIDIA driver is too old. "
+               "Please update to the latest driver first!";
+    return false;
+  }
+  else if (result != OPTIX_SUCCESS) {
+    VLOG(1) << "OptiX initialization failed with error code " << (unsigned int)result;
+    return false;
+  }
+
+  /* Loaded OptiX successfully! */
+  return true;
+#else
+  return false;
+#endif
+}
+
+void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices)
+{
+#ifdef WITH_OPTIX
+  devices.reserve(cuda_devices.size());
+
+  /* Simply add all supported CUDA devices as OptiX devices again. */
+  for (DeviceInfo info : cuda_devices) {
+    assert(info.type == DEVICE_CUDA);
+
+    int major;
+    cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, info.num);
+    if (major < 5) {
+      /* Only Maxwell and up are supported by OptiX. */
+      continue;
+    }
+
+    info.type = DEVICE_OPTIX;
+    info.id += "_OptiX";
+    info.denoisers |= DENOISER_OPTIX;
+
+    devices.push_back(info);
+  }
+#else
+  (void)cuda_devices;
+  (void)devices;
+#endif
+}
+
+Device *device_optix_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+{
+#ifdef WITH_OPTIX
+  return new OptiXDevice(info, stats, profiler);
+#else
+  (void)info;
+  (void)stats;
+  (void)profiler;
+
+  LOG(FATAL) << "Request to create OptiX device without compiled-in support. Should never happen.";
+
+  return nullptr;
+#endif
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/optix/device.h b/intern/cycles/device/optix/device.h
new file mode 100644
index 00000000000..29fa729c2e4
--- /dev/null
+++ b/intern/cycles/device/optix/device.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_string.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+bool device_optix_init();
+
+Device *device_optix_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp
new file mode 100644
index 00000000000..b54d423a183
--- /dev/null
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -0,0 +1,1573 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_OPTIX
+
+#  include "device/optix/device_impl.h"
+
+#  include "bvh/bvh.h"
+#  include "bvh/bvh_optix.h"
+#  include "integrator/pass_accessor_gpu.h"
+#  include "render/buffers.h"
+#  include "render/hair.h"
+#  include "render/mesh.h"
+#  include "render/object.h"
+#  include "render/pass.h"
+#  include "render/scene.h"
+
+#  include "util/util_debug.h"
+#  include "util/util_logging.h"
+#  include "util/util_md5.h"
+#  include "util/util_path.h"
+#  include "util/util_progress.h"
+#  include "util/util_time.h"
+
+#  undef __KERNEL_CPU__
+#  define __KERNEL_OPTIX__
+#  include "kernel/device/optix/globals.h"
+
+CCL_NAMESPACE_BEGIN
+
+OptiXDevice::Denoiser::Denoiser(OptiXDevice *device)
+    : device(device), queue(device), state(device, "__denoiser_state")
+{
+}
+
+OptiXDevice::Denoiser::~Denoiser()
+{
+  const CUDAContextScope scope(device);
+  if (optix_denoiser != nullptr) {
+    optixDenoiserDestroy(optix_denoiser);
+  }
+}
+
+OptiXDevice::OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+    : CUDADevice(info, stats, profiler),
+      sbt_data(this, "__sbt", MEM_READ_ONLY),
+      launch_params(this, "__params"),
+      denoiser_(this)
+{
+  /* Make the CUDA context current. */
+  if (!cuContext) {
+    /* Do not initialize if CUDA context creation failed already. */
+    return;
+  }
+  const CUDAContextScope scope(this);
+
+  /* Create OptiX context for this device. */
+  OptixDeviceContextOptions options = {};
+#  ifdef WITH_CYCLES_LOGGING
+  options.logCallbackLevel = 4; /* Fatal = 1, Error = 2, Warning = 3, Print = 4. */
+  options.logCallbackFunction = [](unsigned int level, const char *, const char *message, void *) {
+    switch (level) {
+      case 1:
+        LOG_IF(FATAL, VLOG_IS_ON(1)) << message;
+        break;
+      case 2:
+        LOG_IF(ERROR, VLOG_IS_ON(1)) << message;
+        break;
+      case 3:
+        LOG_IF(WARNING, VLOG_IS_ON(1)) << message;
+        break;
+      case 4:
+        LOG_IF(INFO, VLOG_IS_ON(1)) << message;
+        break;
+    }
+  };
+#  endif
+  if (DebugFlags().optix.use_debug) {
+    options.validationMode = OPTIX_DEVICE_CONTEXT_VALIDATION_MODE_ALL;
+  }
+  optix_assert(optixDeviceContextCreate(cuContext, &options, &context));
+#  ifdef WITH_CYCLES_LOGGING
+  optix_assert(optixDeviceContextSetLogCallback(
+      context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel));
+#  endif
+
+  /* Fix weird compiler bug that assigns wrong size. */
+  launch_params.data_elements = sizeof(KernelParamsOptiX);
+
+  /* Allocate launch parameter buffer memory on device. */
+  launch_params.alloc_to_device(1);
+}
+
+OptiXDevice::~OptiXDevice()
+{
+  /* Make CUDA context current. */
+  const CUDAContextScope scope(this);
+
+  free_bvh_memory_delayed();
+
+  sbt_data.free();
+  texture_info.free();
+  launch_params.free();
+
+  /* Unload modules. */
+  if (optix_module != NULL) {
+    optixModuleDestroy(optix_module);
+  }
+  for (unsigned int i = 0; i < 2; ++i) {
+    if (builtin_modules[i] != NULL) {
+      optixModuleDestroy(builtin_modules[i]);
+    }
+  }
+  for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
+    if (pipelines[i] != NULL) {
+      optixPipelineDestroy(pipelines[i]);
+    }
+  }
+
+  optixDeviceContextDestroy(context);
+}
+
+unique_ptr<DeviceQueue> OptiXDevice::gpu_queue_create()
+{
+  return make_unique<OptiXDeviceQueue>(this);
+}
+
+BVHLayoutMask OptiXDevice::get_bvh_layout_mask() const
+{
+  /* OptiX has its own internal acceleration structure format. */
+  return BVH_LAYOUT_OPTIX;
+}
+
+string OptiXDevice::compile_kernel_get_common_cflags(const uint kernel_features)
+{
+  string common_cflags = CUDADevice::compile_kernel_get_common_cflags(kernel_features);
+
+  /* Add OptiX SDK include directory to include paths. */
+  const char *optix_sdk_path = getenv("OPTIX_ROOT_DIR");
+  if (optix_sdk_path) {
+    common_cflags += string_printf(" -I\"%s/include\"", optix_sdk_path);
+  }
+
+  /* Specialization for shader raytracing. */
+  if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+    common_cflags += " --keep-device-functions";
+  }
+
+  return common_cflags;
+}
+
+bool OptiXDevice::load_kernels(const uint kernel_features)
+{
+  if (have_error()) {
+    /* Abort early if context creation failed already. */
+    return false;
+  }
+
+  /* Load CUDA modules because we need some of the utility kernels. */
+  if (!CUDADevice::load_kernels(kernel_features)) {
+    return false;
+  }
+
+  /* Skip creating OptiX module if only doing denoising. */
+  if (!(kernel_features & (KERNEL_FEATURE_PATH_TRACING | KERNEL_FEATURE_BAKING))) {
+    return true;
+  }
+
+  const CUDAContextScope scope(this);
+
+  /* Unload existing OptiX module and pipelines first. */
+  if (optix_module != NULL) {
+    optixModuleDestroy(optix_module);
+    optix_module = NULL;
+  }
+  for (unsigned int i = 0; i < 2; ++i) {
+    if (builtin_modules[i] != NULL) {
+      optixModuleDestroy(builtin_modules[i]);
+      builtin_modules[i] = NULL;
+    }
+  }
+  for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
+    if (pipelines[i] != NULL) {
+      optixPipelineDestroy(pipelines[i]);
+      pipelines[i] = NULL;
+    }
+  }
+
+  OptixModuleCompileOptions module_options = {};
+  module_options.maxRegisterCount = 0; /* Do not set an explicit register limit. */
+
+  if (DebugFlags().optix.use_debug) {
+    module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_0;
+    module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
+  }
+  else {
+    module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
+    module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
+  }
+
+  module_options.boundValues = nullptr;
+  module_options.numBoundValues = 0;
+
+  OptixPipelineCompileOptions pipeline_options = {};
+  /* Default to no motion blur and two-level graph, since it is the fastest option. */
+  pipeline_options.usesMotionBlur = false;
+  pipeline_options.traversableGraphFlags =
+      OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING;
+  pipeline_options.numPayloadValues = 6;
+  pipeline_options.numAttributeValues = 2; /* u, v */
+  pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE;
+  pipeline_options.pipelineLaunchParamsVariableName = "__params"; /* See globals.h */
+
+  pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE;
+  if (kernel_features & KERNEL_FEATURE_HAIR) {
+    if (kernel_features & KERNEL_FEATURE_HAIR_THICK) {
+      pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE;
+    }
+    else
+      pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
+  }
+
+  /* Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
+   * This is necessary since objects may be reported to have motion if the Vector pass is
+   * active, but may still need to be rendered without motion blur if that isn't active as well. */
+  motion_blur = (kernel_features & KERNEL_FEATURE_OBJECT_MOTION) != 0;
+
+  if (motion_blur) {
+    pipeline_options.usesMotionBlur = true;
+    /* Motion blur can insert motion transforms into the traversal graph.
+     * It is no longer a two-level graph then, so need to set flags to allow any configuration. */
+    pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY;
+  }
+
+  { /* Load and compile PTX module with OptiX kernels. */
+    string ptx_data, ptx_filename = path_get((kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ?
+                                                 "lib/kernel_optix_shader_raytrace.ptx" :
+                                                 "lib/kernel_optix.ptx");
+    if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
+      if (!getenv("OPTIX_ROOT_DIR")) {
+        set_error(
+            "Missing OPTIX_ROOT_DIR environment variable (which must be set with the path to "
+            "the Optix SDK to be able to compile Optix kernels on demand).");
+        return false;
+      }
+      ptx_filename = compile_kernel(
+          kernel_features,
+          (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ? "kernel_shader_raytrace" : "kernel",
+          "optix",
+          true);
+    }
+    if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) {
+      set_error(string_printf("Failed to load OptiX kernel from '%s'", ptx_filename.c_str()));
+      return false;
+    }
+
+    const OptixResult result = optixModuleCreateFromPTX(context,
+                                                        &module_options,
+                                                        &pipeline_options,
+                                                        ptx_data.data(),
+                                                        ptx_data.size(),
+                                                        nullptr,
+                                                        0,
+                                                        &optix_module);
+    if (result != OPTIX_SUCCESS) {
+      set_error(string_printf("Failed to load OptiX kernel from '%s' (%s)",
+                              ptx_filename.c_str(),
+                              optixGetErrorName(result)));
+      return false;
+    }
+  }
+
+  /* Create program groups. */
+  OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
+  OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
+  OptixProgramGroupOptions group_options = {}; /* There are no options currently. */
+  group_descs[PG_RGEN_INTERSECT_CLOSEST].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+  group_descs[PG_RGEN_INTERSECT_CLOSEST].raygen.module = optix_module;
+  group_descs[PG_RGEN_INTERSECT_CLOSEST].raygen.entryFunctionName =
+      "__raygen__kernel_optix_integrator_intersect_closest";
+  group_descs[PG_RGEN_INTERSECT_SHADOW].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+  group_descs[PG_RGEN_INTERSECT_SHADOW].raygen.module = optix_module;
+  group_descs[PG_RGEN_INTERSECT_SHADOW].raygen.entryFunctionName =
+      "__raygen__kernel_optix_integrator_intersect_shadow";
+  group_descs[PG_RGEN_INTERSECT_SUBSURFACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+  group_descs[PG_RGEN_INTERSECT_SUBSURFACE].raygen.module = optix_module;
+  group_descs[PG_RGEN_INTERSECT_SUBSURFACE].raygen.entryFunctionName =
+      "__raygen__kernel_optix_integrator_intersect_subsurface";
+  group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+  group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].raygen.module = optix_module;
+  group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].raygen.entryFunctionName =
+      "__raygen__kernel_optix_integrator_intersect_volume_stack";
+  group_descs[PG_MISS].kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
+  group_descs[PG_MISS].miss.module = optix_module;
+  group_descs[PG_MISS].miss.entryFunctionName = "__miss__kernel_optix_miss";
+  group_descs[PG_HITD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+  group_descs[PG_HITD].hitgroup.moduleCH = optix_module;
+  group_descs[PG_HITD].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit";
+  group_descs[PG_HITD].hitgroup.moduleAH = optix_module;
+  group_descs[PG_HITD].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_visibility_test";
+  group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+  group_descs[PG_HITS].hitgroup.moduleAH = optix_module;
+  group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit";
+
+  if (kernel_features & KERNEL_FEATURE_HAIR) {
+    if (kernel_features & KERNEL_FEATURE_HAIR_THICK) {
+      /* Built-in thick curve intersection. */
+      OptixBuiltinISOptions builtin_options = {};
+      builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
+      builtin_options.usesMotionBlur = false;
+
+      optix_assert(optixBuiltinISModuleGet(
+          context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[0]));
+
+      group_descs[PG_HITD].hitgroup.moduleIS = builtin_modules[0];
+      group_descs[PG_HITD].hitgroup.entryFunctionNameIS = nullptr;
+      group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0];
+      group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr;
+
+      if (motion_blur) {
+        builtin_options.usesMotionBlur = true;
+
+        optix_assert(optixBuiltinISModuleGet(
+            context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[1]));
+
+        group_descs[PG_HITD_MOTION] = group_descs[PG_HITD];
+        group_descs[PG_HITD_MOTION].hitgroup.moduleIS = builtin_modules[1];
+        group_descs[PG_HITS_MOTION] = group_descs[PG_HITS];
+        group_descs[PG_HITS_MOTION].hitgroup.moduleIS = builtin_modules[1];
+      }
+    }
+    else {
+      /* Custom ribbon intersection. */
+      group_descs[PG_HITD].hitgroup.moduleIS = optix_module;
+      group_descs[PG_HITS].hitgroup.moduleIS = optix_module;
+      group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
+      group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
+    }
+  }
+
+  if (kernel_features & (KERNEL_FEATURE_SUBSURFACE | KERNEL_FEATURE_NODE_RAYTRACE)) {
+    /* Add hit group for local intersections. */
+    group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+    group_descs[PG_HITL].hitgroup.moduleAH = optix_module;
+    group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit";
+  }
+
+  /* Shader raytracing replaces some functions with direct callables. */
+  if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+    group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.module = optix_module;
+    group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.entryFunctionName =
+        "__raygen__kernel_optix_integrator_shade_surface_raytrace";
+    group_descs[PG_CALL_SVM_AO].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+    group_descs[PG_CALL_SVM_AO].callables.moduleDC = optix_module;
+    group_descs[PG_CALL_SVM_AO].callables.entryFunctionNameDC = "__direct_callable__svm_node_ao";
+    group_descs[PG_CALL_SVM_BEVEL].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+    group_descs[PG_CALL_SVM_BEVEL].callables.moduleDC = optix_module;
+    group_descs[PG_CALL_SVM_BEVEL].callables.entryFunctionNameDC =
+        "__direct_callable__svm_node_bevel";
+    group_descs[PG_CALL_AO_PASS].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+    group_descs[PG_CALL_AO_PASS].callables.moduleDC = optix_module;
+    group_descs[PG_CALL_AO_PASS].callables.entryFunctionNameDC = "__direct_callable__ao_pass";
+  }
+
+  optix_assert(optixProgramGroupCreate(
+      context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups));
+
+  /* Get program stack sizes. */
+  OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
+  /* Set up SBT, which in this case is used only to select between different programs. */
+  sbt_data.alloc(NUM_PROGRAM_GROUPS);
+  memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS);
+  for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+    optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
+    optix_assert(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
+  }
+  sbt_data.copy_to_device(); /* Upload SBT to device. */
+
+  /* Calculate maximum trace continuation stack size. */
+  unsigned int trace_css = stack_size[PG_HITD].cssCH;
+  /* This is based on the maximum of closest-hit and any-hit/intersection programs. */
+  trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
+  trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
+  trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
+  trace_css = std::max(trace_css,
+                       stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH);
+  trace_css = std::max(trace_css,
+                       stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH);
+
+  OptixPipelineLinkOptions link_options = {};
+  link_options.maxTraceDepth = 1;
+
+  if (DebugFlags().optix.use_debug) {
+    link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
+  }
+  else {
+    link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
+  }
+
+  if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+    /* Create shader raytracing pipeline. */
+    vector<OptixProgramGroup> pipeline_groups;
+    pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
+    pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]);
+    pipeline_groups.push_back(groups[PG_MISS]);
+    pipeline_groups.push_back(groups[PG_HITD]);
+    pipeline_groups.push_back(groups[PG_HITS]);
+    pipeline_groups.push_back(groups[PG_HITL]);
+    if (motion_blur) {
+      pipeline_groups.push_back(groups[PG_HITD_MOTION]);
+      pipeline_groups.push_back(groups[PG_HITS_MOTION]);
+    }
+    pipeline_groups.push_back(groups[PG_CALL_SVM_AO]);
+    pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]);
+
+    optix_assert(optixPipelineCreate(context,
+                                     &pipeline_options,
+                                     &link_options,
+                                     pipeline_groups.data(),
+                                     pipeline_groups.size(),
+                                     nullptr,
+                                     0,
+                                     &pipelines[PIP_SHADE_RAYTRACE]));
+
+    /* Combine ray generation and trace continuation stack size. */
+    const unsigned int css = stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG +
+                             link_options.maxTraceDepth * trace_css;
+    const unsigned int dss = std::max(stack_size[PG_CALL_SVM_AO].dssDC,
+                                      stack_size[PG_CALL_SVM_BEVEL].dssDC);
+
+    /* Set stack size depending on pipeline options. */
+    optix_assert(optixPipelineSetStackSize(
+        pipelines[PIP_SHADE_RAYTRACE], 0, dss, css, motion_blur ? 3 : 2));
+  }
+
+  { /* Create intersection-only pipeline. */
+    vector<OptixProgramGroup> pipeline_groups;
+    pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
+    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_CLOSEST]);
+    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SHADOW]);
+    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SUBSURFACE]);
+    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_VOLUME_STACK]);
+    pipeline_groups.push_back(groups[PG_MISS]);
+    pipeline_groups.push_back(groups[PG_HITD]);
+    pipeline_groups.push_back(groups[PG_HITS]);
+    pipeline_groups.push_back(groups[PG_HITL]);
+    if (motion_blur) {
+      pipeline_groups.push_back(groups[PG_HITD_MOTION]);
+      pipeline_groups.push_back(groups[PG_HITS_MOTION]);
+    }
+
+    optix_assert(optixPipelineCreate(context,
+                                     &pipeline_options,
+                                     &link_options,
+                                     pipeline_groups.data(),
+                                     pipeline_groups.size(),
+                                     nullptr,
+                                     0,
+                                     &pipelines[PIP_INTERSECT]));
+
+    /* Calculate continuation stack size based on the maximum of all ray generation stack sizes. */
+    const unsigned int css =
+        std::max(stack_size[PG_RGEN_INTERSECT_CLOSEST].cssRG,
+                 std::max(stack_size[PG_RGEN_INTERSECT_SHADOW].cssRG,
+                          std::max(stack_size[PG_RGEN_INTERSECT_SUBSURFACE].cssRG,
+                                   stack_size[PG_RGEN_INTERSECT_VOLUME_STACK].cssRG))) +
+        link_options.maxTraceDepth * trace_css;
+
+    optix_assert(
+        optixPipelineSetStackSize(pipelines[PIP_INTERSECT], 0, 0, css, motion_blur ? 3 : 2));
+  }
+
+  /* Clean up program group objects. */
+  for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+    optixProgramGroupDestroy(groups[i]);
+  }
+
+  return true;
+}
+
+/* --------------------------------------------------------------------
+ * Buffer denoising.
+ */
+
+class OptiXDevice::DenoiseContext {
+ public:
+  explicit DenoiseContext(OptiXDevice *device, const DeviceDenoiseTask &task)
+      : denoise_params(task.params),
+        render_buffers(task.render_buffers),
+        buffer_params(task.buffer_params),
+        guiding_buffer(device, "denoiser guiding passes buffer"),
+        num_samples(task.num_samples)
+  {
+    num_input_passes = 1;
+    if (denoise_params.use_pass_albedo) {
+      num_input_passes += 1;
+      use_pass_albedo = true;
+      pass_denoising_albedo = buffer_params.get_pass_offset(PASS_DENOISING_ALBEDO);
+      if (denoise_params.use_pass_normal) {
+        num_input_passes += 1;
+        use_pass_normal = true;
+        pass_denoising_normal = buffer_params.get_pass_offset(PASS_DENOISING_NORMAL);
+      }
+    }
+
+    const int num_guiding_passes = num_input_passes - 1;
+
+    if (num_guiding_passes) {
+      if (task.allow_inplace_modification) {
+        guiding_params.device_pointer = render_buffers->buffer.device_pointer;
+
+        guiding_params.pass_albedo = pass_denoising_albedo;
+        guiding_params.pass_normal = pass_denoising_normal;
+
+        guiding_params.stride = buffer_params.stride;
+        guiding_params.pass_stride = buffer_params.pass_stride;
+      }
+      else {
+        guiding_params.pass_stride = 0;
+        if (use_pass_albedo) {
+          guiding_params.pass_albedo = guiding_params.pass_stride;
+          guiding_params.pass_stride += 3;
+        }
+        if (use_pass_normal) {
+          guiding_params.pass_normal = guiding_params.pass_stride;
+          guiding_params.pass_stride += 3;
+        }
+
+        guiding_params.stride = buffer_params.width;
+
+        guiding_buffer.alloc_to_device(buffer_params.width * buffer_params.height *
+                                       guiding_params.pass_stride);
+        guiding_params.device_pointer = guiding_buffer.device_pointer;
+      }
+    }
+
+    pass_sample_count = buffer_params.get_pass_offset(PASS_SAMPLE_COUNT);
+  }
+
+  const DenoiseParams &denoise_params;
+
+  RenderBuffers *render_buffers = nullptr;
+  const BufferParams &buffer_params;
+
+  /* Device-side storage of the guiding passes. */
+  device_only_memory<float> guiding_buffer;
+
+  struct {
+    device_ptr device_pointer = 0;
+
+    /* NOTE: Are only initialized when the corresponding guiding pass is enabled. */
+    int pass_albedo = PASS_UNUSED;
+    int pass_normal = PASS_UNUSED;
+
+    int stride = -1;
+    int pass_stride = -1;
+  } guiding_params;
+
+  /* Number of input passes. Including the color and extra auxiliary passes. */
+  int num_input_passes = 0;
+  bool use_pass_albedo = false;
+  bool use_pass_normal = false;
+
+  int num_samples = 0;
+
+  int pass_sample_count = PASS_UNUSED;
+
+  /* NOTE: Are only initialized when the corresponding guiding pass is enabled. */
+  int pass_denoising_albedo = PASS_UNUSED;
+  int pass_denoising_normal = PASS_UNUSED;
+
+  /* For passes which don't need albedo channel for denoising we replace the actual albedo with
+   * the (0.5, 0.5, 0.5). This flag indicates that the real albedo pass has been replaced with
+   * the fake values and denoising of passes which do need albedo can no longer happen. */
+  bool albedo_replaced_with_fake = false;
+};
+
+class OptiXDevice::DenoisePass {
+ public:
+  DenoisePass(const PassType type, const BufferParams &buffer_params) : type(type)
+  {
+    noisy_offset = buffer_params.get_pass_offset(type, PassMode::NOISY);
+    denoised_offset = buffer_params.get_pass_offset(type, PassMode::DENOISED);
+
+    const PassInfo pass_info = Pass::get_info(type);
+    num_components = pass_info.num_components;
+    use_compositing = pass_info.use_compositing;
+    use_denoising_albedo = pass_info.use_denoising_albedo;
+  }
+
+  PassType type;
+
+  int noisy_offset;
+  int denoised_offset;
+
+  int num_components;
+  bool use_compositing;
+  bool use_denoising_albedo;
+};
+
+bool OptiXDevice::denoise_buffer(const DeviceDenoiseTask &task)
+{
+  const CUDAContextScope scope(this);
+
+  DenoiseContext context(this, task);
+
+  if (!denoise_ensure(context)) {
+    return false;
+  }
+
+  if (!denoise_filter_guiding_preprocess(context)) {
+    LOG(ERROR) << "Error preprocessing guiding passes.";
+    return false;
+  }
+
+  /* Passes which will use real albedo when it is available. */
+  denoise_pass(context, PASS_COMBINED);
+  denoise_pass(context, PASS_SHADOW_CATCHER_MATTE);
+
+  /* Passes which do not need albedo and hence if real is present it needs to become fake. */
+  denoise_pass(context, PASS_SHADOW_CATCHER);
+
+  return true;
+}
+
+DeviceQueue *OptiXDevice::get_denoise_queue()
+{
+  return &denoiser_.queue;
+}
+
+bool OptiXDevice::denoise_filter_guiding_preprocess(DenoiseContext &context)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+
+  const int work_size = buffer_params.width * buffer_params.height;
+
+  void *args[] = {const_cast<device_ptr *>(&context.guiding_params.device_pointer),
+                  const_cast<int *>(&context.guiding_params.pass_stride),
+                  const_cast<int *>(&context.guiding_params.pass_albedo),
+                  const_cast<int *>(&context.guiding_params.pass_normal),
+                  &context.render_buffers->buffer.device_pointer,
+                  const_cast<int *>(&buffer_params.offset),
+                  const_cast<int *>(&buffer_params.stride),
+                  const_cast<int *>(&buffer_params.pass_stride),
+                  const_cast<int *>(&context.pass_sample_count),
+                  const_cast<int *>(&context.pass_denoising_albedo),
+                  const_cast<int *>(&context.pass_denoising_normal),
+                  const_cast<int *>(&buffer_params.full_x),
+                  const_cast<int *>(&buffer_params.full_y),
+                  const_cast<int *>(&buffer_params.width),
+                  const_cast<int *>(&buffer_params.height),
+                  const_cast<int *>(&context.num_samples)};
+
+  return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS, work_size, args);
+}
+
+bool OptiXDevice::denoise_filter_guiding_set_fake_albedo(DenoiseContext &context)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+
+  const int work_size = buffer_params.width * buffer_params.height;
+
+  void *args[] = {const_cast<device_ptr *>(&context.guiding_params.device_pointer),
+                  const_cast<int *>(&context.guiding_params.pass_stride),
+                  const_cast<int *>(&context.guiding_params.pass_albedo),
+                  const_cast<int *>(&buffer_params.width),
+                  const_cast<int *>(&buffer_params.height)};
+
+  return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO, work_size, args);
+}
+
+void OptiXDevice::denoise_pass(DenoiseContext &context, PassType pass_type)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+
+  const DenoisePass pass(pass_type, buffer_params);
+
+  if (pass.noisy_offset == PASS_UNUSED) {
+    return;
+  }
+  if (pass.denoised_offset == PASS_UNUSED) {
+    LOG(DFATAL) << "Missing denoised pass " << pass_type_as_string(pass_type);
+    return;
+  }
+
+  if (pass.use_denoising_albedo) {
+    if (context.albedo_replaced_with_fake) {
+      LOG(ERROR) << "Pass which requires albedo is denoised after fake albedo has been set.";
+      return;
+    }
+  }
+  else if (!context.albedo_replaced_with_fake) {
+    context.albedo_replaced_with_fake = true;
+    if (!denoise_filter_guiding_set_fake_albedo(context)) {
+      LOG(ERROR) << "Error replacing real albedo with the fake one.";
+      return;
+    }
+  }
+
+  /* Read and preprocess noisy color input pass. */
+  denoise_color_read(context, pass);
+  if (!denoise_filter_color_preprocess(context, pass)) {
+    LOG(ERROR) << "Error connverting denoising passes to RGB buffer.";
+    return;
+  }
+
+  if (!denoise_run(context, pass)) {
+    LOG(ERROR) << "Error running OptiX denoiser.";
+    return;
+  }
+
+  /* Store result in the combined pass of the render buffer.
+   *
+   * This will scale the denoiser result up to match the number of, possibly per-pixel, samples. */
+  if (!denoise_filter_color_postprocess(context, pass)) {
+    LOG(ERROR) << "Error copying denoiser result to the denoised pass.";
+    return;
+  }
+
+  denoiser_.queue.synchronize();
+}
+
+void OptiXDevice::denoise_color_read(DenoiseContext &context, const DenoisePass &pass)
+{
+  PassAccessor::PassAccessInfo pass_access_info;
+  pass_access_info.type = pass.type;
+  pass_access_info.mode = PassMode::NOISY;
+  pass_access_info.offset = pass.noisy_offset;
+
+  /* Denoiser operates on passes which are used to calculate the approximation, and is never used
+   * on the approximation. The latter is not even possible because OptiX does not support
+   * denoising of semi-transparent pixels. */
+  pass_access_info.use_approximate_shadow_catcher = false;
+  pass_access_info.use_approximate_shadow_catcher_background = false;
+  pass_access_info.show_active_pixels = false;
+
+  /* TODO(sergey): Consider adding support of actual exposure, to avoid clamping in extreme cases.
+   */
+  const PassAccessorGPU pass_accessor(
+      &denoiser_.queue, pass_access_info, 1.0f, context.num_samples);
+
+  PassAccessor::Destination destination(pass_access_info.type);
+  destination.d_pixels = context.render_buffers->buffer.device_pointer +
+                         pass.denoised_offset * sizeof(float);
+  destination.num_components = 3;
+  destination.pixel_stride = context.buffer_params.pass_stride;
+
+  pass_accessor.get_render_tile_pixels(context.render_buffers, context.buffer_params, destination);
+}
+
+bool OptiXDevice::denoise_filter_color_preprocess(DenoiseContext &context, const DenoisePass &pass)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+
+  const int work_size = buffer_params.width * buffer_params.height;
+
+  void *args[] = {&context.render_buffers->buffer.device_pointer,
+                  const_cast<int *>(&buffer_params.full_x),
+                  const_cast<int *>(&buffer_params.full_y),
+                  const_cast<int *>(&buffer_params.width),
+                  const_cast<int *>(&buffer_params.height),
+                  const_cast<int *>(&buffer_params.offset),
+                  const_cast<int *>(&buffer_params.stride),
+                  const_cast<int *>(&buffer_params.pass_stride),
+                  const_cast<int *>(&pass.denoised_offset)};
+
+  return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_COLOR_PREPROCESS, work_size, args);
+}
+
+bool OptiXDevice::denoise_filter_color_postprocess(DenoiseContext &context,
+                                                   const DenoisePass &pass)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+
+  const int work_size = buffer_params.width * buffer_params.height;
+
+  void *args[] = {&context.render_buffers->buffer.device_pointer,
+                  const_cast<int *>(&buffer_params.full_x),
+                  const_cast<int *>(&buffer_params.full_y),
+                  const_cast<int *>(&buffer_params.width),
+                  const_cast<int *>(&buffer_params.height),
+                  const_cast<int *>(&buffer_params.offset),
+                  const_cast<int *>(&buffer_params.stride),
+                  const_cast<int *>(&buffer_params.pass_stride),
+                  const_cast<int *>(&context.num_samples),
+                  const_cast<int *>(&pass.noisy_offset),
+                  const_cast<int *>(&pass.denoised_offset),
+                  const_cast<int *>(&context.pass_sample_count),
+                  const_cast<int *>(&pass.num_components),
+                  const_cast<bool *>(&pass.use_compositing)};
+
+  return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS, work_size, args);
+}
+
+bool OptiXDevice::denoise_ensure(DenoiseContext &context)
+{
+  if (!denoise_create_if_needed(context)) {
+    LOG(ERROR) << "OptiX denoiser creation has failed.";
+    return false;
+  }
+
+  if (!denoise_configure_if_needed(context)) {
+    LOG(ERROR) << "OptiX denoiser configuration has failed.";
+    return false;
+  }
+
+  return true;
+}
+
+bool OptiXDevice::denoise_create_if_needed(DenoiseContext &context)
+{
+  const bool recreate_denoiser = (denoiser_.optix_denoiser == nullptr) ||
+                                 (denoiser_.use_pass_albedo != context.use_pass_albedo) ||
+                                 (denoiser_.use_pass_normal != context.use_pass_normal);
+  if (!recreate_denoiser) {
+    return true;
+  }
+
+  /* Destroy existing handle before creating new one. */
+  if (denoiser_.optix_denoiser) {
+    optixDenoiserDestroy(denoiser_.optix_denoiser);
+  }
+
+  /* Create OptiX denoiser handle on demand when it is first used. */
+  OptixDenoiserOptions denoiser_options = {};
+  denoiser_options.guideAlbedo = context.use_pass_albedo;
+  denoiser_options.guideNormal = context.use_pass_normal;
+  const OptixResult result = optixDenoiserCreate(
+      this->context, OPTIX_DENOISER_MODEL_KIND_HDR, &denoiser_options, &denoiser_.optix_denoiser);
+
+  if (result != OPTIX_SUCCESS) {
+    set_error("Failed to create OptiX denoiser");
+    return false;
+  }
+
+  /* OptiX denoiser handle was created with the requested number of input passes. */
+  denoiser_.use_pass_albedo = context.use_pass_albedo;
+  denoiser_.use_pass_normal = context.use_pass_normal;
+
+  /* OptiX denoiser has been created, but it needs configuration. */
+  denoiser_.is_configured = false;
+
+  return true;
+}
+
+bool OptiXDevice::denoise_configure_if_needed(DenoiseContext &context)
+{
+  if (denoiser_.is_configured && (denoiser_.configured_size.x == context.buffer_params.width &&
+                                  denoiser_.configured_size.y == context.buffer_params.height)) {
+    return true;
+  }
+
+  const BufferParams &buffer_params = context.buffer_params;
+
+  OptixDenoiserSizes sizes = {};
+  optix_assert(optixDenoiserComputeMemoryResources(
+      denoiser_.optix_denoiser, buffer_params.width, buffer_params.height, &sizes));
+
+  denoiser_.scratch_size = sizes.withOverlapScratchSizeInBytes;
+  denoiser_.scratch_offset = sizes.stateSizeInBytes;
+
+  /* Allocate denoiser state if tile size has changed since last setup. */
+  denoiser_.state.alloc_to_device(denoiser_.scratch_offset + denoiser_.scratch_size);
+
+  /* Initialize denoiser state for the current tile size. */
+  const OptixResult result = optixDenoiserSetup(denoiser_.optix_denoiser,
+                                                denoiser_.queue.stream(),
+                                                buffer_params.width,
+                                                buffer_params.height,
+                                                denoiser_.state.device_pointer,
+                                                denoiser_.scratch_offset,
+                                                denoiser_.state.device_pointer +
+                                                    denoiser_.scratch_offset,
+                                                denoiser_.scratch_size);
+  if (result != OPTIX_SUCCESS) {
+    set_error("Failed to set up OptiX denoiser");
+    return false;
+  }
+
+  denoiser_.is_configured = true;
+  denoiser_.configured_size.x = buffer_params.width;
+  denoiser_.configured_size.y = buffer_params.height;
+
+  return true;
+}
+
+bool OptiXDevice::denoise_run(DenoiseContext &context, const DenoisePass &pass)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+  const int width = buffer_params.width;
+  const int height = buffer_params.height;
+
+  /* Set up input and output layer information. */
+  OptixImage2D color_layer = {0};
+  OptixImage2D albedo_layer = {0};
+  OptixImage2D normal_layer = {0};
+
+  OptixImage2D output_layer = {0};
+
+  /* Color pass. */
+  {
+    const int pass_denoised = pass.denoised_offset;
+    const int64_t pass_stride_in_bytes = context.buffer_params.pass_stride * sizeof(float);
+
+    color_layer.data = context.render_buffers->buffer.device_pointer +
+                       pass_denoised * sizeof(float);
+    color_layer.width = width;
+    color_layer.height = height;
+    color_layer.rowStrideInBytes = pass_stride_in_bytes * context.buffer_params.stride;
+    color_layer.pixelStrideInBytes = pass_stride_in_bytes;
+    color_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
+  }
+
+  device_vector<float> fake_albedo(this, "fake_albedo", MEM_READ_WRITE);
+
+  /* Optional albedo and color passes. */
+  if (context.num_input_passes > 1) {
+    const device_ptr d_guiding_buffer = context.guiding_params.device_pointer;
+    const int64_t pixel_stride_in_bytes = context.guiding_params.pass_stride * sizeof(float);
+    const int64_t row_stride_in_bytes = context.guiding_params.stride * pixel_stride_in_bytes;
+
+    if (context.use_pass_albedo) {
+      albedo_layer.data = d_guiding_buffer + context.guiding_params.pass_albedo * sizeof(float);
+      albedo_layer.width = width;
+      albedo_layer.height = height;
+      albedo_layer.rowStrideInBytes = row_stride_in_bytes;
+      albedo_layer.pixelStrideInBytes = pixel_stride_in_bytes;
+      albedo_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
+    }
+
+    if (context.use_pass_normal) {
+      normal_layer.data = d_guiding_buffer + context.guiding_params.pass_normal * sizeof(float);
+      normal_layer.width = width;
+      normal_layer.height = height;
+      normal_layer.rowStrideInBytes = row_stride_in_bytes;
+      normal_layer.pixelStrideInBytes = pixel_stride_in_bytes;
+      normal_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
+    }
+  }
+
+  /* Denoise in-place of the noisy input in the render buffers. */
+  output_layer = color_layer;
+
+  /* Finally run denoising. */
+  OptixDenoiserParams params = {}; /* All parameters are disabled/zero. */
+  OptixDenoiserLayer image_layers = {};
+  image_layers.input = color_layer;
+  image_layers.output = output_layer;
+
+  OptixDenoiserGuideLayer guide_layers = {};
+  guide_layers.albedo = albedo_layer;
+  guide_layers.normal = normal_layer;
+
+  optix_assert(optixDenoiserInvoke(denoiser_.optix_denoiser,
+                                   denoiser_.queue.stream(),
+                                   &params,
+                                   denoiser_.state.device_pointer,
+                                   denoiser_.scratch_offset,
+                                   &guide_layers,
+                                   &image_layers,
+                                   1,
+                                   0,
+                                   0,
+                                   denoiser_.state.device_pointer + denoiser_.scratch_offset,
+                                   denoiser_.scratch_size));
+
+  return true;
+}
+
+bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh,
+                                  OptixBuildOperation operation,
+                                  const OptixBuildInput &build_input,
+                                  uint16_t num_motion_steps)
+{
+  const CUDAContextScope scope(this);
+
+  const bool use_fast_trace_bvh = (bvh->params.bvh_type == BVH_TYPE_STATIC);
+
+  /* Compute memory usage. */
+  OptixAccelBufferSizes sizes = {};
+  OptixAccelBuildOptions options = {};
+  options.operation = operation;
+  if (use_fast_trace_bvh) {
+    VLOG(2) << "Using fast to trace OptiX BVH";
+    options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
+  }
+  else {
+    VLOG(2) << "Using fast to update OptiX BVH";
+    options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD | OPTIX_BUILD_FLAG_ALLOW_UPDATE;
+  }
+
+  options.motionOptions.numKeys = num_motion_steps;
+  options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH;
+  options.motionOptions.timeBegin = 0.0f;
+  options.motionOptions.timeEnd = 1.0f;
+
+  optix_assert(optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));
+
+  /* Allocate required output buffers. */
+  device_only_memory<char> temp_mem(this, "optix temp as build mem");
+  temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
+  if (!temp_mem.device_pointer) {
+    /* Make sure temporary memory allocation succeeded. */
+    return false;
+  }
+
+  device_only_memory<char> &out_data = bvh->as_data;
+  if (operation == OPTIX_BUILD_OPERATION_BUILD) {
+    assert(out_data.device == this);
+    out_data.alloc_to_device(sizes.outputSizeInBytes);
+    if (!out_data.device_pointer) {
+      return false;
+    }
+  }
+  else {
+    assert(out_data.device_pointer && out_data.device_size >= sizes.outputSizeInBytes);
+  }
+
+  /* Finally build the acceleration structure. */
+  OptixAccelEmitDesc compacted_size_prop = {};
+  compacted_size_prop.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
+  /* A tiny space was allocated for this property at the end of the temporary buffer above.
+   * Make sure this pointer is 8-byte aligned. */
+  compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8);
+
+  OptixTraversableHandle out_handle = 0;
+  optix_assert(optixAccelBuild(context,
+                               NULL,
+                               &options,
+                               &build_input,
+                               1,
+                               temp_mem.device_pointer,
+                               sizes.tempSizeInBytes,
+                               out_data.device_pointer,
+                               sizes.outputSizeInBytes,
+                               &out_handle,
+                               use_fast_trace_bvh ? &compacted_size_prop : NULL,
+                               use_fast_trace_bvh ? 1 : 0));
+  bvh->traversable_handle = static_cast<uint64_t>(out_handle);
+
+  /* Wait for all operations to finish. */
+  cuda_assert(cuStreamSynchronize(NULL));
+
+  /* Compact acceleration structure to save memory (do not do this in viewport for faster builds).
+   */
+  if (use_fast_trace_bvh) {
+    uint64_t compacted_size = sizes.outputSizeInBytes;
+    cuda_assert(cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size)));
+
+    /* Temporary memory is no longer needed, so free it now to make space. */
+    temp_mem.free();
+
+    /* There is no point compacting if the size does not change. */
+    if (compacted_size < sizes.outputSizeInBytes) {
+      device_only_memory<char> compacted_data(this, "optix compacted as");
+      compacted_data.alloc_to_device(compacted_size);
+      if (!compacted_data.device_pointer)
+        /* Do not compact if memory allocation for compacted acceleration structure fails.
+         * Can just use the uncompacted one then, so succeed here regardless. */
+        return !have_error();
+
+      optix_assert(optixAccelCompact(
+          context, NULL, out_handle, compacted_data.device_pointer, compacted_size, &out_handle));
+      bvh->traversable_handle = static_cast<uint64_t>(out_handle);
+
+      /* Wait for compaction to finish. */
+      cuda_assert(cuStreamSynchronize(NULL));
+
+      std::swap(out_data.device_size, compacted_data.device_size);
+      std::swap(out_data.device_pointer, compacted_data.device_pointer);
+    }
+  }
+
+  return !have_error();
+}
+
+void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
+{
+  const bool use_fast_trace_bvh = (bvh->params.bvh_type == BVH_TYPE_STATIC);
+
+  free_bvh_memory_delayed();
+
+  BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
+
+  progress.set_substatus("Building OptiX acceleration structure");
+
+  if (!bvh->params.top_level) {
+    assert(bvh->objects.size() == 1 && bvh->geometry.size() == 1);
+
+    /* Refit is only possible in viewport for now (because AS is built with
+     * OPTIX_BUILD_FLAG_ALLOW_UPDATE only there, see above). */
+    OptixBuildOperation operation = OPTIX_BUILD_OPERATION_BUILD;
+    if (refit && !use_fast_trace_bvh) {
+      assert(bvh_optix->traversable_handle != 0);
+      operation = OPTIX_BUILD_OPERATION_UPDATE;
+    }
+    else {
+      bvh_optix->as_data.free();
+      bvh_optix->traversable_handle = 0;
+    }
+
+    /* Build bottom level acceleration structures (BLAS). */
+    Geometry *const geom = bvh->geometry[0];
+    if (geom->geometry_type == Geometry::HAIR) {
+      /* Build BLAS for curve primitives. */
+      Hair *const hair = static_cast<Hair *const>(geom);
+      if (hair->num_curves() == 0) {
+        return;
+      }
+
+      const size_t num_segments = hair->num_segments();
+
+      size_t num_motion_steps = 1;
+      Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+      if (motion_blur && hair->get_use_motion_blur() && motion_keys) {
+        num_motion_steps = hair->get_motion_steps();
+      }
+
+      device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY);
+      device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
+      device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
+      /* Four control points for each curve segment. */
+      const size_t num_vertices = num_segments * 4;
+      if (hair->curve_shape == CURVE_THICK) {
+        index_data.alloc(num_segments);
+        vertex_data.alloc(num_vertices * num_motion_steps);
+      }
+      else
+        aabb_data.alloc(num_segments * num_motion_steps);
+
+      /* Get AABBs for each motion step. */
+      for (size_t step = 0; step < num_motion_steps; ++step) {
+        /* The center step for motion vertices is not stored in the attribute. */
+        const float3 *keys = hair->get_curve_keys().data();
+        size_t center_step = (num_motion_steps - 1) / 2;
+        if (step != center_step) {
+          size_t attr_offset = (step > center_step) ? step - 1 : step;
+          /* Technically this is a float4 array, but sizeof(float3) == sizeof(float4). */
+          keys = motion_keys->data_float3() + attr_offset * hair->get_curve_keys().size();
+        }
+
+        for (size_t j = 0, i = 0; j < hair->num_curves(); ++j) {
+          const Hair::Curve curve = hair->get_curve(j);
+          const array<float> &curve_radius = hair->get_curve_radius();
+
+          for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) {
+            if (hair->curve_shape == CURVE_THICK) {
+              int k0 = curve.first_key + segment;
+              int k1 = k0 + 1;
+              int ka = max(k0 - 1, curve.first_key);
+              int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1);
+
+              const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x);
+              const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y);
+              const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z);
+              const float4 pw = make_float4(
+                  curve_radius[ka], curve_radius[k0], curve_radius[k1], curve_radius[kb]);
+
+              /* Convert Catmull-Rom data to Bezier spline. */
+              static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f;
+              static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f;
+              static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f;
+              static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f;
+
+              index_data[i] = i * 4;
+              float4 *const v = vertex_data.data() + step * num_vertices + index_data[i];
+              v[0] = make_float4(
+                  dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw));
+              v[1] = make_float4(
+                  dot(cr2bsp1, px), dot(cr2bsp1, py), dot(cr2bsp1, pz), dot(cr2bsp1, pw));
+              v[2] = make_float4(
+                  dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw));
+              v[3] = make_float4(
+                  dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw));
+            }
+            else {
+              BoundBox bounds = BoundBox::empty;
+              curve.bounds_grow(segment, keys, hair->get_curve_radius().data(), bounds);
+
+              const size_t index = step * num_segments + i;
+              aabb_data[index].minX = bounds.min.x;
+              aabb_data[index].minY = bounds.min.y;
+              aabb_data[index].minZ = bounds.min.z;
+              aabb_data[index].maxX = bounds.max.x;
+              aabb_data[index].maxY = bounds.max.y;
+              aabb_data[index].maxZ = bounds.max.z;
+            }
+          }
+        }
+      }
+
+      /* Upload AABB data to GPU. */
+      aabb_data.copy_to_device();
+      index_data.copy_to_device();
+      vertex_data.copy_to_device();
+
+      vector<device_ptr> aabb_ptrs;
+      aabb_ptrs.reserve(num_motion_steps);
+      vector<device_ptr> width_ptrs;
+      vector<device_ptr> vertex_ptrs;
+      width_ptrs.reserve(num_motion_steps);
+      vertex_ptrs.reserve(num_motion_steps);
+      for (size_t step = 0; step < num_motion_steps; ++step) {
+        aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb));
+        const device_ptr base_ptr = vertex_data.device_pointer +
+                                    step * num_vertices * sizeof(float4);
+        width_ptrs.push_back(base_ptr + 3 * sizeof(float)); /* Offset by vertex size. */
+        vertex_ptrs.push_back(base_ptr);
+      }
+
+      /* Force a single any-hit call, so shadow record-all behavior works correctly. */
+      unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
+      OptixBuildInput build_input = {};
+      if (hair->curve_shape == CURVE_THICK) {
+        build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES;
+        build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
+        build_input.curveArray.numPrimitives = num_segments;
+        build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
+        build_input.curveArray.numVertices = num_vertices;
+        build_input.curveArray.vertexStrideInBytes = sizeof(float4);
+        build_input.curveArray.widthBuffers = (CUdeviceptr *)width_ptrs.data();
+        build_input.curveArray.widthStrideInBytes = sizeof(float4);
+        build_input.curveArray.indexBuffer = (CUdeviceptr)index_data.device_pointer;
+        build_input.curveArray.indexStrideInBytes = sizeof(int);
+        build_input.curveArray.flag = build_flags;
+        build_input.curveArray.primitiveIndexOffset = hair->optix_prim_offset;
+      }
+      else {
+        /* Disable visibility test any-hit program, since it is already checked during
+         * intersection. Those trace calls that require anyhit can force it with a ray flag. */
+        build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
+
+        build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
+        build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
+        build_input.customPrimitiveArray.numPrimitives = num_segments;
+        build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb);
+        build_input.customPrimitiveArray.flags = &build_flags;
+        build_input.customPrimitiveArray.numSbtRecords = 1;
+        build_input.customPrimitiveArray.primitiveIndexOffset = hair->optix_prim_offset;
+      }
+
+      if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
+        progress.set_error("Failed to build OptiX acceleration structure");
+      }
+    }
+    else if (geom->geometry_type == Geometry::MESH || geom->geometry_type == Geometry::VOLUME) {
+      /* Build BLAS for triangle primitives. */
+      Mesh *const mesh = static_cast<Mesh *const>(geom);
+      if (mesh->num_triangles() == 0) {
+        return;
+      }
+
+      const size_t num_verts = mesh->get_verts().size();
+
+      size_t num_motion_steps = 1;
+      Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+      if (motion_blur && mesh->get_use_motion_blur() && motion_keys) {
+        num_motion_steps = mesh->get_motion_steps();
+      }
+
+      device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
+      index_data.alloc(mesh->get_triangles().size());
+      memcpy(index_data.data(),
+             mesh->get_triangles().data(),
+             mesh->get_triangles().size() * sizeof(int));
+      device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
+      vertex_data.alloc(num_verts * num_motion_steps);
+
+      for (size_t step = 0; step < num_motion_steps; ++step) {
+        const float3 *verts = mesh->get_verts().data();
+
+        size_t center_step = (num_motion_steps - 1) / 2;
+        /* The center step for motion vertices is not stored in the attribute. */
+        if (step != center_step) {
+          verts = motion_keys->data_float3() + (step > center_step ? step - 1 : step) * num_verts;
+        }
+
+        memcpy(vertex_data.data() + num_verts * step, verts, num_verts * sizeof(float3));
+      }
+
+      /* Upload triangle data to GPU. */
+      index_data.copy_to_device();
+      vertex_data.copy_to_device();
+
+      vector<device_ptr> vertex_ptrs;
+      vertex_ptrs.reserve(num_motion_steps);
+      for (size_t step = 0; step < num_motion_steps; ++step) {
+        vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3));
+      }
+
+      /* Force a single any-hit call, so shadow record-all behavior works correctly. */
+      unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
+      OptixBuildInput build_input = {};
+      build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
+      build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
+      build_input.triangleArray.numVertices = num_verts;
+      build_input.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3;
+      build_input.triangleArray.vertexStrideInBytes = sizeof(float4);
+      build_input.triangleArray.indexBuffer = index_data.device_pointer;
+      build_input.triangleArray.numIndexTriplets = mesh->num_triangles();
+      build_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3;
+      build_input.triangleArray.indexStrideInBytes = 3 * sizeof(int);
+      build_input.triangleArray.flags = &build_flags;
+      /* The SBT does not store per primitive data since Cycles already allocates separate
+       * buffers for that purpose. OptiX does not allow this to be zero though, so just pass in
+       * one and rely on that having the same meaning in this case. */
+      build_input.triangleArray.numSbtRecords = 1;
+      build_input.triangleArray.primitiveIndexOffset = mesh->optix_prim_offset;
+
+      if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
+        progress.set_error("Failed to build OptiX acceleration structure");
+      }
+    }
+  }
+  else {
+    unsigned int num_instances = 0;
+    unsigned int max_num_instances = 0xFFFFFFFF;
+
+    bvh_optix->as_data.free();
+    bvh_optix->traversable_handle = 0;
+    bvh_optix->motion_transform_data.free();
+
+    optixDeviceContextGetProperty(context,
+                                  OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID,
+                                  &max_num_instances,
+                                  sizeof(max_num_instances));
+    /* Do not count first bit, which is used to distinguish instanced and non-instanced objects. */
+    max_num_instances >>= 1;
+    if (bvh->objects.size() > max_num_instances) {
+      progress.set_error(
+          "Failed to build OptiX acceleration structure because there are too many instances");
+      return;
+    }
+
+    /* Fill instance descriptions. */
+    device_vector<OptixInstance> instances(this, "optix tlas instances", MEM_READ_ONLY);
+    instances.alloc(bvh->objects.size());
+
+    /* Calculate total motion transform size and allocate memory for them. */
+    size_t motion_transform_offset = 0;
+    if (motion_blur) {
+      size_t total_motion_transform_size = 0;
+      for (Object *const ob : bvh->objects) {
+        if (ob->is_traceable() && ob->use_motion()) {
+          total_motion_transform_size = align_up(total_motion_transform_size,
+                                                 OPTIX_TRANSFORM_BYTE_ALIGNMENT);
+          const size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
+          total_motion_transform_size = total_motion_transform_size +
+                                        sizeof(OptixSRTMotionTransform) +
+                                        motion_keys * sizeof(OptixSRTData);
+        }
+      }
+
+      assert(bvh_optix->motion_transform_data.device == this);
+      bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size);
+    }
+
+    for (Object *ob : bvh->objects) {
+      /* Skip non-traceable objects. */
+      if (!ob->is_traceable()) {
+        continue;
+      }
+
+      BVHOptiX *const blas = static_cast<BVHOptiX *>(ob->get_geometry()->bvh);
+      OptixTraversableHandle handle = blas->traversable_handle;
+
+      OptixInstance &instance = instances[num_instances++];
+      memset(&instance, 0, sizeof(instance));
+
+      /* Clear transform to identity matrix. */
+      instance.transform[0] = 1.0f;
+      instance.transform[5] = 1.0f;
+      instance.transform[10] = 1.0f;
+
+      /* Set user instance ID to object index (but leave low bit blank). */
+      instance.instanceId = ob->get_device_index() << 1;
+
+      /* Have to have at least one bit in the mask, or else instance would always be culled. */
+      instance.visibilityMask = 1;
+
+      if (ob->get_geometry()->has_volume) {
+        /* Volumes have a special bit set in the visibility mask so a trace can mask only volumes.
+         */
+        instance.visibilityMask |= 2;
+      }
+
+      if (ob->get_geometry()->geometry_type == Geometry::HAIR) {
+        /* Same applies to curves (so they can be skipped in local trace calls). */
+        instance.visibilityMask |= 4;
+
+        if (motion_blur && ob->get_geometry()->has_motion_blur() &&
+            static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) {
+          /* Select between motion blur and non-motion blur built-in intersection module. */
+          instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
+        }
+      }
+
+      /* Insert motion traversable if object has motion. */
+      if (motion_blur && ob->use_motion()) {
+        size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
+        size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
+                                       motion_keys * sizeof(OptixSRTData);
+
+        const CUDAContextScope scope(this);
+
+        motion_transform_offset = align_up(motion_transform_offset,
+                                           OPTIX_TRANSFORM_BYTE_ALIGNMENT);
+        CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data.device_pointer +
+                                           motion_transform_offset;
+        motion_transform_offset += motion_transform_size;
+
+        /* Allocate host side memory for motion transform and fill it with transform data. */
+        OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
+            new uint8_t[motion_transform_size]);
+        motion_transform.child = handle;
+        motion_transform.motionOptions.numKeys = ob->get_motion().size();
+        motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
+        motion_transform.motionOptions.timeBegin = 0.0f;
+        motion_transform.motionOptions.timeEnd = 1.0f;
+
+        OptixSRTData *const srt_data = motion_transform.srtData;
+        array<DecomposedTransform> decomp(ob->get_motion().size());
+        transform_motion_decompose(
+            decomp.data(), ob->get_motion().data(), ob->get_motion().size());
+
+        for (size_t i = 0; i < ob->get_motion().size(); ++i) {
+          /* Scale. */
+          srt_data[i].sx = decomp[i].y.w; /* scale.x.x */
+          srt_data[i].sy = decomp[i].z.w; /* scale.y.y */
+          srt_data[i].sz = decomp[i].w.w; /* scale.z.z */
+
+          /* Shear. */
+          srt_data[i].a = decomp[i].z.x; /* scale.x.y */
+          srt_data[i].b = decomp[i].z.y; /* scale.x.z */
+          srt_data[i].c = decomp[i].w.x; /* scale.y.z */
+          assert(decomp[i].z.z == 0.0f); /* scale.y.x */
+          assert(decomp[i].w.y == 0.0f); /* scale.z.x */
+          assert(decomp[i].w.z == 0.0f); /* scale.z.y */
+
+          /* Pivot point. */
+          srt_data[i].pvx = 0.0f;
+          srt_data[i].pvy = 0.0f;
+          srt_data[i].pvz = 0.0f;
+
+          /* Rotation. */
+          srt_data[i].qx = decomp[i].x.x;
+          srt_data[i].qy = decomp[i].x.y;
+          srt_data[i].qz = decomp[i].x.z;
+          srt_data[i].qw = decomp[i].x.w;
+
+          /* Translation. */
+          srt_data[i].tx = decomp[i].y.x;
+          srt_data[i].ty = decomp[i].y.y;
+          srt_data[i].tz = decomp[i].y.z;
+        }
+
+        /* Upload motion transform to GPU. */
+        cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
+        delete[] reinterpret_cast<uint8_t *>(&motion_transform);
+
+        /* Disable instance transform if object uses motion transform already. */
+        instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
+
+        /* Get traversable handle to motion transform. */
+        optixConvertPointerToTraversableHandle(context,
+                                               motion_transform_gpu,
+                                               OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
+                                               &instance.traversableHandle);
+      }
+      else {
+        instance.traversableHandle = handle;
+
+        if (ob->get_geometry()->is_instanced()) {
+          /* Set transform matrix. */
+          memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform));
+        }
+        else {
+          /* Disable instance transform if geometry already has it applied to vertex data. */
+          instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
+          /* Non-instanced objects read ID from 'prim_object', so distinguish
+           * them from instanced objects with the low bit set. */
+          instance.instanceId |= 1;
+        }
+      }
+    }
+
+    /* Upload instance descriptions. */
+    instances.resize(num_instances);
+    instances.copy_to_device();
+
+    /* Build top-level acceleration structure (TLAS) */
+    OptixBuildInput build_input = {};
+    build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
+    build_input.instanceArray.instances = instances.device_pointer;
+    build_input.instanceArray.numInstances = num_instances;
+
+    if (!build_optix_bvh(bvh_optix, OPTIX_BUILD_OPERATION_BUILD, build_input, 0)) {
+      progress.set_error("Failed to build OptiX acceleration structure");
+    }
+    tlas_handle = bvh_optix->traversable_handle;
+  }
+}
+
+void OptiXDevice::release_optix_bvh(BVH *bvh)
+{
+  thread_scoped_lock lock(delayed_free_bvh_mutex);
+  /* Do delayed free of BVH memory, since geometry holding BVH might be deleted
+   * while GPU is still rendering. */
+  BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
+
+  delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->as_data));
+  delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->motion_transform_data));
+  bvh_optix->traversable_handle = 0;
+}
+
+void OptiXDevice::free_bvh_memory_delayed()
+{
+  thread_scoped_lock lock(delayed_free_bvh_mutex);
+  delayed_free_bvh_memory.free_memory();
+}
+
+void OptiXDevice::const_copy_to(const char *name, void *host, size_t size)
+{
+  /* Set constant memory for CUDA module. */
+  CUDADevice::const_copy_to(name, host, size);
+
+  if (strcmp(name, "__data") == 0) {
+    assert(size <= sizeof(KernelData));
+
+    /* Update traversable handle (since it is different for each device on multi devices). */
+    KernelData *const data = (KernelData *)host;
+    *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;
+
+    update_launch_params(offsetof(KernelParamsOptiX, data), host, size);
+    return;
+  }
+
+  /* Update data storage pointers in launch parameters. */
+#  define KERNEL_TEX(data_type, tex_name) \
+    if (strcmp(name, #tex_name) == 0) { \
+      update_launch_params(offsetof(KernelParamsOptiX, tex_name), host, size); \
+      return; \
+    }
+  KERNEL_TEX(IntegratorStateGPU, __integrator_state)
+#  include "kernel/kernel_textures.h"
+#  undef KERNEL_TEX
+}
+
+void OptiXDevice::update_launch_params(size_t offset, void *data, size_t data_size)
+{
+  const CUDAContextScope scope(this);
+
+  cuda_assert(cuMemcpyHtoD(launch_params.device_pointer + offset, data, data_size));
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_OPTIX */
diff --git a/intern/cycles/device/optix/device_impl.h b/intern/cycles/device/optix/device_impl.h
new file mode 100644
index 00000000000..91ef52e0a5a
--- /dev/null
+++ b/intern/cycles/device/optix/device_impl.h
@@ -0,0 +1,186 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_OPTIX
+
+#  include "device/cuda/device_impl.h"
+#  include "device/optix/queue.h"
+#  include "device/optix/util.h"
+#  include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BVHOptiX;
+struct KernelParamsOptiX;
+
+/* List of OptiX program groups. */
+enum {
+  PG_RGEN_INTERSECT_CLOSEST,
+  PG_RGEN_INTERSECT_SHADOW,
+  PG_RGEN_INTERSECT_SUBSURFACE,
+  PG_RGEN_INTERSECT_VOLUME_STACK,
+  PG_RGEN_SHADE_SURFACE_RAYTRACE,
+  PG_MISS,
+  PG_HITD, /* Default hit group. */
+  PG_HITS, /* __SHADOW_RECORD_ALL__ hit group. */
+  PG_HITL, /* __BVH_LOCAL__ hit group (only used for triangles). */
+  PG_HITD_MOTION,
+  PG_HITS_MOTION,
+  PG_CALL_SVM_AO,
+  PG_CALL_SVM_BEVEL,
+  PG_CALL_AO_PASS,
+  NUM_PROGRAM_GROUPS
+};
+
+static const int MISS_PROGRAM_GROUP_OFFSET = PG_MISS;
+static const int NUM_MIS_PROGRAM_GROUPS = 1;
+static const int HIT_PROGAM_GROUP_OFFSET = PG_HITD;
+static const int NUM_HIT_PROGRAM_GROUPS = 5;
+static const int CALLABLE_PROGRAM_GROUPS_BASE = PG_CALL_SVM_AO;
+static const int NUM_CALLABLE_PROGRAM_GROUPS = 3;
+
+/* List of OptiX pipelines. */
+enum { PIP_SHADE_RAYTRACE, PIP_INTERSECT, NUM_PIPELINES };
+
+/* A single shader binding table entry. */
+struct SbtRecord {
+  char header[OPTIX_SBT_RECORD_HEADER_SIZE];
+};
+
+class OptiXDevice : public CUDADevice {
+ public:
+  OptixDeviceContext context = NULL;
+
+  OptixModule optix_module = NULL; /* All necessary OptiX kernels are in one module. */
+  OptixModule builtin_modules[2] = {};
+  OptixPipeline pipelines[NUM_PIPELINES] = {};
+
+  bool motion_blur = false;
+  device_vector<SbtRecord> sbt_data;
+  device_only_memory<KernelParamsOptiX> launch_params;
+  OptixTraversableHandle tlas_handle = 0;
+
+  vector<device_only_memory<char>> delayed_free_bvh_memory;
+  thread_mutex delayed_free_bvh_mutex;
+
+  class Denoiser {
+   public:
+    explicit Denoiser(OptiXDevice *device);
+    ~Denoiser();
+
+    OptiXDevice *device;
+    OptiXDeviceQueue queue;
+
+    OptixDenoiser optix_denoiser = nullptr;
+
+    /* Configuration size, as provided to `optixDenoiserSetup`.
+     * If the `optixDenoiserSetup()` was never used on the current `optix_denoiser` the
+     * `is_configured` will be false. */
+    bool is_configured = false;
+    int2 configured_size = make_int2(0, 0);
+
+    /* OptiX denoiser state and scratch buffers, stored in a single memory buffer.
+     * The memory layout goes as following: [denoiser state][scratch buffer]. */
+    device_only_memory<unsigned char> state;
+    size_t scratch_offset = 0;
+    size_t scratch_size = 0;
+
+    bool use_pass_albedo = false;
+    bool use_pass_normal = false;
+  };
+  Denoiser denoiser_;
+
+ public:
+  OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+  ~OptiXDevice();
+
+ private:
+  BVHLayoutMask get_bvh_layout_mask() const override;
+
+  string compile_kernel_get_common_cflags(const uint kernel_features) override;
+
+  bool load_kernels(const uint kernel_features) override;
+
+  bool build_optix_bvh(BVHOptiX *bvh,
+                       OptixBuildOperation operation,
+                       const OptixBuildInput &build_input,
+                       uint16_t num_motion_steps);
+
+  void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
+
+  void release_optix_bvh(BVH *bvh) override;
+  void free_bvh_memory_delayed();
+
+  void const_copy_to(const char *name, void *host, size_t size) override;
+
+  void update_launch_params(size_t offset, void *data, size_t data_size);
+
+  virtual unique_ptr<DeviceQueue> gpu_queue_create() override;
+
+  /* --------------------------------------------------------------------
+   * Denoising.
+   */
+
+  class DenoiseContext;
+  class DenoisePass;
+
+  virtual bool denoise_buffer(const DeviceDenoiseTask &task) override;
+  virtual DeviceQueue *get_denoise_queue() override;
+
+  /* Read guiding passes from the render buffers, preprocess them in a way which is expected by
+   * OptiX and store in the guiding passes memory within the given context.
+   *
+   * Pre=-processing of the guiding passes is to only happen once per context lifetime. DO not
+   * preprocess them for every pass which is being denoised. */
+  bool denoise_filter_guiding_preprocess(DenoiseContext &context);
+
+  /* Set fake albedo pixels in the albedo guiding pass storage.
+   * After this point only passes which do not need albedo for denoising can be processed. */
+  bool denoise_filter_guiding_set_fake_albedo(DenoiseContext &context);
+
+  void denoise_pass(DenoiseContext &context, PassType pass_type);
+
+  /* Read input color pass from the render buffer into the memory which corresponds to the noisy
+   * input within the given context. Pixels are scaled to the number of samples, but are not
+   * preprocessed yet. */
+  void denoise_color_read(DenoiseContext &context, const DenoisePass &pass);
+
+  /* Run corresponding filter kernels, preparing data for the denoiser or copying data from the
+   * denoiser result to the render buffer. */
+  bool denoise_filter_color_preprocess(DenoiseContext &context, const DenoisePass &pass);
+  bool denoise_filter_color_postprocess(DenoiseContext &context, const DenoisePass &pass);
+
+  /* Make sure the OptiX denoiser is created and configured. */
+  bool denoise_ensure(DenoiseContext &context);
+
+  /* Create OptiX denoiser descriptor if needed.
+   * Will do nothing if the current OptiX descriptor is usable for the given parameters.
+   * If the OptiX denoiser descriptor did re-allocate here it is left unconfigured. */
+  bool denoise_create_if_needed(DenoiseContext &context);
+
+  /* Configure existing OptiX denoiser descriptor for the use for the given task. */
+  bool denoise_configure_if_needed(DenoiseContext &context);
+
+  /* Run configured denoiser. */
+  bool denoise_run(DenoiseContext &context, const DenoisePass &pass);
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_OPTIX */
diff --git a/intern/cycles/device/optix/queue.cpp b/intern/cycles/device/optix/queue.cpp
new file mode 100644
index 00000000000..458ed70baa8
--- /dev/null
+++ b/intern/cycles/device/optix/queue.cpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_OPTIX
+
+#  include "device/optix/queue.h"
+#  include "device/optix/device_impl.h"
+
+#  include "util/util_time.h"
+
+#  undef __KERNEL_CPU__
+#  define __KERNEL_OPTIX__
+#  include "kernel/device/optix/globals.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* CUDADeviceQueue */
+
+OptiXDeviceQueue::OptiXDeviceQueue(OptiXDevice *device) : CUDADeviceQueue(device)
+{
+}
+
+void OptiXDeviceQueue::init_execution()
+{
+  CUDADeviceQueue::init_execution();
+}
+
+static bool is_optix_specific_kernel(DeviceKernel kernel)
+{
+  return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
+}
+
+bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[])
+{
+  if (!is_optix_specific_kernel(kernel)) {
+    return CUDADeviceQueue::enqueue(kernel, work_size, args);
+  }
+
+  if (cuda_device_->have_error()) {
+    return false;
+  }
+
+  debug_enqueue(kernel, work_size);
+
+  const CUDAContextScope scope(cuda_device_);
+
+  OptiXDevice *const optix_device = static_cast<OptiXDevice *>(cuda_device_);
+
+  const device_ptr sbt_data_ptr = optix_device->sbt_data.device_pointer;
+  const device_ptr launch_params_ptr = optix_device->launch_params.device_pointer;
+
+  cuda_device_assert(
+      cuda_device_,
+      cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, path_index_array),
+                        args[0],  // &d_path_index
+                        sizeof(device_ptr),
+                        cuda_stream_));
+
+  if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+    cuda_device_assert(
+        cuda_device_,
+        cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, render_buffer),
+                          args[1],  // &d_render_buffer
+                          sizeof(device_ptr),
+                          cuda_stream_));
+  }
+
+  cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_));
+
+  OptixPipeline pipeline = nullptr;
+  OptixShaderBindingTable sbt_params = {};
+
+  switch (kernel) {
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
+      pipeline = optix_device->pipelines[PIP_SHADE_RAYTRACE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE_RAYTRACE * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
+      pipeline = optix_device->pipelines[PIP_INTERSECT];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_CLOSEST * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
+      pipeline = optix_device->pipelines[PIP_INTERSECT];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_SHADOW * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
+      pipeline = optix_device->pipelines[PIP_INTERSECT];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_SUBSURFACE * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK:
+      pipeline = optix_device->pipelines[PIP_INTERSECT];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_VOLUME_STACK * sizeof(SbtRecord);
+      break;
+
+    default:
+      LOG(ERROR) << "Invalid kernel " << device_kernel_as_string(kernel)
+                 << " is attempted to be enqueued.";
+      return false;
+  }
+
+  sbt_params.missRecordBase = sbt_data_ptr + MISS_PROGRAM_GROUP_OFFSET * sizeof(SbtRecord);
+  sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
+  sbt_params.missRecordCount = NUM_MIS_PROGRAM_GROUPS;
+  sbt_params.hitgroupRecordBase = sbt_data_ptr + HIT_PROGAM_GROUP_OFFSET * sizeof(SbtRecord);
+  sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
+  sbt_params.hitgroupRecordCount = NUM_HIT_PROGRAM_GROUPS;
+  sbt_params.callablesRecordBase = sbt_data_ptr + CALLABLE_PROGRAM_GROUPS_BASE * sizeof(SbtRecord);
+  sbt_params.callablesRecordCount = NUM_CALLABLE_PROGRAM_GROUPS;
+  sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
+
+  /* Launch the ray generation program. */
+  optix_device_assert(optix_device,
+                      optixLaunch(pipeline,
+                                  cuda_stream_,
+                                  launch_params_ptr,
+                                  optix_device->launch_params.data_elements,
+                                  &sbt_params,
+                                  work_size,
+                                  1,
+                                  1));
+
+  return !(optix_device->have_error());
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_OPTIX */
diff --git a/intern/cycles/device/optix/queue.h b/intern/cycles/device/optix/queue.h
new file mode 100644
index 00000000000..0de422ccc71
--- /dev/null
+++ b/intern/cycles/device/optix/queue.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_OPTIX
+
+#  include "device/cuda/queue.h"
+
+CCL_NAMESPACE_BEGIN
+
+class OptiXDevice;
+
+/* Base class for CUDA queues. */
+class OptiXDeviceQueue : public CUDADeviceQueue {
+ public:
+  OptiXDeviceQueue(OptiXDevice *device);
+
+  virtual void init_execution() override;
+
+  virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) override;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_OPTIX */
diff --git a/intern/cycles/device/optix/util.h b/intern/cycles/device/optix/util.h
new file mode 100644
index 00000000000..34ae5bb5609
--- /dev/null
+++ b/intern/cycles/device/optix/util.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_OPTIX
+
+#  include "device/cuda/util.h"
+
+#  ifdef WITH_CUDA_DYNLOAD
+#    include <cuew.h>
+// Do not use CUDA SDK headers when using CUEW
+#    define OPTIX_DONT_INCLUDE_CUDA
+#  endif
+
+#  include <optix_stubs.h>
+
+/* Utility for checking return values of OptiX function calls. */
+#  define optix_device_assert(optix_device, stmt) \
+    { \
+      OptixResult result = stmt; \
+      if (result != OPTIX_SUCCESS) { \
+        const char *name = optixGetErrorName(result); \
+        optix_device->set_error( \
+            string_printf("%s in %s (%s:%d)", name, #stmt, __FILE__, __LINE__)); \
+      } \
+    } \
+    (void)0
+
+#  define optix_assert(stmt) optix_device_assert(this, stmt)
+
+#endif /* WITH_OPTIX */