Cleanup: Move common CUDA/OptiX Cycles device code into separate file

This reduces code duplication between the CUDA and OptiX device implementations: The CUDA device class is now split into declaration and definition (similar to the OpenCL device) and the OptiX device class implements that and only overrides the functions it actually has to change, while using the CUDA implementation for everything else. Reviewed By: brecht Differential Revision: https://developer.blender.org/D6814
author: Patrick Mours <pmours@nvidia.com> 2020-02-11 20:54:50 +0300
committer: Patrick Mours <pmours@nvidia.com> 2020-02-12 15:11:32 +0300
commit: 153e001c743bf0f6bc259966418446441e00e200 (patch)
tree: 280bafa4dfcba152fe774864b1636dd281be9713 /intern
parent: cc085e228de70563ed0b6870d23ef3bb531c4798 (diff)
10 files changed, 2829 insertions, 3789 deletions
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 35a79356957..aa5b65a2b73 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -34,13 +34,17 @@ set(SRC
   device_task.cpp
 )
 
+set(SRC_CUDA
+  cuda/device_cuda.h
+  cuda/device_cuda_impl.cpp
+)
+
 set(SRC_OPENCL
-  opencl/opencl.h
+  opencl/device_opencl.h
+  opencl/device_opencl_impl.cpp
   opencl/memory_manager.h
-
-  opencl/opencl_split.cpp
-  opencl/opencl_util.cpp
   opencl/memory_manager.cpp
+  opencl/opencl_util.cpp
 )
 
 if(WITH_CYCLES_NETWORK)
@@ -98,4 +102,4 @@ endif()
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
 
-cycles_add_library(cycles_device "${LIB}" ${SRC} ${SRC_OPENCL} ${SRC_HEADERS})
+cycles_add_library(cycles_device "${LIB}" ${SRC} ${SRC_CUDA} ${SRC_OPENCL} ${SRC_HEADERS})
diff --git a/intern/cycles/device/cuda/device_cuda.h b/intern/cycles/device/cuda/device_cuda.h
new file mode 100644
index 00000000000..3d29d13a781
--- /dev/null
+++ b/intern/cycles/device/cuda/device_cuda.h
@@ -0,0 +1,248 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include "device/device.h"
+#  include "device/device_denoising.h"
+#  include "device/device_split_kernel.h"
+
+#  include "util/util_map.h"
+
+#  ifdef WITH_CUDA_DYNLOAD
+#    include "cuew.h"
+#  else
+#    include "util/util_opengl.h"
+#    include <cuda.h>
+#    include <cudaGL.h>
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice : public Device {
+
+  friend class CUDASplitKernelFunction;
+  friend class CUDASplitKernel;
+  friend class CUDAContextScope;
+
+ public:
+  DedicatedTaskPool task_pool;
+  CUdevice cuDevice;
+  CUcontext cuContext;
+  CUmodule cuModule, cuFilterModule;
+  size_t device_texture_headroom;
+  size_t device_working_headroom;
+  bool move_texture_to_host;
+  size_t map_host_used;
+  size_t map_host_limit;
+  int can_map_host;
+  int cuDevId;
+  int cuDevArchitecture;
+  bool first_error;
+  CUDASplitKernel *split_kernel;
+
+  struct CUDAMem {
+    CUDAMem() : texobject(0), array(0), use_mapped_host(false)
+    {
+    }
+
+    CUtexObject texobject;
+    CUarray array;
+
+    /* If true, a mapped host memory in shared_pointer is being used. */
+    bool use_mapped_host;
+  };
+  typedef map<device_memory *, CUDAMem> CUDAMemMap;
+  CUDAMemMap cuda_mem_map;
+
+  struct PixelMem {
+    GLuint cuPBO;
+    CUgraphicsResource cuPBOresource;
+    GLuint cuTexId;
+    int w, h;
+  };
+  map<device_ptr, PixelMem> pixel_mem_map;
+
+  /* Bindless Textures */
+  device_vector<TextureInfo> texture_info;
+  bool need_texture_info;
+
+  CUdeviceptr cuda_device_ptr(device_ptr mem)
+  {
+    return (CUdeviceptr)mem;
+  }
+
+  static bool have_precompiled_kernels();
+
+  virtual bool show_samples() const;
+
+  virtual BVHLayoutMask get_bvh_layout_mask() const;
+
+  void cuda_error_documentation();
+
+  bool cuda_error_(CUresult result, const string &stmt);
+
+  void cuda_error_message(const string &message);
+
+  CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_);
+
+  virtual ~CUDADevice();
+
+  bool support_device(const DeviceRequestedFeatures & /*requested_features*/);
+
+  bool use_adaptive_compilation();
+
+  bool use_split_kernel();
+
+  string compile_kernel_get_common_cflags(const DeviceRequestedFeatures &requested_features,
+                                          bool filter = false,
+                                          bool split = false);
+
+  bool compile_check_compiler();
+
+  string compile_kernel(const DeviceRequestedFeatures &requested_features,
+                        bool filter = false,
+                        bool split = false);
+
+  virtual bool load_kernels(const DeviceRequestedFeatures &requested_features);
+
+  void reserve_local_memory(const DeviceRequestedFeatures &requested_features);
+
+  void init_host_memory();
+
+  void load_texture_info();
+
+  void move_textures_to_host(size_t size, bool for_texture);
+
+  CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
+
+  void generic_copy_to(device_memory &mem);
+
+  void generic_free(device_memory &mem);
+
+  void mem_alloc(device_memory &mem);
+
+  void mem_copy_to(device_memory &mem);
+
+  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem);
+
+  void mem_zero(device_memory &mem);
+
+  void mem_free(device_memory &mem);
+
+  device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/);
+
+  virtual void const_copy_to(const char *name, void *host, size_t size);
+
+  void tex_alloc(device_memory &mem);
+
+  void tex_free(device_memory &mem);
+
+  bool denoising_non_local_means(device_ptr image_ptr,
+                                 device_ptr guide_ptr,
+                                 device_ptr variance_ptr,
+                                 device_ptr out_ptr,
+                                 DenoisingTask *task);
+
+  bool denoising_construct_transform(DenoisingTask *task);
+
+  bool denoising_accumulate(device_ptr color_ptr,
+                            device_ptr color_variance_ptr,
+                            device_ptr scale_ptr,
+                            int frame,
+                            DenoisingTask *task);
+
+  bool denoising_solve(device_ptr output_ptr, DenoisingTask *task);
+
+  bool denoising_combine_halves(device_ptr a_ptr,
+                                device_ptr b_ptr,
+                                device_ptr mean_ptr,
+                                device_ptr variance_ptr,
+                                int r,
+                                int4 rect,
+                                DenoisingTask *task);
+
+  bool denoising_divide_shadow(device_ptr a_ptr,
+                               device_ptr b_ptr,
+                               device_ptr sample_variance_ptr,
+                               device_ptr sv_variance_ptr,
+                               device_ptr buffer_variance_ptr,
+                               DenoisingTask *task);
+
+  bool denoising_get_feature(int mean_offset,
+                             int variance_offset,
+                             device_ptr mean_ptr,
+                             device_ptr variance_ptr,
+                             float scale,
+                             DenoisingTask *task);
+
+  bool denoising_write_feature(int out_offset,
+                               device_ptr from_ptr,
+                               device_ptr buffer_ptr,
+                               DenoisingTask *task);
+
+  bool denoising_detect_outliers(device_ptr image_ptr,
+                                 device_ptr variance_ptr,
+                                 device_ptr depth_ptr,
+                                 device_ptr output_ptr,
+                                 DenoisingTask *task);
+
+  void denoise(RenderTile &rtile, DenoisingTask &denoising);
+
+  void path_trace(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles);
+
+  void film_convert(DeviceTask &task,
+                    device_ptr buffer,
+                    device_ptr rgba_byte,
+                    device_ptr rgba_half);
+
+  void shader(DeviceTask &task);
+
+  CUdeviceptr map_pixels(device_ptr mem);
+
+  void unmap_pixels(device_ptr mem);
+
+  void pixels_alloc(device_memory &mem);
+
+  void pixels_copy_from(device_memory &mem, int y, int w, int h);
+
+  void pixels_free(device_memory &mem);
+
+  void draw_pixels(device_memory &mem,
+                   int y,
+                   int w,
+                   int h,
+                   int width,
+                   int height,
+                   int dx,
+                   int dy,
+                   int dw,
+                   int dh,
+                   bool transparent,
+                   const DeviceDrawParams &draw_params);
+
+  void thread_run(DeviceTask *task);
+
+  virtual void task_add(DeviceTask &task);
+
+  virtual void task_wait();
+
+  virtual void task_cancel();
+};
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp
new file mode 100644
index 00000000000..a4e1c026263
--- /dev/null
+++ b/intern/cycles/device/cuda/device_cuda_impl.cpp
@@ -0,0 +1,2502 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include <climits>
+#  include <limits.h>
+#  include <stdio.h>
+#  include <stdlib.h>
+#  include <string.h>
+
+#  include "device/cuda/device_cuda.h"
+#  include "device/device_intern.h"
+#  include "device/device_split_kernel.h"
+
+#  include "render/buffers.h"
+
+#  include "kernel/filter/filter_defines.h"
+
+#  include "util/util_debug.h"
+#  include "util/util_foreach.h"
+#  include "util/util_logging.h"
+#  include "util/util_map.h"
+#  include "util/util_md5.h"
+#  include "util/util_opengl.h"
+#  include "util/util_path.h"
+#  include "util/util_string.h"
+#  include "util/util_system.h"
+#  include "util/util_types.h"
+#  include "util/util_time.h"
+#  include "util/util_windows.h"
+
+#  include "kernel/split/kernel_split_data_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+#  ifndef WITH_CUDA_DYNLOAD
+
+/* Transparently implement some functions, so majority of the file does not need
+ * to worry about difference between dynamically loaded and linked CUDA at all.
+ */
+
+namespace {
+
+const char *cuewErrorString(CUresult result)
+{
+  /* We can only give error code here without major code duplication, that
+   * should be enough since dynamic loading is only being disabled by folks
+   * who knows what they're doing anyway.
+   *
+   * NOTE: Avoid call from several threads.
+   */
+  static string error;
+  error = string_printf("%d", result);
+  return error.c_str();
+}
+
+const char *cuewCompilerPath()
+{
+  return CYCLES_CUDA_NVCC_EXECUTABLE;
+}
+
+int cuewCompilerVersion()
+{
+  return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10);
+}
+
+} /* namespace */
+#  endif /* WITH_CUDA_DYNLOAD */
+
+class CUDADevice;
+
+class CUDASplitKernel : public DeviceSplitKernel {
+  CUDADevice *device;
+
+ public:
+  explicit CUDASplitKernel(CUDADevice *device);
+
+  virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads);
+
+  virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
+                                              RenderTile &rtile,
+                                              int num_global_elements,
+                                              device_memory &kernel_globals,
+                                              device_memory &kernel_data_,
+                                              device_memory &split_data,
+                                              device_memory &ray_state,
+                                              device_memory &queue_index,
+                                              device_memory &use_queues_flag,
+                                              device_memory &work_pool_wgs);
+
+  virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
+                                                         const DeviceRequestedFeatures &);
+  virtual int2 split_kernel_local_size();
+  virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask *task);
+};
+
+/* Utility to push/pop CUDA context. */
+class CUDAContextScope {
+ public:
+  CUDAContextScope(CUDADevice *device);
+  ~CUDAContextScope();
+
+ private:
+  CUDADevice *device;
+};
+
+bool CUDADevice::have_precompiled_kernels()
+{
+  string cubins_path = path_get("lib");
+  return path_exists(cubins_path);
+}
+
+bool CUDADevice::show_samples() const
+{
+  /* The CUDADevice only processes one tile at a time, so showing samples is fine. */
+  return true;
+}
+
+BVHLayoutMask CUDADevice::get_bvh_layout_mask() const
+{
+  return BVH_LAYOUT_BVH2;
+}
+
+void CUDADevice::cuda_error_documentation()
+{
+  if (first_error) {
+    fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
+    fprintf(stderr,
+            "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
+    first_error = false;
+  }
+}
+
+#  define cuda_assert(stmt) \
+    { \
+      CUresult result = stmt; \
+\
+      if (result != CUDA_SUCCESS) { \
+        string message = string_printf( \
+            "CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \
+        if (error_msg == "") \
+          error_msg = message; \
+        fprintf(stderr, "%s\n", message.c_str()); \
+        /*cuda_abort();*/ \
+        cuda_error_documentation(); \
+      } \
+    } \
+    (void)0
+
+bool CUDADevice::cuda_error_(CUresult result, const string &stmt)
+{
+  if (result == CUDA_SUCCESS)
+    return false;
+
+  string message = string_printf("CUDA error at %s: %s", stmt.c_str(), cuewErrorString(result));
+  if (error_msg == "")
+    error_msg = message;
+  fprintf(stderr, "%s\n", message.c_str());
+  cuda_error_documentation();
+  return true;
+}
+
+#  define cuda_error(stmt) cuda_error_(stmt, #  stmt)
+
+void CUDADevice::cuda_error_message(const string &message)
+{
+  if (error_msg == "")
+    error_msg = message;
+  fprintf(stderr, "%s\n", message.c_str());
+  cuda_error_documentation();
+}
+
+CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_)
+    : Device(info, stats, profiler, background_), texture_info(this, "__texture_info", MEM_TEXTURE)
+{
+  first_error = true;
+  background = background_;
+
+  cuDevId = info.num;
+  cuDevice = 0;
+  cuContext = 0;
+
+  cuModule = 0;
+  cuFilterModule = 0;
+
+  split_kernel = NULL;
+
+  need_texture_info = false;
+
+  device_texture_headroom = 0;
+  device_working_headroom = 0;
+  move_texture_to_host = false;
+  map_host_limit = 0;
+  map_host_used = 0;
+  can_map_host = 0;
+
+  /* Intialize CUDA. */
+  if (cuda_error(cuInit(0)))
+    return;
+
+  /* Setup device and context. */
+  if (cuda_error(cuDeviceGet(&cuDevice, cuDevId)))
+    return;
+
+  /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
+   * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
+   * so we can predict which memory to map to host. */
+  cuda_assert(
+      cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
+
+  unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
+  if (can_map_host) {
+    ctx_flags |= CU_CTX_MAP_HOST;
+    init_host_memory();
+  }
+
+  /* Create context. */
+  CUresult result;
+
+  if (background) {
+    result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
+  }
+  else {
+    result = cuGLCtxCreate(&cuContext, ctx_flags, cuDevice);
+
+    if (result != CUDA_SUCCESS) {
+      result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
+      background = true;
+    }
+  }
+
+  if (cuda_error_(result, "cuCtxCreate"))
+    return;
+
+  int major, minor;
+  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+  cuDevArchitecture = major * 100 + minor * 10;
+
+  /* Pop context set by cuCtxCreate. */
+  cuCtxPopCurrent(NULL);
+}
+
+CUDADevice::~CUDADevice()
+{
+  task_pool.stop();
+
+  delete split_kernel;
+
+  texture_info.free();
+
+  cuda_assert(cuCtxDestroy(cuContext));
+}
+
+bool CUDADevice::support_device(const DeviceRequestedFeatures & /*requested_features*/)
+{
+  int major, minor;
+  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+
+  /* We only support sm_30 and above */
+  if (major < 3) {
+    cuda_error_message(
+        string_printf("CUDA device supported only with compute capability 3.0 or up, found %d.%d.",
+                      major,
+                      minor));
+    return false;
+  }
+
+  return true;
+}
+
+bool CUDADevice::use_adaptive_compilation()
+{
+  return DebugFlags().cuda.adaptive_compile;
+}
+
+bool CUDADevice::use_split_kernel()
+{
+  return DebugFlags().cuda.split_kernel;
+}
+
+/* Common NVCC flags which stays the same regardless of shading model,
+ * kernel sources md5 and only depends on compiler or compilation settings.
+ */
+string CUDADevice::compile_kernel_get_common_cflags(
+    const DeviceRequestedFeatures &requested_features, bool filter, bool split)
+{
+  const int machine = system_cpu_bits();
+  const string source_path = path_get("source");
+  const string include_path = source_path;
+  string cflags = string_printf(
+      "-m%d "
+      "--ptxas-options=\"-v\" "
+      "--use_fast_math "
+      "-DNVCC "
+      "-I\"%s\"",
+      machine,
+      include_path.c_str());
+  if (!filter && use_adaptive_compilation()) {
+    cflags += " " + requested_features.get_build_options();
+  }
+  const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
+  if (extra_cflags) {
+    cflags += string(" ") + string(extra_cflags);
+  }
+#  ifdef WITH_CYCLES_DEBUG
+  cflags += " -D__KERNEL_DEBUG__";
+#  endif
+
+  if (split) {
+    cflags += " -D__SPLIT__";
+  }
+
+  return cflags;
+}
+
+bool CUDADevice::compile_check_compiler()
+{
+  const char *nvcc = cuewCompilerPath();
+  if (nvcc == NULL) {
+    cuda_error_message(
+        "CUDA nvcc compiler not found. "
+        "Install CUDA toolkit in default location.");
+    return false;
+  }
+  const int cuda_version = cuewCompilerVersion();
+  VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << cuda_version << ".";
+  const int major = cuda_version / 10, minor = cuda_version % 10;
+  if (cuda_version == 0) {
+    cuda_error_message("CUDA nvcc compiler version could not be parsed.");
+    return false;
+  }
+  if (cuda_version < 80) {
+    printf(
+        "Unsupported CUDA version %d.%d detected, "
+        "you need CUDA 8.0 or newer.\n",
+        major,
+        minor);
+    return false;
+  }
+  else if (cuda_version != 101) {
+    printf(
+        "CUDA version %d.%d detected, build may succeed but only "
+        "CUDA 10.1 is officially supported.\n",
+        major,
+        minor);
+  }
+  return true;
+}
+
+string CUDADevice::compile_kernel(const DeviceRequestedFeatures &requested_features,
+                                  bool filter,
+                                  bool split)
+{
+  const char *name, *source;
+  if (filter) {
+    name = "filter";
+    source = "filter.cu";
+  }
+  else if (split) {
+    name = "kernel_split";
+    source = "kernel_split.cu";
+  }
+  else {
+    name = "kernel";
+    source = "kernel.cu";
+  }
+  /* Compute cubin name. */
+  int major, minor;
+  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+
+  /* Attempt to use kernel provided with Blender. */
+  if (!use_adaptive_compilation()) {
+    const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor));
+    VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
+    if (path_exists(cubin)) {
+      VLOG(1) << "Using precompiled kernel.";
+      return cubin;
+    }
+    const string ptx = path_get(string_printf("lib/%s_compute_%d%d.ptx", name, major, minor));
+    VLOG(1) << "Testing for pre-compiled kernel " << ptx << ".";
+    if (path_exists(ptx)) {
+      VLOG(1) << "Using precompiled kernel.";
+      return ptx;
+    }
+  }
+
+  const string common_cflags = compile_kernel_get_common_cflags(requested_features, filter, split);
+
+  /* Try to use locally compiled kernel. */
+  const string source_path = path_get("source");
+  const string kernel_md5 = path_files_md5_hash(source_path);
+
+  /* We include cflags into md5 so changing cuda toolkit or changing other
+   * compiler command line arguments makes sure cubin gets re-built.
+   */
+  const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);
+
+  const string cubin_file = string_printf(
+      "cycles_%s_sm%d%d_%s.cubin", name, major, minor, cubin_md5.c_str());
+  const string cubin = path_cache_get(path_join("kernels", cubin_file));
+  VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
+  if (path_exists(cubin)) {
+    VLOG(1) << "Using locally compiled kernel.";
+    return cubin;
+  }
+
+#  ifdef _WIN32
+  if (have_precompiled_kernels()) {
+    if (major < 3) {
+      cuda_error_message(
+          string_printf("CUDA device requires compute capability 3.0 or up, "
+                        "found %d.%d. Your GPU is not supported.",
+                        major,
+                        minor));
+    }
+    else {
+      cuda_error_message(
+          string_printf("CUDA binary kernel for this graphics card compute "
+                        "capability (%d.%d) not found.",
+                        major,
+                        minor));
+    }
+    return "";
+  }
+#  endif
+
+  /* Compile. */
+  if (!compile_check_compiler()) {
+    return "";
+  }
+  const char *nvcc = cuewCompilerPath();
+  const string kernel = path_join(path_join(source_path, "kernel"),
+                                  path_join("kernels", path_join("cuda", source)));
+  double starttime = time_dt();
+  printf("Compiling CUDA kernel ...\n");
+
+  path_create_directories(cubin);
+
+  string command = string_printf(
+      "\"%s\" "
+      "-arch=sm_%d%d "
+      "--cubin \"%s\" "
+      "-o \"%s\" "
+      "%s ",
+      nvcc,
+      major,
+      minor,
+      kernel.c_str(),
+      cubin.c_str(),
+      common_cflags.c_str());
+
+  printf("%s\n", command.c_str());
+
+  if (system(command.c_str()) == -1) {
+    cuda_error_message(
+        "Failed to execute compilation command, "
+        "see console for details.");
+    return "";
+  }
+
+  /* Verify if compilation succeeded */
+  if (!path_exists(cubin)) {
+    cuda_error_message(
+        "CUDA kernel compilation failed, "
+        "see console for details.");
+    return "";
+  }
+
+  printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
+
+  return cubin;
+}
+
+bool CUDADevice::load_kernels(const DeviceRequestedFeatures &requested_features)
+{
+  /* TODO(sergey): Support kernels re-load for CUDA devices.
+   *
+   * Currently re-loading kernel will invalidate memory pointers,
+   * causing problems in cuCtxSynchronize.
+   */
+  if (cuFilterModule && cuModule) {
+    VLOG(1) << "Skipping kernel reload, not currently supported.";
+    return true;
+  }
+
+  /* check if cuda init succeeded */
+  if (cuContext == 0)
+    return false;
+
+  /* check if GPU is supported */
+  if (!support_device(requested_features))
+    return false;
+
+  /* get kernel */
+  string cubin = compile_kernel(requested_features, false, use_split_kernel());
+  if (cubin == "")
+    return false;
+
+  string filter_cubin = compile_kernel(requested_features, true, false);
+  if (filter_cubin == "")
+    return false;
+
+  /* open module */
+  CUDAContextScope scope(this);
+
+  string cubin_data;
+  CUresult result;
+
+  if (path_read_text(cubin, cubin_data))
+    result = cuModuleLoadData(&cuModule, cubin_data.c_str());
+  else
+    result = CUDA_ERROR_FILE_NOT_FOUND;
+
+  if (cuda_error_(result, "cuModuleLoad"))
+    cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));
+
+  if (path_read_text(filter_cubin, cubin_data))
+    result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
+  else
+    result = CUDA_ERROR_FILE_NOT_FOUND;
+
+  if (cuda_error_(result, "cuModuleLoad"))
+    cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str()));
+
+  if (result == CUDA_SUCCESS) {
+    reserve_local_memory(requested_features);
+  }
+
+  return (result == CUDA_SUCCESS);
+}
+
+void CUDADevice::reserve_local_memory(const DeviceRequestedFeatures &requested_features)
+{
+  if (use_split_kernel()) {
+    /* Split kernel mostly uses global memory and adaptive compilation,
+     * difficult to predict how much is needed currently. */
+    return;
+  }
+
+  /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
+   * needed for kernel launches, so that we can reliably figure out when
+   * to allocate scene data in mapped host memory. */
+  CUDAContextScope scope(this);
+
+  size_t total = 0, free_before = 0, free_after = 0;
+  cuMemGetInfo(&free_before, &total);
+
+  /* Get kernel function. */
+  CUfunction cuPathTrace;
+
+  if (requested_features.use_integrator_branched) {
+    cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"));
+  }
+  else {
+    cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"));
+  }
+
+  cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
+
+  int min_blocks, num_threads_per_block;
+  cuda_assert(cuOccupancyMaxPotentialBlockSize(
+      &min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0));
+
+  /* Launch kernel, using just 1 block appears sufficient to reserve
+   * memory for all multiprocessors. It would be good to do this in
+   * parallel for the multi GPU case still to make it faster. */
+  CUdeviceptr d_work_tiles = 0;
+  uint total_work_size = 0;
+
+  void *args[] = {&d_work_tiles, &total_work_size};
+
+  cuda_assert(cuLaunchKernel(cuPathTrace, 1, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
+
+  cuda_assert(cuCtxSynchronize());
+
+  cuMemGetInfo(&free_after, &total);
+  VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after)
+          << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
+
+#  if 0
+  /* For testing mapped host memory, fill up device memory. */
+  const size_t keep_mb = 1024;
+
+  while (free_after > keep_mb * 1024 * 1024LL) {
+    CUdeviceptr tmp;
+    cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
+    cuMemGetInfo(&free_after, &total);
+  }
+#  endif
+}
+
+void CUDADevice::init_host_memory()
+{
+  /* Limit amount of host mapped memory, because allocating too much can
+   * cause system instability. Leave at least half or 4 GB of system
+   * memory free, whichever is smaller. */
+  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
+  size_t system_ram = system_physical_ram();
+
+  if (system_ram > 0) {
+    if (system_ram / 2 > default_limit) {
+      map_host_limit = system_ram - default_limit;
+    }
+    else {
+      map_host_limit = system_ram / 2;
+    }
+  }
+  else {
+    VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
+    map_host_limit = 0;
+  }
+
+  /* Amount of device memory to keep is free after texture memory
+   * and working memory allocations respectively. We set the working
+   * memory limit headroom lower so that some space is left after all
+   * texture memory allocations. */
+  device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
+  device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
+
+  VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
+          << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
+}
+
+void CUDADevice::load_texture_info()
+{
+  if (need_texture_info) {
+    texture_info.copy_to_device();
+    need_texture_info = false;
+  }
+}
+
+void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
+{
+  /* Signal to reallocate textures in host memory only. */
+  move_texture_to_host = true;
+
+  while (size > 0) {
+    /* Find suitable memory allocation to move. */
+    device_memory *max_mem = NULL;
+    size_t max_size = 0;
+    bool max_is_image = false;
+
+    foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
+      device_memory &mem = *pair.first;
+      CUDAMem *cmem = &pair.second;
+
+      bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
+      bool is_image = is_texture && (mem.data_height > 1);
+
+      /* Can't move this type of memory. */
+      if (!is_texture || cmem->array) {
+        continue;
+      }
+
+      /* Already in host memory. */
+      if (cmem->use_mapped_host) {
+        continue;
+      }
+
+      /* For other textures, only move image textures. */
+      if (for_texture && !is_image) {
+        continue;
+      }
+
+      /* Try to move largest allocation, prefer moving images. */
+      if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
+        max_is_image = is_image;
+        max_size = mem.device_size;
+        max_mem = &mem;
+      }
+    }
+
+    /* Move to host memory. This part is mutex protected since
+     * multiple CUDA devices could be moving the memory. The
+     * first one will do it, and the rest will adopt the pointer. */
+    if (max_mem) {
+      VLOG(1) << "Move memory from device to host: " << max_mem->name;
+
+      static thread_mutex move_mutex;
+      thread_scoped_lock lock(move_mutex);
+
+      /* Preserve the original device pointer, in case of multi device
+       * we can't change it because the pointer mapping would break. */
+      device_ptr prev_pointer = max_mem->device_pointer;
+      size_t prev_size = max_mem->device_size;
+
+      tex_free(*max_mem);
+      tex_alloc(*max_mem);
+      size = (max_size >= size) ? 0 : size - max_size;
+
+      max_mem->device_pointer = prev_pointer;
+      max_mem->device_size = prev_size;
+    }
+    else {
+      break;
+    }
+  }
+
+  /* Update texture info array with new pointers. */
+  load_texture_info();
+
+  move_texture_to_host = false;
+}
+
+CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding)
+{
+  CUDAContextScope scope(this);
+
+  CUdeviceptr device_pointer = 0;
+  size_t size = mem.memory_size() + pitch_padding;
+
+  CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
+  const char *status = "";
+
+  /* First try allocating in device memory, respecting headroom. We make
+   * an exception for texture info. It is small and frequently accessed,
+   * so treat it as working memory.
+   *
+   * If there is not enough room for working memory, we will try to move
+   * textures to host memory, assuming the performance impact would have
+   * been worse for working memory. */
+  bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
+  bool is_image = is_texture && (mem.data_height > 1);
+
+  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
+
+  size_t total = 0, free = 0;
+  cuMemGetInfo(&free, &total);
+
+  /* Move textures to host memory if needed. */
+  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
+    move_textures_to_host(size + headroom - free, is_texture);
+    cuMemGetInfo(&free, &total);
+  }
+
+  /* Allocate in device memory. */
+  if (!move_texture_to_host && (size + headroom) < free) {
+    mem_alloc_result = cuMemAlloc(&device_pointer, size);
+    if (mem_alloc_result == CUDA_SUCCESS) {
+      status = " in device memory";
+    }
+  }
+
+  /* Fall back to mapped host memory if needed and possible. */
+
+  void *shared_pointer = 0;
+
+  if (mem_alloc_result != CUDA_SUCCESS && can_map_host) {
+    if (mem.shared_pointer) {
+      /* Another device already allocated host memory. */
+      mem_alloc_result = CUDA_SUCCESS;
+      shared_pointer = mem.shared_pointer;
+    }
+    else if (map_host_used + size < map_host_limit) {
+      /* Allocate host memory ourselves. */
+      mem_alloc_result = cuMemHostAlloc(
+          &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
+
+      assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
+             (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
+    }
+
+    if (mem_alloc_result == CUDA_SUCCESS) {
+      cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0));
+      map_host_used += size;
+      status = " in host memory";
+    }
+    else {
+      status = " failed, out of host memory";
+    }
+  }
+
+  if (mem_alloc_result != CUDA_SUCCESS) {
+    status = " failed, out of device and host memory";
+    cuda_assert(mem_alloc_result);
+  }
+
+  if (mem.name) {
+    VLOG(1) << "Buffer allocate: " << mem.name << ", "
+            << string_human_readable_number(mem.memory_size()) << " bytes. ("
+            << string_human_readable_size(mem.memory_size()) << ")" << status;
+  }
+
+  mem.device_pointer = (device_ptr)device_pointer;
+  mem.device_size = size;
+  stats.mem_alloc(size);
+
+  if (!mem.device_pointer) {
+    return NULL;
+  }
+
+  /* Insert into map of allocations. */
+  CUDAMem *cmem = &cuda_mem_map[&mem];
+  if (shared_pointer != 0) {
+    /* Replace host pointer with our host allocation. Only works if
+     * CUDA memory layout is the same and has no pitch padding. Also
+     * does not work if we move textures to host during a render,
+     * since other devices might be using the memory. */
+
+    if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
+        mem.host_pointer != shared_pointer) {
+      memcpy(shared_pointer, mem.host_pointer, size);
+
+      /* A Call to device_memory::host_free() should be preceded by
+       * a call to device_memory::device_free() for host memory
+       * allocated by a device to be handled properly. Two exceptions
+       * are here and a call in OptiXDevice::generic_alloc(), where
+       * the current host memory can be assumed to be allocated by
+       * device_memory::host_alloc(), not by a device */
+
+      mem.host_free();
+      mem.host_pointer = shared_pointer;
+    }
+    mem.shared_pointer = shared_pointer;
+    mem.shared_counter++;
+    cmem->use_mapped_host = true;
+  }
+  else {
+    cmem->use_mapped_host = false;
+  }
+
+  return cmem;
+}
+
+void CUDADevice::generic_copy_to(device_memory &mem)
+{
+  if (mem.host_pointer && mem.device_pointer) {
+    CUDAContextScope scope(this);
+
+    /* If use_mapped_host of mem is false, the current device only
+     * uses device memory allocated by cuMemAlloc regardless of
+     * mem.host_pointer and mem.shared_pointer, and should copy
+     * data from mem.host_pointer. */
+
+    if (cuda_mem_map[&mem].use_mapped_host == false || mem.host_pointer != mem.shared_pointer) {
+      cuda_assert(
+          cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), mem.host_pointer, mem.memory_size()));
+    }
+  }
+}
+
+void CUDADevice::generic_free(device_memory &mem)
+{
+  if (mem.device_pointer) {
+    CUDAContextScope scope(this);
+    const CUDAMem &cmem = cuda_mem_map[&mem];
+
+    /* If cmem.use_mapped_host is true, reference counting is used
+     * to safely free a mapped host memory. */
+
+    if (cmem.use_mapped_host) {
+      assert(mem.shared_pointer);
+      if (mem.shared_pointer) {
+        assert(mem.shared_counter > 0);
+        if (--mem.shared_counter == 0) {
+          if (mem.host_pointer == mem.shared_pointer) {
+            mem.host_pointer = 0;
+          }
+          cuMemFreeHost(mem.shared_pointer);
+          mem.shared_pointer = 0;
+        }
+      }
+      map_host_used -= mem.device_size;
+    }
+    else {
+      /* Free device memory. */
+      cuMemFree(mem.device_pointer);
+    }
+
+    stats.mem_free(mem.device_size);
+    mem.device_pointer = 0;
+    mem.device_size = 0;
+
+    cuda_mem_map.erase(cuda_mem_map.find(&mem));
+  }
+}
+
+void CUDADevice::mem_alloc(device_memory &mem)
+{
+  if (mem.type == MEM_PIXELS && !background) {
+    pixels_alloc(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    assert(!"mem_alloc not supported for textures.");
+  }
+  else {
+    generic_alloc(mem);
+  }
+}
+
+void CUDADevice::mem_copy_to(device_memory &mem)
+{
+  if (mem.type == MEM_PIXELS) {
+    assert(!"mem_copy_to not supported for pixels.");
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free(mem);
+    tex_alloc(mem);
+  }
+  else {
+    if (!mem.device_pointer) {
+      generic_alloc(mem);
+    }
+
+    generic_copy_to(mem);
+  }
+}
+
+void CUDADevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
+{
+  if (mem.type == MEM_PIXELS && !background) {
+    pixels_copy_from(mem, y, w, h);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    assert(!"mem_copy_from not supported for textures.");
+  }
+  else if (mem.host_pointer) {
+    const size_t size = elem * w * h;
+    const size_t offset = elem * y * w;
+
+    if (mem.device_pointer) {
+      const CUDAContextScope scope(this);
+      cuda_assert(cuMemcpyDtoH(
+          (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size));
+    }
+    else {
+      memset((char *)mem.host_pointer + offset, 0, size);
+    }
+  }
+}
+
+void CUDADevice::mem_zero(device_memory &mem)
+{
+  if (!mem.device_pointer) {
+    mem_alloc(mem);
+  }
+  if (!mem.device_pointer) {
+    return;
+  }
+
+  /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
+   * regardless of mem.host_pointer and mem.shared_pointer. */
+  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+    const CUDAContextScope scope(this);
+    cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size()));
+  }
+  else if (mem.host_pointer) {
+    memset(mem.host_pointer, 0, mem.memory_size());
+  }
+}
+
+void CUDADevice::mem_free(device_memory &mem)
+{
+  if (mem.type == MEM_PIXELS && !background) {
+    pixels_free(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free(mem);
+  }
+  else {
+    generic_free(mem);
+  }
+}
+
+device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
+{
+  return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
+}
+
+void CUDADevice::const_copy_to(const char *name, void *host, size_t size)
+{
+  CUDAContextScope scope(this);
+  CUdeviceptr mem;
+  size_t bytes;
+
+  cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
+  // assert(bytes == size);
+  cuda_assert(cuMemcpyHtoD(mem, host, size));
+}
+
+void CUDADevice::tex_alloc(device_memory &mem)
+{
+  CUDAContextScope scope(this);
+
+  /* General variables for both architectures */
+  string bind_name = mem.name;
+  size_t dsize = datatype_size(mem.data_type);
+  size_t size = mem.memory_size();
+
+  CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
+  switch (mem.extension) {
+    case EXTENSION_REPEAT:
+      address_mode = CU_TR_ADDRESS_MODE_WRAP;
+      break;
+    case EXTENSION_EXTEND:
+      address_mode = CU_TR_ADDRESS_MODE_CLAMP;
+      break;
+    case EXTENSION_CLIP:
+      address_mode = CU_TR_ADDRESS_MODE_BORDER;
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  CUfilter_mode filter_mode;
+  if (mem.interpolation == INTERPOLATION_CLOSEST) {
+    filter_mode = CU_TR_FILTER_MODE_POINT;
+  }
+  else {
+    filter_mode = CU_TR_FILTER_MODE_LINEAR;
+  }
+
+  /* Data Storage */
+  if (mem.interpolation == INTERPOLATION_NONE) {
+    generic_alloc(mem);
+    generic_copy_to(mem);
+
+    const_copy_to(bind_name.c_str(), &mem.device_pointer, sizeof(mem.device_pointer));
+    return;
+  }
+
+  /* Image Texture Storage */
+  CUarray_format_enum format;
+  switch (mem.data_type) {
+    case TYPE_UCHAR:
+      format = CU_AD_FORMAT_UNSIGNED_INT8;
+      break;
+    case TYPE_UINT16:
+      format = CU_AD_FORMAT_UNSIGNED_INT16;
+      break;
+    case TYPE_UINT:
+      format = CU_AD_FORMAT_UNSIGNED_INT32;
+      break;
+    case TYPE_INT:
+      format = CU_AD_FORMAT_SIGNED_INT32;
+      break;
+    case TYPE_FLOAT:
+      format = CU_AD_FORMAT_FLOAT;
+      break;
+    case TYPE_HALF:
+      format = CU_AD_FORMAT_HALF;
+      break;
+    default:
+      assert(0);
+      return;
+  }
+
+  CUDAMem *cmem = NULL;
+  CUarray array_3d = NULL;
+  size_t src_pitch = mem.data_width * dsize * mem.data_elements;
+  size_t dst_pitch = src_pitch;
+
+  if (mem.data_depth > 1) {
+    /* 3D texture using array, there is no API for linear memory. */
+    CUDA_ARRAY3D_DESCRIPTOR desc;
+
+    desc.Width = mem.data_width;
+    desc.Height = mem.data_height;
+    desc.Depth = mem.data_depth;
+    desc.Format = format;
+    desc.NumChannels = mem.data_elements;
+    desc.Flags = 0;
+
+    VLOG(1) << "Array 3D allocate: " << mem.name << ", "
+            << string_human_readable_number(mem.memory_size()) << " bytes. ("
+            << string_human_readable_size(mem.memory_size()) << ")";
+
+    cuda_assert(cuArray3DCreate(&array_3d, &desc));
+
+    if (!array_3d) {
+      return;
+    }
+
+    CUDA_MEMCPY3D param;
+    memset(&param, 0, sizeof(param));
+    param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+    param.dstArray = array_3d;
+    param.srcMemoryType = CU_MEMORYTYPE_HOST;
+    param.srcHost = mem.host_pointer;
+    param.srcPitch = src_pitch;
+    param.WidthInBytes = param.srcPitch;
+    param.Height = mem.data_height;
+    param.Depth = mem.data_depth;
+
+    cuda_assert(cuMemcpy3D(&param));
+
+    mem.device_pointer = (device_ptr)array_3d;
+    mem.device_size = size;
+    stats.mem_alloc(size);
+
+    cmem = &cuda_mem_map[&mem];
+    cmem->texobject = 0;
+    cmem->array = array_3d;
+  }
+  else if (mem.data_height > 0) {
+    /* 2D texture, using pitch aligned linear memory. */
+    int alignment = 0;
+    cuda_assert(
+        cuDeviceGetAttribute(&alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
+    dst_pitch = align_up(src_pitch, alignment);
+    size_t dst_size = dst_pitch * mem.data_height;
+
+    cmem = generic_alloc(mem, dst_size - mem.memory_size());
+    if (!cmem) {
+      return;
+    }
+
+    CUDA_MEMCPY2D param;
+    memset(&param, 0, sizeof(param));
+    param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+    param.dstDevice = mem.device_pointer;
+    param.dstPitch = dst_pitch;
+    param.srcMemoryType = CU_MEMORYTYPE_HOST;
+    param.srcHost = mem.host_pointer;
+    param.srcPitch = src_pitch;
+    param.WidthInBytes = param.srcPitch;
+    param.Height = mem.data_height;
+
+    cuda_assert(cuMemcpy2DUnaligned(&param));
+  }
+  else {
+    /* 1D texture, using linear memory. */
+    cmem = generic_alloc(mem);
+    if (!cmem) {
+      return;
+    }
+
+    cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
+  }
+
+  /* Kepler+, bindless textures. */
+  int flat_slot = 0;
+  if (string_startswith(mem.name, "__tex_image")) {
+    int pos = string(mem.name).rfind("_");
+    flat_slot = atoi(mem.name + pos + 1);
+  }
+  else {
+    assert(0);
+  }
+
+  CUDA_RESOURCE_DESC resDesc;
+  memset(&resDesc, 0, sizeof(resDesc));
+
+  if (array_3d) {
+    resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
+    resDesc.res.array.hArray = array_3d;
+    resDesc.flags = 0;
+  }
+  else if (mem.data_height > 0) {
+    resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
+    resDesc.res.pitch2D.devPtr = mem.device_pointer;
+    resDesc.res.pitch2D.format = format;
+    resDesc.res.pitch2D.numChannels = mem.data_elements;
+    resDesc.res.pitch2D.height = mem.data_height;
+    resDesc.res.pitch2D.width = mem.data_width;
+    resDesc.res.pitch2D.pitchInBytes = dst_pitch;
+  }
+  else {
+    resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
+    resDesc.res.linear.devPtr = mem.device_pointer;
+    resDesc.res.linear.format = format;
+    resDesc.res.linear.numChannels = mem.data_elements;
+    resDesc.res.linear.sizeInBytes = mem.device_size;
+  }
+
+  CUDA_TEXTURE_DESC texDesc;
+  memset(&texDesc, 0, sizeof(texDesc));
+  texDesc.addressMode[0] = address_mode;
+  texDesc.addressMode[1] = address_mode;
+  texDesc.addressMode[2] = address_mode;
+  texDesc.filterMode = filter_mode;
+  texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
+
+  cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
+
+  /* Resize once */
+  if (flat_slot >= texture_info.size()) {
+    /* Allocate some slots in advance, to reduce amount
+     * of re-allocations. */
+    texture_info.resize(flat_slot + 128);
+  }
+
+  /* Set Mapping and tag that we need to (re-)upload to device */
+  TextureInfo &info = texture_info[flat_slot];
+  info.data = (uint64_t)cmem->texobject;
+  info.cl_buffer = 0;
+  info.interpolation = mem.interpolation;
+  info.extension = mem.extension;
+  info.width = mem.data_width;
+  info.height = mem.data_height;
+  info.depth = mem.data_depth;
+  need_texture_info = true;
+}
+
+void CUDADevice::tex_free(device_memory &mem)
+{
+  if (mem.device_pointer) {
+    CUDAContextScope scope(this);
+    const CUDAMem &cmem = cuda_mem_map[&mem];
+
+    if (cmem.texobject) {
+      /* Free bindless texture. */
+      cuTexObjectDestroy(cmem.texobject);
+    }
+
+    if (cmem.array) {
+      /* Free array. */
+      cuArrayDestroy(cmem.array);
+      stats.mem_free(mem.device_size);
+      mem.device_pointer = 0;
+      mem.device_size = 0;
+
+      cuda_mem_map.erase(cuda_mem_map.find(&mem));
+    }
+    else {
+      generic_free(mem);
+    }
+  }
+}
+
+#  define CUDA_GET_BLOCKSIZE(func, w, h) \
+    int threads_per_block; \
+    cuda_assert( \
+        cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
+    int threads = (int)sqrt((float)threads_per_block); \
+    int xblocks = ((w) + threads - 1) / threads; \
+    int yblocks = ((h) + threads - 1) / threads;
+
+#  define CUDA_LAUNCH_KERNEL(func, args) \
+    cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0));
+
+/* Similar as above, but for 1-dimensional blocks. */
+#  define CUDA_GET_BLOCKSIZE_1D(func, w, h) \
+    int threads_per_block; \
+    cuda_assert( \
+        cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
+    int xblocks = ((w) + threads_per_block - 1) / threads_per_block; \
+    int yblocks = h;
+
+#  define CUDA_LAUNCH_KERNEL_1D(func, args) \
+    cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads_per_block, 1, 1, 0, 0, args, 0));
+
+bool CUDADevice::denoising_non_local_means(device_ptr image_ptr,
+                                           device_ptr guide_ptr,
+                                           device_ptr variance_ptr,
+                                           device_ptr out_ptr,
+                                           DenoisingTask *task)
+{
+  if (have_error())
+    return false;
+
+  CUDAContextScope scope(this);
+
+  int stride = task->buffer.stride;
+  int w = task->buffer.width;
+  int h = task->buffer.h;
+  int r = task->nlm_state.r;
+  int f = task->nlm_state.f;
+  float a = task->nlm_state.a;
+  float k_2 = task->nlm_state.k_2;
+
+  int pass_stride = task->buffer.pass_stride;
+  int num_shifts = (2 * r + 1) * (2 * r + 1);
+  int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
+  int frame_offset = 0;
+
+  if (have_error())
+    return false;
+
+  CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer);
+  CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
+  CUdeviceptr weightAccum = difference + 2 * sizeof(float) * pass_stride * num_shifts;
+  CUdeviceptr scale_ptr = 0;
+
+  cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float) * pass_stride));
+  cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float) * pass_stride));
+
+  {
+    CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput;
+    cuda_assert(cuModuleGetFunction(
+        &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+    cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+    cuda_assert(cuModuleGetFunction(
+        &cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+    cuda_assert(cuModuleGetFunction(
+        &cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
+
+    cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+    cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
+    cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
+    cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
+
+    CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w * h, num_shifts);
+
+    void *calc_difference_args[] = {&guide_ptr,
+                                    &variance_ptr,
+                                    &scale_ptr,
+                                    &difference,
+                                    &w,
+                                    &h,
+                                    &stride,
+                                    &pass_stride,
+                                    &r,
+                                    &channel_offset,
+                                    &frame_offset,
+                                    &a,
+                                    &k_2};
+    void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
+    void *calc_weight_args[] = {
+        &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
+    void *update_output_args[] = {&blurDifference,
+                                  &image_ptr,
+                                  &out_ptr,
+                                  &weightAccum,
+                                  &w,
+                                  &h,
+                                  &stride,
+                                  &pass_stride,
+                                  &channel_offset,
+                                  &r,
+                                  &f};
+
+    CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
+    CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+    CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
+    CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+    CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args);
+  }
+
+  {
+    CUfunction cuNLMNormalize;
+    cuda_assert(
+        cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
+    cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
+    void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride};
+    CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h);
+    CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
+    cuda_assert(cuCtxSynchronize());
+  }
+
+  return !have_error();
+}
+
+bool CUDADevice::denoising_construct_transform(DenoisingTask *task)
+{
+  if (have_error())
+    return false;
+
+  CUDAContextScope scope(this);
+
+  CUfunction cuFilterConstructTransform;
+  cuda_assert(cuModuleGetFunction(
+      &cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
+  cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
+  CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, task->storage.w, task->storage.h);
+
+  void *args[] = {&task->buffer.mem.device_pointer,
+                  &task->tile_info_mem.device_pointer,
+                  &task->storage.transform.device_pointer,
+                  &task->storage.rank.device_pointer,
+                  &task->filter_area,
+                  &task->rect,
+                  &task->radius,
+                  &task->pca_threshold,
+                  &task->buffer.pass_stride,
+                  &task->buffer.frame_stride,
+                  &task->buffer.use_time};
+  CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
+  cuda_assert(cuCtxSynchronize());
+
+  return !have_error();
+}
+
+bool CUDADevice::denoising_accumulate(device_ptr color_ptr,
+                                      device_ptr color_variance_ptr,
+                                      device_ptr scale_ptr,
+                                      int frame,
+                                      DenoisingTask *task)
+{
+  if (have_error())
+    return false;
+
+  CUDAContextScope scope(this);
+
+  int r = task->radius;
+  int f = 4;
+  float a = 1.0f;
+  float k_2 = task->nlm_k_2;
+
+  int w = task->reconstruction_state.source_w;
+  int h = task->reconstruction_state.source_h;
+  int stride = task->buffer.stride;
+  int frame_offset = frame * task->buffer.frame_stride;
+  int t = task->tile_info->frames[frame];
+
+  int pass_stride = task->buffer.pass_stride;
+  int num_shifts = (2 * r + 1) * (2 * r + 1);
+
+  if (have_error())
+    return false;
+
+  CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer);
+  CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
+
+  CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
+  cuda_assert(cuModuleGetFunction(
+      &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+  cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+  cuda_assert(
+      cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+  cuda_assert(cuModuleGetFunction(
+      &cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
+
+  cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+  cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
+  cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
+  cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
+
+  CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
+                        task->reconstruction_state.source_w * task->reconstruction_state.source_h,
+                        num_shifts);
+
+  void *calc_difference_args[] = {&color_ptr,
+                                  &color_variance_ptr,
+                                  &scale_ptr,
+                                  &difference,
+                                  &w,
+                                  &h,
+                                  &stride,
+                                  &pass_stride,
+                                  &r,
+                                  &pass_stride,
+                                  &frame_offset,
+                                  &a,
+                                  &k_2};
+  void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
+  void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
+  void *construct_gramian_args[] = {&t,
+                                    &blurDifference,
+                                    &task->buffer.mem.device_pointer,
+                                    &task->storage.transform.device_pointer,
+                                    &task->storage.rank.device_pointer,
+                                    &task->storage.XtWX.device_pointer,
+                                    &task->storage.XtWY.device_pointer,
+                                    &task->reconstruction_state.filter_window,
+                                    &w,
+                                    &h,
+                                    &stride,
+                                    &pass_stride,
+                                    &r,
+                                    &f,
+                                    &frame_offset,
+                                    &task->buffer.use_time};
+
+  CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
+  CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+  CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
+  CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+  CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
+  cuda_assert(cuCtxSynchronize());
+
+  return !have_error();
+}
+
+bool CUDADevice::denoising_solve(device_ptr output_ptr, DenoisingTask *task)
+{
+  CUfunction cuFinalize;
+  cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
+  cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
+  void *finalize_args[] = {&output_ptr,
+                           &task->storage.rank.device_pointer,
+                           &task->storage.XtWX.device_pointer,
+                           &task->storage.XtWY.device_pointer,
+                           &task->filter_area,
+                           &task->reconstruction_state.buffer_params.x,
+                           &task->render_buffer.samples};
+  CUDA_GET_BLOCKSIZE(
+      cuFinalize, task->reconstruction_state.source_w, task->reconstruction_state.source_h);
+  CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
+  cuda_assert(cuCtxSynchronize());
+
+  return !have_error();
+}
+
+bool CUDADevice::denoising_combine_halves(device_ptr a_ptr,
+                                          device_ptr b_ptr,
+                                          device_ptr mean_ptr,
+                                          device_ptr variance_ptr,
+                                          int r,
+                                          int4 rect,
+                                          DenoisingTask *task)
+{
+  if (have_error())
+    return false;
+
+  CUDAContextScope scope(this);
+
+  CUfunction cuFilterCombineHalves;
+  cuda_assert(cuModuleGetFunction(
+      &cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
+  cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
+  CUDA_GET_BLOCKSIZE(
+      cuFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+  void *args[] = {&mean_ptr, &variance_ptr, &a_ptr, &b_ptr, &rect, &r};
+  CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
+  cuda_assert(cuCtxSynchronize());
+
+  return !have_error();
+}
+
+bool CUDADevice::denoising_divide_shadow(device_ptr a_ptr,
+                                         device_ptr b_ptr,
+                                         device_ptr sample_variance_ptr,
+                                         device_ptr sv_variance_ptr,
+                                         device_ptr buffer_variance_ptr,
+                                         DenoisingTask *task)
+{
+  if (have_error())
+    return false;
+
+  CUDAContextScope scope(this);
+
+  CUfunction cuFilterDivideShadow;
+  cuda_assert(cuModuleGetFunction(
+      &cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
+  cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
+  CUDA_GET_BLOCKSIZE(
+      cuFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+  void *args[] = {&task->render_buffer.samples,
+                  &task->tile_info_mem.device_pointer,
+                  &a_ptr,
+                  &b_ptr,
+                  &sample_variance_ptr,
+                  &sv_variance_ptr,
+                  &buffer_variance_ptr,
+                  &task->rect,
+                  &task->render_buffer.pass_stride,
+                  &task->render_buffer.offset};
+  CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
+  cuda_assert(cuCtxSynchronize());
+
+  return !have_error();
+}
+
+bool CUDADevice::denoising_get_feature(int mean_offset,
+                                       int variance_offset,
+                                       device_ptr mean_ptr,
+                                       device_ptr variance_ptr,
+                                       float scale,
+                                       DenoisingTask *task)
+{
+  if (have_error())
+    return false;
+
+  CUDAContextScope scope(this);
+
+  CUfunction cuFilterGetFeature;
+  cuda_assert(
+      cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
+  cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
+  CUDA_GET_BLOCKSIZE(cuFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+  void *args[] = {&task->render_buffer.samples,
+                  &task->tile_info_mem.device_pointer,
+                  &mean_offset,
+                  &variance_offset,
+                  &mean_ptr,
+                  &variance_ptr,
+                  &scale,
+                  &task->rect,
+                  &task->render_buffer.pass_stride,
+                  &task->render_buffer.offset};
+  CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
+  cuda_assert(cuCtxSynchronize());
+
+  return !have_error();
+}
+
+bool CUDADevice::denoising_write_feature(int out_offset,
+                                         device_ptr from_ptr,
+                                         device_ptr buffer_ptr,
+                                         DenoisingTask *task)
+{
+  if (have_error())
+    return false;
+
+  CUDAContextScope scope(this);
+
+  CUfunction cuFilterWriteFeature;
+  cuda_assert(cuModuleGetFunction(
+      &cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature"));
+  cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1));
+  CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, task->filter_area.z, task->filter_area.w);
+
+  void *args[] = {&task->render_buffer.samples,
+                  &task->reconstruction_state.buffer_params,
+                  &task->filter_area,
+                  &from_ptr,
+                  &buffer_ptr,
+                  &out_offset,
+                  &task->rect};
+  CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args);
+  cuda_assert(cuCtxSynchronize());
+
+  return !have_error();
+}
+
+bool CUDADevice::denoising_detect_outliers(device_ptr image_ptr,
+                                           device_ptr variance_ptr,
+                                           device_ptr depth_ptr,
+                                           device_ptr output_ptr,
+                                           DenoisingTask *task)
+{
+  if (have_error())
+    return false;
+
+  CUDAContextScope scope(this);
+
+  CUfunction cuFilterDetectOutliers;
+  cuda_assert(cuModuleGetFunction(
+      &cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers"));
+  cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1));
+  CUDA_GET_BLOCKSIZE(
+      cuFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+  void *args[] = {
+      &image_ptr, &variance_ptr, &depth_ptr, &output_ptr, &task->rect, &task->buffer.pass_stride};
+
+  CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
+  cuda_assert(cuCtxSynchronize());
+
+  return !have_error();
+}
+
+void CUDADevice::denoise(RenderTile &rtile, DenoisingTask &denoising)
+{
+  denoising.functions.construct_transform = function_bind(
+      &CUDADevice::denoising_construct_transform, this, &denoising);
+  denoising.functions.accumulate = function_bind(
+      &CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
+  denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising);
+  denoising.functions.divide_shadow = function_bind(
+      &CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
+  denoising.functions.non_local_means = function_bind(
+      &CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
+  denoising.functions.combine_halves = function_bind(
+      &CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
+  denoising.functions.get_feature = function_bind(
+      &CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
+  denoising.functions.write_feature = function_bind(
+      &CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising);
+  denoising.functions.detect_outliers = function_bind(
+      &CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
+
+  denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
+  denoising.render_buffer.samples = rtile.sample;
+  denoising.buffer.gpu_temporary_mem = true;
+
+  denoising.run_denoising(&rtile);
+}
+
+void CUDADevice::path_trace(DeviceTask &task,
+                            RenderTile &rtile,
+                            device_vector<WorkTile> &work_tiles)
+{
+  scoped_timer timer(&rtile.buffers->render_time);
+
+  if (have_error())
+    return;
+
+  CUDAContextScope scope(this);
+  CUfunction cuPathTrace;
+
+  /* Get kernel function. */
+  if (task.integrator_branched) {
+    cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"));
+  }
+  else {
+    cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"));
+  }
+
+  if (have_error()) {
+    return;
+  }
+
+  cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
+
+  /* Allocate work tile. */
+  work_tiles.alloc(1);
+
+  WorkTile *wtile = work_tiles.data();
+  wtile->x = rtile.x;
+  wtile->y = rtile.y;
+  wtile->w = rtile.w;
+  wtile->h = rtile.h;
+  wtile->offset = rtile.offset;
+  wtile->stride = rtile.stride;
+  wtile->buffer = (float *)cuda_device_ptr(rtile.buffer);
+
+  /* Prepare work size. More step samples render faster, but for now we
+   * remain conservative for GPUs connected to a display to avoid driver
+   * timeouts and display freezing. */
+  int min_blocks, num_threads_per_block;
+  cuda_assert(cuOccupancyMaxPotentialBlockSize(
+      &min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0));
+  if (!info.display_device) {
+    min_blocks *= 8;
+  }
+
+  uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
+
+  /* Render all samples. */
+  int start_sample = rtile.start_sample;
+  int end_sample = rtile.start_sample + rtile.num_samples;
+
+  for (int sample = start_sample; sample < end_sample; sample += step_samples) {
+    /* Setup and copy work tile to device. */
+    wtile->start_sample = sample;
+    wtile->num_samples = min(step_samples, end_sample - sample);
+    work_tiles.copy_to_device();
+
+    CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer);
+    uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
+    uint num_blocks = divide_up(total_work_size, num_threads_per_block);
+
+    /* Launch kernel. */
+    void *args[] = {&d_work_tiles, &total_work_size};
+
+    cuda_assert(
+        cuLaunchKernel(cuPathTrace, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
+
+    cuda_assert(cuCtxSynchronize());
+
+    /* Update progress. */
+    rtile.sample = sample + wtile->num_samples;
+    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
+
+    if (task.get_cancel()) {
+      if (task.need_finish_queue == false)
+        break;
+    }
+  }
+}
+
+void CUDADevice::film_convert(DeviceTask &task,
+                              device_ptr buffer,
+                              device_ptr rgba_byte,
+                              device_ptr rgba_half)
+{
+  if (have_error())
+    return;
+
+  CUDAContextScope scope(this);
+
+  CUfunction cuFilmConvert;
+  CUdeviceptr d_rgba = map_pixels((rgba_byte) ? rgba_byte : rgba_half);
+  CUdeviceptr d_buffer = cuda_device_ptr(buffer);
+
+  /* get kernel function */
+  if (rgba_half) {
+    cuda_assert(
+        cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float"));
+  }
+  else {
+    cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte"));
+  }
+
+  float sample_scale = 1.0f / (task.sample + 1);
+
+  /* pass in parameters */
+  void *args[] = {&d_rgba,
+                  &d_buffer,
+                  &sample_scale,
+                  &task.x,
+                  &task.y,
+                  &task.w,
+                  &task.h,
+                  &task.offset,
+                  &task.stride};
+
+  /* launch kernel */
+  int threads_per_block;
+  cuda_assert(cuFuncGetAttribute(
+      &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert));
+
+  int xthreads = (int)sqrt(threads_per_block);
+  int ythreads = (int)sqrt(threads_per_block);
+  int xblocks = (task.w + xthreads - 1) / xthreads;
+  int yblocks = (task.h + ythreads - 1) / ythreads;
+
+  cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1));
+
+  cuda_assert(cuLaunchKernel(cuFilmConvert,
+                             xblocks,
+                             yblocks,
+                             1, /* blocks */
+                             xthreads,
+                             ythreads,
+                             1, /* threads */
+                             0,
+                             0,
+                             args,
+                             0));
+
+  unmap_pixels((rgba_byte) ? rgba_byte : rgba_half);
+
+  cuda_assert(cuCtxSynchronize());
+}
+
+void CUDADevice::shader(DeviceTask &task)
+{
+  if (have_error())
+    return;
+
+  CUDAContextScope scope(this);
+
+  CUfunction cuShader;
+  CUdeviceptr d_input = cuda_device_ptr(task.shader_input);
+  CUdeviceptr d_output = cuda_device_ptr(task.shader_output);
+
+  /* get kernel function */
+  if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
+    cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_bake"));
+  }
+  else if (task.shader_eval_type == SHADER_EVAL_DISPLACE) {
+    cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace"));
+  }
+  else {
+    cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background"));
+  }
+
+  /* do tasks in smaller chunks, so we can cancel it */
+  const int shader_chunk_size = 65536;
+  const int start = task.shader_x;
+  const int end = task.shader_x + task.shader_w;
+  int offset = task.offset;
+
+  bool canceled = false;
+  for (int sample = 0; sample < task.num_samples && !canceled; sample++) {
+    for (int shader_x = start; shader_x < end; shader_x += shader_chunk_size) {
+      int shader_w = min(shader_chunk_size, end - shader_x);
+
+      /* pass in parameters */
+      void *args[8];
+      int arg = 0;
+      args[arg++] = &d_input;
+      args[arg++] = &d_output;
+      args[arg++] = &task.shader_eval_type;
+      if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
+        args[arg++] = &task.shader_filter;
+      }
+      args[arg++] = &shader_x;
+      args[arg++] = &shader_w;
+      args[arg++] = &offset;
+      args[arg++] = &sample;
+
+      /* launch kernel */
+      int threads_per_block;
+      cuda_assert(cuFuncGetAttribute(
+          &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));
+
+      int xblocks = (shader_w + threads_per_block - 1) / threads_per_block;
+
+      cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
+      cuda_assert(cuLaunchKernel(cuShader,
+                                 xblocks,
+                                 1,
+                                 1, /* blocks */
+                                 threads_per_block,
+                                 1,
+                                 1, /* threads */
+                                 0,
+                                 0,
+                                 args,
+                                 0));
+
+      cuda_assert(cuCtxSynchronize());
+
+      if (task.get_cancel()) {
+        canceled = true;
+        break;
+      }
+    }
+
+    task.update_progress(NULL);
+  }
+}
+
+CUdeviceptr CUDADevice::map_pixels(device_ptr mem)
+{
+  if (!background) {
+    PixelMem pmem = pixel_mem_map[mem];
+    CUdeviceptr buffer;
+
+    size_t bytes;
+    cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0));
+    cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource));
+
+    return buffer;
+  }
+
+  return cuda_device_ptr(mem);
+}
+
+void CUDADevice::unmap_pixels(device_ptr mem)
+{
+  if (!background) {
+    PixelMem pmem = pixel_mem_map[mem];
+
+    cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0));
+  }
+}
+
+void CUDADevice::pixels_alloc(device_memory &mem)
+{
+  PixelMem pmem;
+
+  pmem.w = mem.data_width;
+  pmem.h = mem.data_height;
+
+  CUDAContextScope scope(this);
+
+  glGenBuffers(1, &pmem.cuPBO);
+  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
+  if (mem.data_type == TYPE_HALF)
+    glBufferData(
+        GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(GLhalf) * 4, NULL, GL_DYNAMIC_DRAW);
+  else
+    glBufferData(
+        GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(uint8_t) * 4, NULL, GL_DYNAMIC_DRAW);
+
+  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+
+  glActiveTexture(GL_TEXTURE0);
+  glGenTextures(1, &pmem.cuTexId);
+  glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
+  if (mem.data_type == TYPE_HALF)
+    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL);
+  else
+    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+  glBindTexture(GL_TEXTURE_2D, 0);
+
+  CUresult result = cuGraphicsGLRegisterBuffer(
+      &pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
+
+  if (result == CUDA_SUCCESS) {
+    mem.device_pointer = pmem.cuTexId;
+    pixel_mem_map[mem.device_pointer] = pmem;
+
+    mem.device_size = mem.memory_size();
+    stats.mem_alloc(mem.device_size);
+
+    return;
+  }
+  else {
+    /* failed to register buffer, fallback to no interop */
+    glDeleteBuffers(1, &pmem.cuPBO);
+    glDeleteTextures(1, &pmem.cuTexId);
+
+    background = true;
+  }
+}
+
+void CUDADevice::pixels_copy_from(device_memory &mem, int y, int w, int h)
+{
+  PixelMem pmem = pixel_mem_map[mem.device_pointer];
+
+  CUDAContextScope scope(this);
+
+  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
+  uchar *pixels = (uchar *)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
+  size_t offset = sizeof(uchar) * 4 * y * w;
+  memcpy((uchar *)mem.host_pointer + offset, pixels + offset, sizeof(uchar) * 4 * w * h);
+  glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
+  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+}
+
+void CUDADevice::pixels_free(device_memory &mem)
+{
+  if (mem.device_pointer) {
+    PixelMem pmem = pixel_mem_map[mem.device_pointer];
+
+    CUDAContextScope scope(this);
+
+    cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource));
+    glDeleteBuffers(1, &pmem.cuPBO);
+    glDeleteTextures(1, &pmem.cuTexId);
+
+    pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
+    mem.device_pointer = 0;
+
+    stats.mem_free(mem.device_size);
+    mem.device_size = 0;
+  }
+}
+
+void CUDADevice::draw_pixels(device_memory &mem,
+                             int y,
+                             int w,
+                             int h,
+                             int width,
+                             int height,
+                             int dx,
+                             int dy,
+                             int dw,
+                             int dh,
+                             bool transparent,
+                             const DeviceDrawParams &draw_params)
+{
+  assert(mem.type == MEM_PIXELS);
+
+  if (!background) {
+    const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL);
+    PixelMem pmem = pixel_mem_map[mem.device_pointer];
+    float *vpointer;
+
+    CUDAContextScope scope(this);
+
+    /* for multi devices, this assumes the inefficient method that we allocate
+     * all pixels on the device even though we only render to a subset */
+    size_t offset = 4 * y * w;
+
+    if (mem.data_type == TYPE_HALF)
+      offset *= sizeof(GLhalf);
+    else
+      offset *= sizeof(uint8_t);
+
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
+    glActiveTexture(GL_TEXTURE0);
+    glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
+    if (mem.data_type == TYPE_HALF) {
+      glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_HALF_FLOAT, (void *)offset);
+    }
+    else {
+      glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void *)offset);
+    }
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+
+    if (transparent) {
+      glEnable(GL_BLEND);
+      glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
+    }
+
+    GLint shader_program;
+    if (use_fallback_shader) {
+      if (!bind_fallback_display_space_shader(dw, dh)) {
+        return;
+      }
+      shader_program = fallback_shader_program;
+    }
+    else {
+      draw_params.bind_display_space_shader_cb();
+      glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program);
+    }
+
+    if (!vertex_buffer) {
+      glGenBuffers(1, &vertex_buffer);
+    }
+
+    glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
+    /* invalidate old contents -
+     * avoids stalling if buffer is still waiting in queue to be rendered */
+    glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
+
+    vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
+
+    if (vpointer) {
+      /* texture coordinate - vertex pair */
+      vpointer[0] = 0.0f;
+      vpointer[1] = 0.0f;
+      vpointer[2] = dx;
+      vpointer[3] = dy;
+
+      vpointer[4] = (float)w / (float)pmem.w;
+      vpointer[5] = 0.0f;
+      vpointer[6] = (float)width + dx;
+      vpointer[7] = dy;
+
+      vpointer[8] = (float)w / (float)pmem.w;
+      vpointer[9] = (float)h / (float)pmem.h;
+      vpointer[10] = (float)width + dx;
+      vpointer[11] = (float)height + dy;
+
+      vpointer[12] = 0.0f;
+      vpointer[13] = (float)h / (float)pmem.h;
+      vpointer[14] = dx;
+      vpointer[15] = (float)height + dy;
+
+      glUnmapBuffer(GL_ARRAY_BUFFER);
+    }
+
+    GLuint vertex_array_object;
+    GLuint position_attribute, texcoord_attribute;
+
+    glGenVertexArrays(1, &vertex_array_object);
+    glBindVertexArray(vertex_array_object);
+
+    texcoord_attribute = glGetAttribLocation(shader_program, "texCoord");
+    position_attribute = glGetAttribLocation(shader_program, "pos");
+
+    glEnableVertexAttribArray(texcoord_attribute);
+    glEnableVertexAttribArray(position_attribute);
+
+    glVertexAttribPointer(
+        texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
+    glVertexAttribPointer(position_attribute,
+                          2,
+                          GL_FLOAT,
+                          GL_FALSE,
+                          4 * sizeof(float),
+                          (const GLvoid *)(sizeof(float) * 2));
+
+    glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
+
+    if (use_fallback_shader) {
+      glUseProgram(0);
+    }
+    else {
+      draw_params.unbind_display_space_shader_cb();
+    }
+
+    if (transparent) {
+      glDisable(GL_BLEND);
+    }
+
+    glBindTexture(GL_TEXTURE_2D, 0);
+
+    return;
+  }
+
+  Device::draw_pixels(mem, y, w, h, width, height, dx, dy, dw, dh, transparent, draw_params);
+}
+
+void CUDADevice::thread_run(DeviceTask *task)
+{
+  CUDAContextScope scope(this);
+
+  if (task->type == DeviceTask::RENDER || task->type == DeviceTask::DENOISE) {
+    DeviceRequestedFeatures requested_features;
+    if (use_split_kernel()) {
+      if (split_kernel == NULL) {
+        split_kernel = new CUDASplitKernel(this);
+        split_kernel->load_kernels(requested_features);
+      }
+    }
+
+    device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
+
+    /* keep rendering tiles until done */
+    RenderTile tile;
+    DenoisingTask denoising(this, *task);
+
+    while (task->acquire_tile(this, tile)) {
+      if (tile.task == RenderTile::PATH_TRACE) {
+        if (use_split_kernel()) {
+          device_only_memory<uchar> void_buffer(this, "void_buffer");
+          split_kernel->path_trace(task, tile, void_buffer, void_buffer);
+        }
+        else {
+          path_trace(*task, tile, work_tiles);
+        }
+      }
+      else if (tile.task == RenderTile::DENOISE) {
+        tile.sample = tile.start_sample + tile.num_samples;
+
+        denoise(tile, denoising);
+
+        task->update_progress(&tile, tile.w * tile.h);
+      }
+
+      task->release_tile(tile);
+
+      if (task->get_cancel()) {
+        if (task->need_finish_queue == false)
+          break;
+      }
+    }
+
+    work_tiles.free();
+  }
+  else if (task->type == DeviceTask::SHADER) {
+    shader(*task);
+
+    cuda_assert(cuCtxSynchronize());
+  }
+  else if (task->type == DeviceTask::DENOISE_BUFFER) {
+    RenderTile tile;
+    tile.x = task->x;
+    tile.y = task->y;
+    tile.w = task->w;
+    tile.h = task->h;
+    tile.buffer = task->buffer;
+    tile.sample = task->sample + task->num_samples;
+    tile.num_samples = task->num_samples;
+    tile.start_sample = task->sample;
+    tile.offset = task->offset;
+    tile.stride = task->stride;
+    tile.buffers = task->buffers;
+
+    DenoisingTask denoising(this, *task);
+    denoise(tile, denoising);
+    task->update_progress(&tile, tile.w * tile.h);
+  }
+}
+
+class CUDADeviceTask : public DeviceTask {
+ public:
+  CUDADeviceTask(CUDADevice *device, DeviceTask &task) : DeviceTask(task)
+  {
+    run = function_bind(&CUDADevice::thread_run, device, this);
+  }
+};
+
+void CUDADevice::task_add(DeviceTask &task)
+{
+  CUDAContextScope scope(this);
+
+  /* Load texture info. */
+  load_texture_info();
+
+  /* Synchronize all memory copies before executing task. */
+  cuda_assert(cuCtxSynchronize());
+
+  if (task.type == DeviceTask::FILM_CONVERT) {
+    /* must be done in main thread due to opengl access */
+    film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
+  }
+  else {
+    task_pool.push(new CUDADeviceTask(this, task));
+  }
+}
+
+void CUDADevice::task_wait()
+{
+  task_pool.wait();
+}
+
+void CUDADevice::task_cancel()
+{
+  task_pool.cancel();
+}
+
+/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class
+ * now that the definition of that class is complete
+ */
+#  undef cuda_assert
+#  define cuda_assert(stmt) \
+    { \
+      CUresult result = stmt; \
+\
+      if (result != CUDA_SUCCESS) { \
+        string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
+        if (device->error_msg == "") \
+          device->error_msg = message; \
+        fprintf(stderr, "%s\n", message.c_str()); \
+        /*cuda_abort();*/ \
+        device->cuda_error_documentation(); \
+      } \
+    } \
+    (void)0
+
+/* CUDA context scope. */
+
+CUDAContextScope::CUDAContextScope(CUDADevice *device) : device(device)
+{
+  cuda_assert(cuCtxPushCurrent(device->cuContext));
+}
+
+CUDAContextScope::~CUDAContextScope()
+{
+  cuda_assert(cuCtxPopCurrent(NULL));
+}
+
+/* split kernel */
+
+class CUDASplitKernelFunction : public SplitKernelFunction {
+  CUDADevice *device;
+  CUfunction func;
+
+ public:
+  CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func)
+  {
+  }
+
+  /* enqueue the kernel, returns false if there is an error */
+  bool enqueue(const KernelDimensions &dim, device_memory & /*kg*/, device_memory & /*data*/)
+  {
+    return enqueue(dim, NULL);
+  }
+
+  /* enqueue the kernel, returns false if there is an error */
+  bool enqueue(const KernelDimensions &dim, void *args[])
+  {
+    if (device->have_error())
+      return false;
+
+    CUDAContextScope scope(device);
+
+    /* we ignore dim.local_size for now, as this is faster */
+    int threads_per_block;
+    cuda_assert(
+        cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
+
+    int xblocks = (dim.global_size[0] * dim.global_size[1] + threads_per_block - 1) /
+                  threads_per_block;
+
+    cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
+
+    cuda_assert(cuLaunchKernel(func,
+                               xblocks,
+                               1,
+                               1, /* blocks */
+                               threads_per_block,
+                               1,
+                               1, /* threads */
+                               0,
+                               0,
+                               args,
+                               0));
+
+    return !device->have_error();
+  }
+};
+
+CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device)
+{
+}
+
+uint64_t CUDASplitKernel::state_buffer_size(device_memory & /*kg*/,
+                                            device_memory & /*data*/,
+                                            size_t num_threads)
+{
+  CUDAContextScope scope(device);
+
+  device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
+  size_buffer.alloc(1);
+  size_buffer.zero_to_device();
+
+  uint threads = num_threads;
+  CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer);
+
+  struct args_t {
+    uint *num_threads;
+    CUdeviceptr *size;
+  };
+
+  args_t args = {&threads, &d_size};
+
+  CUfunction state_buffer_size;
+  cuda_assert(
+      cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size"));
+
+  cuda_assert(cuLaunchKernel(state_buffer_size, 1, 1, 1, 1, 1, 1, 0, 0, (void **)&args, 0));
+
+  size_buffer.copy_from_device(0, 1, 1);
+  size_t size = size_buffer[0];
+  size_buffer.free();
+
+  return size;
+}
+
+bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim,
+                                                     RenderTile &rtile,
+                                                     int num_global_elements,
+                                                     device_memory & /*kernel_globals*/,
+                                                     device_memory & /*kernel_data*/,
+                                                     device_memory &split_data,
+                                                     device_memory &ray_state,
+                                                     device_memory &queue_index,
+                                                     device_memory &use_queues_flag,
+                                                     device_memory &work_pool_wgs)
+{
+  CUDAContextScope scope(device);
+
+  CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer);
+  CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer);
+  CUdeviceptr d_queue_index = device->cuda_device_ptr(queue_index.device_pointer);
+  CUdeviceptr d_use_queues_flag = device->cuda_device_ptr(use_queues_flag.device_pointer);
+  CUdeviceptr d_work_pool_wgs = device->cuda_device_ptr(work_pool_wgs.device_pointer);
+
+  CUdeviceptr d_buffer = device->cuda_device_ptr(rtile.buffer);
+
+  int end_sample = rtile.start_sample + rtile.num_samples;
+  int queue_size = dim.global_size[0] * dim.global_size[1];
+
+  struct args_t {
+    CUdeviceptr *split_data_buffer;
+    int *num_elements;
+    CUdeviceptr *ray_state;
+    int *start_sample;
+    int *end_sample;
+    int *sx;
+    int *sy;
+    int *sw;
+    int *sh;
+    int *offset;
+    int *stride;
+    CUdeviceptr *queue_index;
+    int *queuesize;
+    CUdeviceptr *use_queues_flag;
+    CUdeviceptr *work_pool_wgs;
+    int *num_samples;
+    CUdeviceptr *buffer;
+  };
+
+  args_t args = {&d_split_data,
+                 &num_global_elements,
+                 &d_ray_state,
+                 &rtile.start_sample,
+                 &end_sample,
+                 &rtile.x,
+                 &rtile.y,
+                 &rtile.w,
+                 &rtile.h,
+                 &rtile.offset,
+                 &rtile.stride,
+                 &d_queue_index,
+                 &queue_size,
+                 &d_use_queues_flag,
+                 &d_work_pool_wgs,
+                 &rtile.num_samples,
+                 &d_buffer};
+
+  CUfunction data_init;
+  cuda_assert(
+      cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init"));
+  if (device->have_error()) {
+    return false;
+  }
+
+  CUDASplitKernelFunction(device, data_init).enqueue(dim, (void **)&args);
+
+  return !device->have_error();
+}
+
+SplitKernelFunction *CUDASplitKernel::get_split_kernel_function(const string &kernel_name,
+                                                                const DeviceRequestedFeatures &)
+{
+  CUDAContextScope scope(device);
+  CUfunction func;
+
+  cuda_assert(
+      cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data()));
+  if (device->have_error()) {
+    device->cuda_error_message(
+        string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data()));
+    return NULL;
+  }
+
+  return new CUDASplitKernelFunction(device, func);
+}
+
+int2 CUDASplitKernel::split_kernel_local_size()
+{
+  return make_int2(32, 1);
+}
+
+int2 CUDASplitKernel::split_kernel_global_size(device_memory &kg,
+                                               device_memory &data,
+                                               DeviceTask * /*task*/)
+{
+  CUDAContextScope scope(device);
+  size_t free;
+  size_t total;
+
+  cuda_assert(cuMemGetInfo(&free, &total));
+
+  VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(free)
+          << " bytes. (" << string_human_readable_size(free) << ").";
+
+  size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2);
+  size_t side = round_down((int)sqrt(num_elements), 32);
+  int2 global_size = make_int2(side, round_down(num_elements / side, 16));
+  VLOG(1) << "Global size: " << global_size << ".";
+  return global_size;
+}
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index e3c737cc2e7..b24b3ab6567 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -14,2576 +14,15 @@
  * limitations under the License.
  */
 
-#include <climits>
-#include <limits.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
+#ifdef WITH_CUDA
 
-#include "device/device.h"
-#include "device/device_denoising.h"
-#include "device/device_intern.h"
-#include "device/device_split_kernel.h"
+#  include "device/cuda/device_cuda.h"
 
-#include "render/buffers.h"
-
-#include "kernel/filter/filter_defines.h"
-
-#ifdef WITH_CUDA_DYNLOAD
-#  include "cuew.h"
-#else
-#  include "util/util_opengl.h"
-#  include <cuda.h>
-#  include <cudaGL.h>
-#endif
-#include "util/util_debug.h"
-#include "util/util_foreach.h"
-#include "util/util_logging.h"
-#include "util/util_map.h"
-#include "util/util_md5.h"
-#include "util/util_opengl.h"
-#include "util/util_path.h"
-#include "util/util_string.h"
-#include "util/util_system.h"
-#include "util/util_types.h"
-#include "util/util_time.h"
-#include "util/util_windows.h"
-
-#include "kernel/split/kernel_split_data_types.h"
+#  include "util/util_logging.h"
+#  include "util/util_string.h"
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef WITH_CUDA_DYNLOAD
-
-/* Transparently implement some functions, so majority of the file does not need
- * to worry about difference between dynamically loaded and linked CUDA at all.
- */
-
-namespace {
-
-const char *cuewErrorString(CUresult result)
-{
-  /* We can only give error code here without major code duplication, that
-   * should be enough since dynamic loading is only being disabled by folks
-   * who knows what they're doing anyway.
-   *
-   * NOTE: Avoid call from several threads.
-   */
-  static string error;
-  error = string_printf("%d", result);
-  return error.c_str();
-}
-
-const char *cuewCompilerPath()
-{
-  return CYCLES_CUDA_NVCC_EXECUTABLE;
-}
-
-int cuewCompilerVersion()
-{
-  return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10);
-}
-
-} /* namespace */
-#endif /* WITH_CUDA_DYNLOAD */
-
-class CUDADevice;
-
-class CUDASplitKernel : public DeviceSplitKernel {
-  CUDADevice *device;
-
- public:
-  explicit CUDASplitKernel(CUDADevice *device);
-
-  virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads);
-
-  virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
-                                              RenderTile &rtile,
-                                              int num_global_elements,
-                                              device_memory &kernel_globals,
-                                              device_memory &kernel_data_,
-                                              device_memory &split_data,
-                                              device_memory &ray_state,
-                                              device_memory &queue_index,
-                                              device_memory &use_queues_flag,
-                                              device_memory &work_pool_wgs);
-
-  virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
-                                                         const DeviceRequestedFeatures &);
-  virtual int2 split_kernel_local_size();
-  virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask *task);
-};
-
-/* Utility to push/pop CUDA context. */
-class CUDAContextScope {
- public:
-  CUDAContextScope(CUDADevice *device);
-  ~CUDAContextScope();
-
- private:
-  CUDADevice *device;
-};
-
-class CUDADevice : public Device {
- public:
-  DedicatedTaskPool task_pool;
-  CUdevice cuDevice;
-  CUcontext cuContext;
-  CUmodule cuModule, cuFilterModule;
-  size_t device_texture_headroom;
-  size_t device_working_headroom;
-  bool move_texture_to_host;
-  size_t map_host_used;
-  size_t map_host_limit;
-  int can_map_host;
-  int cuDevId;
-  int cuDevArchitecture;
-  bool first_error;
-  CUDASplitKernel *split_kernel;
-
-  struct CUDAMem {
-    CUDAMem() : texobject(0), array(0), use_mapped_host(false)
-    {
-    }
-
-    CUtexObject texobject;
-    CUarray array;
-
-    /* If true, a mapped host memory in shared_pointer is being used. */
-    bool use_mapped_host;
-  };
-  typedef map<device_memory *, CUDAMem> CUDAMemMap;
-  CUDAMemMap cuda_mem_map;
-
-  struct PixelMem {
-    GLuint cuPBO;
-    CUgraphicsResource cuPBOresource;
-    GLuint cuTexId;
-    int w, h;
-  };
-  map<device_ptr, PixelMem> pixel_mem_map;
-
-  /* Bindless Textures */
-  device_vector<TextureInfo> texture_info;
-  bool need_texture_info;
-
-  CUdeviceptr cuda_device_ptr(device_ptr mem)
-  {
-    return (CUdeviceptr)mem;
-  }
-
-  static bool have_precompiled_kernels()
-  {
-    string cubins_path = path_get("lib");
-    return path_exists(cubins_path);
-  }
-
-  virtual bool show_samples() const
-  {
-    /* The CUDADevice only processes one tile at a time, so showing samples is fine. */
-    return true;
-  }
-
-  virtual BVHLayoutMask get_bvh_layout_mask() const
-  {
-    return BVH_LAYOUT_BVH2;
-  }
-
-  /*#ifdef NDEBUG
-#define cuda_abort()
-#else
-#define cuda_abort() abort()
-#endif*/
-  void cuda_error_documentation()
-  {
-    if (first_error) {
-      fprintf(stderr,
-              "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
-      fprintf(stderr,
-              "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
-      first_error = false;
-    }
-  }
-
-#define cuda_assert(stmt) \
-  { \
-    CUresult result = stmt; \
-\
-    if (result != CUDA_SUCCESS) { \
-      string message = string_printf( \
-          "CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \
-      if (error_msg == "") \
-        error_msg = message; \
-      fprintf(stderr, "%s\n", message.c_str()); \
-      /*cuda_abort();*/ \
-      cuda_error_documentation(); \
-    } \
-  } \
-  (void)0
-
-  bool cuda_error_(CUresult result, const string &stmt)
-  {
-    if (result == CUDA_SUCCESS)
-      return false;
-
-    string message = string_printf("CUDA error at %s: %s", stmt.c_str(), cuewErrorString(result));
-    if (error_msg == "")
-      error_msg = message;
-    fprintf(stderr, "%s\n", message.c_str());
-    cuda_error_documentation();
-    return true;
-  }
-
-#define cuda_error(stmt) cuda_error_(stmt, #stmt)
-
-  void cuda_error_message(const string &message)
-  {
-    if (error_msg == "")
-      error_msg = message;
-    fprintf(stderr, "%s\n", message.c_str());
-    cuda_error_documentation();
-  }
-
-  CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_)
-      : Device(info, stats, profiler, background_),
-        texture_info(this, "__texture_info", MEM_TEXTURE)
-  {
-    first_error = true;
-    background = background_;
-
-    cuDevId = info.num;
-    cuDevice = 0;
-    cuContext = 0;
-
-    cuModule = 0;
-    cuFilterModule = 0;
-
-    split_kernel = NULL;
-
-    need_texture_info = false;
-
-    device_texture_headroom = 0;
-    device_working_headroom = 0;
-    move_texture_to_host = false;
-    map_host_limit = 0;
-    map_host_used = 0;
-    can_map_host = 0;
-
-    /* Intialize CUDA. */
-    if (cuda_error(cuInit(0)))
-      return;
-
-    /* Setup device and context. */
-    if (cuda_error(cuDeviceGet(&cuDevice, cuDevId)))
-      return;
-
-    /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
-     * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
-     * so we can predict which memory to map to host. */
-    cuda_assert(
-        cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
-
-    unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
-    if (can_map_host) {
-      ctx_flags |= CU_CTX_MAP_HOST;
-      init_host_memory();
-    }
-
-    /* Create context. */
-    CUresult result;
-
-    if (background) {
-      result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
-    }
-    else {
-      result = cuGLCtxCreate(&cuContext, ctx_flags, cuDevice);
-
-      if (result != CUDA_SUCCESS) {
-        result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
-        background = true;
-      }
-    }
-
-    if (cuda_error_(result, "cuCtxCreate"))
-      return;
-
-    int major, minor;
-    cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
-    cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
-    cuDevArchitecture = major * 100 + minor * 10;
-
-    /* Pop context set by cuCtxCreate. */
-    cuCtxPopCurrent(NULL);
-  }
-
-  ~CUDADevice()
-  {
-    task_pool.stop();
-
-    delete split_kernel;
-
-    texture_info.free();
-
-    cuda_assert(cuCtxDestroy(cuContext));
-  }
-
-  bool support_device(const DeviceRequestedFeatures & /*requested_features*/)
-  {
-    int major, minor;
-    cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
-    cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
-
-    /* We only support sm_30 and above */
-    if (major < 3) {
-      cuda_error_message(string_printf(
-          "CUDA device supported only with compute capability 3.0 or up, found %d.%d.",
-          major,
-          minor));
-      return false;
-    }
-
-    return true;
-  }
-
-  bool use_adaptive_compilation()
-  {
-    return DebugFlags().cuda.adaptive_compile;
-  }
-
-  bool use_split_kernel()
-  {
-    return DebugFlags().cuda.split_kernel;
-  }
-
-  /* Common NVCC flags which stays the same regardless of shading model,
-   * kernel sources md5 and only depends on compiler or compilation settings.
-   */
-  string compile_kernel_get_common_cflags(const DeviceRequestedFeatures &requested_features,
-                                          bool filter = false,
-                                          bool split = false)
-  {
-    const int machine = system_cpu_bits();
-    const string source_path = path_get("source");
-    const string include_path = source_path;
-    string cflags = string_printf(
-        "-m%d "
-        "--ptxas-options=\"-v\" "
-        "--use_fast_math "
-        "-DNVCC "
-        "-I\"%s\"",
-        machine,
-        include_path.c_str());
-    if (!filter && use_adaptive_compilation()) {
-      cflags += " " + requested_features.get_build_options();
-    }
-    const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
-    if (extra_cflags) {
-      cflags += string(" ") + string(extra_cflags);
-    }
-#ifdef WITH_CYCLES_DEBUG
-    cflags += " -D__KERNEL_DEBUG__";
-#endif
-
-    if (split) {
-      cflags += " -D__SPLIT__";
-    }
-
-    return cflags;
-  }
-
-  bool compile_check_compiler()
-  {
-    const char *nvcc = cuewCompilerPath();
-    if (nvcc == NULL) {
-      cuda_error_message(
-          "CUDA nvcc compiler not found. "
-          "Install CUDA toolkit in default location.");
-      return false;
-    }
-    const int cuda_version = cuewCompilerVersion();
-    VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << cuda_version << ".";
-    const int major = cuda_version / 10, minor = cuda_version % 10;
-    if (cuda_version == 0) {
-      cuda_error_message("CUDA nvcc compiler version could not be parsed.");
-      return false;
-    }
-    if (cuda_version < 80) {
-      printf(
-          "Unsupported CUDA version %d.%d detected, "
-          "you need CUDA 8.0 or newer.\n",
-          major,
-          minor);
-      return false;
-    }
-    else if (cuda_version != 101) {
-      printf(
-          "CUDA version %d.%d detected, build may succeed but only "
-          "CUDA 10.1 is officially supported.\n",
-          major,
-          minor);
-    }
-    return true;
-  }
-
-  string compile_kernel(const DeviceRequestedFeatures &requested_features,
-                        bool filter = false,
-                        bool split = false)
-  {
-    const char *name, *source;
-    if (filter) {
-      name = "filter";
-      source = "filter.cu";
-    }
-    else if (split) {
-      name = "kernel_split";
-      source = "kernel_split.cu";
-    }
-    else {
-      name = "kernel";
-      source = "kernel.cu";
-    }
-    /* Compute cubin name. */
-    int major, minor;
-    cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
-    cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
-
-    /* Attempt to use kernel provided with Blender. */
-    if (!use_adaptive_compilation()) {
-      const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor));
-      VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
-      if (path_exists(cubin)) {
-        VLOG(1) << "Using precompiled kernel.";
-        return cubin;
-      }
-      const string ptx = path_get(string_printf("lib/%s_compute_%d%d.ptx", name, major, minor));
-      VLOG(1) << "Testing for pre-compiled kernel " << ptx << ".";
-      if (path_exists(ptx)) {
-        VLOG(1) << "Using precompiled kernel.";
-        return ptx;
-      }
-    }
-
-    const string common_cflags = compile_kernel_get_common_cflags(
-        requested_features, filter, split);
-
-    /* Try to use locally compiled kernel. */
-    const string source_path = path_get("source");
-    const string kernel_md5 = path_files_md5_hash(source_path);
-
-    /* We include cflags into md5 so changing cuda toolkit or changing other
-     * compiler command line arguments makes sure cubin gets re-built.
-     */
-    const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);
-
-    const string cubin_file = string_printf(
-        "cycles_%s_sm%d%d_%s.cubin", name, major, minor, cubin_md5.c_str());
-    const string cubin = path_cache_get(path_join("kernels", cubin_file));
-    VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
-    if (path_exists(cubin)) {
-      VLOG(1) << "Using locally compiled kernel.";
-      return cubin;
-    }
-
-#ifdef _WIN32
-    if (have_precompiled_kernels()) {
-      if (major < 3) {
-        cuda_error_message(
-            string_printf("CUDA device requires compute capability 3.0 or up, "
-                          "found %d.%d. Your GPU is not supported.",
-                          major,
-                          minor));
-      }
-      else {
-        cuda_error_message(
-            string_printf("CUDA binary kernel for this graphics card compute "
-                          "capability (%d.%d) not found.",
-                          major,
-                          minor));
-      }
-      return "";
-    }
-#endif
-
-    /* Compile. */
-    if (!compile_check_compiler()) {
-      return "";
-    }
-    const char *nvcc = cuewCompilerPath();
-    const string kernel = path_join(path_join(source_path, "kernel"),
-                                    path_join("kernels", path_join("cuda", source)));
-    double starttime = time_dt();
-    printf("Compiling CUDA kernel ...\n");
-
-    path_create_directories(cubin);
-
-    string command = string_printf(
-        "\"%s\" "
-        "-arch=sm_%d%d "
-        "--cubin \"%s\" "
-        "-o \"%s\" "
-        "%s ",
-        nvcc,
-        major,
-        minor,
-        kernel.c_str(),
-        cubin.c_str(),
-        common_cflags.c_str());
-
-    printf("%s\n", command.c_str());
-
-    if (system(command.c_str()) == -1) {
-      cuda_error_message(
-          "Failed to execute compilation command, "
-          "see console for details.");
-      return "";
-    }
-
-    /* Verify if compilation succeeded */
-    if (!path_exists(cubin)) {
-      cuda_error_message(
-          "CUDA kernel compilation failed, "
-          "see console for details.");
-      return "";
-    }
-
-    printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
-
-    return cubin;
-  }
-
-  bool load_kernels(const DeviceRequestedFeatures &requested_features)
-  {
-    /* TODO(sergey): Support kernels re-load for CUDA devices.
-     *
-     * Currently re-loading kernel will invalidate memory pointers,
-     * causing problems in cuCtxSynchronize.
-     */
-    if (cuFilterModule && cuModule) {
-      VLOG(1) << "Skipping kernel reload, not currently supported.";
-      return true;
-    }
-
-    /* check if cuda init succeeded */
-    if (cuContext == 0)
-      return false;
-
-    /* check if GPU is supported */
-    if (!support_device(requested_features))
-      return false;
-
-    /* get kernel */
-    string cubin = compile_kernel(requested_features, false, use_split_kernel());
-    if (cubin == "")
-      return false;
-
-    string filter_cubin = compile_kernel(requested_features, true, false);
-    if (filter_cubin == "")
-      return false;
-
-    /* open module */
-    CUDAContextScope scope(this);
-
-    string cubin_data;
-    CUresult result;
-
-    if (path_read_text(cubin, cubin_data))
-      result = cuModuleLoadData(&cuModule, cubin_data.c_str());
-    else
-      result = CUDA_ERROR_FILE_NOT_FOUND;
-
-    if (cuda_error_(result, "cuModuleLoad"))
-      cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));
-
-    if (path_read_text(filter_cubin, cubin_data))
-      result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
-    else
-      result = CUDA_ERROR_FILE_NOT_FOUND;
-
-    if (cuda_error_(result, "cuModuleLoad"))
-      cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str()));
-
-    if (result == CUDA_SUCCESS) {
-      reserve_local_memory(requested_features);
-    }
-
-    return (result == CUDA_SUCCESS);
-  }
-
-  void reserve_local_memory(const DeviceRequestedFeatures &requested_features)
-  {
-    if (use_split_kernel()) {
-      /* Split kernel mostly uses global memory and adaptive compilation,
-       * difficult to predict how much is needed currently. */
-      return;
-    }
-
-    /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
-     * needed for kernel launches, so that we can reliably figure out when
-     * to allocate scene data in mapped host memory. */
-    CUDAContextScope scope(this);
-
-    size_t total = 0, free_before = 0, free_after = 0;
-    cuMemGetInfo(&free_before, &total);
-
-    /* Get kernel function. */
-    CUfunction cuPathTrace;
-
-    if (requested_features.use_integrator_branched) {
-      cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"));
-    }
-    else {
-      cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"));
-    }
-
-    cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
-
-    int min_blocks, num_threads_per_block;
-    cuda_assert(cuOccupancyMaxPotentialBlockSize(
-        &min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0));
-
-    /* Launch kernel, using just 1 block appears sufficient to reserve
-     * memory for all multiprocessors. It would be good to do this in
-     * parallel for the multi GPU case still to make it faster. */
-    CUdeviceptr d_work_tiles = 0;
-    uint total_work_size = 0;
-
-    void *args[] = {&d_work_tiles, &total_work_size};
-
-    cuda_assert(cuLaunchKernel(cuPathTrace, 1, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
-
-    cuda_assert(cuCtxSynchronize());
-
-    cuMemGetInfo(&free_after, &total);
-    VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after)
-            << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
-
-#if 0
-    /* For testing mapped host memory, fill up device memory. */
-    const size_t keep_mb = 1024;
-
-    while (free_after > keep_mb * 1024 * 1024LL) {
-      CUdeviceptr tmp;
-      cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
-      cuMemGetInfo(&free_after, &total);
-    }
-#endif
-  }
-
-  void init_host_memory()
-  {
-    /* Limit amount of host mapped memory, because allocating too much can
-     * cause system instability. Leave at least half or 4 GB of system
-     * memory free, whichever is smaller. */
-    size_t default_limit = 4 * 1024 * 1024 * 1024LL;
-    size_t system_ram = system_physical_ram();
-
-    if (system_ram > 0) {
-      if (system_ram / 2 > default_limit) {
-        map_host_limit = system_ram - default_limit;
-      }
-      else {
-        map_host_limit = system_ram / 2;
-      }
-    }
-    else {
-      VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
-      map_host_limit = 0;
-    }
-
-    /* Amount of device memory to keep is free after texture memory
-     * and working memory allocations respectively. We set the working
-     * memory limit headroom lower so that some space is left after all
-     * texture memory allocations. */
-    device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
-    device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
-
-    VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
-            << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
-  }
-
-  void load_texture_info()
-  {
-    if (need_texture_info) {
-      texture_info.copy_to_device();
-      need_texture_info = false;
-    }
-  }
-
-  void move_textures_to_host(size_t size, bool for_texture)
-  {
-    /* Signal to reallocate textures in host memory only. */
-    move_texture_to_host = true;
-
-    while (size > 0) {
-      /* Find suitable memory allocation to move. */
-      device_memory *max_mem = NULL;
-      size_t max_size = 0;
-      bool max_is_image = false;
-
-      foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
-        device_memory &mem = *pair.first;
-        CUDAMem *cmem = &pair.second;
-
-        bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
-        bool is_image = is_texture && (mem.data_height > 1);
-
-        /* Can't move this type of memory. */
-        if (!is_texture || cmem->array) {
-          continue;
-        }
-
-        /* Already in host memory. */
-        if (cmem->use_mapped_host) {
-          continue;
-        }
-
-        /* For other textures, only move image textures. */
-        if (for_texture && !is_image) {
-          continue;
-        }
-
-        /* Try to move largest allocation, prefer moving images. */
-        if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
-          max_is_image = is_image;
-          max_size = mem.device_size;
-          max_mem = &mem;
-        }
-      }
-
-      /* Move to host memory. This part is mutex protected since
-       * multiple CUDA devices could be moving the memory. The
-       * first one will do it, and the rest will adopt the pointer. */
-      if (max_mem) {
-        VLOG(1) << "Move memory from device to host: " << max_mem->name;
-
-        static thread_mutex move_mutex;
-        thread_scoped_lock lock(move_mutex);
-
-        /* Preserve the original device pointer, in case of multi device
-         * we can't change it because the pointer mapping would break. */
-        device_ptr prev_pointer = max_mem->device_pointer;
-        size_t prev_size = max_mem->device_size;
-
-        tex_free(*max_mem);
-        tex_alloc(*max_mem);
-        size = (max_size >= size) ? 0 : size - max_size;
-
-        max_mem->device_pointer = prev_pointer;
-        max_mem->device_size = prev_size;
-      }
-      else {
-        break;
-      }
-    }
-
-    /* Update texture info array with new pointers. */
-    load_texture_info();
-
-    move_texture_to_host = false;
-  }
-
-  CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0)
-  {
-    CUDAContextScope scope(this);
-
-    CUdeviceptr device_pointer = 0;
-    size_t size = mem.memory_size() + pitch_padding;
-
-    CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
-    const char *status = "";
-
-    /* First try allocating in device memory, respecting headroom. We make
-     * an exception for texture info. It is small and frequently accessed,
-     * so treat it as working memory.
-     *
-     * If there is not enough room for working memory, we will try to move
-     * textures to host memory, assuming the performance impact would have
-     * been worse for working memory. */
-    bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
-    bool is_image = is_texture && (mem.data_height > 1);
-
-    size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
-
-    size_t total = 0, free = 0;
-    cuMemGetInfo(&free, &total);
-
-    /* Move textures to host memory if needed. */
-    if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
-      move_textures_to_host(size + headroom - free, is_texture);
-      cuMemGetInfo(&free, &total);
-    }
-
-    /* Allocate in device memory. */
-    if (!move_texture_to_host && (size + headroom) < free) {
-      mem_alloc_result = cuMemAlloc(&device_pointer, size);
-      if (mem_alloc_result == CUDA_SUCCESS) {
-        status = " in device memory";
-      }
-    }
-
-    /* Fall back to mapped host memory if needed and possible. */
-
-    void *shared_pointer = 0;
-
-    if (mem_alloc_result != CUDA_SUCCESS && can_map_host) {
-      if (mem.shared_pointer) {
-        /* Another device already allocated host memory. */
-        mem_alloc_result = CUDA_SUCCESS;
-        shared_pointer = mem.shared_pointer;
-      }
-      else if (map_host_used + size < map_host_limit) {
-        /* Allocate host memory ourselves. */
-        mem_alloc_result = cuMemHostAlloc(
-            &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
-
-        assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
-               (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
-      }
-
-      if (mem_alloc_result == CUDA_SUCCESS) {
-        cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0));
-        map_host_used += size;
-        status = " in host memory";
-      }
-      else {
-        status = " failed, out of host memory";
-      }
-    }
-
-    if (mem_alloc_result != CUDA_SUCCESS) {
-      status = " failed, out of device and host memory";
-      cuda_assert(mem_alloc_result);
-    }
-
-    if (mem.name) {
-      VLOG(1) << "Buffer allocate: " << mem.name << ", "
-              << string_human_readable_number(mem.memory_size()) << " bytes. ("
-              << string_human_readable_size(mem.memory_size()) << ")" << status;
-    }
-
-    mem.device_pointer = (device_ptr)device_pointer;
-    mem.device_size = size;
-    stats.mem_alloc(size);
-
-    if (!mem.device_pointer) {
-      return NULL;
-    }
-
-    /* Insert into map of allocations. */
-    CUDAMem *cmem = &cuda_mem_map[&mem];
-    if (shared_pointer != 0) {
-      /* Replace host pointer with our host allocation. Only works if
-       * CUDA memory layout is the same and has no pitch padding. Also
-       * does not work if we move textures to host during a render,
-       * since other devices might be using the memory. */
-
-      if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
-          mem.host_pointer != shared_pointer) {
-        memcpy(shared_pointer, mem.host_pointer, size);
-
-        /* A Call to device_memory::host_free() should be preceded by
-         * a call to device_memory::device_free() for host memory
-         * allocated by a device to be handled properly. Two exceptions
-         * are here and a call in OptiXDevice::generic_alloc(), where
-         * the current host memory can be assumed to be allocated by
-         * device_memory::host_alloc(), not by a device */
-
-        mem.host_free();
-        mem.host_pointer = shared_pointer;
-      }
-      mem.shared_pointer = shared_pointer;
-      mem.shared_counter++;
-      cmem->use_mapped_host = true;
-    }
-    else {
-      cmem->use_mapped_host = false;
-    }
-
-    return cmem;
-  }
-
-  void generic_copy_to(device_memory &mem)
-  {
-    if (mem.host_pointer && mem.device_pointer) {
-      CUDAContextScope scope(this);
-
-      /* If use_mapped_host of mem is false, the current device only
-       * uses device memory allocated by cuMemAlloc regardless of
-       * mem.host_pointer and mem.shared_pointer, and should copy
-       * data from mem.host_pointer. */
-
-      if (cuda_mem_map[&mem].use_mapped_host == false || mem.host_pointer != mem.shared_pointer) {
-        cuda_assert(cuMemcpyHtoD(
-            cuda_device_ptr(mem.device_pointer), mem.host_pointer, mem.memory_size()));
-      }
-    }
-  }
-
-  void generic_free(device_memory &mem)
-  {
-    if (mem.device_pointer) {
-      CUDAContextScope scope(this);
-      const CUDAMem &cmem = cuda_mem_map[&mem];
-
-      /* If cmem.use_mapped_host is true, reference counting is used
-       * to safely free a mapped host memory. */
-
-      if (cmem.use_mapped_host) {
-        assert(mem.shared_pointer);
-        if (mem.shared_pointer) {
-          assert(mem.shared_counter > 0);
-          if (--mem.shared_counter == 0) {
-            if (mem.host_pointer == mem.shared_pointer) {
-              mem.host_pointer = 0;
-            }
-            cuMemFreeHost(mem.shared_pointer);
-            mem.shared_pointer = 0;
-          }
-        }
-        map_host_used -= mem.device_size;
-      }
-      else {
-        /* Free device memory. */
-        cuMemFree(mem.device_pointer);
-      }
-
-      stats.mem_free(mem.device_size);
-      mem.device_pointer = 0;
-      mem.device_size = 0;
-
-      cuda_mem_map.erase(cuda_mem_map.find(&mem));
-    }
-  }
-
-  void mem_alloc(device_memory &mem)
-  {
-    if (mem.type == MEM_PIXELS && !background) {
-      pixels_alloc(mem);
-    }
-    else if (mem.type == MEM_TEXTURE) {
-      assert(!"mem_alloc not supported for textures.");
-    }
-    else {
-      generic_alloc(mem);
-    }
-  }
-
-  void mem_copy_to(device_memory &mem)
-  {
-    if (mem.type == MEM_PIXELS) {
-      assert(!"mem_copy_to not supported for pixels.");
-    }
-    else if (mem.type == MEM_TEXTURE) {
-      tex_free(mem);
-      tex_alloc(mem);
-    }
-    else {
-      if (!mem.device_pointer) {
-        generic_alloc(mem);
-      }
-
-      generic_copy_to(mem);
-    }
-  }
-
-  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
-  {
-    if (mem.type == MEM_PIXELS && !background) {
-      pixels_copy_from(mem, y, w, h);
-    }
-    else if (mem.type == MEM_TEXTURE) {
-      assert(!"mem_copy_from not supported for textures.");
-    }
-    else if (mem.host_pointer) {
-      const size_t size = elem * w * h;
-      const size_t offset = elem * y * w;
-
-      if (mem.device_pointer) {
-        const CUDAContextScope scope(this);
-        cuda_assert(cuMemcpyDtoH(
-            (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size));
-      }
-      else {
-        memset((char *)mem.host_pointer + offset, 0, size);
-      }
-    }
-  }
-
-  void mem_zero(device_memory &mem)
-  {
-    if (!mem.device_pointer) {
-      mem_alloc(mem);
-    }
-    if (!mem.device_pointer) {
-      return;
-    }
-
-    /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
-     * regardless of mem.host_pointer and mem.shared_pointer. */
-    if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
-      const CUDAContextScope scope(this);
-      cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size()));
-    }
-    else if (mem.host_pointer) {
-      memset(mem.host_pointer, 0, mem.memory_size());
-    }
-  }
-
-  void mem_free(device_memory &mem)
-  {
-    if (mem.type == MEM_PIXELS && !background) {
-      pixels_free(mem);
-    }
-    else if (mem.type == MEM_TEXTURE) {
-      tex_free(mem);
-    }
-    else {
-      generic_free(mem);
-    }
-  }
-
-  virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
-  {
-    return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
-  }
-
-  void const_copy_to(const char *name, void *host, size_t size)
-  {
-    CUDAContextScope scope(this);
-    CUdeviceptr mem;
-    size_t bytes;
-
-    cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
-    // assert(bytes == size);
-    cuda_assert(cuMemcpyHtoD(mem, host, size));
-  }
-
-  void tex_alloc(device_memory &mem)
-  {
-    CUDAContextScope scope(this);
-
-    /* General variables for both architectures */
-    string bind_name = mem.name;
-    size_t dsize = datatype_size(mem.data_type);
-    size_t size = mem.memory_size();
-
-    CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
-    switch (mem.extension) {
-      case EXTENSION_REPEAT:
-        address_mode = CU_TR_ADDRESS_MODE_WRAP;
-        break;
-      case EXTENSION_EXTEND:
-        address_mode = CU_TR_ADDRESS_MODE_CLAMP;
-        break;
-      case EXTENSION_CLIP:
-        address_mode = CU_TR_ADDRESS_MODE_BORDER;
-        break;
-      default:
-        assert(0);
-        break;
-    }
-
-    CUfilter_mode filter_mode;
-    if (mem.interpolation == INTERPOLATION_CLOSEST) {
-      filter_mode = CU_TR_FILTER_MODE_POINT;
-    }
-    else {
-      filter_mode = CU_TR_FILTER_MODE_LINEAR;
-    }
-
-    /* Data Storage */
-    if (mem.interpolation == INTERPOLATION_NONE) {
-      generic_alloc(mem);
-      generic_copy_to(mem);
-
-      CUdeviceptr cumem;
-      size_t cubytes;
-
-      cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str()));
-
-      if (cubytes == 8) {
-        /* 64 bit device pointer */
-        uint64_t ptr = mem.device_pointer;
-        cuda_assert(cuMemcpyHtoD(cumem, (void *)&ptr, cubytes));
-      }
-      else {
-        /* 32 bit device pointer */
-        uint32_t ptr = (uint32_t)mem.device_pointer;
-        cuda_assert(cuMemcpyHtoD(cumem, (void *)&ptr, cubytes));
-      }
-      return;
-    }
-
-    /* Image Texture Storage */
-    CUarray_format_enum format;
-    switch (mem.data_type) {
-      case TYPE_UCHAR:
-        format = CU_AD_FORMAT_UNSIGNED_INT8;
-        break;
-      case TYPE_UINT16:
-        format = CU_AD_FORMAT_UNSIGNED_INT16;
-        break;
-      case TYPE_UINT:
-        format = CU_AD_FORMAT_UNSIGNED_INT32;
-        break;
-      case TYPE_INT:
-        format = CU_AD_FORMAT_SIGNED_INT32;
-        break;
-      case TYPE_FLOAT:
-        format = CU_AD_FORMAT_FLOAT;
-        break;
-      case TYPE_HALF:
-        format = CU_AD_FORMAT_HALF;
-        break;
-      default:
-        assert(0);
-        return;
-    }
-
-    CUDAMem *cmem = NULL;
-    CUarray array_3d = NULL;
-    size_t src_pitch = mem.data_width * dsize * mem.data_elements;
-    size_t dst_pitch = src_pitch;
-
-    if (mem.data_depth > 1) {
-      /* 3D texture using array, there is no API for linear memory. */
-      CUDA_ARRAY3D_DESCRIPTOR desc;
-
-      desc.Width = mem.data_width;
-      desc.Height = mem.data_height;
-      desc.Depth = mem.data_depth;
-      desc.Format = format;
-      desc.NumChannels = mem.data_elements;
-      desc.Flags = 0;
-
-      VLOG(1) << "Array 3D allocate: " << mem.name << ", "
-              << string_human_readable_number(mem.memory_size()) << " bytes. ("
-              << string_human_readable_size(mem.memory_size()) << ")";
-
-      cuda_assert(cuArray3DCreate(&array_3d, &desc));
-
-      if (!array_3d) {
-        return;
-      }
-
-      CUDA_MEMCPY3D param;
-      memset(&param, 0, sizeof(param));
-      param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
-      param.dstArray = array_3d;
-      param.srcMemoryType = CU_MEMORYTYPE_HOST;
-      param.srcHost = mem.host_pointer;
-      param.srcPitch = src_pitch;
-      param.WidthInBytes = param.srcPitch;
-      param.Height = mem.data_height;
-      param.Depth = mem.data_depth;
-
-      cuda_assert(cuMemcpy3D(&param));
-
-      mem.device_pointer = (device_ptr)array_3d;
-      mem.device_size = size;
-      stats.mem_alloc(size);
-
-      cmem = &cuda_mem_map[&mem];
-      cmem->texobject = 0;
-      cmem->array = array_3d;
-    }
-    else if (mem.data_height > 0) {
-      /* 2D texture, using pitch aligned linear memory. */
-      int alignment = 0;
-      cuda_assert(
-          cuDeviceGetAttribute(&alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
-      dst_pitch = align_up(src_pitch, alignment);
-      size_t dst_size = dst_pitch * mem.data_height;
-
-      cmem = generic_alloc(mem, dst_size - mem.memory_size());
-      if (!cmem) {
-        return;
-      }
-
-      CUDA_MEMCPY2D param;
-      memset(&param, 0, sizeof(param));
-      param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
-      param.dstDevice = mem.device_pointer;
-      param.dstPitch = dst_pitch;
-      param.srcMemoryType = CU_MEMORYTYPE_HOST;
-      param.srcHost = mem.host_pointer;
-      param.srcPitch = src_pitch;
-      param.WidthInBytes = param.srcPitch;
-      param.Height = mem.data_height;
-
-      cuda_assert(cuMemcpy2DUnaligned(&param));
-    }
-    else {
-      /* 1D texture, using linear memory. */
-      cmem = generic_alloc(mem);
-      if (!cmem) {
-        return;
-      }
-
-      cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
-    }
-
-    /* Kepler+, bindless textures. */
-    int flat_slot = 0;
-    if (string_startswith(mem.name, "__tex_image")) {
-      int pos = string(mem.name).rfind("_");
-      flat_slot = atoi(mem.name + pos + 1);
-    }
-    else {
-      assert(0);
-    }
-
-    CUDA_RESOURCE_DESC resDesc;
-    memset(&resDesc, 0, sizeof(resDesc));
-
-    if (array_3d) {
-      resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
-      resDesc.res.array.hArray = array_3d;
-      resDesc.flags = 0;
-    }
-    else if (mem.data_height > 0) {
-      resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
-      resDesc.res.pitch2D.devPtr = mem.device_pointer;
-      resDesc.res.pitch2D.format = format;
-      resDesc.res.pitch2D.numChannels = mem.data_elements;
-      resDesc.res.pitch2D.height = mem.data_height;
-      resDesc.res.pitch2D.width = mem.data_width;
-      resDesc.res.pitch2D.pitchInBytes = dst_pitch;
-    }
-    else {
-      resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
-      resDesc.res.linear.devPtr = mem.device_pointer;
-      resDesc.res.linear.format = format;
-      resDesc.res.linear.numChannels = mem.data_elements;
-      resDesc.res.linear.sizeInBytes = mem.device_size;
-    }
-
-    CUDA_TEXTURE_DESC texDesc;
-    memset(&texDesc, 0, sizeof(texDesc));
-    texDesc.addressMode[0] = address_mode;
-    texDesc.addressMode[1] = address_mode;
-    texDesc.addressMode[2] = address_mode;
-    texDesc.filterMode = filter_mode;
-    texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
-
-    cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
-
-    /* Resize once */
-    if (flat_slot >= texture_info.size()) {
-      /* Allocate some slots in advance, to reduce amount
-       * of re-allocations. */
-      texture_info.resize(flat_slot + 128);
-    }
-
-    /* Set Mapping and tag that we need to (re-)upload to device */
-    TextureInfo &info = texture_info[flat_slot];
-    info.data = (uint64_t)cmem->texobject;
-    info.cl_buffer = 0;
-    info.interpolation = mem.interpolation;
-    info.extension = mem.extension;
-    info.width = mem.data_width;
-    info.height = mem.data_height;
-    info.depth = mem.data_depth;
-    need_texture_info = true;
-  }
-
-  void tex_free(device_memory &mem)
-  {
-    if (mem.device_pointer) {
-      CUDAContextScope scope(this);
-      const CUDAMem &cmem = cuda_mem_map[&mem];
-
-      if (cmem.texobject) {
-        /* Free bindless texture. */
-        cuTexObjectDestroy(cmem.texobject);
-      }
-
-      if (cmem.array) {
-        /* Free array. */
-        cuArrayDestroy(cmem.array);
-        stats.mem_free(mem.device_size);
-        mem.device_pointer = 0;
-        mem.device_size = 0;
-
-        cuda_mem_map.erase(cuda_mem_map.find(&mem));
-      }
-      else {
-        generic_free(mem);
-      }
-    }
-  }
-
-#define CUDA_GET_BLOCKSIZE(func, w, h) \
-  int threads_per_block; \
-  cuda_assert( \
-      cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
-  int threads = (int)sqrt((float)threads_per_block); \
-  int xblocks = ((w) + threads - 1) / threads; \
-  int yblocks = ((h) + threads - 1) / threads;
-
-#define CUDA_LAUNCH_KERNEL(func, args) \
-  cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0));
-
-/* Similar as above, but for 1-dimensional blocks. */
-#define CUDA_GET_BLOCKSIZE_1D(func, w, h) \
-  int threads_per_block; \
-  cuda_assert( \
-      cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
-  int xblocks = ((w) + threads_per_block - 1) / threads_per_block; \
-  int yblocks = h;
-
-#define CUDA_LAUNCH_KERNEL_1D(func, args) \
-  cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads_per_block, 1, 1, 0, 0, args, 0));
-
-  bool denoising_non_local_means(device_ptr image_ptr,
-                                 device_ptr guide_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr out_ptr,
-                                 DenoisingTask *task)
-  {
-    if (have_error())
-      return false;
-
-    CUDAContextScope scope(this);
-
-    int stride = task->buffer.stride;
-    int w = task->buffer.width;
-    int h = task->buffer.h;
-    int r = task->nlm_state.r;
-    int f = task->nlm_state.f;
-    float a = task->nlm_state.a;
-    float k_2 = task->nlm_state.k_2;
-
-    int pass_stride = task->buffer.pass_stride;
-    int num_shifts = (2 * r + 1) * (2 * r + 1);
-    int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
-    int frame_offset = 0;
-
-    if (have_error())
-      return false;
-
-    CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer);
-    CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
-    CUdeviceptr weightAccum = difference + 2 * sizeof(float) * pass_stride * num_shifts;
-    CUdeviceptr scale_ptr = 0;
-
-    cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float) * pass_stride));
-    cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float) * pass_stride));
-
-    {
-      CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput;
-      cuda_assert(cuModuleGetFunction(
-          &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
-      cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
-      cuda_assert(cuModuleGetFunction(
-          &cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
-      cuda_assert(cuModuleGetFunction(
-          &cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
-
-      cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
-      cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
-      cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
-      cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
-
-      CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w * h, num_shifts);
-
-      void *calc_difference_args[] = {&guide_ptr,
-                                      &variance_ptr,
-                                      &scale_ptr,
-                                      &difference,
-                                      &w,
-                                      &h,
-                                      &stride,
-                                      &pass_stride,
-                                      &r,
-                                      &channel_offset,
-                                      &frame_offset,
-                                      &a,
-                                      &k_2};
-      void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
-      void *calc_weight_args[] = {
-          &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
-      void *update_output_args[] = {&blurDifference,
-                                    &image_ptr,
-                                    &out_ptr,
-                                    &weightAccum,
-                                    &w,
-                                    &h,
-                                    &stride,
-                                    &pass_stride,
-                                    &channel_offset,
-                                    &r,
-                                    &f};
-
-      CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
-      CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-      CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
-      CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-      CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args);
-    }
-
-    {
-      CUfunction cuNLMNormalize;
-      cuda_assert(cuModuleGetFunction(
-          &cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
-      cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
-      void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride};
-      CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h);
-      CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
-      cuda_assert(cuCtxSynchronize());
-    }
-
-    return !have_error();
-  }
-
-  bool denoising_construct_transform(DenoisingTask *task)
-  {
-    if (have_error())
-      return false;
-
-    CUDAContextScope scope(this);
-
-    CUfunction cuFilterConstructTransform;
-    cuda_assert(cuModuleGetFunction(
-        &cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
-    cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
-    CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, task->storage.w, task->storage.h);
-
-    void *args[] = {&task->buffer.mem.device_pointer,
-                    &task->tile_info_mem.device_pointer,
-                    &task->storage.transform.device_pointer,
-                    &task->storage.rank.device_pointer,
-                    &task->filter_area,
-                    &task->rect,
-                    &task->radius,
-                    &task->pca_threshold,
-                    &task->buffer.pass_stride,
-                    &task->buffer.frame_stride,
-                    &task->buffer.use_time};
-    CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
-    cuda_assert(cuCtxSynchronize());
-
-    return !have_error();
-  }
-
-  bool denoising_accumulate(device_ptr color_ptr,
-                            device_ptr color_variance_ptr,
-                            device_ptr scale_ptr,
-                            int frame,
-                            DenoisingTask *task)
-  {
-    if (have_error())
-      return false;
-
-    CUDAContextScope scope(this);
-
-    int r = task->radius;
-    int f = 4;
-    float a = 1.0f;
-    float k_2 = task->nlm_k_2;
-
-    int w = task->reconstruction_state.source_w;
-    int h = task->reconstruction_state.source_h;
-    int stride = task->buffer.stride;
-    int frame_offset = frame * task->buffer.frame_stride;
-    int t = task->tile_info->frames[frame];
-
-    int pass_stride = task->buffer.pass_stride;
-    int num_shifts = (2 * r + 1) * (2 * r + 1);
-
-    if (have_error())
-      return false;
-
-    CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer);
-    CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
-
-    CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
-    cuda_assert(cuModuleGetFunction(
-        &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
-    cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
-    cuda_assert(cuModuleGetFunction(
-        &cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
-    cuda_assert(cuModuleGetFunction(
-        &cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
-
-    cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
-    cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
-    cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
-    cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
-
-    CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
-                          task->reconstruction_state.source_w *
-                              task->reconstruction_state.source_h,
-                          num_shifts);
-
-    void *calc_difference_args[] = {&color_ptr,
-                                    &color_variance_ptr,
-                                    &scale_ptr,
-                                    &difference,
-                                    &w,
-                                    &h,
-                                    &stride,
-                                    &pass_stride,
-                                    &r,
-                                    &pass_stride,
-                                    &frame_offset,
-                                    &a,
-                                    &k_2};
-    void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
-    void *calc_weight_args[] = {
-        &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
-    void *construct_gramian_args[] = {&t,
-                                      &blurDifference,
-                                      &task->buffer.mem.device_pointer,
-                                      &task->storage.transform.device_pointer,
-                                      &task->storage.rank.device_pointer,
-                                      &task->storage.XtWX.device_pointer,
-                                      &task->storage.XtWY.device_pointer,
-                                      &task->reconstruction_state.filter_window,
-                                      &w,
-                                      &h,
-                                      &stride,
-                                      &pass_stride,
-                                      &r,
-                                      &f,
-                                      &frame_offset,
-                                      &task->buffer.use_time};
-
-    CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
-    cuda_assert(cuCtxSynchronize());
-
-    return !have_error();
-  }
-
-  bool denoising_solve(device_ptr output_ptr, DenoisingTask *task)
-  {
-    CUfunction cuFinalize;
-    cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
-    cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
-    void *finalize_args[] = {&output_ptr,
-                             &task->storage.rank.device_pointer,
-                             &task->storage.XtWX.device_pointer,
-                             &task->storage.XtWY.device_pointer,
-                             &task->filter_area,
-                             &task->reconstruction_state.buffer_params.x,
-                             &task->render_buffer.samples};
-    CUDA_GET_BLOCKSIZE(
-        cuFinalize, task->reconstruction_state.source_w, task->reconstruction_state.source_h);
-    CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
-    cuda_assert(cuCtxSynchronize());
-
-    return !have_error();
-  }
-
-  bool denoising_combine_halves(device_ptr a_ptr,
-                                device_ptr b_ptr,
-                                device_ptr mean_ptr,
-                                device_ptr variance_ptr,
-                                int r,
-                                int4 rect,
-                                DenoisingTask *task)
-  {
-    if (have_error())
-      return false;
-
-    CUDAContextScope scope(this);
-
-    CUfunction cuFilterCombineHalves;
-    cuda_assert(cuModuleGetFunction(
-        &cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
-    cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
-    CUDA_GET_BLOCKSIZE(
-        cuFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-    void *args[] = {&mean_ptr, &variance_ptr, &a_ptr, &b_ptr, &rect, &r};
-    CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
-    cuda_assert(cuCtxSynchronize());
-
-    return !have_error();
-  }
-
-  bool denoising_divide_shadow(device_ptr a_ptr,
-                               device_ptr b_ptr,
-                               device_ptr sample_variance_ptr,
-                               device_ptr sv_variance_ptr,
-                               device_ptr buffer_variance_ptr,
-                               DenoisingTask *task)
-  {
-    if (have_error())
-      return false;
-
-    CUDAContextScope scope(this);
-
-    CUfunction cuFilterDivideShadow;
-    cuda_assert(cuModuleGetFunction(
-        &cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
-    cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
-    CUDA_GET_BLOCKSIZE(
-        cuFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-    void *args[] = {&task->render_buffer.samples,
-                    &task->tile_info_mem.device_pointer,
-                    &a_ptr,
-                    &b_ptr,
-                    &sample_variance_ptr,
-                    &sv_variance_ptr,
-                    &buffer_variance_ptr,
-                    &task->rect,
-                    &task->render_buffer.pass_stride,
-                    &task->render_buffer.offset};
-    CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
-    cuda_assert(cuCtxSynchronize());
-
-    return !have_error();
-  }
-
-  bool denoising_get_feature(int mean_offset,
-                             int variance_offset,
-                             device_ptr mean_ptr,
-                             device_ptr variance_ptr,
-                             float scale,
-                             DenoisingTask *task)
-  {
-    if (have_error())
-      return false;
-
-    CUDAContextScope scope(this);
-
-    CUfunction cuFilterGetFeature;
-    cuda_assert(cuModuleGetFunction(
-        &cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
-    cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
-    CUDA_GET_BLOCKSIZE(
-        cuFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-    void *args[] = {&task->render_buffer.samples,
-                    &task->tile_info_mem.device_pointer,
-                    &mean_offset,
-                    &variance_offset,
-                    &mean_ptr,
-                    &variance_ptr,
-                    &scale,
-                    &task->rect,
-                    &task->render_buffer.pass_stride,
-                    &task->render_buffer.offset};
-    CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
-    cuda_assert(cuCtxSynchronize());
-
-    return !have_error();
-  }
-
-  bool denoising_write_feature(int out_offset,
-                               device_ptr from_ptr,
-                               device_ptr buffer_ptr,
-                               DenoisingTask *task)
-  {
-    if (have_error())
-      return false;
-
-    CUDAContextScope scope(this);
-
-    CUfunction cuFilterWriteFeature;
-    cuda_assert(cuModuleGetFunction(
-        &cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature"));
-    cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1));
-    CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, task->filter_area.z, task->filter_area.w);
-
-    void *args[] = {&task->render_buffer.samples,
-                    &task->reconstruction_state.buffer_params,
-                    &task->filter_area,
-                    &from_ptr,
-                    &buffer_ptr,
-                    &out_offset,
-                    &task->rect};
-    CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args);
-    cuda_assert(cuCtxSynchronize());
-
-    return !have_error();
-  }
-
-  bool denoising_detect_outliers(device_ptr image_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr depth_ptr,
-                                 device_ptr output_ptr,
-                                 DenoisingTask *task)
-  {
-    if (have_error())
-      return false;
-
-    CUDAContextScope scope(this);
-
-    CUfunction cuFilterDetectOutliers;
-    cuda_assert(cuModuleGetFunction(
-        &cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers"));
-    cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1));
-    CUDA_GET_BLOCKSIZE(
-        cuFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-    void *args[] = {&image_ptr,
-                    &variance_ptr,
-                    &depth_ptr,
-                    &output_ptr,
-                    &task->rect,
-                    &task->buffer.pass_stride};
-
-    CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
-    cuda_assert(cuCtxSynchronize());
-
-    return !have_error();
-  }
-
-  void denoise(RenderTile &rtile, DenoisingTask &denoising)
-  {
-    denoising.functions.construct_transform = function_bind(
-        &CUDADevice::denoising_construct_transform, this, &denoising);
-    denoising.functions.accumulate = function_bind(
-        &CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
-    denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising);
-    denoising.functions.divide_shadow = function_bind(
-        &CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
-    denoising.functions.non_local_means = function_bind(
-        &CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
-    denoising.functions.combine_halves = function_bind(
-        &CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
-    denoising.functions.get_feature = function_bind(
-        &CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
-    denoising.functions.write_feature = function_bind(
-        &CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising);
-    denoising.functions.detect_outliers = function_bind(
-        &CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
-
-    denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
-    denoising.render_buffer.samples = rtile.sample;
-    denoising.buffer.gpu_temporary_mem = true;
-
-    denoising.run_denoising(&rtile);
-  }
-
-  void path_trace(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles)
-  {
-    scoped_timer timer(&rtile.buffers->render_time);
-
-    if (have_error())
-      return;
-
-    CUDAContextScope scope(this);
-    CUfunction cuPathTrace;
-
-    /* Get kernel function. */
-    if (task.integrator_branched) {
-      cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"));
-    }
-    else {
-      cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"));
-    }
-
-    if (have_error()) {
-      return;
-    }
-
-    cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
-
-    /* Allocate work tile. */
-    work_tiles.alloc(1);
-
-    WorkTile *wtile = work_tiles.data();
-    wtile->x = rtile.x;
-    wtile->y = rtile.y;
-    wtile->w = rtile.w;
-    wtile->h = rtile.h;
-    wtile->offset = rtile.offset;
-    wtile->stride = rtile.stride;
-    wtile->buffer = (float *)cuda_device_ptr(rtile.buffer);
-
-    /* Prepare work size. More step samples render faster, but for now we
-     * remain conservative for GPUs connected to a display to avoid driver
-     * timeouts and display freezing. */
-    int min_blocks, num_threads_per_block;
-    cuda_assert(cuOccupancyMaxPotentialBlockSize(
-        &min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0));
-    if (!info.display_device) {
-      min_blocks *= 8;
-    }
-
-    uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
-
-    /* Render all samples. */
-    int start_sample = rtile.start_sample;
-    int end_sample = rtile.start_sample + rtile.num_samples;
-
-    for (int sample = start_sample; sample < end_sample; sample += step_samples) {
-      /* Setup and copy work tile to device. */
-      wtile->start_sample = sample;
-      wtile->num_samples = min(step_samples, end_sample - sample);
-      work_tiles.copy_to_device();
-
-      CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer);
-      uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
-      uint num_blocks = divide_up(total_work_size, num_threads_per_block);
-
-      /* Launch kernel. */
-      void *args[] = {&d_work_tiles, &total_work_size};
-
-      cuda_assert(cuLaunchKernel(
-          cuPathTrace, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
-
-      cuda_assert(cuCtxSynchronize());
-
-      /* Update progress. */
-      rtile.sample = sample + wtile->num_samples;
-      task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
-
-      if (task.get_cancel()) {
-        if (task.need_finish_queue == false)
-          break;
-      }
-    }
-  }
-
-  void film_convert(DeviceTask &task,
-                    device_ptr buffer,
-                    device_ptr rgba_byte,
-                    device_ptr rgba_half)
-  {
-    if (have_error())
-      return;
-
-    CUDAContextScope scope(this);
-
-    CUfunction cuFilmConvert;
-    CUdeviceptr d_rgba = map_pixels((rgba_byte) ? rgba_byte : rgba_half);
-    CUdeviceptr d_buffer = cuda_device_ptr(buffer);
-
-    /* get kernel function */
-    if (rgba_half) {
-      cuda_assert(
-          cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float"));
-    }
-    else {
-      cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte"));
-    }
-
-    float sample_scale = 1.0f / (task.sample + 1);
-
-    /* pass in parameters */
-    void *args[] = {&d_rgba,
-                    &d_buffer,
-                    &sample_scale,
-                    &task.x,
-                    &task.y,
-                    &task.w,
-                    &task.h,
-                    &task.offset,
-                    &task.stride};
-
-    /* launch kernel */
-    int threads_per_block;
-    cuda_assert(cuFuncGetAttribute(
-        &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert));
-
-    int xthreads = (int)sqrt(threads_per_block);
-    int ythreads = (int)sqrt(threads_per_block);
-    int xblocks = (task.w + xthreads - 1) / xthreads;
-    int yblocks = (task.h + ythreads - 1) / ythreads;
-
-    cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1));
-
-    cuda_assert(cuLaunchKernel(cuFilmConvert,
-                               xblocks,
-                               yblocks,
-                               1, /* blocks */
-                               xthreads,
-                               ythreads,
-                               1, /* threads */
-                               0,
-                               0,
-                               args,
-                               0));
-
-    unmap_pixels((rgba_byte) ? rgba_byte : rgba_half);
-
-    cuda_assert(cuCtxSynchronize());
-  }
-
-  void shader(DeviceTask &task)
-  {
-    if (have_error())
-      return;
-
-    CUDAContextScope scope(this);
-
-    CUfunction cuShader;
-    CUdeviceptr d_input = cuda_device_ptr(task.shader_input);
-    CUdeviceptr d_output = cuda_device_ptr(task.shader_output);
-
-    /* get kernel function */
-    if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
-      cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_bake"));
-    }
-    else if (task.shader_eval_type == SHADER_EVAL_DISPLACE) {
-      cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace"));
-    }
-    else {
-      cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background"));
-    }
-
-    /* do tasks in smaller chunks, so we can cancel it */
-    const int shader_chunk_size = 65536;
-    const int start = task.shader_x;
-    const int end = task.shader_x + task.shader_w;
-    int offset = task.offset;
-
-    bool canceled = false;
-    for (int sample = 0; sample < task.num_samples && !canceled; sample++) {
-      for (int shader_x = start; shader_x < end; shader_x += shader_chunk_size) {
-        int shader_w = min(shader_chunk_size, end - shader_x);
-
-        /* pass in parameters */
-        void *args[8];
-        int arg = 0;
-        args[arg++] = &d_input;
-        args[arg++] = &d_output;
-        args[arg++] = &task.shader_eval_type;
-        if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
-          args[arg++] = &task.shader_filter;
-        }
-        args[arg++] = &shader_x;
-        args[arg++] = &shader_w;
-        args[arg++] = &offset;
-        args[arg++] = &sample;
-
-        /* launch kernel */
-        int threads_per_block;
-        cuda_assert(cuFuncGetAttribute(
-            &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));
-
-        int xblocks = (shader_w + threads_per_block - 1) / threads_per_block;
-
-        cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
-        cuda_assert(cuLaunchKernel(cuShader,
-                                   xblocks,
-                                   1,
-                                   1, /* blocks */
-                                   threads_per_block,
-                                   1,
-                                   1, /* threads */
-                                   0,
-                                   0,
-                                   args,
-                                   0));
-
-        cuda_assert(cuCtxSynchronize());
-
-        if (task.get_cancel()) {
-          canceled = true;
-          break;
-        }
-      }
-
-      task.update_progress(NULL);
-    }
-  }
-
-  CUdeviceptr map_pixels(device_ptr mem)
-  {
-    if (!background) {
-      PixelMem pmem = pixel_mem_map[mem];
-      CUdeviceptr buffer;
-
-      size_t bytes;
-      cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0));
-      cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource));
-
-      return buffer;
-    }
-
-    return cuda_device_ptr(mem);
-  }
-
-  void unmap_pixels(device_ptr mem)
-  {
-    if (!background) {
-      PixelMem pmem = pixel_mem_map[mem];
-
-      cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0));
-    }
-  }
-
-  void pixels_alloc(device_memory &mem)
-  {
-    PixelMem pmem;
-
-    pmem.w = mem.data_width;
-    pmem.h = mem.data_height;
-
-    CUDAContextScope scope(this);
-
-    glGenBuffers(1, &pmem.cuPBO);
-    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
-    if (mem.data_type == TYPE_HALF)
-      glBufferData(
-          GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(GLhalf) * 4, NULL, GL_DYNAMIC_DRAW);
-    else
-      glBufferData(
-          GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(uint8_t) * 4, NULL, GL_DYNAMIC_DRAW);
-
-    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
-    glActiveTexture(GL_TEXTURE0);
-    glGenTextures(1, &pmem.cuTexId);
-    glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
-    if (mem.data_type == TYPE_HALF)
-      glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL);
-    else
-      glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-    glBindTexture(GL_TEXTURE_2D, 0);
-
-    CUresult result = cuGraphicsGLRegisterBuffer(
-        &pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
-
-    if (result == CUDA_SUCCESS) {
-      mem.device_pointer = pmem.cuTexId;
-      pixel_mem_map[mem.device_pointer] = pmem;
-
-      mem.device_size = mem.memory_size();
-      stats.mem_alloc(mem.device_size);
-
-      return;
-    }
-    else {
-      /* failed to register buffer, fallback to no interop */
-      glDeleteBuffers(1, &pmem.cuPBO);
-      glDeleteTextures(1, &pmem.cuTexId);
-
-      background = true;
-    }
-  }
-
-  void pixels_copy_from(device_memory &mem, int y, int w, int h)
-  {
-    PixelMem pmem = pixel_mem_map[mem.device_pointer];
-
-    CUDAContextScope scope(this);
-
-    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
-    uchar *pixels = (uchar *)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
-    size_t offset = sizeof(uchar) * 4 * y * w;
-    memcpy((uchar *)mem.host_pointer + offset, pixels + offset, sizeof(uchar) * 4 * w * h);
-    glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
-    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-  }
-
-  void pixels_free(device_memory &mem)
-  {
-    if (mem.device_pointer) {
-      PixelMem pmem = pixel_mem_map[mem.device_pointer];
-
-      CUDAContextScope scope(this);
-
-      cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource));
-      glDeleteBuffers(1, &pmem.cuPBO);
-      glDeleteTextures(1, &pmem.cuTexId);
-
-      pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
-      mem.device_pointer = 0;
-
-      stats.mem_free(mem.device_size);
-      mem.device_size = 0;
-    }
-  }
-
-  void draw_pixels(device_memory &mem,
-                   int y,
-                   int w,
-                   int h,
-                   int width,
-                   int height,
-                   int dx,
-                   int dy,
-                   int dw,
-                   int dh,
-                   bool transparent,
-                   const DeviceDrawParams &draw_params)
-  {
-    assert(mem.type == MEM_PIXELS);
-
-    if (!background) {
-      const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL);
-      PixelMem pmem = pixel_mem_map[mem.device_pointer];
-      float *vpointer;
-
-      CUDAContextScope scope(this);
-
-      /* for multi devices, this assumes the inefficient method that we allocate
-       * all pixels on the device even though we only render to a subset */
-      size_t offset = 4 * y * w;
-
-      if (mem.data_type == TYPE_HALF)
-        offset *= sizeof(GLhalf);
-      else
-        offset *= sizeof(uint8_t);
-
-      glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
-      glActiveTexture(GL_TEXTURE0);
-      glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
-      if (mem.data_type == TYPE_HALF) {
-        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_HALF_FLOAT, (void *)offset);
-      }
-      else {
-        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void *)offset);
-      }
-      glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
-      if (transparent) {
-        glEnable(GL_BLEND);
-        glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
-      }
-
-      GLint shader_program;
-      if (use_fallback_shader) {
-        if (!bind_fallback_display_space_shader(dw, dh)) {
-          return;
-        }
-        shader_program = fallback_shader_program;
-      }
-      else {
-        draw_params.bind_display_space_shader_cb();
-        glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program);
-      }
-
-      if (!vertex_buffer) {
-        glGenBuffers(1, &vertex_buffer);
-      }
-
-      glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
-      /* invalidate old contents -
-       * avoids stalling if buffer is still waiting in queue to be rendered */
-      glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
-
-      vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
-
-      if (vpointer) {
-        /* texture coordinate - vertex pair */
-        vpointer[0] = 0.0f;
-        vpointer[1] = 0.0f;
-        vpointer[2] = dx;
-        vpointer[3] = dy;
-
-        vpointer[4] = (float)w / (float)pmem.w;
-        vpointer[5] = 0.0f;
-        vpointer[6] = (float)width + dx;
-        vpointer[7] = dy;
-
-        vpointer[8] = (float)w / (float)pmem.w;
-        vpointer[9] = (float)h / (float)pmem.h;
-        vpointer[10] = (float)width + dx;
-        vpointer[11] = (float)height + dy;
-
-        vpointer[12] = 0.0f;
-        vpointer[13] = (float)h / (float)pmem.h;
-        vpointer[14] = dx;
-        vpointer[15] = (float)height + dy;
-
-        glUnmapBuffer(GL_ARRAY_BUFFER);
-      }
-
-      GLuint vertex_array_object;
-      GLuint position_attribute, texcoord_attribute;
-
-      glGenVertexArrays(1, &vertex_array_object);
-      glBindVertexArray(vertex_array_object);
-
-      texcoord_attribute = glGetAttribLocation(shader_program, "texCoord");
-      position_attribute = glGetAttribLocation(shader_program, "pos");
-
-      glEnableVertexAttribArray(texcoord_attribute);
-      glEnableVertexAttribArray(position_attribute);
-
-      glVertexAttribPointer(
-          texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
-      glVertexAttribPointer(position_attribute,
-                            2,
-                            GL_FLOAT,
-                            GL_FALSE,
-                            4 * sizeof(float),
-                            (const GLvoid *)(sizeof(float) * 2));
-
-      glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
-
-      if (use_fallback_shader) {
-        glUseProgram(0);
-      }
-      else {
-        draw_params.unbind_display_space_shader_cb();
-      }
-
-      if (transparent) {
-        glDisable(GL_BLEND);
-      }
-
-      glBindTexture(GL_TEXTURE_2D, 0);
-
-      return;
-    }
-
-    Device::draw_pixels(mem, y, w, h, width, height, dx, dy, dw, dh, transparent, draw_params);
-  }
-
-  void thread_run(DeviceTask *task)
-  {
-    CUDAContextScope scope(this);
-
-    if (task->type == DeviceTask::RENDER || task->type == DeviceTask::DENOISE) {
-      DeviceRequestedFeatures requested_features;
-      if (use_split_kernel()) {
-        if (split_kernel == NULL) {
-          split_kernel = new CUDASplitKernel(this);
-          split_kernel->load_kernels(requested_features);
-        }
-      }
-
-      device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
-
-      /* keep rendering tiles until done */
-      RenderTile tile;
-      DenoisingTask denoising(this, *task);
-
-      while (task->acquire_tile(this, tile)) {
-        if (tile.task == RenderTile::PATH_TRACE) {
-          if (use_split_kernel()) {
-            device_only_memory<uchar> void_buffer(this, "void_buffer");
-            split_kernel->path_trace(task, tile, void_buffer, void_buffer);
-          }
-          else {
-            path_trace(*task, tile, work_tiles);
-          }
-        }
-        else if (tile.task == RenderTile::DENOISE) {
-          tile.sample = tile.start_sample + tile.num_samples;
-
-          denoise(tile, denoising);
-
-          task->update_progress(&tile, tile.w * tile.h);
-        }
-
-        task->release_tile(tile);
-
-        if (task->get_cancel()) {
-          if (task->need_finish_queue == false)
-            break;
-        }
-      }
-
-      work_tiles.free();
-    }
-    else if (task->type == DeviceTask::SHADER) {
-      shader(*task);
-
-      cuda_assert(cuCtxSynchronize());
-    }
-    else if (task->type == DeviceTask::DENOISE_BUFFER) {
-      RenderTile tile;
-      tile.x = task->x;
-      tile.y = task->y;
-      tile.w = task->w;
-      tile.h = task->h;
-      tile.buffer = task->buffer;
-      tile.sample = task->sample + task->num_samples;
-      tile.num_samples = task->num_samples;
-      tile.start_sample = task->sample;
-      tile.offset = task->offset;
-      tile.stride = task->stride;
-      tile.buffers = task->buffers;
-
-      DenoisingTask denoising(this, *task);
-      denoise(tile, denoising);
-      task->update_progress(&tile, tile.w * tile.h);
-    }
-  }
-
-  class CUDADeviceTask : public DeviceTask {
-   public:
-    CUDADeviceTask(CUDADevice *device, DeviceTask &task) : DeviceTask(task)
-    {
-      run = function_bind(&CUDADevice::thread_run, device, this);
-    }
-  };
-
-  void task_add(DeviceTask &task)
-  {
-    CUDAContextScope scope(this);
-
-    /* Load texture info. */
-    load_texture_info();
-
-    /* Synchronize all memory copies before executing task. */
-    cuda_assert(cuCtxSynchronize());
-
-    if (task.type == DeviceTask::FILM_CONVERT) {
-      /* must be done in main thread due to opengl access */
-      film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
-    }
-    else {
-      task_pool.push(new CUDADeviceTask(this, task));
-    }
-  }
-
-  void task_wait()
-  {
-    task_pool.wait();
-  }
-
-  void task_cancel()
-  {
-    task_pool.cancel();
-  }
-
-  friend class CUDASplitKernelFunction;
-  friend class CUDASplitKernel;
-  friend class CUDAContextScope;
-};
-
-/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class
- * now that the definition of that class is complete
- */
-#undef cuda_assert
-#define cuda_assert(stmt) \
-  { \
-    CUresult result = stmt; \
-\
-    if (result != CUDA_SUCCESS) { \
-      string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
-      if (device->error_msg == "") \
-        device->error_msg = message; \
-      fprintf(stderr, "%s\n", message.c_str()); \
-      /*cuda_abort();*/ \
-      device->cuda_error_documentation(); \
-    } \
-  } \
-  (void)0
-
-/* CUDA context scope. */
-
-CUDAContextScope::CUDAContextScope(CUDADevice *device) : device(device)
-{
-  cuda_assert(cuCtxPushCurrent(device->cuContext));
-}
-
-CUDAContextScope::~CUDAContextScope()
-{
-  cuda_assert(cuCtxPopCurrent(NULL));
-}
-
-/* split kernel */
-
-class CUDASplitKernelFunction : public SplitKernelFunction {
-  CUDADevice *device;
-  CUfunction func;
-
- public:
-  CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func)
-  {
-  }
-
-  /* enqueue the kernel, returns false if there is an error */
-  bool enqueue(const KernelDimensions &dim, device_memory & /*kg*/, device_memory & /*data*/)
-  {
-    return enqueue(dim, NULL);
-  }
-
-  /* enqueue the kernel, returns false if there is an error */
-  bool enqueue(const KernelDimensions &dim, void *args[])
-  {
-    if (device->have_error())
-      return false;
-
-    CUDAContextScope scope(device);
-
-    /* we ignore dim.local_size for now, as this is faster */
-    int threads_per_block;
-    cuda_assert(
-        cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
-
-    int xblocks = (dim.global_size[0] * dim.global_size[1] + threads_per_block - 1) /
-                  threads_per_block;
-
-    cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
-
-    cuda_assert(cuLaunchKernel(func,
-                               xblocks,
-                               1,
-                               1, /* blocks */
-                               threads_per_block,
-                               1,
-                               1, /* threads */
-                               0,
-                               0,
-                               args,
-                               0));
-
-    return !device->have_error();
-  }
-};
-
-CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device)
-{
-}
-
-uint64_t CUDASplitKernel::state_buffer_size(device_memory & /*kg*/,
-                                            device_memory & /*data*/,
-                                            size_t num_threads)
-{
-  CUDAContextScope scope(device);
-
-  device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
-  size_buffer.alloc(1);
-  size_buffer.zero_to_device();
-
-  uint threads = num_threads;
-  CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer);
-
-  struct args_t {
-    uint *num_threads;
-    CUdeviceptr *size;
-  };
-
-  args_t args = {&threads, &d_size};
-
-  CUfunction state_buffer_size;
-  cuda_assert(
-      cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size"));
-
-  cuda_assert(cuLaunchKernel(state_buffer_size, 1, 1, 1, 1, 1, 1, 0, 0, (void **)&args, 0));
-
-  size_buffer.copy_from_device(0, 1, 1);
-  size_t size = size_buffer[0];
-  size_buffer.free();
-
-  return size;
-}
-
-bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim,
-                                                     RenderTile &rtile,
-                                                     int num_global_elements,
-                                                     device_memory & /*kernel_globals*/,
-                                                     device_memory & /*kernel_data*/,
-                                                     device_memory &split_data,
-                                                     device_memory &ray_state,
-                                                     device_memory &queue_index,
-                                                     device_memory &use_queues_flag,
-                                                     device_memory &work_pool_wgs)
-{
-  CUDAContextScope scope(device);
-
-  CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer);
-  CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer);
-  CUdeviceptr d_queue_index = device->cuda_device_ptr(queue_index.device_pointer);
-  CUdeviceptr d_use_queues_flag = device->cuda_device_ptr(use_queues_flag.device_pointer);
-  CUdeviceptr d_work_pool_wgs = device->cuda_device_ptr(work_pool_wgs.device_pointer);
-
-  CUdeviceptr d_buffer = device->cuda_device_ptr(rtile.buffer);
-
-  int end_sample = rtile.start_sample + rtile.num_samples;
-  int queue_size = dim.global_size[0] * dim.global_size[1];
-
-  struct args_t {
-    CUdeviceptr *split_data_buffer;
-    int *num_elements;
-    CUdeviceptr *ray_state;
-    int *start_sample;
-    int *end_sample;
-    int *sx;
-    int *sy;
-    int *sw;
-    int *sh;
-    int *offset;
-    int *stride;
-    CUdeviceptr *queue_index;
-    int *queuesize;
-    CUdeviceptr *use_queues_flag;
-    CUdeviceptr *work_pool_wgs;
-    int *num_samples;
-    CUdeviceptr *buffer;
-  };
-
-  args_t args = {&d_split_data,
-                 &num_global_elements,
-                 &d_ray_state,
-                 &rtile.start_sample,
-                 &end_sample,
-                 &rtile.x,
-                 &rtile.y,
-                 &rtile.w,
-                 &rtile.h,
-                 &rtile.offset,
-                 &rtile.stride,
-                 &d_queue_index,
-                 &queue_size,
-                 &d_use_queues_flag,
-                 &d_work_pool_wgs,
-                 &rtile.num_samples,
-                 &d_buffer};
-
-  CUfunction data_init;
-  cuda_assert(
-      cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init"));
-  if (device->have_error()) {
-    return false;
-  }
-
-  CUDASplitKernelFunction(device, data_init).enqueue(dim, (void **)&args);
-
-  return !device->have_error();
-}
-
-SplitKernelFunction *CUDASplitKernel::get_split_kernel_function(const string &kernel_name,
-                                                                const DeviceRequestedFeatures &)
-{
-  CUDAContextScope scope(device);
-  CUfunction func;
-
-  cuda_assert(
-      cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data()));
-  if (device->have_error()) {
-    device->cuda_error_message(
-        string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data()));
-    return NULL;
-  }
-
-  return new CUDASplitKernelFunction(device, func);
-}
-
-int2 CUDASplitKernel::split_kernel_local_size()
-{
-  return make_int2(32, 1);
-}
-
-int2 CUDASplitKernel::split_kernel_global_size(device_memory &kg,
-                                               device_memory &data,
-                                               DeviceTask * /*task*/)
-{
-  CUDAContextScope scope(device);
-  size_t free;
-  size_t total;
-
-  cuda_assert(cuMemGetInfo(&free, &total));
-
-  VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(free)
-          << " bytes. (" << string_human_readable_size(free) << ").";
-
-  size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2);
-  size_t side = round_down((int)sqrt(num_elements), 32);
-  int2 global_size = make_int2(side, round_down(num_elements / side, 16));
-  VLOG(1) << "Global size: " << global_size << ".";
-  return global_size;
-}
-
 bool device_cuda_init()
 {
 #ifdef WITH_CUDA_DYNLOAD
@@ -2861,3 +300,5 @@ string device_cuda_capabilities()
 }
 
 CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index b07596c60ff..83f3598df3c 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -16,9 +16,7 @@
 
 #ifdef WITH_OPENCL
 
-#  include "device/opencl/opencl.h"
-
-#  include "device/device_intern.h"
+#  include "device/opencl/device_opencl.h"
 
 #  include "util/util_foreach.h"
 #  include "util/util_logging.h"
diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp
index ac119a723e3..4a711c50731 100644
--- a/intern/cycles/device/device_optix.cpp
+++ b/intern/cycles/device/device_optix.cpp
@@ -17,7 +17,7 @@
 
 #ifdef WITH_OPTIX
 
-#  include "device/device.h"
+#  include "device/cuda/device_cuda.h"
 #  include "device/device_intern.h"
 #  include "device/device_denoising.h"
 #  include "bvh/bvh.h"
@@ -120,19 +120,7 @@ struct KernelParams {
     check_result_cuda_ret(cuLaunchKernel( \
         func, xblocks, yblocks, 1, threads, threads, 1, 0, cuda_stream[thread_index], args, 0));
 
-/* Similar as above, but for 1-dimensional blocks. */
-#  define CUDA_GET_BLOCKSIZE_1D(func, w, h) \
-    int threads; \
-    check_result_cuda_ret( \
-        cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
-    int xblocks = ((w) + threads - 1) / threads; \
-    int yblocks = h;
-
-#  define CUDA_LAUNCH_KERNEL_1D(func, args) \
-    check_result_cuda_ret(cuLaunchKernel( \
-        func, xblocks, yblocks, 1, threads, 1, 1, 0, cuda_stream[thread_index], args, 0));
-
-class OptiXDevice : public Device {
+class OptiXDevice : public CUDADevice {
 
   // List of OptiX program groups
   enum {
@@ -181,78 +169,36 @@ class OptiXDevice : public Device {
   // Use a pool with multiple threads to support launches with multiple CUDA streams
   TaskPool task_pool;
 
-  // CUDA/OptiX context handles
-  CUdevice cuda_device = 0;
-  CUcontext cuda_context = NULL;
   vector<CUstream> cuda_stream;
   OptixDeviceContext context = NULL;
 
-  // Need CUDA kernel module for some utility functions
-  CUmodule cuda_module = NULL;
-  CUmodule cuda_filter_module = NULL;
-  // All necessary OptiX kernels are in one module
-  OptixModule optix_module = NULL;
+  OptixModule optix_module = NULL;  // All necessary OptiX kernels are in one module
   OptixPipeline pipelines[NUM_PIPELINES] = {};
 
   bool motion_blur = false;
-  bool need_texture_info = false;
   device_vector<SbtRecord> sbt_data;
-  device_vector<TextureInfo> texture_info;
   device_only_memory<KernelParams> launch_params;
   vector<CUdeviceptr> as_mem;
   OptixTraversableHandle tlas_handle = 0;
 
-  // TODO(pmours): This is copied from device_cuda.cpp, so move to common code eventually
-  int can_map_host = 0;
-  size_t map_host_used = 0;
-  size_t map_host_limit = 0;
-  size_t device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
-  size_t device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
-  map<device_memory *, CUDAMem> cuda_mem_map;
-  bool move_texture_to_host = false;
-
   OptixDenoiser denoiser = NULL;
   vector<pair<int2, CUdeviceptr>> denoiser_state;
   int denoiser_input_passes = 0;
 
  public:
   OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
-      : Device(info_, stats_, profiler_, background_),
+      : CUDADevice(info_, stats_, profiler_, background_),
         sbt_data(this, "__sbt", MEM_READ_ONLY),
-        texture_info(this, "__texture_info", MEM_TEXTURE),
         launch_params(this, "__params")
   {
     // Store number of CUDA streams in device info
     info.cpu_threads = DebugFlags().optix.cuda_streams;
 
-    // Initialize CUDA driver API
-    check_result_cuda(cuInit(0));
-
-    // Retrieve the primary CUDA context for this device
-    check_result_cuda(cuDeviceGet(&cuda_device, info.num));
-    check_result_cuda(cuDevicePrimaryCtxRetain(&cuda_context, cuda_device));
-
-    // Make that CUDA context current
-    const CUDAContextScope scope(cuda_context);
-
-    // Limit amount of host mapped memory (see init_host_memory in device_cuda.cpp)
-    size_t default_limit = 4 * 1024 * 1024 * 1024LL;
-    size_t system_ram = system_physical_ram();
-    if (system_ram > 0) {
-      if (system_ram / 2 > default_limit) {
-        map_host_limit = system_ram - default_limit;
-      }
-      else {
-        map_host_limit = system_ram / 2;
-      }
-    }
-    else {
-      VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
+    // Make the CUDA context current
+    if (!cuContext) {
+      return;
     }
-
-    // Check device support for pinned host memory
-    check_result_cuda(
-        cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuda_device));
+    const CUDAContextScope scope(cuContext);
 
     // Create OptiX context for this device
     OptixDeviceContextOptions options = {};
@@ -276,7 +222,7 @@ class OptiXDevice : public Device {
           }
         };
 #  endif
-    check_result_optix(optixDeviceContextCreate(cuda_context, &options, &context));
+    check_result_optix(optixDeviceContextCreate(cuContext, &options, &context));
 #  ifdef WITH_CYCLES_LOGGING
     check_result_optix(optixDeviceContextSetLogCallback(
         context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel));
@@ -300,6 +246,9 @@ class OptiXDevice : public Device {
     // Stop processing any more tasks
     task_pool.stop();
 
+    // Make CUDA context current
+    const CUDAContextScope scope(cuContext);
+
     // Free all acceleration structures
     for (CUdeviceptr mem : as_mem) {
       cuMemFree(mem);
@@ -314,14 +263,7 @@ class OptiXDevice : public Device {
     texture_info.free();
     launch_params.free();
 
-    // Make CUDA context current
-    const CUDAContextScope scope(cuda_context);
-
     // Unload modules
-    if (cuda_module != NULL)
-      cuModuleUnload(cuda_module);
-    if (cuda_filter_module != NULL)
-      cuModuleUnload(cuda_filter_module);
     if (optix_module != NULL)
       optixModuleDestroy(optix_module);
     for (unsigned int i = 0; i < NUM_PIPELINES; ++i)
@@ -335,9 +277,7 @@ class OptiXDevice : public Device {
     if (denoiser != NULL)
       optixDenoiserDestroy(denoiser);
 
-    // Destroy OptiX and CUDA context
     optixDeviceContextDestroy(context);
-    cuDevicePrimaryCtxRelease(cuda_device);
   }
 
  private:
@@ -355,8 +295,15 @@ class OptiXDevice : public Device {
 
   bool load_kernels(const DeviceRequestedFeatures &requested_features) override
   {
-    if (have_error())
-      return false;  // Abort early if context creation failed already
+    if (have_error()) {
+      // Abort early if context creation failed already
+      return false;
+    }
+
+    // Load CUDA modules because we need some of the utility kernels
+    if (!CUDADevice::load_kernels(requested_features)) {
+      return false;
+    }
 
     // Disable baking for now, since its kernel is not well-suited for inlining and is very slow
     if (requested_features.use_baking) {
@@ -369,7 +316,7 @@ class OptiXDevice : public Device {
       return false;
     }
 
-    const CUDAContextScope scope(cuda_context);
+    const CUDAContextScope scope(cuContext);
 
     // Unload existing OptiX module and pipelines first
     if (optix_module != NULL) {
@@ -437,34 +384,6 @@ class OptiXDevice : public Device {
                                                       &optix_module));
     }
 
-    {  // Load CUDA modules because we need some of the utility kernels
-      int major, minor;
-      cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, info.num);
-      cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, info.num);
-
-      if (cuda_module == NULL) {  // Avoid reloading module if it was already loaded
-        string cubin_data;
-        const string cubin_filename = string_printf("lib/kernel_sm_%d%d.cubin", major, minor);
-        if (!path_read_text(path_get(cubin_filename), cubin_data)) {
-          set_error("Failed loading pre-compiled CUDA kernel " + cubin_filename + ".");
-          return false;
-        }
-
-        check_result_cuda_ret(cuModuleLoadData(&cuda_module, cubin_data.data()));
-      }
-
-      if (requested_features.use_denoising && cuda_filter_module == NULL) {
-        string filter_data;
-        const string filter_filename = string_printf("lib/filter_sm_%d%d.cubin", major, minor);
-        if (!path_read_text(path_get(filter_filename), filter_data)) {
-          set_error("Failed loading pre-compiled CUDA filter kernel " + filter_filename + ".");
-          return false;
-        }
-
-        check_result_cuda_ret(cuModuleLoadData(&cuda_filter_module, filter_data.data()));
-      }
-    }
-
     // Create program groups
     OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
     OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
@@ -650,9 +569,6 @@ class OptiXDevice : public Device {
     else if (task.type == DeviceTask::SHADER) {
       launch_shader_eval(task, thread_index);
     }
-    else if (task.type == DeviceTask::FILM_CONVERT) {
-      launch_film_convert(task, thread_index);
-    }
     else if (task.type == DeviceTask::DENOISE_BUFFER) {
       // Set up a single tile that covers the whole task and denoise it
       RenderTile tile;
@@ -694,7 +610,7 @@ class OptiXDevice : public Device {
     device_ptr launch_params_ptr = launch_params.device_pointer +
                                    thread_index * launch_params.data_elements;
 
-    const CUDAContextScope scope(cuda_context);
+    const CUDAContextScope scope(cuContext);
 
     for (int sample = rtile.start_sample; sample < end_sample; sample += step_samples) {
       // Copy work tile information to device
@@ -745,7 +661,7 @@ class OptiXDevice : public Device {
   {
     int total_samples = rtile.start_sample + rtile.num_samples;
 
-    const CUDAContextScope scope(cuda_context);
+    const CUDAContextScope scope(cuContext);
 
     // Choose between OptiX and NLM denoising
     if (task.denoising_use_optix) {
@@ -826,7 +742,7 @@ class OptiXDevice : public Device {
 
         CUfunction filter_copy_func;
         check_result_cuda_ret(cuModuleGetFunction(
-            &filter_copy_func, cuda_filter_module, "kernel_cuda_filter_copy_input"));
+            &filter_copy_func, cuFilterModule, "kernel_cuda_filter_copy_input"));
         check_result_cuda_ret(cuFuncSetCacheConfig(filter_copy_func, CU_FUNC_CACHE_PREFER_L1));
 
         void *args[] = {
@@ -843,7 +759,7 @@ class OptiXDevice : public Device {
 
         CUfunction convert_to_rgb_func;
         check_result_cuda_ret(cuModuleGetFunction(
-            &convert_to_rgb_func, cuda_filter_module, "kernel_cuda_filter_convert_to_rgb"));
+            &convert_to_rgb_func, cuFilterModule, "kernel_cuda_filter_convert_to_rgb"));
         check_result_cuda_ret(cuFuncSetCacheConfig(convert_to_rgb_func, CU_FUNC_CACHE_PREFER_L1));
 
         void *args[] = {&input_rgb.device_pointer,
@@ -971,7 +887,7 @@ class OptiXDevice : public Device {
       {
         CUfunction convert_from_rgb_func;
         check_result_cuda_ret(cuModuleGetFunction(
-            &convert_from_rgb_func, cuda_filter_module, "kernel_cuda_filter_convert_from_rgb"));
+            &convert_from_rgb_func, cuFilterModule, "kernel_cuda_filter_convert_from_rgb"));
         check_result_cuda_ret(
             cuFuncSetCacheConfig(convert_from_rgb_func, CU_FUNC_CACHE_PREFER_L1));
 
@@ -998,47 +914,10 @@ class OptiXDevice : public Device {
       task.unmap_neighbor_tiles(rtiles, this);
     }
     else {
+      assert(thread_index == 0);
       // Run CUDA denoising kernels
       DenoisingTask denoising(this, task);
-      denoising.functions.construct_transform = function_bind(
-          &OptiXDevice::denoising_construct_transform, this, &denoising, thread_index);
-      denoising.functions.accumulate = function_bind(
-          &OptiXDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising, thread_index);
-      denoising.functions.solve = function_bind(
-          &OptiXDevice::denoising_solve, this, _1, &denoising, thread_index);
-      denoising.functions.divide_shadow = function_bind(&OptiXDevice::denoising_divide_shadow,
-                                                        this,
-                                                        _1,
-                                                        _2,
-                                                        _3,
-                                                        _4,
-                                                        _5,
-                                                        &denoising,
-                                                        thread_index);
-      denoising.functions.non_local_means = function_bind(
-          &OptiXDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising, thread_index);
-      denoising.functions.combine_halves = function_bind(&OptiXDevice::denoising_combine_halves,
-                                                         this,
-                                                         _1,
-                                                         _2,
-                                                         _3,
-                                                         _4,
-                                                         _5,
-                                                         _6,
-                                                         &denoising,
-                                                         thread_index);
-      denoising.functions.get_feature = function_bind(
-          &OptiXDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising, thread_index);
-      denoising.functions.write_feature = function_bind(
-          &OptiXDevice::denoising_write_feature, this, _1, _2, _3, &denoising, thread_index);
-      denoising.functions.detect_outliers = function_bind(
-          &OptiXDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising, thread_index);
-
-      denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
-      denoising.render_buffer.samples = total_samples;
-      denoising.buffer.gpu_temporary_mem = true;
-
-      denoising.run_denoising(&rtile);
+      CUDADevice::denoise(rtile, denoising);
     }
 
     // Update current sample, so it is displayed correctly
@@ -1057,7 +936,7 @@ class OptiXDevice : public Device {
     if (task.shader_eval_type == SHADER_EVAL_DISPLACE)
       rgen_index = PG_DISP;
 
-    const CUDAContextScope scope(cuda_context);
+    const CUDAContextScope scope(cuContext);
 
     device_ptr launch_params_ptr = launch_params.device_pointer +
                                    thread_index * launch_params.data_elements;
@@ -1104,62 +983,13 @@ class OptiXDevice : public Device {
     }
   }
 
-  void launch_film_convert(DeviceTask &task, int thread_index)
-  {
-    const CUDAContextScope scope(cuda_context);
-
-    CUfunction film_convert_func;
-    check_result_cuda(cuModuleGetFunction(&film_convert_func,
-                                          cuda_module,
-                                          task.rgba_byte ? "kernel_cuda_convert_to_byte" :
-                                                           "kernel_cuda_convert_to_half_float"));
-
-    float sample_scale = 1.0f / (task.sample + 1);
-    CUdeviceptr rgba = (task.rgba_byte ? task.rgba_byte : task.rgba_half);
-
-    void *args[] = {&rgba,
-                    &task.buffer,
-                    &sample_scale,
-                    &task.x,
-                    &task.y,
-                    &task.w,
-                    &task.h,
-                    &task.offset,
-                    &task.stride};
-
-    int threads_per_block;
-    check_result_cuda(cuFuncGetAttribute(
-        &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, film_convert_func));
-
-    const int num_threads_x = (int)sqrt(threads_per_block);
-    const int num_blocks_x = (task.w + num_threads_x - 1) / num_threads_x;
-    const int num_threads_y = (int)sqrt(threads_per_block);
-    const int num_blocks_y = (task.h + num_threads_y - 1) / num_threads_y;
-
-    check_result_cuda(cuLaunchKernel(film_convert_func,
-                                     num_blocks_x,
-                                     num_blocks_y,
-                                     1, /* blocks */
-                                     num_threads_x,
-                                     num_threads_y,
-                                     1, /* threads */
-                                     0,
-                                     cuda_stream[thread_index],
-                                     args,
-                                     0));
-
-    check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
-
-    task.update_progress(NULL);
-  }
-
   bool build_optix_bvh(const OptixBuildInput &build_input,
                        uint16_t num_motion_steps,
                        OptixTraversableHandle &out_handle)
   {
     out_handle = 0;
 
-    const CUDAContextScope scope(cuda_context);
+    const CUDAContextScope scope(cuContext);
 
     // Compute memory usage
     OptixAccelBufferSizes sizes = {};
@@ -1477,7 +1307,7 @@ class OptiXDevice : public Device {
         size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
                                        motion_keys * sizeof(OptixSRTData);
 
-        const CUDAContextScope scope(cuda_context);
+        const CUDAContextScope scope(cuContext);
 
         CUdeviceptr motion_transform_gpu = 0;
         check_result_cuda_ret(cuMemAlloc(&motion_transform_gpu, motion_transform_size));
@@ -1574,621 +1404,13 @@ class OptiXDevice : public Device {
     return build_optix_bvh(build_input, 0, tlas_handle);
   }
 
-  void update_texture_info()
-  {
-    if (need_texture_info) {
-      texture_info.copy_to_device();
-      need_texture_info = false;
-    }
-  }
-
-  void update_launch_params(const char *name, size_t offset, void *data, size_t data_size)
+  void const_copy_to(const char *name, void *host, size_t size) override
   {
-    const CUDAContextScope scope(cuda_context);
-
-    for (int i = 0; i < info.cpu_threads; ++i)
-      check_result_cuda(
-          cuMemcpyHtoD(launch_params.device_pointer + i * launch_params.data_elements + offset,
-                       data,
-                       data_size));
-
     // Set constant memory for CUDA module
-    // TODO(pmours): This is only used for tonemapping (see 'launch_film_convert').
+    // TODO(pmours): This is only used for tonemapping (see 'film_convert').
     //               Could be removed by moving those functions to filter CUDA module.
-    size_t bytes = 0;
-    CUdeviceptr mem = 0;
-    check_result_cuda(cuModuleGetGlobal(&mem, &bytes, cuda_module, name));
-    assert(mem != 0 && bytes == data_size);
-    check_result_cuda(cuMemcpyHtoD(mem, data, data_size));
-  }
-
-  void mem_alloc(device_memory &mem) override
-  {
-    if (mem.type == MEM_PIXELS && !background) {
-      // Always fall back to no interop for now
-      // TODO(pmours): Support OpenGL interop when moving CUDA memory management to common code
-      background = true;
-    }
-    else if (mem.type == MEM_TEXTURE) {
-      assert(!"mem_alloc not supported for textures.");
-      return;
-    }
-
-    generic_alloc(mem);
-  }
-
-  CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0)
-  {
-    CUDAContextScope scope(cuda_context);
-
-    CUdeviceptr device_pointer = 0;
-    size_t size = mem.memory_size() + pitch_padding;
-
-    CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
-    const char *status = "";
-
-    /* First try allocating in device memory, respecting headroom. We make
-     * an exception for texture info. It is small and frequently accessed,
-     * so treat it as working memory.
-     *
-     * If there is not enough room for working memory, we will try to move
-     * textures to host memory, assuming the performance impact would have
-     * been worse for working memory. */
-    bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
-    bool is_image = is_texture && (mem.data_height > 1);
-
-    size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
-
-    size_t total = 0, free = 0;
-    cuMemGetInfo(&free, &total);
-
-    /* Move textures to host memory if needed. */
-    if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
-      move_textures_to_host(size + headroom - free, is_texture);
-      cuMemGetInfo(&free, &total);
-    }
-
-    /* Allocate in device memory. */
-    if (!move_texture_to_host && (size + headroom) < free) {
-      mem_alloc_result = cuMemAlloc(&device_pointer, size);
-      if (mem_alloc_result == CUDA_SUCCESS) {
-        status = " in device memory";
-      }
-    }
-
-    /* Fall back to mapped host memory if needed and possible. */
-    void *shared_pointer = 0;
-
-    if (mem_alloc_result != CUDA_SUCCESS && can_map_host) {
-      if (mem.shared_pointer) {
-        /* Another device already allocated host memory. */
-        mem_alloc_result = CUDA_SUCCESS;
-        shared_pointer = mem.shared_pointer;
-      }
-      else if (map_host_used + size < map_host_limit) {
-        /* Allocate host memory ourselves. */
-        mem_alloc_result = cuMemHostAlloc(
-            &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
-
-        assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
-               (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
-      }
-
-      if (mem_alloc_result == CUDA_SUCCESS) {
-        cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0);
-        map_host_used += size;
-        status = " in host memory";
-      }
-      else {
-        status = " failed, out of host memory";
-      }
-    }
-    else if (mem_alloc_result != CUDA_SUCCESS) {
-      status = " failed, out of device and host memory";
-    }
-
-    if (mem.name) {
-      VLOG(1) << "Buffer allocate: " << mem.name << ", "
-              << string_human_readable_number(mem.memory_size()) << " bytes. ("
-              << string_human_readable_size(mem.memory_size()) << ")" << status;
-    }
-
-    if (mem_alloc_result != CUDA_SUCCESS) {
-      set_error(string_printf("Buffer allocate %s", status));
-      return NULL;
-    }
-
-    mem.device_pointer = (device_ptr)device_pointer;
-    mem.device_size = size;
-    stats.mem_alloc(size);
-
-    if (!mem.device_pointer) {
-      return NULL;
-    }
-
-    /* Insert into map of allocations. */
-    CUDAMem *cmem = &cuda_mem_map[&mem];
-    if (shared_pointer != 0) {
-      /* Replace host pointer with our host allocation. Only works if
-       * CUDA memory layout is the same and has no pitch padding. Also
-       * does not work if we move textures to host during a render,
-       * since other devices might be using the memory. */
-
-      if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
-          mem.host_pointer != shared_pointer) {
-        memcpy(shared_pointer, mem.host_pointer, size);
-
-        /* A call to device_memory::host_free() should be preceded by
-         * a call to device_memory::device_free() for host memory
-         * allocated by a device to be handled properly. Two exceptions
-         * are here and a call in CUDADevice::generic_alloc(), where
-         * the current host memory can be assumed to be allocated by
-         * device_memory::host_alloc(), not by a device */
-
-        mem.host_free();
-        mem.host_pointer = shared_pointer;
-      }
-      mem.shared_pointer = shared_pointer;
-      mem.shared_counter++;
-      cmem->use_mapped_host = true;
-    }
-    else {
-      cmem->use_mapped_host = false;
-    }
-
-    return cmem;
-  }
-
-  void tex_alloc(device_memory &mem)
-  {
-    CUDAContextScope scope(cuda_context);
-
-    /* General variables for both architectures */
-    string bind_name = mem.name;
-    size_t dsize = datatype_size(mem.data_type);
-    size_t size = mem.memory_size();
-
-    CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
-    switch (mem.extension) {
-      case EXTENSION_REPEAT:
-        address_mode = CU_TR_ADDRESS_MODE_WRAP;
-        break;
-      case EXTENSION_EXTEND:
-        address_mode = CU_TR_ADDRESS_MODE_CLAMP;
-        break;
-      case EXTENSION_CLIP:
-        address_mode = CU_TR_ADDRESS_MODE_BORDER;
-        break;
-      default:
-        assert(0);
-        break;
-    }
-
-    CUfilter_mode filter_mode;
-    if (mem.interpolation == INTERPOLATION_CLOSEST) {
-      filter_mode = CU_TR_FILTER_MODE_POINT;
-    }
-    else {
-      filter_mode = CU_TR_FILTER_MODE_LINEAR;
-    }
-
-    /* Data Storage */
-    if (mem.interpolation == INTERPOLATION_NONE) {
-      generic_alloc(mem);
-      generic_copy_to(mem);
-
-      // Update data storage pointers in launch parameters
-#  define KERNEL_TEX(data_type, tex_name) \
-    if (strcmp(mem.name, #tex_name) == 0) \
-      update_launch_params( \
-          mem.name, offsetof(KernelParams, tex_name), &mem.device_pointer, sizeof(device_ptr));
-#  include "kernel/kernel_textures.h"
-#  undef KERNEL_TEX
-      return;
-    }
-
-    /* Image Texture Storage */
-    CUarray_format_enum format;
-    switch (mem.data_type) {
-      case TYPE_UCHAR:
-        format = CU_AD_FORMAT_UNSIGNED_INT8;
-        break;
-      case TYPE_UINT16:
-        format = CU_AD_FORMAT_UNSIGNED_INT16;
-        break;
-      case TYPE_UINT:
-        format = CU_AD_FORMAT_UNSIGNED_INT32;
-        break;
-      case TYPE_INT:
-        format = CU_AD_FORMAT_SIGNED_INT32;
-        break;
-      case TYPE_FLOAT:
-        format = CU_AD_FORMAT_FLOAT;
-        break;
-      case TYPE_HALF:
-        format = CU_AD_FORMAT_HALF;
-        break;
-      default:
-        assert(0);
-        return;
-    }
-
-    CUDAMem *cmem = NULL;
-    CUarray array_3d = NULL;
-    size_t src_pitch = mem.data_width * dsize * mem.data_elements;
-    size_t dst_pitch = src_pitch;
-
-    if (mem.data_depth > 1) {
-      /* 3D texture using array, there is no API for linear memory. */
-      CUDA_ARRAY3D_DESCRIPTOR desc;
-
-      desc.Width = mem.data_width;
-      desc.Height = mem.data_height;
-      desc.Depth = mem.data_depth;
-      desc.Format = format;
-      desc.NumChannels = mem.data_elements;
-      desc.Flags = 0;
-
-      VLOG(1) << "Array 3D allocate: " << mem.name << ", "
-              << string_human_readable_number(mem.memory_size()) << " bytes. ("
-              << string_human_readable_size(mem.memory_size()) << ")";
-
-      check_result_cuda(cuArray3DCreate(&array_3d, &desc));
-
-      if (!array_3d) {
-        return;
-      }
-
-      CUDA_MEMCPY3D param;
-      memset(&param, 0, sizeof(param));
-      param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
-      param.dstArray = array_3d;
-      param.srcMemoryType = CU_MEMORYTYPE_HOST;
-      param.srcHost = mem.host_pointer;
-      param.srcPitch = src_pitch;
-      param.WidthInBytes = param.srcPitch;
-      param.Height = mem.data_height;
-      param.Depth = mem.data_depth;
-
-      check_result_cuda(cuMemcpy3D(&param));
-
-      mem.device_pointer = (device_ptr)array_3d;
-      mem.device_size = size;
-      stats.mem_alloc(size);
-
-      cmem = &cuda_mem_map[&mem];
-      cmem->texobject = 0;
-      cmem->array = array_3d;
-    }
-    else if (mem.data_height > 0) {
-      /* 2D texture, using pitch aligned linear memory. */
-      int alignment = 0;
-      check_result_cuda(cuDeviceGetAttribute(
-          &alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuda_device));
-      dst_pitch = align_up(src_pitch, alignment);
-      size_t dst_size = dst_pitch * mem.data_height;
-
-      cmem = generic_alloc(mem, dst_size - mem.memory_size());
-      if (!cmem) {
-        return;
-      }
-
-      CUDA_MEMCPY2D param;
-      memset(&param, 0, sizeof(param));
-      param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
-      param.dstDevice = mem.device_pointer;
-      param.dstPitch = dst_pitch;
-      param.srcMemoryType = CU_MEMORYTYPE_HOST;
-      param.srcHost = mem.host_pointer;
-      param.srcPitch = src_pitch;
-      param.WidthInBytes = param.srcPitch;
-      param.Height = mem.data_height;
-
-      check_result_cuda(cuMemcpy2DUnaligned(&param));
-    }
-    else {
-      /* 1D texture, using linear memory. */
-      cmem = generic_alloc(mem);
-      if (!cmem) {
-        return;
-      }
-
-      check_result_cuda(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
-    }
-
-    /* Kepler+, bindless textures. */
-    int flat_slot = 0;
-    if (string_startswith(mem.name, "__tex_image")) {
-      int pos = string(mem.name).rfind("_");
-      flat_slot = atoi(mem.name + pos + 1);
-    }
-    else {
-      assert(0);
-    }
-
-    CUDA_RESOURCE_DESC resDesc;
-    memset(&resDesc, 0, sizeof(resDesc));
-
-    if (array_3d) {
-      resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
-      resDesc.res.array.hArray = array_3d;
-      resDesc.flags = 0;
-    }
-    else if (mem.data_height > 0) {
-      resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
-      resDesc.res.pitch2D.devPtr = mem.device_pointer;
-      resDesc.res.pitch2D.format = format;
-      resDesc.res.pitch2D.numChannels = mem.data_elements;
-      resDesc.res.pitch2D.height = mem.data_height;
-      resDesc.res.pitch2D.width = mem.data_width;
-      resDesc.res.pitch2D.pitchInBytes = dst_pitch;
-    }
-    else {
-      resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
-      resDesc.res.linear.devPtr = mem.device_pointer;
-      resDesc.res.linear.format = format;
-      resDesc.res.linear.numChannels = mem.data_elements;
-      resDesc.res.linear.sizeInBytes = mem.device_size;
-    }
-
-    CUDA_TEXTURE_DESC texDesc;
-    memset(&texDesc, 0, sizeof(texDesc));
-    texDesc.addressMode[0] = address_mode;
-    texDesc.addressMode[1] = address_mode;
-    texDesc.addressMode[2] = address_mode;
-    texDesc.filterMode = filter_mode;
-    texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
-
-    check_result_cuda(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
-
-    /* Resize once */
-    if (flat_slot >= texture_info.size()) {
-      /* Allocate some slots in advance, to reduce amount
-       * of re-allocations. */
-      texture_info.resize(flat_slot + 128);
-    }
-
-    /* Set Mapping and tag that we need to (re-)upload to device */
-    TextureInfo &info = texture_info[flat_slot];
-    info.data = (uint64_t)cmem->texobject;
-    info.cl_buffer = 0;
-    info.interpolation = mem.interpolation;
-    info.extension = mem.extension;
-    info.width = mem.data_width;
-    info.height = mem.data_height;
-    info.depth = mem.data_depth;
-    need_texture_info = true;
-  }
-
-  void mem_copy_to(device_memory &mem) override
-  {
-    if (mem.type == MEM_PIXELS) {
-      assert(!"mem_copy_to not supported for pixels.");
-    }
-    else if (mem.type == MEM_TEXTURE) {
-      tex_free(mem);
-      tex_alloc(mem);
-    }
-    else {
-      if (!mem.device_pointer) {
-        generic_alloc(mem);
-      }
-
-      generic_copy_to(mem);
-    }
-  }
-
-  void generic_copy_to(device_memory &mem)
-  {
-    if (mem.host_pointer && mem.device_pointer) {
-      CUDAContextScope scope(cuda_context);
-
-      /* If use_mapped_host of mem is false, the current device only
-       * uses device memory allocated by cuMemAlloc regardless of
-       * mem.host_pointer and mem.shared_pointer, and should copy
-       * data from mem.host_pointer. */
-
-      if (cuda_mem_map[&mem].use_mapped_host == false || mem.host_pointer != mem.shared_pointer) {
-        check_result_cuda(
-            cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size()));
-      }
-    }
-  }
-
-  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override
-  {
-    if (mem.type == MEM_PIXELS && !background) {
-      assert(!"mem_copy_from not supported for pixels.");
-    }
-    else if (mem.type == MEM_TEXTURE) {
-      assert(!"mem_copy_from not supported for textures.");
-    }
-    else if (mem.host_pointer) {
-      // Calculate linear memory offset and size
-      const size_t size = elem * w * h;
-      const size_t offset = elem * y * w;
-
-      if (mem.device_pointer) {
-        const CUDAContextScope scope(cuda_context);
-        check_result_cuda(cuMemcpyDtoH(
-            (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size));
-      }
-      else {
-        memset((char *)mem.host_pointer + offset, 0, size);
-      }
-    }
-  }
-
-  void mem_zero(device_memory &mem) override
-  {
-    if (!mem.device_pointer) {
-      mem_alloc(mem);  // Need to allocate memory first if it does not exist yet
-    }
-    if (!mem.device_pointer) {
-      return;
-    }
-
-    /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
-     * regardless of mem.host_pointer and mem.shared_pointer. */
-    if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
-      const CUDAContextScope scope(cuda_context);
-      check_result_cuda(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
-    }
-    else if (mem.host_pointer) {
-      memset(mem.host_pointer, 0, mem.memory_size());
-    }
-  }
-
-  void mem_free(device_memory &mem) override
-  {
-    if (mem.type == MEM_PIXELS && !background) {
-      assert(!"mem_free not supported for pixels.");
-    }
-    else if (mem.type == MEM_TEXTURE) {
-      tex_free(mem);
-    }
-    else {
-      generic_free(mem);
-    }
-  }
+    CUDADevice::const_copy_to(name, host, size);
 
-  void generic_free(device_memory &mem)
-  {
-    if (mem.device_pointer) {
-      CUDAContextScope scope(cuda_context);
-      const CUDAMem &cmem = cuda_mem_map[&mem];
-
-      /* If cmem.use_mapped_host is true, reference counting is used
-       * to safely free a mapped host memory. */
-
-      if (cmem.use_mapped_host) {
-        assert(mem.shared_pointer);
-        if (mem.shared_pointer) {
-          assert(mem.shared_counter > 0);
-          if (--mem.shared_counter == 0) {
-            if (mem.host_pointer == mem.shared_pointer) {
-              mem.host_pointer = 0;
-            }
-            cuMemFreeHost(mem.shared_pointer);
-            mem.shared_pointer = 0;
-          }
-        }
-        map_host_used -= mem.device_size;
-      }
-      else {
-        /* Free device memory. */
-        cuMemFree(mem.device_pointer);
-      }
-
-      stats.mem_free(mem.device_size);
-      mem.device_pointer = 0;
-      mem.device_size = 0;
-
-      cuda_mem_map.erase(cuda_mem_map.find(&mem));
-    }
-  }
-
-  void tex_free(device_memory &mem)
-  {
-    if (mem.device_pointer) {
-      CUDAContextScope scope(cuda_context);
-      const CUDAMem &cmem = cuda_mem_map[&mem];
-
-      if (cmem.texobject) {
-        /* Free bindless texture. */
-        cuTexObjectDestroy(cmem.texobject);
-      }
-
-      if (cmem.array) {
-        /* Free array. */
-        cuArrayDestroy(cmem.array);
-        stats.mem_free(mem.device_size);
-        mem.device_pointer = 0;
-        mem.device_size = 0;
-
-        cuda_mem_map.erase(cuda_mem_map.find(&mem));
-      }
-      else {
-        generic_free(mem);
-      }
-    }
-  }
-
-  void move_textures_to_host(size_t size, bool for_texture)
-  {
-    /* Signal to reallocate textures in host memory only. */
-    move_texture_to_host = true;
-
-    while (size > 0) {
-      /* Find suitable memory allocation to move. */
-      device_memory *max_mem = NULL;
-      size_t max_size = 0;
-      bool max_is_image = false;
-
-      foreach (auto &pair, cuda_mem_map) {
-        device_memory &mem = *pair.first;
-        CUDAMem *cmem = &pair.second;
-
-        bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
-        bool is_image = is_texture && (mem.data_height > 1);
-
-        /* Can't move this type of memory. */
-        if (!is_texture || cmem->array) {
-          continue;
-        }
-
-        /* Already in host memory. */
-        if (cmem->use_mapped_host) {
-          continue;
-        }
-
-        /* For other textures, only move image textures. */
-        if (for_texture && !is_image) {
-          continue;
-        }
-
-        /* Try to move largest allocation, prefer moving images. */
-        if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
-          max_is_image = is_image;
-          max_size = mem.device_size;
-          max_mem = &mem;
-        }
-      }
-
-      /* Move to host memory. This part is mutex protected since
-       * multiple CUDA devices could be moving the memory. The
-       * first one will do it, and the rest will adopt the pointer. */
-      if (max_mem) {
-        VLOG(1) << "Move memory from device to host: " << max_mem->name;
-
-        static thread_mutex move_mutex;
-        thread_scoped_lock lock(move_mutex);
-
-        /* Preserve the original device pointer, in case of multi device
-         * we can't change it because the pointer mapping would break. */
-        device_ptr prev_pointer = max_mem->device_pointer;
-        size_t prev_size = max_mem->device_size;
-
-        tex_free(*max_mem);
-        tex_alloc(*max_mem);
-        size = (max_size >= size) ? 0 : size - max_size;
-
-        max_mem->device_pointer = prev_pointer;
-        max_mem->device_size = prev_size;
-      }
-      else {
-        break;
-      }
-    }
-
-    /* Update texture info array with new pointers. */
-    update_texture_info();
-
-    move_texture_to_host = false;
-  }
-
-  void const_copy_to(const char *name, void *host, size_t size) override
-  {
     if (strcmp(name, "__data") == 0) {
       assert(size <= sizeof(KernelData));
 
@@ -2197,18 +1419,40 @@ class OptiXDevice : public Device {
       *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;
 
       update_launch_params(name, offsetof(KernelParams, data), host, size);
+      return;
     }
+
+    // Update data storage pointers in launch parameters
+#  define KERNEL_TEX(data_type, tex_name) \
+    if (strcmp(name, #tex_name) == 0) { \
+      update_launch_params(name, offsetof(KernelParams, tex_name), host, size); \
+      return; \
+    }
+#  include "kernel/kernel_textures.h"
+#  undef KERNEL_TEX
   }
 
-  device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override
+  void update_launch_params(const char *name, size_t offset, void *data, size_t data_size)
   {
-    return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
+    const CUDAContextScope scope(cuContext);
+
+    for (int i = 0; i < info.cpu_threads; ++i)
+      check_result_cuda(
+          cuMemcpyHtoD(launch_params.device_pointer + i * launch_params.data_elements + offset,
+                       data,
+                       data_size));
   }
 
   void task_add(DeviceTask &task) override
   {
     // Upload texture information to device if it has changed since last launch
-    update_texture_info();
+    load_texture_info();
+
+    if (task.type == DeviceTask::FILM_CONVERT) {
+      // Execute in main thread because of OpenGL access
+      film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
+      return;
+    }
 
     // Split task into smaller ones
     list<DeviceTask> tasks;
@@ -2240,403 +1484,6 @@ class OptiXDevice : public Device {
     // Cancel any remaining tasks in the internal pool
     task_pool.cancel();
   }
-
-  bool denoising_non_local_means(device_ptr image_ptr,
-                                 device_ptr guide_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr out_ptr,
-                                 DenoisingTask *task,
-                                 int thread_index)
-  {
-    if (have_error())
-      return false;
-
-    int stride = task->buffer.stride;
-    int w = task->buffer.width;
-    int h = task->buffer.h;
-    int r = task->nlm_state.r;
-    int f = task->nlm_state.f;
-    float a = task->nlm_state.a;
-    float k_2 = task->nlm_state.k_2;
-
-    int pass_stride = task->buffer.pass_stride;
-    int num_shifts = (2 * r + 1) * (2 * r + 1);
-    int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
-    int frame_offset = 0;
-
-    CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer;
-    CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
-    CUdeviceptr weightAccum = difference + 2 * sizeof(float) * pass_stride * num_shifts;
-    CUdeviceptr scale_ptr = 0;
-
-    check_result_cuda_ret(
-        cuMemsetD8Async(weightAccum, 0, sizeof(float) * pass_stride, cuda_stream[thread_index]));
-    check_result_cuda_ret(
-        cuMemsetD8Async(out_ptr, 0, sizeof(float) * pass_stride, cuda_stream[thread_index]));
-
-    {
-      CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput;
-      check_result_cuda_ret(cuModuleGetFunction(
-          &cuNLMCalcDifference, cuda_filter_module, "kernel_cuda_filter_nlm_calc_difference"));
-      check_result_cuda_ret(
-          cuModuleGetFunction(&cuNLMBlur, cuda_filter_module, "kernel_cuda_filter_nlm_blur"));
-      check_result_cuda_ret(cuModuleGetFunction(
-          &cuNLMCalcWeight, cuda_filter_module, "kernel_cuda_filter_nlm_calc_weight"));
-      check_result_cuda_ret(cuModuleGetFunction(
-          &cuNLMUpdateOutput, cuda_filter_module, "kernel_cuda_filter_nlm_update_output"));
-
-      check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
-      check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
-      check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
-      check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
-
-      CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w * h, num_shifts);
-
-      void *calc_difference_args[] = {&guide_ptr,
-                                      &variance_ptr,
-                                      &scale_ptr,
-                                      &difference,
-                                      &w,
-                                      &h,
-                                      &stride,
-                                      &pass_stride,
-                                      &r,
-                                      &channel_offset,
-                                      &frame_offset,
-                                      &a,
-                                      &k_2};
-      void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
-      void *calc_weight_args[] = {
-          &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
-      void *update_output_args[] = {&blurDifference,
-                                    &image_ptr,
-                                    &out_ptr,
-                                    &weightAccum,
-                                    &w,
-                                    &h,
-                                    &stride,
-                                    &pass_stride,
-                                    &channel_offset,
-                                    &r,
-                                    &f};
-
-      CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
-      CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-      CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
-      CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-      CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args);
-    }
-
-    {
-      CUfunction cuNLMNormalize;
-      check_result_cuda_ret(cuModuleGetFunction(
-          &cuNLMNormalize, cuda_filter_module, "kernel_cuda_filter_nlm_normalize"));
-      check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
-      void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride};
-      CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h);
-      CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
-      check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
-    }
-
-    return !have_error();
-  }
-
-  bool denoising_construct_transform(DenoisingTask *task, int thread_index)
-  {
-    if (have_error())
-      return false;
-
-    CUfunction cuFilterConstructTransform;
-    check_result_cuda_ret(cuModuleGetFunction(&cuFilterConstructTransform,
-                                              cuda_filter_module,
-                                              "kernel_cuda_filter_construct_transform"));
-    check_result_cuda_ret(
-        cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
-    CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, task->storage.w, task->storage.h);
-
-    void *args[] = {&task->buffer.mem.device_pointer,
-                    &task->tile_info_mem.device_pointer,
-                    &task->storage.transform.device_pointer,
-                    &task->storage.rank.device_pointer,
-                    &task->filter_area,
-                    &task->rect,
-                    &task->radius,
-                    &task->pca_threshold,
-                    &task->buffer.pass_stride,
-                    &task->buffer.frame_stride,
-                    &task->buffer.use_time};
-    CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
-    check_result_cuda_ret(cuCtxSynchronize());
-
-    return !have_error();
-  }
-
-  bool denoising_accumulate(device_ptr color_ptr,
-                            device_ptr color_variance_ptr,
-                            device_ptr scale_ptr,
-                            int frame,
-                            DenoisingTask *task,
-                            int thread_index)
-  {
-    if (have_error())
-      return false;
-
-    int r = task->radius;
-    int f = 4;
-    float a = 1.0f;
-    float k_2 = task->nlm_k_2;
-
-    int w = task->reconstruction_state.source_w;
-    int h = task->reconstruction_state.source_h;
-    int stride = task->buffer.stride;
-    int frame_offset = frame * task->buffer.frame_stride;
-    int t = task->tile_info->frames[frame];
-
-    int pass_stride = task->buffer.pass_stride;
-    int num_shifts = (2 * r + 1) * (2 * r + 1);
-
-    CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer;
-    CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
-
-    CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
-    check_result_cuda_ret(cuModuleGetFunction(
-        &cuNLMCalcDifference, cuda_filter_module, "kernel_cuda_filter_nlm_calc_difference"));
-    check_result_cuda_ret(
-        cuModuleGetFunction(&cuNLMBlur, cuda_filter_module, "kernel_cuda_filter_nlm_blur"));
-    check_result_cuda_ret(cuModuleGetFunction(
-        &cuNLMCalcWeight, cuda_filter_module, "kernel_cuda_filter_nlm_calc_weight"));
-    check_result_cuda_ret(cuModuleGetFunction(
-        &cuNLMConstructGramian, cuda_filter_module, "kernel_cuda_filter_nlm_construct_gramian"));
-
-    check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
-    check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
-    check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
-    check_result_cuda_ret(
-        cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
-
-    CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
-                          task->reconstruction_state.source_w *
-                              task->reconstruction_state.source_h,
-                          num_shifts);
-
-    void *calc_difference_args[] = {&color_ptr,
-                                    &color_variance_ptr,
-                                    &scale_ptr,
-                                    &difference,
-                                    &w,
-                                    &h,
-                                    &stride,
-                                    &pass_stride,
-                                    &r,
-                                    &pass_stride,
-                                    &frame_offset,
-                                    &a,
-                                    &k_2};
-    void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
-    void *calc_weight_args[] = {
-        &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
-    void *construct_gramian_args[] = {&t,
-                                      &blurDifference,
-                                      &task->buffer.mem.device_pointer,
-                                      &task->storage.transform.device_pointer,
-                                      &task->storage.rank.device_pointer,
-                                      &task->storage.XtWX.device_pointer,
-                                      &task->storage.XtWY.device_pointer,
-                                      &task->reconstruction_state.filter_window,
-                                      &w,
-                                      &h,
-                                      &stride,
-                                      &pass_stride,
-                                      &r,
-                                      &f,
-                                      &frame_offset,
-                                      &task->buffer.use_time};
-
-    CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
-    check_result_cuda_ret(cuCtxSynchronize());
-
-    return !have_error();
-  }
-
-  bool denoising_solve(device_ptr output_ptr, DenoisingTask *task, int thread_index)
-  {
-    if (have_error())
-      return false;
-
-    CUfunction cuFinalize;
-    check_result_cuda_ret(
-        cuModuleGetFunction(&cuFinalize, cuda_filter_module, "kernel_cuda_filter_finalize"));
-    check_result_cuda_ret(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
-    void *finalize_args[] = {&output_ptr,
-                             &task->storage.rank.device_pointer,
-                             &task->storage.XtWX.device_pointer,
-                             &task->storage.XtWY.device_pointer,
-                             &task->filter_area,
-                             &task->reconstruction_state.buffer_params.x,
-                             &task->render_buffer.samples};
-    CUDA_GET_BLOCKSIZE(
-        cuFinalize, task->reconstruction_state.source_w, task->reconstruction_state.source_h);
-    CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
-    check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
-
-    return !have_error();
-  }
-
-  bool denoising_combine_halves(device_ptr a_ptr,
-                                device_ptr b_ptr,
-                                device_ptr mean_ptr,
-                                device_ptr variance_ptr,
-                                int r,
-                                int4 rect,
-                                DenoisingTask *task,
-                                int thread_index)
-  {
-    if (have_error())
-      return false;
-
-    CUfunction cuFilterCombineHalves;
-    check_result_cuda_ret(cuModuleGetFunction(
-        &cuFilterCombineHalves, cuda_filter_module, "kernel_cuda_filter_combine_halves"));
-    check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
-    CUDA_GET_BLOCKSIZE(
-        cuFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-    void *args[] = {&mean_ptr, &variance_ptr, &a_ptr, &b_ptr, &rect, &r};
-    CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
-    check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
-
-    return !have_error();
-  }
-
-  bool denoising_divide_shadow(device_ptr a_ptr,
-                               device_ptr b_ptr,
-                               device_ptr sample_variance_ptr,
-                               device_ptr sv_variance_ptr,
-                               device_ptr buffer_variance_ptr,
-                               DenoisingTask *task,
-                               int thread_index)
-  {
-    if (have_error())
-      return false;
-
-    CUfunction cuFilterDivideShadow;
-    check_result_cuda_ret(cuModuleGetFunction(
-        &cuFilterDivideShadow, cuda_filter_module, "kernel_cuda_filter_divide_shadow"));
-    check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
-    CUDA_GET_BLOCKSIZE(
-        cuFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-    void *args[] = {&task->render_buffer.samples,
-                    &task->tile_info_mem.device_pointer,
-                    &a_ptr,
-                    &b_ptr,
-                    &sample_variance_ptr,
-                    &sv_variance_ptr,
-                    &buffer_variance_ptr,
-                    &task->rect,
-                    &task->render_buffer.pass_stride,
-                    &task->render_buffer.offset};
-    CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
-    check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
-
-    return !have_error();
-  }
-
-  bool denoising_get_feature(int mean_offset,
-                             int variance_offset,
-                             device_ptr mean_ptr,
-                             device_ptr variance_ptr,
-                             float scale,
-                             DenoisingTask *task,
-                             int thread_index)
-  {
-    if (have_error())
-      return false;
-
-    CUfunction cuFilterGetFeature;
-    check_result_cuda_ret(cuModuleGetFunction(
-        &cuFilterGetFeature, cuda_filter_module, "kernel_cuda_filter_get_feature"));
-    check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
-    CUDA_GET_BLOCKSIZE(
-        cuFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-    void *args[] = {&task->render_buffer.samples,
-                    &task->tile_info_mem.device_pointer,
-                    &mean_offset,
-                    &variance_offset,
-                    &mean_ptr,
-                    &variance_ptr,
-                    &scale,
-                    &task->rect,
-                    &task->render_buffer.pass_stride,
-                    &task->render_buffer.offset};
-    CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
-    check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
-
-    return !have_error();
-  }
-
-  bool denoising_write_feature(int out_offset,
-                               device_ptr from_ptr,
-                               device_ptr buffer_ptr,
-                               DenoisingTask *task,
-                               int thread_index)
-  {
-    if (have_error())
-      return false;
-
-    CUfunction cuFilterWriteFeature;
-    check_result_cuda_ret(cuModuleGetFunction(
-        &cuFilterWriteFeature, cuda_filter_module, "kernel_cuda_filter_write_feature"));
-    check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1));
-    CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, task->filter_area.z, task->filter_area.w);
-
-    void *args[] = {&task->render_buffer.samples,
-                    &task->reconstruction_state.buffer_params,
-                    &task->filter_area,
-                    &from_ptr,
-                    &buffer_ptr,
-                    &out_offset,
-                    &task->rect};
-    CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args);
-    check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
-
-    return !have_error();
-  }
-
-  bool denoising_detect_outliers(device_ptr image_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr depth_ptr,
-                                 device_ptr output_ptr,
-                                 DenoisingTask *task,
-                                 int thread_index)
-  {
-    if (have_error())
-      return false;
-
-    CUfunction cuFilterDetectOutliers;
-    check_result_cuda_ret(cuModuleGetFunction(
-        &cuFilterDetectOutliers, cuda_filter_module, "kernel_cuda_filter_detect_outliers"));
-    check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1));
-    CUDA_GET_BLOCKSIZE(
-        cuFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-    void *args[] = {&image_ptr,
-                    &variance_ptr,
-                    &depth_ptr,
-                    &output_ptr,
-                    &task->rect,
-                    &task->buffer.pass_stride};
-
-    CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
-    check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
-
-    return !have_error();
-  }
 };
 
 bool device_optix_init()
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/device_opencl.h
index 61b1e3e3b6b..61b1e3e3b6b 100644
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/device_opencl.h
diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/device_opencl_impl.cpp
index af40aa89db4..012f6dbe114 100644
--- a/intern/cycles/device/opencl/opencl_split.cpp
+++ b/intern/cycles/device/opencl/device_opencl_impl.cpp
@@ -16,7 +16,7 @@
 
 #ifdef WITH_OPENCL
 
-#  include "device/opencl/opencl.h"
+#  include "device/opencl/device_opencl.h"
 
 #  include "kernel/kernel_types.h"
 #  include "kernel/split/kernel_split_data_types.h"
diff --git a/intern/cycles/device/opencl/memory_manager.cpp b/intern/cycles/device/opencl/memory_manager.cpp
index 06d4746a86e..fedb3ea8c6a 100644
--- a/intern/cycles/device/opencl/memory_manager.cpp
+++ b/intern/cycles/device/opencl/memory_manager.cpp
@@ -18,7 +18,7 @@
 
 #  include "util/util_foreach.h"
 
-#  include "device/opencl/opencl.h"
+#  include "device/opencl/device_opencl.h"
 #  include "device/opencl/memory_manager.h"
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp
index 3eeff31f8c2..f59f109db70 100644
--- a/intern/cycles/device/opencl/opencl_util.cpp
+++ b/intern/cycles/device/opencl/opencl_util.cpp
@@ -16,7 +16,7 @@
 
 #ifdef WITH_OPENCL
 
-#  include "device/opencl/opencl.h"
+#  include "device/opencl/device_opencl.h"
 #  include "device/device_intern.h"
 
 #  include "util/util_debug.h"
author	Patrick Mours <pmours@nvidia.com>	2020-02-11 20:54:50 +0300
committer	Patrick Mours <pmours@nvidia.com>	2020-02-12 15:11:32 +0300
commit	153e001c743bf0f6bc259966418446441e00e200 (patch)
tree	280bafa4dfcba152fe774864b1636dd281be9713 /intern
parent	cc085e228de70563ed0b6870d23ef3bb531c4798 (diff)