From a2b52dc5716a97e5413acbd6eefc9ce3788b6456 Mon Sep 17 00:00:00 2001
From: Patrick Mours <pmours@nvidia.com>
Date: Thu, 12 Sep 2019 14:50:06 +0200
Subject: Cycles: add Optix device backend

This uses hardware-accelerated raytracing on NVIDIA RTX graphics cards.

It is still currently experimental. Most features are supported, but a few
are still missing like baking, branched path tracing and using CPU memory.
https://wiki.blender.org/wiki/Reference/Release_Notes/2.81/Cycles#NVIDIA_RTX

For building with Optix support, the Optix SDK must be installed. See here for
build instructions:
https://wiki.blender.org/wiki/Building_Blender/CUDA

Differential Revision: https://developer.blender.org/D5363
---
 intern/cycles/CMakeLists.txt              |   18 +
 intern/cycles/blender/addon/properties.py |   46 +-
 intern/cycles/blender/addon/ui.py         |   23 +-
 intern/cycles/blender/blender_device.cpp  |    7 +-
 intern/cycles/blender/blender_python.cpp  |   10 +-
 intern/cycles/blender/blender_sync.cpp    |    2 +-
 intern/cycles/bvh/CMakeLists.txt          |    2 +
 intern/cycles/bvh/bvh.cpp                 |   11 +
 intern/cycles/bvh/bvh_optix.cpp           |  215 ++++
 intern/cycles/bvh/bvh_optix.h             |   53 +
 intern/cycles/device/CMakeLists.txt       |    4 +
 intern/cycles/device/device.cpp           |   30 +
 intern/cycles/device/device.h             |   18 +-
 intern/cycles/device/device_cuda.cpp      |    5 -
 intern/cycles/device/device_intern.h      |    4 +
 intern/cycles/device/device_multi.cpp     |   18 +
 intern/cycles/device/device_optix.cpp     | 1969 +++++++++++++++++++++++++++++
 intern/cycles/kernel/CMakeLists.txt       |   76 +-
 intern/cycles/render/mesh.cpp             |    4 +-
 intern/cycles/util/util_debug.cpp         |   16 +-
 intern/cycles/util/util_debug.h           |   14 +
 21 files changed, 2511 insertions(+), 34 deletions(-)
 create mode 100644 intern/cycles/bvh/bvh_optix.cpp
 create mode 100644 intern/cycles/bvh/bvh_optix.h
 create mode 100644 intern/cycles/device/device_optix.cpp

(limited to 'intern')

diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index 6a3ebd85378..25e8e124885 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -219,6 +219,24 @@ if(WITH_CYCLES_OSL)
   )
 endif()
 
+if(WITH_CYCLES_DEVICE_OPTIX)
+  find_package(OptiX)
+
+  if(OPTIX_FOUND)
+    add_definitions(-DWITH_OPTIX)
+    include_directories(
+      SYSTEM
+      ${OPTIX_INCLUDE_DIR}
+      )
+
+    # Need pre-compiled CUDA binaries in the OptiX device
+    set(WITH_CYCLES_CUDA_BINARIES ON)
+  else()
+    message(STATUS "Optix not found, disabling it from Cycles")
+    set(WITH_CYCLES_DEVICE_OPTIX OFF)
+  endif()
+endif()
+
 if(WITH_CYCLES_EMBREE)
   add_definitions(-DWITH_EMBREE)
   add_definitions(-DEMBREE_STATIC_LIB)
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 93f8f76cd6a..8623b38a271 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -137,6 +137,7 @@ enum_world_mis = (
 enum_device_type = (
     ('CPU', "CPU", "CPU", 0),
     ('CUDA', "CUDA", "CUDA", 1),
+    ('OPTIX', "OptiX", "OptiX", 3),
     ('OPENCL', "OpenCL", "OpenCL", 2)
 )
 
@@ -740,6 +741,8 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
     debug_use_cuda_adaptive_compile: BoolProperty(name="Adaptive Compile", default=False)
     debug_use_cuda_split_kernel: BoolProperty(name="Split Kernel", default=False)
 
+    debug_optix_cuda_streams: IntProperty(name="CUDA Streams", default=1, min=1)
+
     debug_opencl_kernel_type: EnumProperty(
         name="OpenCL Kernel Type",
         default='DEFAULT',
@@ -1400,10 +1403,12 @@ class CyclesPreferences(bpy.types.AddonPreferences):
 
     def get_device_types(self, context):
         import _cycles
-        has_cuda, has_opencl = _cycles.get_device_types()
+        has_cuda, has_optix, has_opencl = _cycles.get_device_types()
         list = [('NONE', "None", "Don't use compute device", 0)]
         if has_cuda:
             list.append(('CUDA', "CUDA", "Use CUDA for GPU acceleration", 1))
+        if has_optix:
+            list.append(('OPTIX', "OptiX", "Use OptiX for GPU acceleration", 3))
         if has_opencl:
             list.append(('OPENCL', "OpenCL", "Use OpenCL for GPU acceleration", 2))
         return list
@@ -1424,7 +1429,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
 
     def update_device_entries(self, device_list):
         for device in device_list:
-            if not device[1] in {'CUDA', 'OPENCL', 'CPU'}:
+            if not device[1] in {'CUDA', 'OPTIX', 'OPENCL', 'CPU'}:
                 continue
             # Try to find existing Device entry
             entry = self.find_existing_device_entry(device)
@@ -1439,8 +1444,8 @@ class CyclesPreferences(bpy.types.AddonPreferences):
                 # Update name in case it changed
                 entry.name = device[0]
 
-    # Gets all devices types by default.
-    def get_devices(self, compute_device_type=''):
+    # Gets all devices types for a compute device type.
+    def get_devices_for_type(self, compute_device_type):
         import _cycles
         # Layout of the device tuples: (Name, Type, Persistent ID)
         device_list = _cycles.available_devices(compute_device_type)
@@ -1449,20 +1454,23 @@ class CyclesPreferences(bpy.types.AddonPreferences):
         # hold pointers to a resized array.
         self.update_device_entries(device_list)
         # Sort entries into lists
-        cuda_devices = []
-        opencl_devices = []
+        devices = []
         cpu_devices = []
         for device in device_list:
             entry = self.find_existing_device_entry(device)
-            if entry.type == 'CUDA':
-                cuda_devices.append(entry)
-            elif entry.type == 'OPENCL':
-                opencl_devices.append(entry)
+            if entry.type == compute_device_type:
+                devices.append(entry)
             elif entry.type == 'CPU':
                 cpu_devices.append(entry)
         # Extend all GPU devices with CPU.
-        cuda_devices.extend(cpu_devices)
-        opencl_devices.extend(cpu_devices)
+        if compute_device_type in ('CUDA', 'OPENCL'):
+            devices.extend(cpu_devices)
+        return devices
+
+    # For backwards compatibility, only has CUDA and OpenCL.
+    def get_devices(self, compute_device_type=''):
+        cuda_devices = self.get_devices_for_type('CUDA')
+        opencl_devices = self.get_devices_for_type('OPENCL')
         return cuda_devices, opencl_devices
 
     def get_num_gpu_devices(self):
@@ -1498,16 +1506,24 @@ class CyclesPreferences(bpy.types.AddonPreferences):
         for device in devices:
             box.prop(device, "use", text=device.name)
 
+        if device_type == 'OPTIX':
+            col = box.column(align=True)
+            col.label(text="OptiX support is experimental", icon='INFO')
+            col.label(text="Not all Cycles features are supported yet", icon='BLANK1')
+
+
     def draw_impl(self, layout, context):
         row = layout.row()
         row.prop(self, "compute_device_type", expand=True)
 
-        cuda_devices, opencl_devices = self.get_devices(self.compute_device_type)
+        devices = self.get_devices_for_type(self.compute_device_type)
         row = layout.row()
         if self.compute_device_type == 'CUDA':
-            self._draw_devices(row, 'CUDA', cuda_devices)
+            self._draw_devices(row, 'CUDA', devices)
+        elif self.compute_device_type == 'OPTIX':
+            self._draw_devices(row, 'OPTIX', devices)
         elif self.compute_device_type == 'OPENCL':
-            self._draw_devices(row, 'OPENCL', opencl_devices)
+            self._draw_devices(row, 'OPENCL', devices)
 
     def draw(self, context):
         self.draw_impl(self.layout, context)
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 200b08f93cb..44ed28e9e02 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -88,10 +88,16 @@ def use_cuda(context):
     return (get_device_type(context) == 'CUDA' and cscene.device == 'GPU')
 
 
+def use_optix(context):
+    cscene = context.scene.cycles
+
+    return (get_device_type(context) == 'OPTIX' and cscene.device == 'GPU')
+
+
 def use_branched_path(context):
     cscene = context.scene.cycles
 
-    return (cscene.progressive == 'BRANCHED_PATH')
+    return (cscene.progressive == 'BRANCHED_PATH' and not use_optix(context))
 
 
 def use_sample_all_lights(context):
@@ -168,7 +174,8 @@ class CYCLES_RENDER_PT_sampling(CyclesButtonsPanel, Panel):
         layout.use_property_split = True
         layout.use_property_decorate = False
 
-        layout.prop(cscene, "progressive")
+        if not use_optix(context):
+            layout.prop(cscene, "progressive")
 
         if cscene.progressive == 'PATH' or use_branched_path(context) is False:
             col = layout.column(align=True)
@@ -1763,6 +1770,10 @@ class CYCLES_RENDER_PT_bake(CyclesButtonsPanel, Panel):
     bl_options = {'DEFAULT_CLOSED'}
     COMPAT_ENGINES = {'CYCLES'}
 
+    @classmethod
+    def poll(cls, context):
+        return not use_optix(context)
+
     def draw(self, context):
         layout = self.layout
         layout.use_property_split = True
@@ -1947,7 +1958,13 @@ class CYCLES_RENDER_PT_debug(CyclesButtonsPanel, Panel):
         col.separator()
 
         col = layout.column()
-        col.label(text='OpenCL Flags:')
+        col.label(text="OptiX Flags:")
+        col.prop(cscene, "debug_optix_cuda_streams")
+
+        col.separator()
+
+        col = layout.column()
+        col.label(text="OpenCL Flags:")
         col.prop(cscene, "debug_opencl_device_type", text="Device")
         col.prop(cscene, "debug_use_opencl_debug", text="Debug")
         col.prop(cscene, "debug_opencl_mem_limit")
diff --git a/intern/cycles/blender/blender_device.cpp b/intern/cycles/blender/blender_device.cpp
index 98fc0c6dec4..111fc8d5192 100644
--- a/intern/cycles/blender/blender_device.cpp
+++ b/intern/cycles/blender/blender_device.cpp
@@ -61,7 +61,8 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
       COMPUTE_DEVICE_CPU = 0,
       COMPUTE_DEVICE_CUDA = 1,
       COMPUTE_DEVICE_OPENCL = 2,
-      COMPUTE_DEVICE_NUM = 3,
+      COMPUTE_DEVICE_OPTIX = 3,
+      COMPUTE_DEVICE_NUM = 4,
     };
 
     ComputeDevice compute_device = (ComputeDevice)get_enum(
@@ -73,6 +74,10 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
       if (compute_device == COMPUTE_DEVICE_CUDA) {
         mask |= DEVICE_MASK_CUDA;
       }
+      else if (compute_device == COMPUTE_DEVICE_OPTIX) {
+        /* Cannot use CPU and OptiX device at the same time right now, so replace mask. */
+        mask = DEVICE_MASK_OPTIX;
+      }
       else if (compute_device == COMPUTE_DEVICE_OPENCL) {
         mask |= DEVICE_MASK_OPENCL;
       }
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index 2bea6b34772..335d4daf09c 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -81,6 +81,8 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
   /* Synchronize CUDA flags. */
   flags.cuda.adaptive_compile = get_boolean(cscene, "debug_use_cuda_adaptive_compile");
   flags.cuda.split_kernel = get_boolean(cscene, "debug_use_cuda_split_kernel");
+  /* Synchronize OptiX flags. */
+  flags.optix.cuda_streams = get_int(cscene, "debug_optix_cuda_streams");
   /* Synchronize OpenCL device type. */
   switch (get_enum(cscene, "debug_opencl_device_type")) {
     case 0:
@@ -960,14 +962,16 @@ static PyObject *enable_print_stats_func(PyObject * /*self*/, PyObject * /*args*
 static PyObject *get_device_types_func(PyObject * /*self*/, PyObject * /*args*/)
 {
   vector<DeviceType> device_types = Device::available_types();
-  bool has_cuda = false, has_opencl = false;
+  bool has_cuda = false, has_optix = false, has_opencl = false;
   foreach (DeviceType device_type, device_types) {
     has_cuda |= (device_type == DEVICE_CUDA);
+    has_optix |= (device_type == DEVICE_OPTIX);
     has_opencl |= (device_type == DEVICE_OPENCL);
   }
-  PyObject *list = PyTuple_New(2);
+  PyObject *list = PyTuple_New(3);
   PyTuple_SET_ITEM(list, 0, PyBool_FromLong(has_cuda));
-  PyTuple_SET_ITEM(list, 1, PyBool_FromLong(has_opencl));
+  PyTuple_SET_ITEM(list, 1, PyBool_FromLong(has_optix));
+  PyTuple_SET_ITEM(list, 2, PyBool_FromLong(has_opencl));
   return list;
 }
 
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 8b7c66363d9..1a166d171bc 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -758,7 +758,7 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
     preview_samples = preview_samples * preview_samples;
   }
 
-  if (get_enum(cscene, "progressive") == 0) {
+  if (get_enum(cscene, "progressive") == 0 && (params.device.type != DEVICE_OPTIX)) {
     if (background) {
       params.samples = aa_samples;
     }
diff --git a/intern/cycles/bvh/CMakeLists.txt b/intern/cycles/bvh/CMakeLists.txt
index 36bbd937e1a..27a7f604e1c 100644
--- a/intern/cycles/bvh/CMakeLists.txt
+++ b/intern/cycles/bvh/CMakeLists.txt
@@ -15,6 +15,7 @@ set(SRC
   bvh_build.cpp
   bvh_embree.cpp
   bvh_node.cpp
+  bvh_optix.cpp
   bvh_sort.cpp
   bvh_split.cpp
   bvh_unaligned.cpp
@@ -29,6 +30,7 @@ set(SRC_HEADERS
   bvh_build.h
   bvh_embree.h
   bvh_node.h
+  bvh_optix.h
   bvh_params.h
   bvh_sort.h
   bvh_split.h
diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp
index b6a4aba74b5..16c721da06a 100644
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -26,6 +26,9 @@
 #include "bvh/bvh_build.h"
 #include "bvh/bvh_node.h"
 
+#ifdef WITH_OPTIX
+#  include "bvh/bvh_optix.h"
+#endif
 #ifdef WITH_EMBREE
 #  include "bvh/bvh_embree.h"
 #endif
@@ -51,6 +54,8 @@ const char *bvh_layout_name(BVHLayout layout)
       return "NONE";
     case BVH_LAYOUT_EMBREE:
       return "EMBREE";
+    case BVH_LAYOUT_OPTIX:
+      return "OPTIX";
     case BVH_LAYOUT_ALL:
       return "ALL";
   }
@@ -115,6 +120,12 @@ BVH *BVH::create(const BVHParams &params,
       return new BVHEmbree(params, meshes, objects);
 #else
       break;
+#endif
+    case BVH_LAYOUT_OPTIX:
+#ifdef WITH_OPTIX
+      return new BVHOptiX(params, meshes, objects);
+#else
+      break;
 #endif
     case BVH_LAYOUT_NONE:
     case BVH_LAYOUT_ALL:
diff --git a/intern/cycles/bvh/bvh_optix.cpp b/intern/cycles/bvh/bvh_optix.cpp
new file mode 100644
index 00000000000..b3a9aab3266
--- /dev/null
+++ b/intern/cycles/bvh/bvh_optix.cpp
@@ -0,0 +1,215 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_OPTIX
+
+#  include "bvh/bvh_optix.h"
+#  include "render/mesh.h"
+#  include "render/object.h"
+#  include "util/util_logging.h"
+#  include "util/util_progress.h"
+
+CCL_NAMESPACE_BEGIN
+
+BVHOptiX::BVHOptiX(const BVHParams &params_,
+                   const vector<Mesh *> &meshes_,
+                   const vector<Object *> &objects_)
+    : BVH(params_, meshes_, objects_)
+{
+}
+
+BVHOptiX::~BVHOptiX()
+{
+}
+
+void BVHOptiX::build(Progress &, Stats *)
+{
+  if (params.top_level)
+    pack_tlas();
+  else
+    pack_blas();
+}
+
+void BVHOptiX::copy_to_device(Progress &progress, DeviceScene *dscene)
+{
+  progress.set_status("Updating Scene BVH", "Building OptiX acceleration structure");
+
+  Device *const device = dscene->bvh_nodes.device;
+  if (!device->build_optix_bvh(this, dscene->bvh_nodes))
+    progress.set_error("Failed to build OptiX acceleration structure");
+}
+
+void BVHOptiX::pack_blas()
+{
+  // Bottom-level BVH can contain multiple primitive types, so merge them:
+  assert(meshes.size() == 1 && objects.size() == 1);  // These are build per-mesh
+  Mesh *const mesh = meshes[0];
+
+  if (params.primitive_mask & PRIMITIVE_ALL_CURVE && mesh->num_curves() > 0) {
+    const size_t num_curves = mesh->num_curves();
+    const size_t num_segments = mesh->num_segments();
+    pack.prim_type.reserve(pack.prim_type.size() + num_segments);
+    pack.prim_index.reserve(pack.prim_index.size() + num_segments);
+    pack.prim_object.reserve(pack.prim_object.size() + num_segments);
+    // 'pack.prim_time' is only used in geom_curve_intersect.h
+    // It is not needed because of OPTIX_MOTION_FLAG_[START|END]_VANISH
+
+    uint type = PRIMITIVE_CURVE;
+    if (mesh->use_motion_blur && mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION))
+      type = PRIMITIVE_MOTION_CURVE;
+
+    for (size_t j = 0; j < num_curves; ++j) {
+      const Mesh::Curve curve = mesh->get_curve(j);
+      for (size_t k = 0; k < curve.num_segments(); ++k) {
+        pack.prim_type.push_back_reserved(PRIMITIVE_PACK_SEGMENT(type, k));
+        // Each curve segment points back to its curve index
+        pack.prim_index.push_back_reserved(j);
+        pack.prim_object.push_back_reserved(0);
+      }
+    }
+  }
+
+  if (params.primitive_mask & PRIMITIVE_ALL_TRIANGLE && mesh->num_triangles() > 0) {
+    const size_t num_triangles = mesh->num_triangles();
+    pack.prim_type.reserve(pack.prim_type.size() + num_triangles);
+    pack.prim_index.reserve(pack.prim_index.size() + num_triangles);
+    pack.prim_object.reserve(pack.prim_object.size() + num_triangles);
+
+    uint type = PRIMITIVE_TRIANGLE;
+    if (mesh->use_motion_blur && mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION))
+      type = PRIMITIVE_MOTION_TRIANGLE;
+
+    for (size_t k = 0; k < num_triangles; ++k) {
+      pack.prim_type.push_back_reserved(type);
+      pack.prim_index.push_back_reserved(k);
+      pack.prim_object.push_back_reserved(0);
+    }
+  }
+
+  // Initialize visibility to zero and later update it during top-level build
+  uint prev_visibility = objects[0]->visibility;
+  objects[0]->visibility = 0;
+
+  // Update 'pack.prim_tri_index', 'pack.prim_tri_verts' and 'pack.prim_visibility'
+  pack_primitives();
+
+  // Reset visibility after packing
+  objects[0]->visibility = prev_visibility;
+}
+
+void BVHOptiX::pack_tlas()
+{
+  // Calculate total packed size
+  size_t prim_index_size = 0;
+  size_t prim_tri_verts_size = 0;
+  foreach (Mesh *mesh, meshes) {
+    BVH *const bvh = mesh->bvh;
+    prim_index_size += bvh->pack.prim_index.size();
+    prim_tri_verts_size += bvh->pack.prim_tri_verts.size();
+  }
+
+  if (prim_index_size == 0)
+    return;  // Abort right away if this is an empty BVH
+
+  size_t pack_offset = 0;
+  size_t pack_verts_offset = 0;
+
+  pack.prim_type.resize(prim_index_size);
+  int *pack_prim_type = pack.prim_type.data();
+  pack.prim_index.resize(prim_index_size);
+  int *pack_prim_index = pack.prim_index.data();
+  pack.prim_object.resize(prim_index_size);
+  int *pack_prim_object = pack.prim_object.data();
+  pack.prim_visibility.resize(prim_index_size);
+  uint *pack_prim_visibility = pack.prim_visibility.data();
+  pack.prim_tri_index.resize(prim_index_size);
+  uint *pack_prim_tri_index = pack.prim_tri_index.data();
+  pack.prim_tri_verts.resize(prim_tri_verts_size);
+  float4 *pack_prim_tri_verts = pack.prim_tri_verts.data();
+
+  // Top-level BVH should only contain instances, see 'Mesh::need_build_bvh'
+  // Iterate over scene mesh list instead of objects, since the 'prim_offset' is calculated based
+  // on that list, which may be ordered differently from the object list.
+  foreach (Mesh *mesh, meshes) {
+    PackedBVH &bvh_pack = mesh->bvh->pack;
+    int mesh_tri_offset = mesh->tri_offset;
+    int mesh_curve_offset = mesh->curve_offset;
+
+    // Merge primitive, object and triangle indexes
+    if (!bvh_pack.prim_index.empty()) {
+      int *bvh_prim_type = &bvh_pack.prim_type[0];
+      int *bvh_prim_index = &bvh_pack.prim_index[0];
+      uint *bvh_prim_tri_index = &bvh_pack.prim_tri_index[0];
+      uint *bvh_prim_visibility = &bvh_pack.prim_visibility[0];
+
+      for (size_t i = 0; i < bvh_pack.prim_index.size(); i++, pack_offset++) {
+        if (bvh_pack.prim_type[i] & PRIMITIVE_ALL_CURVE) {
+          pack_prim_index[pack_offset] = bvh_prim_index[i] + mesh_curve_offset;
+          pack_prim_tri_index[pack_offset] = -1;
+        }
+        else {
+          pack_prim_index[pack_offset] = bvh_prim_index[i] + mesh_tri_offset;
+          pack_prim_tri_index[pack_offset] = bvh_prim_tri_index[i] + pack_verts_offset;
+        }
+
+        pack_prim_type[pack_offset] = bvh_prim_type[i];
+        pack_prim_object[pack_offset] = 0;  // Unused for instanced meshes
+        pack_prim_visibility[pack_offset] = bvh_prim_visibility[i];
+      }
+    }
+
+    // Merge triangle vertex data
+    if (!bvh_pack.prim_tri_verts.empty()) {
+      const size_t prim_tri_size = bvh_pack.prim_tri_verts.size();
+      memcpy(pack_prim_tri_verts + pack_verts_offset,
+             bvh_pack.prim_tri_verts.data(),
+             prim_tri_size * sizeof(float4));
+      pack_verts_offset += prim_tri_size;
+    }
+  }
+
+  // Merge visibility flags of all objects and fix object indices for non-instanced meshes
+  foreach (Object *ob, objects) {
+    Mesh *const mesh = ob->mesh;
+    for (size_t i = 0; i < mesh->num_primitives(); ++i) {
+      if (!ob->mesh->is_instanced()) {
+        assert(pack.prim_object[mesh->prim_offset + i] == 0);
+        pack.prim_object[mesh->prim_offset + i] = ob->get_device_index();
+      }
+      pack.prim_visibility[mesh->prim_offset + i] |= ob->visibility_for_tracing();
+    }
+  }
+}
+
+void BVHOptiX::pack_nodes(const BVHNode *)
+{
+}
+
+void BVHOptiX::refit_nodes()
+{
+  // TODO(pmours): Implement?
+  VLOG(1) << "Refit is not yet implemented for OptiX BVH.";
+}
+
+BVHNode *BVHOptiX::widen_children_nodes(const BVHNode *)
+{
+  return NULL;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_OPTIX */
diff --git a/intern/cycles/bvh/bvh_optix.h b/intern/cycles/bvh/bvh_optix.h
new file mode 100644
index 00000000000..35033fe635f
--- /dev/null
+++ b/intern/cycles/bvh/bvh_optix.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BVH_OPTIX_H__
+#define __BVH_OPTIX_H__
+
+#ifdef WITH_OPTIX
+
+#  include "bvh/bvh.h"
+#  include "bvh/bvh_params.h"
+#  include "device/device_memory.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BVHOptiX : public BVH {
+  friend class BVH;
+
+ public:
+  BVHOptiX(const BVHParams &params, const vector<Mesh *> &meshes, const vector<Object *> &objects);
+  virtual ~BVHOptiX();
+
+  virtual void build(Progress &progress, Stats *) override;
+  virtual void copy_to_device(Progress &progress, DeviceScene *dscene) override;
+
+ private:
+  void pack_blas();
+  void pack_tlas();
+
+  virtual void pack_nodes(const BVHNode *) override;
+  virtual void refit_nodes() override;
+
+  virtual BVHNode *widen_children_nodes(const BVHNode *) override;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_OPTIX */
+
+#endif /* __BVH_OPTIX_H__ */
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 3e14480e2ad..a8c4949ad07 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -29,6 +29,7 @@ set(SRC
   device_memory.cpp
   device_multi.cpp
   device_opencl.cpp
+  device_optix.cpp
   device_split_kernel.cpp
   device_task.cpp
 )
@@ -85,6 +86,9 @@ endif()
 if(WITH_CYCLES_DEVICE_CUDA)
   add_definitions(-DWITH_CUDA)
 endif()
+if(WITH_CYCLES_DEVICE_OPTIX)
+  add_definitions(-DWITH_OPTIX)
+endif()
 if(WITH_CYCLES_DEVICE_MULTI)
   add_definitions(-DWITH_MULTI)
 endif()
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 47d111802cd..fe8a814cd14 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -38,6 +38,7 @@ bool Device::need_devices_update = true;
 thread_mutex Device::device_mutex;
 vector<DeviceInfo> Device::opencl_devices;
 vector<DeviceInfo> Device::cuda_devices;
+vector<DeviceInfo> Device::optix_devices;
 vector<DeviceInfo> Device::cpu_devices;
 vector<DeviceInfo> Device::network_devices;
 uint Device::devices_initialized_mask = 0;
@@ -379,6 +380,14 @@ Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool
         device = NULL;
       break;
 #endif
+#ifdef WITH_OPTIX
+    case DEVICE_OPTIX:
+      if (device_optix_init())
+        device = device_optix_create(info, stats, profiler, background);
+      else
+        device = NULL;
+      break;
+#endif
 #ifdef WITH_MULTI
     case DEVICE_MULTI:
       device = device_multi_create(info, stats, profiler, background);
@@ -410,6 +419,8 @@ DeviceType Device::type_from_string(const char *name)
     return DEVICE_CPU;
   else if (strcmp(name, "CUDA") == 0)
     return DEVICE_CUDA;
+  else if (strcmp(name, "OPTIX") == 0)
+    return DEVICE_OPTIX;
   else if (strcmp(name, "OPENCL") == 0)
     return DEVICE_OPENCL;
   else if (strcmp(name, "NETWORK") == 0)
@@ -426,6 +437,8 @@ string Device::string_from_type(DeviceType type)
     return "CPU";
   else if (type == DEVICE_CUDA)
     return "CUDA";
+  else if (type == DEVICE_OPTIX)
+    return "OPTIX";
   else if (type == DEVICE_OPENCL)
     return "OPENCL";
   else if (type == DEVICE_NETWORK)
@@ -443,6 +456,9 @@ vector<DeviceType> Device::available_types()
 #ifdef WITH_CUDA
   types.push_back(DEVICE_CUDA);
 #endif
+#ifdef WITH_OPTIX
+  types.push_back(DEVICE_OPTIX);
+#endif
 #ifdef WITH_OPENCL
   types.push_back(DEVICE_OPENCL);
 #endif
@@ -488,6 +504,20 @@ vector<DeviceInfo> Device::available_devices(uint mask)
   }
 #endif
 
+#ifdef WITH_OPTIX
+  if (mask & DEVICE_MASK_OPTIX) {
+    if (!(devices_initialized_mask & DEVICE_MASK_OPTIX)) {
+      if (device_optix_init()) {
+        device_optix_info(optix_devices);
+      }
+      devices_initialized_mask |= DEVICE_MASK_OPTIX;
+    }
+    foreach (DeviceInfo &info, optix_devices) {
+      devices.push_back(info);
+    }
+  }
+#endif
+
   if (mask & DEVICE_MASK_CPU) {
     if (!(devices_initialized_mask & DEVICE_MASK_CPU)) {
       device_cpu_info(cpu_devices);
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 15a0ceb4a19..672d93c2581 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -34,6 +34,7 @@
 
 CCL_NAMESPACE_BEGIN
 
+class BVH;
 class Progress;
 class RenderTile;
 
@@ -45,13 +46,15 @@ enum DeviceType {
   DEVICE_OPENCL,
   DEVICE_CUDA,
   DEVICE_NETWORK,
-  DEVICE_MULTI
+  DEVICE_MULTI,
+  DEVICE_OPTIX,
 };
 
 enum DeviceTypeMask {
   DEVICE_MASK_CPU = (1 << DEVICE_CPU),
   DEVICE_MASK_OPENCL = (1 << DEVICE_OPENCL),
   DEVICE_MASK_CUDA = (1 << DEVICE_CUDA),
+  DEVICE_MASK_OPTIX = (1 << DEVICE_OPTIX),
   DEVICE_MASK_NETWORK = (1 << DEVICE_NETWORK),
   DEVICE_MASK_ALL = ~0
 };
@@ -380,7 +383,11 @@ class Device {
   }
 
   /* tasks */
-  virtual int get_split_task_count(DeviceTask &task) = 0;
+  virtual int get_split_task_count(DeviceTask &)
+  {
+    return 1;
+  }
+
   virtual void task_add(DeviceTask &task) = 0;
   virtual void task_wait() = 0;
   virtual void task_cancel() = 0;
@@ -399,6 +406,12 @@ class Device {
                            bool transparent,
                            const DeviceDrawParams &draw_params);
 
+  /* acceleration structure building */
+  virtual bool build_optix_bvh(BVH *, device_memory &)
+  {
+    return false;
+  }
+
 #ifdef WITH_NETWORK
   /* networking */
   void server_run();
@@ -456,6 +469,7 @@ class Device {
   static bool need_types_update, need_devices_update;
   static thread_mutex device_mutex;
   static vector<DeviceInfo> cuda_devices;
+  static vector<DeviceInfo> optix_devices;
   static vector<DeviceInfo> opencl_devices;
   static vector<DeviceInfo> cpu_devices;
   static vector<DeviceInfo> network_devices;
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 4d7d87828c2..00dd37f089c 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -2263,11 +2263,6 @@ class CUDADevice : public Device {
     }
   };
 
-  int get_split_task_count(DeviceTask & /*task*/)
-  {
-    return 1;
-  }
-
   void task_add(DeviceTask &task)
   {
     CUDAContextScope scope(this);
diff --git a/intern/cycles/device/device_intern.h b/intern/cycles/device/device_intern.h
index c393a3f9cda..5b8b86886c4 100644
--- a/intern/cycles/device/device_intern.h
+++ b/intern/cycles/device/device_intern.h
@@ -27,6 +27,9 @@ Device *device_opencl_create(DeviceInfo &info, Stats &stats, Profiler &profiler,
 bool device_opencl_compile_kernel(const vector<string> &parameters);
 bool device_cuda_init();
 Device *device_cuda_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
+bool device_optix_init();
+Device *device_optix_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
+
 Device *device_network_create(DeviceInfo &info,
                               Stats &stats,
                               Profiler &profiler,
@@ -36,6 +39,7 @@ Device *device_multi_create(DeviceInfo &info, Stats &stats, Profiler &profiler,
 void device_cpu_info(vector<DeviceInfo> &devices);
 void device_opencl_info(vector<DeviceInfo> &devices);
 void device_cuda_info(vector<DeviceInfo> &devices);
+void device_optix_info(vector<DeviceInfo> &devices);
 void device_network_info(vector<DeviceInfo> &devices);
 
 string device_cpu_capabilities();
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index 4a40e106115..ac71be9dbea 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -153,6 +153,24 @@ class MultiDevice : public Device {
     return result;
   }
 
+  bool build_optix_bvh(BVH *bvh, device_memory &mem)
+  {
+    device_ptr key = unique_key++;
+
+    // Broadcast acceleration structure build to all devices
+    foreach (SubDevice &sub, devices) {
+      mem.device = sub.device;
+      if (!sub.device->build_optix_bvh(bvh, mem))
+        return false;
+      sub.ptr_map[key] = mem.device_pointer;
+    }
+
+    mem.device = this;
+    mem.device_pointer = key;
+    stats.mem_alloc(mem.device_size);
+    return true;
+  }
+
   void mem_alloc(device_memory &mem)
   {
     device_ptr key = unique_key++;
diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp
new file mode 100644
index 00000000000..84d7ecf6934
--- /dev/null
+++ b/intern/cycles/device/device_optix.cpp
@@ -0,0 +1,1969 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_OPTIX
+
+#  include "device/device.h"
+#  include "device/device_intern.h"
+#  include "device/device_denoising.h"
+#  include "bvh/bvh.h"
+#  include "render/scene.h"
+#  include "render/mesh.h"
+#  include "render/object.h"
+#  include "render/buffers.h"
+#  include "util/util_md5.h"
+#  include "util/util_path.h"
+#  include "util/util_time.h"
+#  include "util/util_debug.h"
+#  include "util/util_logging.h"
+
+#  undef _WIN32_WINNT  // Need minimum API support for Windows 7
+#  define _WIN32_WINNT _WIN32_WINNT_WIN7
+
+#  ifdef WITH_CUDA_DYNLOAD
+#    include <cuew.h>
+// Do not use CUDA SDK headers when using CUEW
+#    define OPTIX_DONT_INCLUDE_CUDA
+#  endif
+#  include <optix_stubs.h>
+#  include <optix_function_table_definition.h>
+
+CCL_NAMESPACE_BEGIN
+
+/* Make sure this stays in sync with kernel_globals.h */
+struct ShaderParams {
+  uint4 *input;
+  float4 *output;
+  int type;
+  int filter;
+  int sx;
+  int offset;
+  int sample;
+};
+struct KernelParams {
+  WorkTile tile;
+  KernelData data;
+  ShaderParams shader;
+#  define KERNEL_TEX(type, name) const type *name;
+#  include "kernel/kernel_textures.h"
+#  undef KERNEL_TEX
+};
+
+#  define check_result_cuda(stmt) \
+    { \
+      CUresult res = stmt; \
+      if (res != CUDA_SUCCESS) { \
+        const char *name; \
+        cuGetErrorName(res, &name); \
+        set_error(string_printf("OptiX CUDA error %s in %s, line %d", name, #stmt, __LINE__)); \
+        return; \
+      } \
+    } \
+    (void)0
+#  define check_result_cuda_ret(stmt) \
+    { \
+      CUresult res = stmt; \
+      if (res != CUDA_SUCCESS) { \
+        const char *name; \
+        cuGetErrorName(res, &name); \
+        set_error(string_printf("OptiX CUDA error %s in %s, line %d", name, #stmt, __LINE__)); \
+        return false; \
+      } \
+    } \
+    (void)0
+
+#  define check_result_optix(stmt) \
+    { \
+      enum OptixResult res = stmt; \
+      if (res != OPTIX_SUCCESS) { \
+        const char *name = optixGetErrorName(res); \
+        set_error(string_printf("OptiX error %s in %s, line %d", name, #stmt, __LINE__)); \
+        return; \
+      } \
+    } \
+    (void)0
+#  define check_result_optix_ret(stmt) \
+    { \
+      enum OptixResult res = stmt; \
+      if (res != OPTIX_SUCCESS) { \
+        const char *name = optixGetErrorName(res); \
+        set_error(string_printf("OptiX error %s in %s, line %d", name, #stmt, __LINE__)); \
+        return false; \
+      } \
+    } \
+    (void)0
+
+class OptiXDevice : public Device {
+
+  // List of OptiX program groups
+  enum {
+    PG_RGEN,
+    PG_MISS,
+    PG_HITD,  // Default hit group
+    PG_HITL,  // __BVH_LOCAL__ hit group
+    PG_HITS,  // __SHADOW_RECORD_ALL__ hit group
+#  ifdef WITH_CYCLES_DEBUG
+    PG_EXCP,
+#  endif
+    PG_BAKE,  // kernel_bake_evaluate
+    PG_DISP,  // kernel_displace_evaluate
+    PG_BACK,  // kernel_background_evaluate
+    NUM_PROGRAM_GROUPS
+  };
+
+  // List of OptiX pipelines
+  enum { PIP_PATH_TRACE, PIP_SHADER_EVAL, NUM_PIPELINES };
+
+  // A single shader binding table entry
+  struct SbtRecord {
+    char header[OPTIX_SBT_RECORD_HEADER_SIZE];
+  };
+
+  // Information stored about CUDA memory allocations
+  struct CUDAMem {
+    bool free_map_host = false;
+    CUarray array = NULL;
+    CUtexObject texobject = 0;
+    void *map_host_pointer = nullptr;
+  };
+
+  // Helper class to manage current CUDA context
+  struct CUDAContextScope {
+    CUDAContextScope(CUcontext ctx)
+    {
+      cuCtxPushCurrent(ctx);
+    }
+    ~CUDAContextScope()
+    {
+      cuCtxPopCurrent(NULL);
+    }
+  };
+
+  // Use a pool with multiple threads to support launches with multiple CUDA streams
+  TaskPool task_pool;
+
+  // CUDA/OptiX context handles
+  CUdevice cuda_device = 0;
+  CUcontext cuda_context = NULL;
+  vector<CUstream> cuda_stream;
+  OptixDeviceContext context = NULL;
+
+  // Need CUDA kernel module for some utility functions
+  CUmodule cuda_module = NULL;
+  CUmodule cuda_filter_module = NULL;
+  // All necessary OptiX kernels are in one module
+  OptixModule optix_module = NULL;
+  OptixPipeline pipelines[NUM_PIPELINES] = {};
+
+  bool need_texture_info = false;
+  device_vector<SbtRecord> sbt_data;
+  device_vector<TextureInfo> texture_info;
+  device_only_memory<KernelParams> launch_params;
+  vector<device_only_memory<uint8_t>> blas;
+  OptixTraversableHandle tlas_handle = 0;
+
+  map<device_memory *, CUDAMem> cuda_mem_map;
+
+ public:
+  OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
+      : Device(info_, stats_, profiler_, background_),
+        sbt_data(this, "__sbt", MEM_READ_ONLY),
+        texture_info(this, "__texture_info", MEM_TEXTURE),
+        launch_params(this, "__params")
+  {
+    // Store number of CUDA streams in device info
+    info.cpu_threads = DebugFlags().optix.cuda_streams;
+
+    // Initialize CUDA driver API
+    check_result_cuda(cuInit(0));
+
+    // Retrieve the primary CUDA context for this device
+    check_result_cuda(cuDeviceGet(&cuda_device, info.num));
+    check_result_cuda(cuDevicePrimaryCtxRetain(&cuda_context, cuda_device));
+
+    // Make that CUDA context current
+    const CUDAContextScope scope(cuda_context);
+
+    // Create OptiX context for this device
+    OptixDeviceContextOptions options = {};
+#  ifdef WITH_CYCLES_LOGGING
+    options.logCallbackLevel = 4;  // Fatal = 1, Error = 2, Warning = 3, Print = 4
+    options.logCallbackFunction =
+        [](unsigned int level, const char *, const char *message, void *) {
+          switch (level) {
+            case 1:
+              LOG_IF(FATAL, VLOG_IS_ON(1)) << message;
+              break;
+            case 2:
+              LOG_IF(ERROR, VLOG_IS_ON(1)) << message;
+              break;
+            case 3:
+              LOG_IF(WARNING, VLOG_IS_ON(1)) << message;
+              break;
+            case 4:
+              LOG_IF(INFO, VLOG_IS_ON(1)) << message;
+              break;
+          }
+        };
+#  endif
+    check_result_optix(optixDeviceContextCreate(cuda_context, &options, &context));
+#  ifdef WITH_CYCLES_LOGGING
+    check_result_optix(optixDeviceContextSetLogCallback(
+        context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel));
+#  endif
+
+    // Create launch streams
+    cuda_stream.resize(info.cpu_threads);
+    for (int i = 0; i < info.cpu_threads; ++i)
+      check_result_cuda(cuStreamCreate(&cuda_stream[i], CU_STREAM_NON_BLOCKING));
+
+    // Fix weird compiler bug that assigns wrong size
+    launch_params.data_elements = sizeof(KernelParams);
+    // Allocate launch parameter buffer memory on device
+    launch_params.alloc_to_device(info.cpu_threads);
+  }
+  ~OptiXDevice()
+  {
+    // Stop processing any more tasks
+    task_pool.stop();
+
+    // Clean up all memory before destroying context
+    blas.clear();
+
+    sbt_data.free();
+    texture_info.free();
+    launch_params.free();
+
+    // Make CUDA context current
+    const CUDAContextScope scope(cuda_context);
+
+    // Unload modules
+    if (cuda_module != NULL)
+      cuModuleUnload(cuda_module);
+    if (cuda_filter_module != NULL)
+      cuModuleUnload(cuda_filter_module);
+    if (optix_module != NULL)
+      optixModuleDestroy(optix_module);
+    for (unsigned int i = 0; i < NUM_PIPELINES; ++i)
+      if (pipelines[i] != NULL)
+        optixPipelineDestroy(pipelines[i]);
+
+    // Destroy launch streams
+    for (int i = 0; i < info.cpu_threads; ++i)
+      cuStreamDestroy(cuda_stream[i]);
+
+    // Destroy OptiX and CUDA context
+    optixDeviceContextDestroy(context);
+    cuDevicePrimaryCtxRelease(cuda_device);
+  }
+
+ private:
+  bool show_samples() const override
+  {
+    // Only show samples if not rendering multiple tiles in parallel
+    return info.cpu_threads == 1;
+  }
+
+  BVHLayoutMask get_bvh_layout_mask() const override
+  {
+    // OptiX has its own internal acceleration structure format
+    return BVH_LAYOUT_OPTIX;
+  }
+
+  bool load_kernels(const DeviceRequestedFeatures &requested_features) override
+  {
+    if (have_error())
+      return false;  // Abort early if context creation failed already
+
+    // Disable baking for now, since its kernel is not well-suited for inlining and is very slow
+    if (requested_features.use_baking) {
+      set_error("OptiX implementation does not support baking yet");
+      return false;
+    }
+    // Disable shader raytracing support for now, since continuation callables are slow
+    if (requested_features.use_shader_raytrace) {
+      set_error("OptiX implementation does not support shader raytracing yet");
+      return false;
+    }
+
+    const CUDAContextScope scope(cuda_context);
+
+    // Unload any existing modules first
+    if (cuda_module != NULL)
+      cuModuleUnload(cuda_module);
+    if (cuda_filter_module != NULL)
+      cuModuleUnload(cuda_filter_module);
+    if (optix_module != NULL)
+      optixModuleDestroy(optix_module);
+    for (unsigned int i = 0; i < NUM_PIPELINES; ++i)
+      if (pipelines[i] != NULL)
+        optixPipelineDestroy(pipelines[i]);
+
+    OptixModuleCompileOptions module_options;
+    module_options.maxRegisterCount = 0;  // Do not set an explicit register limit
+#  ifdef WITH_CYCLES_DEBUG
+    module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_0;
+    module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
+#  else
+    module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
+    module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
+#  endif
+    OptixPipelineCompileOptions pipeline_options;
+    // Default to no motion blur and two-level graph, since it is the fastest option
+    pipeline_options.usesMotionBlur = false;
+    pipeline_options.traversableGraphFlags =
+        OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING;
+    pipeline_options.numPayloadValues = 6;
+    pipeline_options.numAttributeValues = 2;  // u, v
+#  ifdef WITH_CYCLES_DEBUG
+    pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_STACK_OVERFLOW |
+                                      OPTIX_EXCEPTION_FLAG_TRACE_DEPTH;
+#  else
+    pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE;
+#  endif
+    pipeline_options.pipelineLaunchParamsVariableName = "__params";  // See kernel_globals.h
+
+    if (requested_features.use_object_motion) {
+      pipeline_options.usesMotionBlur = true;
+      // Motion blur can insert motion transforms into the traversal graph
+      // It is no longer a two-level graph then, so need to set flags to allow any configuration
+      pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY;
+    }
+
+    {  // Load and compile PTX module with OptiX kernels
+      string ptx_data;
+      const string ptx_filename = "lib/kernel_optix.ptx";
+      if (!path_read_text(path_get(ptx_filename), ptx_data)) {
+        set_error("Failed loading OptiX kernel " + ptx_filename + ".");
+        return false;
+      }
+
+      check_result_optix_ret(optixModuleCreateFromPTX(context,
+                                                      &module_options,
+                                                      &pipeline_options,
+                                                      ptx_data.data(),
+                                                      ptx_data.size(),
+                                                      nullptr,
+                                                      0,
+                                                      &optix_module));
+    }
+
+    {  // Load CUDA modules because we need some of the utility kernels
+      int major, minor;
+      cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, info.num);
+      cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, info.num);
+
+      string cubin_data;
+      const string cubin_filename = string_printf("lib/kernel_sm_%d%d.cubin", major, minor);
+      if (!path_read_text(path_get(cubin_filename), cubin_data)) {
+        set_error("Failed loading pre-compiled CUDA kernel " + cubin_filename + ".");
+        return false;
+      }
+
+      check_result_cuda_ret(cuModuleLoadData(&cuda_module, cubin_data.data()));
+
+      if (requested_features.use_denoising) {
+        string filter_data;
+        const string filter_filename = string_printf("lib/filter_sm_%d%d.cubin", major, minor);
+        if (!path_read_text(path_get(filter_filename), filter_data)) {
+          set_error("Failed loading pre-compiled CUDA filter kernel " + filter_filename + ".");
+          return false;
+        }
+
+        check_result_cuda_ret(cuModuleLoadData(&cuda_filter_module, filter_data.data()));
+      }
+    }
+
+    // Create program groups
+    OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
+    OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
+    OptixProgramGroupOptions group_options = {};  // There are no options currently
+    group_descs[PG_RGEN].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN].raygen.module = optix_module;
+    // Ignore branched integrator for now (see "requested_features.use_integrator_branched")
+    group_descs[PG_RGEN].raygen.entryFunctionName = "__raygen__kernel_optix_path_trace";
+    group_descs[PG_MISS].kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
+    group_descs[PG_MISS].miss.module = optix_module;
+    group_descs[PG_MISS].miss.entryFunctionName = "__miss__kernel_optix_miss";
+    group_descs[PG_HITD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+    group_descs[PG_HITD].hitgroup.moduleCH = optix_module;
+    group_descs[PG_HITD].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit";
+    group_descs[PG_HITD].hitgroup.moduleAH = optix_module;
+    group_descs[PG_HITD].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_visibility_test";
+    group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+    group_descs[PG_HITS].hitgroup.moduleAH = optix_module;
+    group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit";
+
+    if (requested_features.use_hair) {
+      // Add curve intersection programs
+      group_descs[PG_HITD].hitgroup.moduleIS = optix_module;
+      group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve";
+      group_descs[PG_HITS].hitgroup.moduleIS = optix_module;
+      group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve";
+    }
+
+    if (requested_features.use_subsurface || requested_features.use_shader_raytrace) {
+      // Add hit group for local intersections
+      group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+      group_descs[PG_HITL].hitgroup.moduleAH = optix_module;
+      group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit";
+    }
+
+#  ifdef WITH_CYCLES_DEBUG
+    group_descs[PG_EXCP].kind = OPTIX_PROGRAM_GROUP_KIND_EXCEPTION;
+    group_descs[PG_EXCP].exception.module = optix_module;
+    group_descs[PG_EXCP].exception.entryFunctionName = "__exception__kernel_optix_exception";
+#  endif
+
+    if (requested_features.use_baking) {
+      group_descs[PG_BAKE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+      group_descs[PG_BAKE].raygen.module = optix_module;
+      group_descs[PG_BAKE].raygen.entryFunctionName = "__raygen__kernel_optix_bake";
+    }
+
+    if (requested_features.use_true_displacement) {
+      group_descs[PG_DISP].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+      group_descs[PG_DISP].raygen.module = optix_module;
+      group_descs[PG_DISP].raygen.entryFunctionName = "__raygen__kernel_optix_displace";
+    }
+
+    if (requested_features.use_background_light) {
+      group_descs[PG_BACK].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+      group_descs[PG_BACK].raygen.module = optix_module;
+      group_descs[PG_BACK].raygen.entryFunctionName = "__raygen__kernel_optix_background";
+    }
+
+    check_result_optix_ret(optixProgramGroupCreate(
+        context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups));
+
+    // Get program stack sizes
+    OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
+    // Set up SBT, which in this case is used only to select between different programs
+    sbt_data.alloc(NUM_PROGRAM_GROUPS);
+    memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS);
+    for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+      check_result_optix_ret(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
+      check_result_optix_ret(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
+    }
+    sbt_data.copy_to_device();  // Upload SBT to device
+
+    // Calculate maximum trace continuation stack size
+    unsigned int trace_css = stack_size[PG_HITD].cssCH;
+    // This is based on the maximum of closest-hit and any-hit/intersection programs
+    trace_css = max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
+    trace_css = max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
+    trace_css = max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
+
+    OptixPipelineLinkOptions link_options;
+    link_options.maxTraceDepth = 1;
+#  ifdef WITH_CYCLES_DEBUG
+    link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
+#  else
+    link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
+#  endif
+    link_options.overrideUsesMotionBlur = pipeline_options.usesMotionBlur;
+
+    {  // Create path tracing pipeline
+      OptixProgramGroup pipeline_groups[] = {
+          groups[PG_RGEN],
+          groups[PG_MISS],
+          groups[PG_HITD],
+          groups[PG_HITS],
+          groups[PG_HITL],
+#  ifdef WITH_CYCLES_DEBUG
+          groups[PG_EXCP],
+#  endif
+      };
+      check_result_optix_ret(
+          optixPipelineCreate(context,
+                              &pipeline_options,
+                              &link_options,
+                              pipeline_groups,
+                              (sizeof(pipeline_groups) / sizeof(pipeline_groups[0])),
+                              nullptr,
+                              0,
+                              &pipelines[PIP_PATH_TRACE]));
+
+      // Combine ray generation and trace continuation stack size
+      const unsigned int css = stack_size[PG_RGEN].cssRG + link_options.maxTraceDepth * trace_css;
+
+      // Set stack size depending on pipeline options
+      check_result_optix_ret(optixPipelineSetStackSize(
+          pipelines[PIP_PATH_TRACE], 0, 0, css, (pipeline_options.usesMotionBlur ? 3 : 2)));
+    }
+
+    // Only need to create shader evaluation pipeline if one of these features is used:
+    const bool use_shader_eval_pipeline = requested_features.use_baking ||
+                                          requested_features.use_background_light ||
+                                          requested_features.use_true_displacement;
+
+    if (use_shader_eval_pipeline) {  // Create shader evaluation pipeline
+      OptixProgramGroup pipeline_groups[] = {
+          groups[PG_BAKE],
+          groups[PG_DISP],
+          groups[PG_BACK],
+          groups[PG_MISS],
+          groups[PG_HITD],
+          groups[PG_HITS],
+          groups[PG_HITL],
+#  ifdef WITH_CYCLES_DEBUG
+          groups[PG_EXCP],
+#  endif
+      };
+      check_result_optix_ret(
+          optixPipelineCreate(context,
+                              &pipeline_options,
+                              &link_options,
+                              pipeline_groups,
+                              (sizeof(pipeline_groups) / sizeof(pipeline_groups[0])),
+                              nullptr,
+                              0,
+                              &pipelines[PIP_SHADER_EVAL]));
+
+      // Calculate continuation stack size based on the maximum of all ray generation stack sizes
+      const unsigned int css = max(stack_size[PG_BAKE].cssRG,
+                                   max(stack_size[PG_DISP].cssRG, stack_size[PG_BACK].cssRG)) +
+                               link_options.maxTraceDepth * trace_css;
+
+      check_result_optix_ret(optixPipelineSetStackSize(
+          pipelines[PIP_SHADER_EVAL], 0, 0, css, (pipeline_options.usesMotionBlur ? 3 : 2)));
+    }
+
+    // Clean up program group objects
+    for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+      optixProgramGroupDestroy(groups[i]);
+    }
+
+    return true;
+  }
+
+  void thread_run(DeviceTask &task, int thread_index)  // Main task entry point
+  {
+    if (have_error())
+      return;  // Abort early if there was an error previously
+
+    if (task.type == DeviceTask::RENDER) {
+      RenderTile tile;
+      while (task.acquire_tile(this, tile)) {
+        if (tile.task == RenderTile::PATH_TRACE)
+          launch_render(task, tile, thread_index);
+        else if (tile.task == RenderTile::DENOISE)
+          launch_denoise(task, tile, thread_index);
+        task.release_tile(tile);
+        if (task.get_cancel() && !task.need_finish_queue)
+          break;  // User requested cancellation
+        else if (have_error())
+          break;  // Abort rendering when encountering an error
+      }
+    }
+    else if (task.type == DeviceTask::SHADER) {
+      launch_shader_eval(task, thread_index);
+    }
+    else if (task.type == DeviceTask::FILM_CONVERT) {
+      launch_film_convert(task, thread_index);
+    }
+  }
+
+  void launch_render(DeviceTask &task, RenderTile &rtile, int thread_index)
+  {
+    assert(thread_index < launch_params.data_size);
+
+    // Keep track of total render time of this tile
+    const scoped_timer timer(&rtile.buffers->render_time);
+
+    WorkTile wtile;
+    wtile.x = rtile.x;
+    wtile.y = rtile.y;
+    wtile.w = rtile.w;
+    wtile.h = rtile.h;
+    wtile.offset = rtile.offset;
+    wtile.stride = rtile.stride;
+    wtile.buffer = (float *)rtile.buffer;
+
+    const int end_sample = rtile.start_sample + rtile.num_samples;
+    // Keep this number reasonable to avoid running into TDRs
+    const int step_samples = (info.display_device ? 8 : 32);
+    // Offset into launch params buffer so that streams use separate data
+    device_ptr launch_params_ptr = launch_params.device_pointer +
+                                   thread_index * launch_params.data_elements;
+
+    const CUDAContextScope scope(cuda_context);
+
+    for (int sample = rtile.start_sample; sample < end_sample; sample += step_samples) {
+      // Copy work tile information to device
+      wtile.num_samples = min(step_samples, end_sample - sample);
+      wtile.start_sample = sample;
+      check_result_cuda(cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, tile),
+                                          &wtile,
+                                          sizeof(wtile),
+                                          cuda_stream[thread_index]));
+
+      OptixShaderBindingTable sbt_params = {};
+      sbt_params.raygenRecord = sbt_data.device_pointer + PG_RGEN * sizeof(SbtRecord);
+#  ifdef WITH_CYCLES_DEBUG
+      sbt_params.exceptionRecord = sbt_data.device_pointer + PG_EXCP * sizeof(SbtRecord);
+#  endif
+      sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord);
+      sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
+      sbt_params.missRecordCount = 1;
+      sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
+      sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
+      sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITL, PG_HITS
+
+      // Launch the ray generation program
+      check_result_optix(optixLaunch(pipelines[PIP_PATH_TRACE],
+                                     cuda_stream[thread_index],
+                                     launch_params_ptr,
+                                     launch_params.data_elements,
+                                     &sbt_params,
+                                     // Launch with samples close to each other for better locality
+                                     wtile.w * wtile.num_samples,
+                                     wtile.h,
+                                     1));
+
+      // Wait for launch to finish
+      check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
+
+      // Update current sample, so it is displayed correctly
+      rtile.sample = wtile.start_sample + wtile.num_samples;
+      // Update task progress after the kernel completed rendering
+      task.update_progress(&rtile, wtile.w * wtile.h * wtile.num_samples);
+
+      if (task.get_cancel() && !task.need_finish_queue)
+        return;  // Cancel rendering
+    }
+  }
+
+  void launch_denoise(DeviceTask &task, RenderTile &rtile, int thread_index)
+  {
+    const CUDAContextScope scope(cuda_context);
+
+    // Run CUDA denoising kernels
+    DenoisingTask denoising(this, task);
+    denoising.functions.construct_transform = function_bind(
+        &OptiXDevice::denoising_construct_transform, this, &denoising, thread_index);
+    denoising.functions.accumulate = function_bind(
+        &OptiXDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising, thread_index);
+    denoising.functions.solve = function_bind(
+        &OptiXDevice::denoising_solve, this, _1, &denoising, thread_index);
+    denoising.functions.divide_shadow = function_bind(
+        &OptiXDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising, thread_index);
+    denoising.functions.non_local_means = function_bind(
+        &OptiXDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising, thread_index);
+    denoising.functions.combine_halves = function_bind(&OptiXDevice::denoising_combine_halves,
+                                                       this,
+                                                       _1,
+                                                       _2,
+                                                       _3,
+                                                       _4,
+                                                       _5,
+                                                       _6,
+                                                       &denoising,
+                                                       thread_index);
+    denoising.functions.get_feature = function_bind(
+        &OptiXDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising, thread_index);
+    denoising.functions.write_feature = function_bind(
+        &OptiXDevice::denoising_write_feature, this, _1, _2, _3, &denoising, thread_index);
+    denoising.functions.detect_outliers = function_bind(
+        &OptiXDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising, thread_index);
+
+    denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
+    denoising.render_buffer.samples = rtile.sample = rtile.start_sample + rtile.num_samples;
+    denoising.buffer.gpu_temporary_mem = true;
+
+    denoising.run_denoising(&rtile);
+
+    task.update_progress(&rtile, rtile.w * rtile.h);
+  }
+
+  void launch_shader_eval(DeviceTask &task, int thread_index)
+  {
+    unsigned int rgen_index = PG_BACK;
+    if (task.shader_eval_type >= SHADER_EVAL_BAKE)
+      rgen_index = PG_BAKE;
+    if (task.shader_eval_type == SHADER_EVAL_DISPLACE)
+      rgen_index = PG_DISP;
+
+    const CUDAContextScope scope(cuda_context);
+
+    device_ptr launch_params_ptr = launch_params.device_pointer +
+                                   thread_index * launch_params.data_elements;
+
+    for (int sample = 0; sample < task.num_samples; ++sample) {
+      ShaderParams params;
+      params.input = (uint4 *)task.shader_input;
+      params.output = (float4 *)task.shader_output;
+      params.type = task.shader_eval_type;
+      params.filter = task.shader_filter;
+      params.sx = task.shader_x;
+      params.offset = task.offset;
+      params.sample = sample;
+
+      check_result_cuda(cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, shader),
+                                          &params,
+                                          sizeof(params),
+                                          cuda_stream[thread_index]));
+
+      OptixShaderBindingTable sbt_params = {};
+      sbt_params.raygenRecord = sbt_data.device_pointer + rgen_index * sizeof(SbtRecord);
+#  ifdef WITH_CYCLES_DEBUG
+      sbt_params.exceptionRecord = sbt_data.device_pointer + PG_EXCP * sizeof(SbtRecord);
+#  endif
+      sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord);
+      sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
+      sbt_params.missRecordCount = 1;
+      sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
+      sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
+      sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITL, PG_HITS
+
+      check_result_optix(optixLaunch(pipelines[PIP_SHADER_EVAL],
+                                     cuda_stream[thread_index],
+                                     launch_params_ptr,
+                                     launch_params.data_elements,
+                                     &sbt_params,
+                                     task.shader_w,
+                                     1,
+                                     1));
+
+      check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
+
+      task.update_progress(NULL);
+    }
+  }
+
+  void launch_film_convert(DeviceTask &task, int thread_index)
+  {
+    const CUDAContextScope scope(cuda_context);
+
+    CUfunction film_convert_func;
+    check_result_cuda(cuModuleGetFunction(&film_convert_func,
+                                          cuda_module,
+                                          task.rgba_byte ? "kernel_cuda_convert_to_byte" :
+                                                           "kernel_cuda_convert_to_half_float"));
+
+    float sample_scale = 1.0f / (task.sample + 1);
+    CUdeviceptr rgba = (task.rgba_byte ? task.rgba_byte : task.rgba_half);
+
+    void *args[] = {&rgba,
+                    &task.buffer,
+                    &sample_scale,
+                    &task.x,
+                    &task.y,
+                    &task.w,
+                    &task.h,
+                    &task.offset,
+                    &task.stride};
+
+    int threads_per_block;
+    check_result_cuda(cuFuncGetAttribute(
+        &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, film_convert_func));
+
+    const int num_threads_x = (int)sqrt(threads_per_block);
+    const int num_blocks_x = (task.w + num_threads_x - 1) / num_threads_x;
+    const int num_threads_y = (int)sqrt(threads_per_block);
+    const int num_blocks_y = (task.h + num_threads_y - 1) / num_threads_y;
+
+    check_result_cuda(cuLaunchKernel(film_convert_func,
+                                     num_blocks_x,
+                                     num_blocks_y,
+                                     1, /* blocks */
+                                     num_threads_x,
+                                     num_threads_y,
+                                     1, /* threads */
+                                     0,
+                                     cuda_stream[thread_index],
+                                     args,
+                                     0));
+
+    check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
+
+    task.update_progress(NULL);
+  }
+
+  bool build_optix_bvh(const OptixBuildInput &build_input,
+                       uint16_t num_motion_steps,
+                       device_memory &out_data,
+                       OptixTraversableHandle &out_handle)
+  {
+    out_handle = 0;
+
+    const CUDAContextScope scope(cuda_context);
+
+    // Compute memory usage
+    OptixAccelBufferSizes sizes = {};
+    OptixAccelBuildOptions options;
+    options.operation = OPTIX_BUILD_OPERATION_BUILD;
+    options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE;
+    options.motionOptions.numKeys = num_motion_steps;
+    options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH;
+    options.motionOptions.timeBegin = 0.0f;
+    options.motionOptions.timeEnd = 1.0f;
+
+    check_result_optix_ret(
+        optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));
+
+    // Allocate required output buffers
+    device_only_memory<char> temp_mem(this, "temp_build_mem");
+    temp_mem.alloc_to_device(sizes.tempSizeInBytes);
+
+    out_data.data_type = TYPE_UNKNOWN;
+    out_data.data_elements = 1;
+    out_data.data_size = sizes.outputSizeInBytes;
+    mem_alloc(out_data);
+
+    // Finally build the acceleration structure
+    check_result_optix_ret(optixAccelBuild(context,
+                                           NULL,
+                                           &options,
+                                           &build_input,
+                                           1,
+                                           temp_mem.device_pointer,
+                                           sizes.tempSizeInBytes,
+                                           out_data.device_pointer,
+                                           sizes.outputSizeInBytes,
+                                           &out_handle,
+                                           NULL,
+                                           0));
+
+    // Wait for all operations to finish
+    check_result_cuda_ret(cuStreamSynchronize(NULL));
+
+    return true;
+  }
+
+  bool build_optix_bvh(BVH *bvh, device_memory &out_data) override
+  {
+    assert(bvh->params.top_level);
+
+    unsigned int num_instances = 0;
+    unordered_map<Mesh *, vector<OptixTraversableHandle>> meshes;
+
+    // Clear all previous AS
+    blas.clear();
+
+    // Build bottom level acceleration structures (BLAS)
+    // Note: Always keep this logic in sync with bvh_optix.cpp!
+    for (Object *ob : bvh->objects) {
+      // Skip meshes for which acceleration structure already exists
+      if (meshes.find(ob->mesh) != meshes.end())
+        continue;
+
+      Mesh *const mesh = ob->mesh;
+      vector<OptixTraversableHandle> handles;
+
+      // Build BLAS for curve primitives
+      if (bvh->params.primitive_mask & PRIMITIVE_ALL_CURVE && mesh->num_curves() > 0) {
+        const size_t num_curves = mesh->num_curves();
+        const size_t num_segments = mesh->num_segments();
+
+        size_t num_motion_steps = 1;
+        Attribute *motion_keys = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+        if (mesh->use_motion_blur && motion_keys) {
+          num_motion_steps = mesh->motion_steps;
+        }
+
+        device_vector<OptixAabb> aabb_data(this, "temp_aabb_data", MEM_READ_ONLY);
+        aabb_data.alloc(num_segments * num_motion_steps);
+
+        // Get AABBs for each motion step
+        for (size_t step = 0; step < num_motion_steps; ++step) {
+          const float3 *keys = mesh->curve_keys.data();
+
+          size_t center_step = (num_motion_steps - 1) / 2;
+          // The center step for motion vertices is not stored in the attribute
+          if (step != center_step) {
+            keys = motion_keys->data_float3() +
+                   (step > center_step ? step - 1 : step) * num_segments;
+          }
+
+          for (size_t i = step * num_segments, j = 0; j < num_curves; ++j) {
+            const Mesh::Curve c = mesh->get_curve(j);
+            for (size_t k = 0; k < c.num_segments(); ++i, ++k) {
+              BoundBox bounds = BoundBox::empty;
+              c.bounds_grow(k, keys, mesh->curve_radius.data(), bounds);
+              aabb_data[i].minX = bounds.min.x;
+              aabb_data[i].minY = bounds.min.y;
+              aabb_data[i].minZ = bounds.min.z;
+              aabb_data[i].maxX = bounds.max.x;
+              aabb_data[i].maxY = bounds.max.y;
+              aabb_data[i].maxZ = bounds.max.z;
+            }
+          }
+        }
+
+        // Upload AABB data to GPU
+        aabb_data.copy_to_device();
+
+        vector<device_ptr> aabb_ptrs;
+        aabb_ptrs.reserve(num_motion_steps);
+        for (size_t step = 0; step < num_motion_steps; ++step) {
+          aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb));
+        }
+
+        // Disable visibility test anyhit program, since it is already checked during intersection
+        // Those trace calls that require anyhit can force it with OPTIX_RAY_FLAG_ENFORCE_ANYHIT
+        unsigned int build_flags = OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
+
+        OptixBuildInput build_input = {};
+        build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
+        build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
+        build_input.aabbArray.numPrimitives = num_segments;
+        build_input.aabbArray.strideInBytes = sizeof(OptixAabb);
+        build_input.aabbArray.flags = &build_flags;
+        build_input.aabbArray.numSbtRecords = 1;
+        build_input.aabbArray.primitiveIndexOffset = mesh->prim_offset;
+
+        // Allocate memory for new BLAS and build it
+        blas.emplace_back(this, "blas");
+        handles.emplace_back();
+        if (!build_optix_bvh(build_input, num_motion_steps, blas.back(), handles.back()))
+          return false;
+      }
+
+      // Build BLAS for triangle primitives
+      if (bvh->params.primitive_mask & PRIMITIVE_ALL_TRIANGLE && mesh->num_triangles() > 0) {
+        const size_t num_verts = mesh->verts.size();
+
+        size_t num_motion_steps = 1;
+        Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+        if (mesh->use_motion_blur && motion_keys) {
+          num_motion_steps = mesh->motion_steps;
+        }
+
+        device_vector<int> index_data(this, "temp_index_data", MEM_READ_ONLY);
+        index_data.alloc(mesh->triangles.size());
+        memcpy(index_data.data(), mesh->triangles.data(), mesh->triangles.size() * sizeof(int));
+        device_vector<float3> vertex_data(this, "temp_vertex_data", MEM_READ_ONLY);
+        vertex_data.alloc(num_verts * num_motion_steps);
+
+        for (size_t step = 0; step < num_motion_steps; ++step) {
+          const float3 *verts = mesh->verts.data();
+
+          size_t center_step = (num_motion_steps - 1) / 2;
+          // The center step for motion vertices is not stored in the attribute
+          if (step != center_step) {
+            verts = motion_keys->data_float3() +
+                    (step > center_step ? step - 1 : step) * num_verts;
+          }
+
+          memcpy(vertex_data.data() + num_verts * step, verts, num_verts * sizeof(float3));
+        }
+
+        // Upload triangle data to GPU
+        index_data.copy_to_device();
+        vertex_data.copy_to_device();
+
+        vector<device_ptr> vertex_ptrs;
+        vertex_ptrs.reserve(num_motion_steps);
+        for (size_t step = 0; step < num_motion_steps; ++step) {
+          vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3));
+        }
+
+        // No special build flags for triangle primitives
+        unsigned int build_flags = OPTIX_GEOMETRY_FLAG_NONE;
+
+        OptixBuildInput build_input = {};
+        build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
+        build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
+        build_input.triangleArray.numVertices = num_verts;
+        build_input.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3;
+        build_input.triangleArray.vertexStrideInBytes = sizeof(float3);
+        build_input.triangleArray.indexBuffer = index_data.device_pointer;
+        build_input.triangleArray.numIndexTriplets = mesh->num_triangles();
+        build_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3;
+        build_input.triangleArray.indexStrideInBytes = 3 * sizeof(int);
+        build_input.triangleArray.flags = &build_flags;
+        // The SBT does not store per primitive data since Cycles already allocates separate
+        // buffers for that purpose. OptiX does not allow this to be zero though, so just pass in
+        // one and rely on that having the same meaning in this case.
+        build_input.triangleArray.numSbtRecords = 1;
+        // Triangle primitives are packed right after the curve primitives of this mesh
+        build_input.triangleArray.primitiveIndexOffset = mesh->prim_offset + mesh->num_segments();
+
+        // Allocate memory for new BLAS and build it
+        blas.emplace_back(this, "blas");
+        handles.emplace_back();
+        if (!build_optix_bvh(build_input, num_motion_steps, blas.back(), handles.back()))
+          return false;
+      }
+
+      meshes.insert({mesh, handles});
+    }
+
+    // Fill instance descriptions
+    device_vector<OptixAabb> aabbs(this, "tlas_aabbs", MEM_READ_ONLY);
+    aabbs.alloc(bvh->objects.size() * 2);
+    device_vector<OptixInstance> instances(this, "tlas_instances", MEM_READ_ONLY);
+    instances.alloc(bvh->objects.size() * 2);
+
+    for (Object *ob : bvh->objects) {
+      // Skip non-traceable objects
+      if (!ob->is_traceable())
+        continue;
+      // Create separate instance for triangle/curve meshes of an object
+      for (OptixTraversableHandle handle : meshes[ob->mesh]) {
+        OptixAabb &aabb = aabbs[num_instances];
+        aabb.minX = ob->bounds.min.x;
+        aabb.minY = ob->bounds.min.y;
+        aabb.minZ = ob->bounds.min.z;
+        aabb.maxX = ob->bounds.max.x;
+        aabb.maxY = ob->bounds.max.y;
+        aabb.maxZ = ob->bounds.max.z;
+
+        OptixInstance &instance = instances[num_instances++];
+        memset(&instance, 0, sizeof(instance));
+
+        // Clear transform to identity matrix
+        instance.transform[0] = 1.0f;
+        instance.transform[5] = 1.0f;
+        instance.transform[10] = 1.0f;
+
+        // Set user instance ID to object index
+        instance.instanceId = ob->get_device_index();
+
+        // Volumes have a special bit set in the visibility mask so a trace can mask only volumes
+        // See 'scene_intersect_volume' in bvh.h
+        instance.visibilityMask = (ob->mesh->has_volume ? 3 : 1);
+
+        // Insert motion traversable if object has motion
+        if (ob->use_motion()) {
+          blas.emplace_back(this, "motion_transform");
+          device_only_memory<uint8_t> &motion_transform_gpu = blas.back();
+          motion_transform_gpu.alloc_to_device(sizeof(OptixSRTMotionTransform) +
+                                               (max(ob->motion.size(), 2) - 2) *
+                                                   sizeof(OptixSRTData));
+
+          // Allocate host side memory for motion transform and fill it with transform data
+          OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
+              motion_transform_gpu.host_pointer = new uint8_t[motion_transform_gpu.memory_size()]);
+          motion_transform.child = handle;
+          motion_transform.motionOptions.numKeys = ob->motion.size();
+          motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
+          motion_transform.motionOptions.timeBegin = 0.0f;
+          motion_transform.motionOptions.timeEnd = 1.0f;
+
+          OptixSRTData *const srt_data = motion_transform.srtData;
+          array<DecomposedTransform> decomp(ob->motion.size());
+          transform_motion_decompose(decomp.data(), ob->motion.data(), ob->motion.size());
+
+          for (size_t i = 0; i < ob->motion.size(); ++i) {
+            // scaling
+            srt_data[i].a = decomp[i].z.x;   // scale.x.y
+            srt_data[i].b = decomp[i].z.y;   // scale.x.z
+            srt_data[i].c = decomp[i].w.x;   // scale.y.z
+            srt_data[i].sx = decomp[i].y.w;  // scale.x.x
+            srt_data[i].sy = decomp[i].z.w;  // scale.y.y
+            srt_data[i].sz = decomp[i].w.w;  // scale.z.z
+            srt_data[i].pvx = 0;
+            srt_data[i].pvy = 0;
+            srt_data[i].pvz = 0;
+            // rotation
+            srt_data[i].qx = decomp[i].x.x;
+            srt_data[i].qy = decomp[i].x.y;
+            srt_data[i].qz = decomp[i].x.z;
+            srt_data[i].qw = decomp[i].x.w;
+            // transform
+            srt_data[i].tx = decomp[i].y.x;
+            srt_data[i].ty = decomp[i].y.y;
+            srt_data[i].tz = decomp[i].y.z;
+          }
+
+          // Upload motion transform to GPU
+          mem_copy_to(motion_transform_gpu);
+          delete[] reinterpret_cast<uint8_t *>(motion_transform_gpu.host_pointer);
+          motion_transform_gpu.host_pointer = 0;
+
+          // Disable instance transform if object uses motion transform already
+          instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
+
+          // Get traversable handle to motion transform
+          optixConvertPointerToTraversableHandle(context,
+                                                 motion_transform_gpu.device_pointer,
+                                                 OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
+                                                 &instance.traversableHandle);
+        }
+        else {
+          instance.traversableHandle = handle;
+
+          if (ob->mesh->is_instanced()) {
+            // Set transform matrix
+            memcpy(instance.transform, &ob->tfm, sizeof(instance.transform));
+          }
+          else {
+            // Disable instance transform if mesh already has it applied to vertex data
+            instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
+            // Non-instanced objects read ID from prim_object, so
+            // distinguish them from instanced objects with high bit set
+            instance.instanceId |= 0x800000;
+          }
+        }
+      }
+    }
+
+    // Upload instance descriptions
+    aabbs.resize(num_instances);
+    aabbs.copy_to_device();
+    instances.resize(num_instances);
+    instances.copy_to_device();
+
+    // Build top-level acceleration structure
+    OptixBuildInput build_input = {};
+    build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
+    build_input.instanceArray.instances = instances.device_pointer;
+    build_input.instanceArray.numInstances = num_instances;
+    build_input.instanceArray.aabbs = aabbs.device_pointer;
+    build_input.instanceArray.numAabbs = num_instances;
+
+    return build_optix_bvh(build_input, 0 /* TLAS has no motion itself */, out_data, tlas_handle);
+  }
+
+  void update_texture_info()
+  {
+    if (need_texture_info) {
+      texture_info.copy_to_device();
+      need_texture_info = false;
+    }
+  }
+
+  void update_launch_params(const char *name, size_t offset, void *data, size_t data_size)
+  {
+    const CUDAContextScope scope(cuda_context);
+
+    for (int i = 0; i < info.cpu_threads; ++i)
+      check_result_cuda(
+          cuMemcpyHtoD(launch_params.device_pointer + i * launch_params.data_elements + offset,
+                       data,
+                       data_size));
+
+    // Set constant memory for CUDA module
+    // TODO(pmours): This is only used for tonemapping (see 'launch_film_convert').
+    //               Could be removed by moving those functions to filter CUDA module.
+    size_t bytes = 0;
+    CUdeviceptr mem = 0;
+    check_result_cuda(cuModuleGetGlobal(&mem, &bytes, cuda_module, name));
+    assert(mem != NULL && bytes == data_size);
+    check_result_cuda(cuMemcpyHtoD(mem, data, data_size));
+  }
+
+  void mem_alloc(device_memory &mem) override
+  {
+    const CUDAContextScope scope(cuda_context);
+
+    mem.device_size = mem.memory_size();
+
+    if (mem.type == MEM_TEXTURE && mem.interpolation != INTERPOLATION_NONE) {
+      CUDAMem &cmem = cuda_mem_map[&mem];  // Lock and get associated memory information
+
+      CUDA_TEXTURE_DESC tex_desc = {};
+      tex_desc.flags = CU_TRSF_NORMALIZED_COORDINATES;
+      CUDA_RESOURCE_DESC res_desc = {};
+
+      switch (mem.extension) {
+        default:
+          assert(0);
+        case EXTENSION_REPEAT:
+          tex_desc.addressMode[0] = tex_desc.addressMode[1] = tex_desc.addressMode[2] =
+              CU_TR_ADDRESS_MODE_WRAP;
+          break;
+        case EXTENSION_EXTEND:
+          tex_desc.addressMode[0] = tex_desc.addressMode[1] = tex_desc.addressMode[2] =
+              CU_TR_ADDRESS_MODE_CLAMP;
+          break;
+        case EXTENSION_CLIP:
+          tex_desc.addressMode[0] = tex_desc.addressMode[1] = tex_desc.addressMode[2] =
+              CU_TR_ADDRESS_MODE_BORDER;
+          break;
+      }
+
+      switch (mem.interpolation) {
+        default:  // Default to linear for unsupported interpolation types
+        case INTERPOLATION_LINEAR:
+          tex_desc.filterMode = CU_TR_FILTER_MODE_LINEAR;
+          break;
+        case INTERPOLATION_CLOSEST:
+          tex_desc.filterMode = CU_TR_FILTER_MODE_POINT;
+          break;
+      }
+
+      CUarray_format format;
+      switch (mem.data_type) {
+        default:
+          assert(0);
+        case TYPE_UCHAR:
+          format = CU_AD_FORMAT_UNSIGNED_INT8;
+          break;
+        case TYPE_UINT16:
+          format = CU_AD_FORMAT_UNSIGNED_INT16;
+          break;
+        case TYPE_UINT:
+          format = CU_AD_FORMAT_UNSIGNED_INT32;
+          break;
+        case TYPE_INT:
+          format = CU_AD_FORMAT_SIGNED_INT32;
+          break;
+        case TYPE_FLOAT:
+          format = CU_AD_FORMAT_FLOAT;
+          break;
+        case TYPE_HALF:
+          format = CU_AD_FORMAT_HALF;
+          break;
+      }
+
+      if (mem.data_depth > 1) { /* 3D texture using array. */
+        CUDA_ARRAY3D_DESCRIPTOR desc;
+        desc.Width = mem.data_width;
+        desc.Height = mem.data_height;
+        desc.Depth = mem.data_depth;
+        desc.Format = format;
+        desc.NumChannels = mem.data_elements;
+        desc.Flags = 0;
+
+        check_result_cuda(cuArray3DCreate(&cmem.array, &desc));
+        mem.device_pointer = (device_ptr)cmem.array;
+
+        res_desc.resType = CU_RESOURCE_TYPE_ARRAY;
+        res_desc.res.array.hArray = cmem.array;
+      }
+      else if (mem.data_height > 0) { /* 2D texture using array. */
+        CUDA_ARRAY_DESCRIPTOR desc;
+        desc.Width = mem.data_width;
+        desc.Height = mem.data_height;
+        desc.Format = format;
+        desc.NumChannels = mem.data_elements;
+
+        check_result_cuda(cuArrayCreate(&cmem.array, &desc));
+        mem.device_pointer = (device_ptr)cmem.array;
+
+        res_desc.resType = CU_RESOURCE_TYPE_ARRAY;
+        res_desc.res.array.hArray = cmem.array;
+      }
+      else {
+        check_result_cuda(cuMemAlloc((CUdeviceptr *)&mem.device_pointer, mem.device_size));
+
+        res_desc.resType = CU_RESOURCE_TYPE_LINEAR;
+        res_desc.res.linear.devPtr = (CUdeviceptr)mem.device_pointer;
+        res_desc.res.linear.format = format;
+        res_desc.res.linear.numChannels = mem.data_elements;
+        res_desc.res.linear.sizeInBytes = mem.device_size;
+      }
+
+      check_result_cuda(cuTexObjectCreate(&cmem.texobject, &res_desc, &tex_desc, NULL));
+
+      int flat_slot = 0;
+      if (string_startswith(mem.name, "__tex_image")) {
+        flat_slot = atoi(mem.name + string(mem.name).rfind("_") + 1);
+      }
+
+      if (flat_slot >= texture_info.size())
+        texture_info.resize(flat_slot + 128);
+
+      TextureInfo &info = texture_info[flat_slot];
+      info.data = (uint64_t)cmem.texobject;
+      info.cl_buffer = 0;
+      info.interpolation = mem.interpolation;
+      info.extension = mem.extension;
+      info.width = mem.data_width;
+      info.height = mem.data_height;
+      info.depth = mem.data_depth;
+
+      // Texture information has changed and needs an update, delay this to next launch
+      need_texture_info = true;
+    }
+    else {
+      // This is not a texture but simple linear memory
+      check_result_cuda(cuMemAlloc((CUdeviceptr *)&mem.device_pointer, mem.device_size));
+
+      // Update data storage pointers in launch parameters
+#  define KERNEL_TEX(data_type, tex_name) \
+    if (strcmp(mem.name, #tex_name) == 0) \
+      update_launch_params( \
+          mem.name, offsetof(KernelParams, tex_name), &mem.device_pointer, sizeof(device_ptr));
+#  include "kernel/kernel_textures.h"
+#  undef KERNEL_TEX
+    }
+
+    stats.mem_alloc(mem.device_size);
+  }
+
+  void mem_copy_to(device_memory &mem) override
+  {
+    if (!mem.host_pointer || mem.host_pointer == mem.shared_pointer)
+      return;
+    if (!mem.device_pointer)
+      mem_alloc(mem);  // Need to allocate memory first if it does not exist yet
+
+    const CUDAContextScope scope(cuda_context);
+
+    if (mem.type == MEM_TEXTURE && mem.interpolation != INTERPOLATION_NONE) {
+      const CUDAMem &cmem = cuda_mem_map[&mem];  // Lock and get associated memory information
+
+      size_t src_pitch = mem.data_width * datatype_size(mem.data_type) * mem.data_elements;
+
+      if (mem.data_depth > 1) {
+        CUDA_MEMCPY3D param;
+        memset(&param, 0, sizeof(param));
+        param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+        param.dstArray = cmem.array;
+        param.srcMemoryType = CU_MEMORYTYPE_HOST;
+        param.srcHost = mem.host_pointer;
+        param.srcPitch = src_pitch;
+        param.WidthInBytes = param.srcPitch;
+        param.Height = mem.data_height;
+        param.Depth = mem.data_depth;
+
+        check_result_cuda(cuMemcpy3D(&param));
+      }
+      else if (mem.data_height > 0) {
+        CUDA_MEMCPY2D param;
+        memset(&param, 0, sizeof(param));
+        param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+        param.dstArray = cmem.array;
+        param.srcMemoryType = CU_MEMORYTYPE_HOST;
+        param.srcHost = mem.host_pointer;
+        param.srcPitch = src_pitch;
+        param.WidthInBytes = param.srcPitch;
+        param.Height = mem.data_height;
+
+        check_result_cuda(cuMemcpy2D(&param));
+      }
+      else {
+        check_result_cuda(
+            cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.device_size));
+      }
+    }
+    else {
+      // This is not a texture but simple linear memory
+      check_result_cuda(
+          cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.device_size));
+    }
+  }
+
+  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override
+  {
+    // Calculate linear memory offset and size
+    const size_t size = elem * w * h;
+    const size_t offset = elem * y * w;
+
+    if (mem.host_pointer && mem.device_pointer) {
+      const CUDAContextScope scope(cuda_context);
+      check_result_cuda(cuMemcpyDtoH(
+          (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size));
+    }
+    else if (mem.host_pointer) {
+      memset((char *)mem.host_pointer + offset, 0, size);
+    }
+  }
+
+  void mem_zero(device_memory &mem) override
+  {
+    if (mem.host_pointer)
+      memset(mem.host_pointer, 0, mem.memory_size());
+    if (mem.host_pointer && mem.host_pointer == mem.shared_pointer)
+      return;  // This is shared host memory, so no device memory to update
+
+    if (!mem.device_pointer)
+      mem_alloc(mem);  // Need to allocate memory first if it does not exist yet
+
+    const CUDAContextScope scope(cuda_context);
+    check_result_cuda(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
+  }
+
+  void mem_free(device_memory &mem) override
+  {
+    assert(mem.device_pointer);
+
+    const CUDAContextScope scope(cuda_context);
+
+    if (mem.type == MEM_TEXTURE && mem.interpolation != INTERPOLATION_NONE) {
+      CUDAMem &cmem = cuda_mem_map[&mem];  // Lock and get associated memory information
+
+      if (cmem.array)
+        cuArrayDestroy(cmem.array);
+      else
+        cuMemFree((CUdeviceptr)mem.device_pointer);
+
+      if (cmem.texobject)
+        cuTexObjectDestroy(cmem.texobject);
+    }
+    else {
+      // This is not a texture but simple linear memory
+      cuMemFree((CUdeviceptr)mem.device_pointer);
+    }
+
+    stats.mem_free(mem.device_size);
+
+    mem.device_size = 0;
+    mem.device_pointer = 0;
+  }
+
+  void const_copy_to(const char *name, void *host, size_t size) override
+  {
+    if (strcmp(name, "__data") == 0) {
+      assert(size <= sizeof(KernelData));
+
+      // Fix traversable handle on multi devices
+      KernelData *const data = (KernelData *)host;
+      *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;
+
+      update_launch_params(name, offsetof(KernelParams, data), host, size);
+    }
+  }
+
+  device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override
+  {
+    return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
+  }
+
+  void task_add(DeviceTask &task) override
+  {
+    // Upload texture information to device if it has changed since last launch
+    update_texture_info();
+
+    // Split task into smaller ones
+    list<DeviceTask> tasks;
+    task.split(tasks, info.cpu_threads);
+
+    // Queue tasks in internal task pool
+    struct OptiXDeviceTask : public DeviceTask {
+      OptiXDeviceTask(OptiXDevice *device, DeviceTask &task, int task_index) : DeviceTask(task)
+      {
+        // Using task index parameter instead of thread index, since number of CUDA streams may
+        // differ from number of threads
+        run = function_bind(&OptiXDevice::thread_run, device, *this, task_index);
+      }
+    };
+
+    int task_index = 0;
+    for (DeviceTask &task : tasks)
+      task_pool.push(new OptiXDeviceTask(this, task, task_index++));
+  }
+
+  void task_wait() override
+  {
+    // Wait for all queued tasks to finish
+    task_pool.wait_work();
+  }
+
+  void task_cancel() override
+  {
+    // Cancel any remaining tasks in the internal pool
+    task_pool.cancel();
+  }
+
+#  define CUDA_GET_BLOCKSIZE(func, w, h) \
+    int threads; \
+    check_result_cuda_ret( \
+        cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
+    threads = (int)sqrt((float)threads); \
+    int xblocks = ((w) + threads - 1) / threads; \
+    int yblocks = ((h) + threads - 1) / threads;
+
+#  define CUDA_LAUNCH_KERNEL(func, args) \
+    check_result_cuda_ret(cuLaunchKernel( \
+        func, xblocks, yblocks, 1, threads, threads, 1, 0, cuda_stream[thread_index], args, 0));
+
+  /* Similar as above, but for 1-dimensional blocks. */
+#  define CUDA_GET_BLOCKSIZE_1D(func, w, h) \
+    int threads; \
+    check_result_cuda_ret( \
+        cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
+    int xblocks = ((w) + threads - 1) / threads; \
+    int yblocks = h;
+
+#  define CUDA_LAUNCH_KERNEL_1D(func, args) \
+    check_result_cuda_ret(cuLaunchKernel( \
+        func, xblocks, yblocks, 1, threads, 1, 1, 0, cuda_stream[thread_index], args, 0));
+
+  bool denoising_non_local_means(device_ptr image_ptr,
+                                 device_ptr guide_ptr,
+                                 device_ptr variance_ptr,
+                                 device_ptr out_ptr,
+                                 DenoisingTask *task,
+                                 int thread_index)
+  {
+    if (have_error())
+      return false;
+
+    int stride = task->buffer.stride;
+    int w = task->buffer.width;
+    int h = task->buffer.h;
+    int r = task->nlm_state.r;
+    int f = task->nlm_state.f;
+    float a = task->nlm_state.a;
+    float k_2 = task->nlm_state.k_2;
+
+    int pass_stride = task->buffer.pass_stride;
+    int num_shifts = (2 * r + 1) * (2 * r + 1);
+    int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
+    int frame_offset = 0;
+
+    CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer;
+    CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
+    CUdeviceptr weightAccum = difference + 2 * sizeof(float) * pass_stride * num_shifts;
+    CUdeviceptr scale_ptr = 0;
+
+    check_result_cuda_ret(
+        cuMemsetD8Async(weightAccum, 0, sizeof(float) * pass_stride, cuda_stream[thread_index]));
+    check_result_cuda_ret(
+        cuMemsetD8Async(out_ptr, 0, sizeof(float) * pass_stride, cuda_stream[thread_index]));
+
+    {
+      CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput;
+      check_result_cuda_ret(cuModuleGetFunction(
+          &cuNLMCalcDifference, cuda_filter_module, "kernel_cuda_filter_nlm_calc_difference"));
+      check_result_cuda_ret(
+          cuModuleGetFunction(&cuNLMBlur, cuda_filter_module, "kernel_cuda_filter_nlm_blur"));
+      check_result_cuda_ret(cuModuleGetFunction(
+          &cuNLMCalcWeight, cuda_filter_module, "kernel_cuda_filter_nlm_calc_weight"));
+      check_result_cuda_ret(cuModuleGetFunction(
+          &cuNLMUpdateOutput, cuda_filter_module, "kernel_cuda_filter_nlm_update_output"));
+
+      check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+      check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
+      check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
+      check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
+
+      CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w * h, num_shifts);
+
+      void *calc_difference_args[] = {&guide_ptr,
+                                      &variance_ptr,
+                                      &scale_ptr,
+                                      &difference,
+                                      &w,
+                                      &h,
+                                      &stride,
+                                      &pass_stride,
+                                      &r,
+                                      &channel_offset,
+                                      &frame_offset,
+                                      &a,
+                                      &k_2};
+      void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
+      void *calc_weight_args[] = {
+          &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
+      void *update_output_args[] = {&blurDifference,
+                                    &image_ptr,
+                                    &out_ptr,
+                                    &weightAccum,
+                                    &w,
+                                    &h,
+                                    &stride,
+                                    &pass_stride,
+                                    &channel_offset,
+                                    &r,
+                                    &f};
+
+      CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
+      CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+      CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
+      CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+      CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args);
+    }
+
+    {
+      CUfunction cuNLMNormalize;
+      check_result_cuda_ret(cuModuleGetFunction(
+          &cuNLMNormalize, cuda_filter_module, "kernel_cuda_filter_nlm_normalize"));
+      check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
+      void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride};
+      CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h);
+      CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
+      check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
+    }
+
+    return !have_error();
+  }
+
+  bool denoising_construct_transform(DenoisingTask *task, int thread_index)
+  {
+    if (have_error())
+      return false;
+
+    CUfunction cuFilterConstructTransform;
+    check_result_cuda_ret(cuModuleGetFunction(&cuFilterConstructTransform,
+                                              cuda_filter_module,
+                                              "kernel_cuda_filter_construct_transform"));
+    check_result_cuda_ret(
+        cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
+    CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, task->storage.w, task->storage.h);
+
+    void *args[] = {&task->buffer.mem.device_pointer,
+                    &task->tile_info_mem.device_pointer,
+                    &task->storage.transform.device_pointer,
+                    &task->storage.rank.device_pointer,
+                    &task->filter_area,
+                    &task->rect,
+                    &task->radius,
+                    &task->pca_threshold,
+                    &task->buffer.pass_stride,
+                    &task->buffer.frame_stride,
+                    &task->buffer.use_time};
+    CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
+    check_result_cuda_ret(cuCtxSynchronize());
+
+    return !have_error();
+  }
+
+  bool denoising_accumulate(device_ptr color_ptr,
+                            device_ptr color_variance_ptr,
+                            device_ptr scale_ptr,
+                            int frame,
+                            DenoisingTask *task,
+                            int thread_index)
+  {
+    if (have_error())
+      return false;
+
+    int r = task->radius;
+    int f = 4;
+    float a = 1.0f;
+    float k_2 = task->nlm_k_2;
+
+    int w = task->reconstruction_state.source_w;
+    int h = task->reconstruction_state.source_h;
+    int stride = task->buffer.stride;
+    int frame_offset = frame * task->buffer.frame_stride;
+    int t = task->tile_info->frames[frame];
+
+    int pass_stride = task->buffer.pass_stride;
+    int num_shifts = (2 * r + 1) * (2 * r + 1);
+
+    CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer;
+    CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
+
+    CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
+    check_result_cuda_ret(cuModuleGetFunction(
+        &cuNLMCalcDifference, cuda_filter_module, "kernel_cuda_filter_nlm_calc_difference"));
+    check_result_cuda_ret(
+        cuModuleGetFunction(&cuNLMBlur, cuda_filter_module, "kernel_cuda_filter_nlm_blur"));
+    check_result_cuda_ret(cuModuleGetFunction(
+        &cuNLMCalcWeight, cuda_filter_module, "kernel_cuda_filter_nlm_calc_weight"));
+    check_result_cuda_ret(cuModuleGetFunction(
+        &cuNLMConstructGramian, cuda_filter_module, "kernel_cuda_filter_nlm_construct_gramian"));
+
+    check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+    check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
+    check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
+    check_result_cuda_ret(
+        cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
+
+    CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
+                          task->reconstruction_state.source_w *
+                              task->reconstruction_state.source_h,
+                          num_shifts);
+
+    void *calc_difference_args[] = {&color_ptr,
+                                    &color_variance_ptr,
+                                    &scale_ptr,
+                                    &difference,
+                                    &w,
+                                    &h,
+                                    &stride,
+                                    &pass_stride,
+                                    &r,
+                                    &pass_stride,
+                                    &frame_offset,
+                                    &a,
+                                    &k_2};
+    void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
+    void *calc_weight_args[] = {
+        &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
+    void *construct_gramian_args[] = {&t,
+                                      &blurDifference,
+                                      &task->buffer.mem.device_pointer,
+                                      &task->storage.transform.device_pointer,
+                                      &task->storage.rank.device_pointer,
+                                      &task->storage.XtWX.device_pointer,
+                                      &task->storage.XtWY.device_pointer,
+                                      &task->reconstruction_state.filter_window,
+                                      &w,
+                                      &h,
+                                      &stride,
+                                      &pass_stride,
+                                      &r,
+                                      &f,
+                                      &frame_offset,
+                                      &task->buffer.use_time};
+
+    CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
+    CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+    CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
+    CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+    CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
+    check_result_cuda_ret(cuCtxSynchronize());
+
+    return !have_error();
+  }
+
+  bool denoising_solve(device_ptr output_ptr, DenoisingTask *task, int thread_index)
+  {
+    if (have_error())
+      return false;
+
+    CUfunction cuFinalize;
+    check_result_cuda_ret(
+        cuModuleGetFunction(&cuFinalize, cuda_filter_module, "kernel_cuda_filter_finalize"));
+    check_result_cuda_ret(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
+    void *finalize_args[] = {&output_ptr,
+                             &task->storage.rank.device_pointer,
+                             &task->storage.XtWX.device_pointer,
+                             &task->storage.XtWY.device_pointer,
+                             &task->filter_area,
+                             &task->reconstruction_state.buffer_params.x,
+                             &task->render_buffer.samples};
+    CUDA_GET_BLOCKSIZE(
+        cuFinalize, task->reconstruction_state.source_w, task->reconstruction_state.source_h);
+    CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
+    check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
+
+    return !have_error();
+  }
+
+  bool denoising_combine_halves(device_ptr a_ptr,
+                                device_ptr b_ptr,
+                                device_ptr mean_ptr,
+                                device_ptr variance_ptr,
+                                int r,
+                                int4 rect,
+                                DenoisingTask *task,
+                                int thread_index)
+  {
+    if (have_error())
+      return false;
+
+    CUfunction cuFilterCombineHalves;
+    check_result_cuda_ret(cuModuleGetFunction(
+        &cuFilterCombineHalves, cuda_filter_module, "kernel_cuda_filter_combine_halves"));
+    check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
+    CUDA_GET_BLOCKSIZE(
+        cuFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+    void *args[] = {&mean_ptr, &variance_ptr, &a_ptr, &b_ptr, &rect, &r};
+    CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
+    check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
+
+    return !have_error();
+  }
+
+  bool denoising_divide_shadow(device_ptr a_ptr,
+                               device_ptr b_ptr,
+                               device_ptr sample_variance_ptr,
+                               device_ptr sv_variance_ptr,
+                               device_ptr buffer_variance_ptr,
+                               DenoisingTask *task,
+                               int thread_index)
+  {
+    if (have_error())
+      return false;
+
+    CUfunction cuFilterDivideShadow;
+    check_result_cuda_ret(cuModuleGetFunction(
+        &cuFilterDivideShadow, cuda_filter_module, "kernel_cuda_filter_divide_shadow"));
+    check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
+    CUDA_GET_BLOCKSIZE(
+        cuFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+    void *args[] = {&task->render_buffer.samples,
+                    &task->tile_info_mem.device_pointer,
+                    &a_ptr,
+                    &b_ptr,
+                    &sample_variance_ptr,
+                    &sv_variance_ptr,
+                    &buffer_variance_ptr,
+                    &task->rect,
+                    &task->render_buffer.pass_stride,
+                    &task->render_buffer.offset};
+    CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
+    check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
+
+    return !have_error();
+  }
+
+  bool denoising_get_feature(int mean_offset,
+                             int variance_offset,
+                             device_ptr mean_ptr,
+                             device_ptr variance_ptr,
+                             float scale,
+                             DenoisingTask *task,
+                             int thread_index)
+  {
+    if (have_error())
+      return false;
+
+    CUfunction cuFilterGetFeature;
+    check_result_cuda_ret(cuModuleGetFunction(
+        &cuFilterGetFeature, cuda_filter_module, "kernel_cuda_filter_get_feature"));
+    check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
+    CUDA_GET_BLOCKSIZE(
+        cuFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+    void *args[] = {&task->render_buffer.samples,
+                    &task->tile_info_mem.device_pointer,
+                    &mean_offset,
+                    &variance_offset,
+                    &mean_ptr,
+                    &variance_ptr,
+                    &scale,
+                    &task->rect,
+                    &task->render_buffer.pass_stride,
+                    &task->render_buffer.offset};
+    CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
+    check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
+
+    return !have_error();
+  }
+
+  bool denoising_write_feature(int out_offset,
+                               device_ptr from_ptr,
+                               device_ptr buffer_ptr,
+                               DenoisingTask *task,
+                               int thread_index)
+  {
+    if (have_error())
+      return false;
+
+    CUfunction cuFilterWriteFeature;
+    check_result_cuda_ret(cuModuleGetFunction(
+        &cuFilterWriteFeature, cuda_filter_module, "kernel_cuda_filter_write_feature"));
+    check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1));
+    CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, task->filter_area.z, task->filter_area.w);
+
+    void *args[] = {&task->render_buffer.samples,
+                    &task->reconstruction_state.buffer_params,
+                    &task->filter_area,
+                    &from_ptr,
+                    &buffer_ptr,
+                    &out_offset,
+                    &task->rect};
+    CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args);
+    check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
+
+    return !have_error();
+  }
+
+  bool denoising_detect_outliers(device_ptr image_ptr,
+                                 device_ptr variance_ptr,
+                                 device_ptr depth_ptr,
+                                 device_ptr output_ptr,
+                                 DenoisingTask *task,
+                                 int thread_index)
+  {
+    if (have_error())
+      return false;
+
+    CUfunction cuFilterDetectOutliers;
+    check_result_cuda_ret(cuModuleGetFunction(
+        &cuFilterDetectOutliers, cuda_filter_module, "kernel_cuda_filter_detect_outliers"));
+    check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1));
+    CUDA_GET_BLOCKSIZE(
+        cuFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+    void *args[] = {&image_ptr,
+                    &variance_ptr,
+                    &depth_ptr,
+                    &output_ptr,
+                    &task->rect,
+                    &task->buffer.pass_stride};
+
+    CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
+    check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index]));
+
+    return !have_error();
+  }
+};
+
+bool device_optix_init()
+{
+  if (g_optixFunctionTable.optixDeviceContextCreate != NULL)
+    return true;  // Already initialized function table
+
+  // Need to initialize CUDA as well
+  if (!device_cuda_init())
+    return false;
+
+#  ifdef WITH_CUDA_DYNLOAD
+  // Load NVRTC function pointers for adaptive kernel compilation
+  if (DebugFlags().cuda.adaptive_compile && cuewInit(CUEW_INIT_NVRTC) != CUEW_SUCCESS) {
+    VLOG(1)
+        << "CUEW initialization failed for NVRTC. Adaptive kernel compilation won't be available.";
+  }
+#  endif
+
+  const OptixResult result = optixInit();
+
+  if (result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) {
+    VLOG(1)
+        << "OptiX initialization failed because the installed driver does not support ABI version "
+        << OPTIX_ABI_VERSION;
+    return false;
+  }
+  else if (result != OPTIX_SUCCESS) {
+    VLOG(1) << "OptiX initialization failed with error code " << (unsigned int)result;
+    return false;
+  }
+
+  // Loaded OptiX successfully!
+  return true;
+}
+
+void device_optix_info(vector<DeviceInfo> &devices)
+{
+  // Simply add all supported CUDA devices as OptiX devices again
+  vector<DeviceInfo> cuda_devices;
+  device_cuda_info(cuda_devices);
+
+  for (auto it = cuda_devices.begin(); it != cuda_devices.end();) {
+    DeviceInfo &info = *it;
+    assert(info.type == DEVICE_CUDA);
+    info.type = DEVICE_OPTIX;
+    info.id += "_OptiX";
+
+    // Figure out RTX support
+    CUdevice cuda_device = 0;
+    CUcontext cuda_context = NULL;
+    unsigned int rtcore_version = 0;
+    if (cuDeviceGet(&cuda_device, info.num) == CUDA_SUCCESS &&
+        cuDevicePrimaryCtxRetain(&cuda_context, cuda_device) == CUDA_SUCCESS) {
+      OptixDeviceContext optix_context = NULL;
+      if (optixDeviceContextCreate(cuda_context, nullptr, &optix_context) == OPTIX_SUCCESS) {
+        optixDeviceContextGetProperty(optix_context,
+                                      OPTIX_DEVICE_PROPERTY_RTCORE_VERSION,
+                                      &rtcore_version,
+                                      sizeof(rtcore_version));
+        optixDeviceContextDestroy(optix_context);
+      }
+      cuDevicePrimaryCtxRelease(cuda_device);
+    }
+
+    // Only add devices with RTX support
+    if (rtcore_version == 0)
+      it = cuda_devices.erase(it);
+    else
+      ++it;
+  }
+
+  devices.insert(devices.end(), cuda_devices.begin(), cuda_devices.end());
+}
+
+Device *device_optix_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
+{
+  return new OptiXDevice(info, stats, profiler, background);
+}
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 41e57bb3e43..ea8aa197b6f 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -64,6 +64,10 @@ set(SRC_OPENCL_KERNELS
   kernels/opencl/filter.cl
 )
 
+set(SRC_OPTIX_KERNELS
+  kernels/optix/kernel_optix.cu
+)
+
 set(SRC_BVH_HEADERS
   bvh/bvh.h
   bvh/bvh_nodes.h
@@ -95,6 +99,7 @@ set(SRC_HEADERS
   kernel_color.h
   kernel_compat_cpu.h
   kernel_compat_cuda.h
+  kernel_compat_optix.h
   kernel_compat_opencl.h
   kernel_differential.h
   kernel_emission.h
@@ -140,6 +145,9 @@ set(SRC_KERNELS_CUDA_HEADERS
   kernels/cuda/kernel_cuda_image.h
 )
 
+set(SRC_KERNELS_OPTIX_HEADERS
+)
+
 set(SRC_KERNELS_OPENCL_HEADERS
   kernels/opencl/kernel_split_function.h
   kernels/opencl/kernel_opencl_image.h
@@ -168,7 +176,7 @@ set(SRC_CLOSURE_HEADERS
   closure/volume.h
   closure/bsdf_principled_diffuse.h
   closure/bsdf_principled_sheen.h
-    closure/bsdf_hair_principled.h
+  closure/bsdf_hair_principled.h
 )
 
 set(SRC_SVM_HEADERS
@@ -476,6 +484,53 @@ if(WITH_CYCLES_CUDA_BINARIES)
   cycles_set_solution_folder(cycles_kernel_cuda)
 endif()
 
+# OptiX PTX modules
+
+if(WITH_CYCLES_DEVICE_OPTIX)
+  foreach(input ${SRC_OPTIX_KERNELS})
+    get_filename_component(input_we ${input} NAME_WE)
+
+    set(output "${CMAKE_CURRENT_BINARY_DIR}/${input_we}.ptx")
+    set(cuda_flags
+      -I "${OPTIX_INCLUDE_DIR}"
+      -I "${CMAKE_CURRENT_SOURCE_DIR}/.."
+      -I "${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda"
+      -arch=sm_30
+      --use_fast_math
+      -o ${output})
+
+    if(WITH_CYCLES_DEBUG)
+      set(cuda_flags ${cuda_flags}
+        -D __KERNEL_DEBUG__)
+    endif()
+
+    add_custom_command(
+      OUTPUT
+        ${output}
+      DEPENDS
+        ${input}
+        ${SRC_HEADERS}
+        ${SRC_KERNELS_CUDA_HEADERS}
+        ${SRC_KERNELS_OPTIX_HEADERS}
+        ${SRC_BVH_HEADERS}
+        ${SRC_SVM_HEADERS}
+        ${SRC_GEOM_HEADERS}
+        ${SRC_CLOSURE_HEADERS}
+        ${SRC_UTIL_HEADERS}
+      COMMAND
+        ${CUDA_NVCC_EXECUTABLE} --ptx ${cuda_flags} ${input}
+      WORKING_DIRECTORY
+        "${CMAKE_CURRENT_SOURCE_DIR}")
+
+    list(APPEND optix_ptx ${output})
+
+    delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${output}" ${CYCLES_INSTALL_PATH}/lib)
+  endforeach()
+
+  add_custom_target(cycles_kernel_optix ALL DEPENDS ${optix_ptx})
+  cycles_set_solution_folder(cycles_kernel_optix)
+endif()
+
 # OSL module
 
 if(WITH_CYCLES_OSL)
@@ -535,10 +590,12 @@ endif()
 cycles_add_library(cycles_kernel "${LIB}"
   ${SRC_CPU_KERNELS}
   ${SRC_CUDA_KERNELS}
+  ${SRC_OPTIX_KERNELS}
   ${SRC_OPENCL_KERNELS}
   ${SRC_HEADERS}
   ${SRC_KERNELS_CPU_HEADERS}
   ${SRC_KERNELS_CUDA_HEADERS}
+  ${SRC_KERNELS_OPTIX_HEADERS}
   ${SRC_KERNELS_OPENCL_HEADERS}
   ${SRC_BVH_HEADERS}
   ${SRC_CLOSURE_HEADERS}
@@ -548,9 +605,24 @@ cycles_add_library(cycles_kernel "${LIB}"
   ${SRC_SPLIT_HEADERS}
 )
 
+source_group("bvh" FILES ${SRC_BVH_HEADERS})
+source_group("closure" FILES ${SRC_CLOSURE_HEADERS})
+source_group("filter" FILES ${SRC_FILTER_HEADERS})
+source_group("geom" FILES ${SRC_GEOM_HEADERS})
+source_group("kernel" FILES ${SRC_HEADERS})
+source_group("kernel\\split" FILES ${SRC_SPLIT_HEADERS})
+source_group("kernels\\cpu" FILES ${SRC_CPU_KERNELS} ${SRC_KERNELS_CPU_HEADERS})
+source_group("kernels\\cuda" FILES ${SRC_CUDA_KERNELS} ${SRC_KERNELS_CUDA_HEADERS})
+source_group("kernels\\opencl" FILES ${SRC_OPENCL_KERNELS} ${SRC_KERNELS_OPENCL_HEADERS})
+source_group("kernels\\optix" FILES ${SRC_OPTIX_KERNELS} ${SRC_KERNELS_OPTIX_HEADERS})
+source_group("svm" FILES ${SRC_SVM_HEADERS})
+
 if(WITH_CYCLES_CUDA)
   add_dependencies(cycles_kernel cycles_kernel_cuda)
 endif()
+if(WITH_CYCLES_DEVICE_OPTIX)
+  add_dependencies(cycles_kernel cycles_kernel_optix)
+endif()
 
 # OpenCL kernel
 
@@ -564,9 +636,11 @@ endif()
 
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_OPENCL_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CUDA_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_OPTIX_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/optix)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_OPENCL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_OPTIX_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/optix)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_FILTER_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/filter)
diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp
index 9be078b6fca..cffe2bfa70a 100644
--- a/intern/cycles/render/mesh.cpp
+++ b/intern/cycles/render/mesh.cpp
@@ -1139,9 +1139,9 @@ int Mesh::motion_step(float time) const
   return -1;
 }
 
-bool Mesh::need_build_bvh(BVHLayout) const
+bool Mesh::need_build_bvh(BVHLayout layout) const
 {
-  return !transform_applied || has_surface_bssrdf;
+  return !transform_applied || has_surface_bssrdf || layout == BVH_LAYOUT_OPTIX;
 }
 
 bool Mesh::is_instanced() const
diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp
index e52fda26845..3ce65802cff 100644
--- a/intern/cycles/util/util_debug.cpp
+++ b/intern/cycles/util/util_debug.cpp
@@ -86,6 +86,16 @@ void DebugFlags::CUDA::reset()
   split_kernel = false;
 }
 
+DebugFlags::OptiX::OptiX()
+{
+  reset();
+}
+
+void DebugFlags::OptiX::reset()
+{
+  cuda_streams = 1;
+}
+
 DebugFlags::OpenCL::OpenCL() : device_type(DebugFlags::OpenCL::DEVICE_ALL), debug(false)
 {
   reset();
@@ -130,6 +140,7 @@ void DebugFlags::reset()
   viewport_static_bvh = false;
   cpu.reset();
   cuda.reset();
+  optix.reset();
   opencl.reset();
 }
 
@@ -145,7 +156,10 @@ std::ostream &operator<<(std::ostream &os, DebugFlagsConstRef debug_flags)
      << "  Split      : " << string_from_bool(debug_flags.cpu.split_kernel) << "\n";
 
   os << "CUDA flags:\n"
-     << " Adaptive Compile: " << string_from_bool(debug_flags.cuda.adaptive_compile) << "\n";
+     << "  Adaptive Compile : " << string_from_bool(debug_flags.cuda.adaptive_compile) << "\n";
+
+  os << "OptiX flags:\n"
+     << "  CUDA streams : " << debug_flags.optix.cuda_streams << "\n";
 
   const char *opencl_device_type;
   switch (debug_flags.opencl.device_type) {
diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h
index 5b0004ea768..cf6b442b878 100644
--- a/intern/cycles/util/util_debug.h
+++ b/intern/cycles/util/util_debug.h
@@ -99,6 +99,17 @@ class DebugFlags {
     bool split_kernel;
   };
 
+  /* Descriptor of OptiX feature-set to be used. */
+  struct OptiX {
+    OptiX();
+
+    /* Reset flags to their defaults. */
+    void reset();
+
+    /* Number of CUDA streams to launch kernels concurrently from. */
+    int cuda_streams;
+  };
+
   /* Descriptor of OpenCL feature-set to be used. */
   struct OpenCL {
     OpenCL();
@@ -165,6 +176,9 @@ class DebugFlags {
   /* Requested CUDA flags. */
   CUDA cuda;
 
+  /* Requested OptiX flags. */
+  OptiX optix;
+
   /* Requested OpenCL flags. */
   OpenCL opencl;
 
-- 
cgit v1.2.3