7 files changed, 355 insertions, 267 deletions
diff --git a/intern/cycles/device/cuda/device_cuda.h b/intern/cycles/device/cuda/device_cuda.h
index e5e3e24165d..c3271c3cfcf 100644
--- a/intern/cycles/device/cuda/device_cuda.h
+++ b/intern/cycles/device/cuda/device_cuda.h
@@ -71,6 +71,7 @@ class CUDADevice : public Device {
   };
   typedef map<device_memory *, CUDAMem> CUDAMemMap;
   CUDAMemMap cuda_mem_map;
+  thread_mutex cuda_mem_map_mutex;
 
   struct PixelMem {
     GLuint cuPBO;
diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp
index 01adf10f252..6c26b247743 100644
--- a/intern/cycles/device/cuda/device_cuda_impl.cpp
+++ b/intern/cycles/device/cuda/device_cuda_impl.cpp
@@ -718,8 +718,10 @@ void CUDADevice::init_host_memory()
 void CUDADevice::load_texture_info()
 {
   if (need_texture_info) {
-    texture_info.copy_to_device();
+    /* Unset flag before copying, so this does not loop indenfinetly if the copy below calls
+     * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
     need_texture_info = false;
+    texture_info.copy_to_device();
   }
 }
 
@@ -988,6 +990,7 @@ void CUDADevice::mem_alloc(device_memory &mem)
     assert(!"mem_alloc not supported for global memory.");
   }
   else {
+    thread_scoped_lock lock(cuda_mem_map_mutex);
     generic_alloc(mem);
   }
 }
@@ -1006,10 +1009,10 @@ void CUDADevice::mem_copy_to(device_memory &mem)
     tex_alloc((device_texture &)mem);
   }
   else {
+    thread_scoped_lock lock(cuda_mem_map_mutex);
     if (!mem.device_pointer) {
       generic_alloc(mem);
     }
-
     generic_copy_to(mem);
   }
 }
@@ -1048,6 +1051,7 @@ void CUDADevice::mem_zero(device_memory &mem)
 
   /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
    * regardless of mem.host_pointer and mem.shared_pointer. */
+  thread_scoped_lock lock(cuda_mem_map_mutex);
   if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
     const CUDAContextScope scope(this);
     cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
@@ -1069,6 +1073,7 @@ void CUDADevice::mem_free(device_memory &mem)
     tex_free((device_texture &)mem);
   }
   else {
+    thread_scoped_lock lock(cuda_mem_map_mutex);
     generic_free(mem);
   }
 }
@@ -1092,6 +1097,7 @@ void CUDADevice::const_copy_to(const char *name, void *host, size_t size)
 void CUDADevice::global_alloc(device_memory &mem)
 {
   if (mem.is_resident(this)) {
+    thread_scoped_lock lock(cuda_mem_map_mutex);
     generic_alloc(mem);
     generic_copy_to(mem);
   }
@@ -1102,6 +1108,7 @@ void CUDADevice::global_alloc(device_memory &mem)
 void CUDADevice::global_free(device_memory &mem)
 {
   if (mem.is_resident(this) && mem.device_pointer) {
+    thread_scoped_lock lock(cuda_mem_map_mutex);
     generic_free(mem);
   }
 }
@@ -1170,6 +1177,8 @@ void CUDADevice::tex_alloc(device_texture &mem)
   size_t src_pitch = mem.data_width * dsize * mem.data_elements;
   size_t dst_pitch = src_pitch;
 
+  thread_scoped_lock lock(cuda_mem_map_mutex);
+
   if (!mem.is_resident(this)) {
     cmem = &cuda_mem_map[&mem];
     cmem->texobject = 0;
@@ -1257,6 +1266,9 @@ void CUDADevice::tex_alloc(device_texture &mem)
     cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
   }
 
+  /* Unlock mutex before resizing texture info, since that may attempt to lock it again. */
+  lock.unlock();
+
   /* Resize once */
   const uint slot = mem.slot;
   if (slot >= texture_info.size()) {
@@ -1305,6 +1317,11 @@ void CUDADevice::tex_alloc(device_texture &mem)
     texDesc.filterMode = filter_mode;
     texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
 
+    /* Lock again and refresh the data pointer (in case another thread modified the map in the
+     * meantime). */
+    lock.lock();
+    cmem = &cuda_mem_map[&mem];
+
     cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
 
     texture_info[slot].data = (uint64_t)cmem->texobject;
@@ -1318,6 +1335,7 @@ void CUDADevice::tex_free(device_texture &mem)
 {
   if (mem.device_pointer) {
     CUDAContextScope scope(this);
+    thread_scoped_lock lock(cuda_mem_map_mutex);
     const CUDAMem &cmem = cuda_mem_map[&mem];
 
     if (cmem.texobject) {
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index eb8fb8040e3..1efd628b79b 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -17,6 +17,8 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include "bvh/bvh2.h"
+
 #include "device/device.h"
 #include "device/device_intern.h"
 
@@ -364,6 +366,19 @@ void Device::draw_pixels(device_memory &rgba,
   }
 }
 
+void Device::build_bvh(BVH *bvh, Progress &progress, bool refit)
+{
+  assert(bvh->params.bvh_layout == BVH_LAYOUT_BVH2);
+
+  BVH2 *const bvh2 = static_cast<BVH2 *>(bvh);
+  if (refit) {
+    bvh2->refit(progress);
+  }
+  else {
+    bvh2->build(progress, &stats);
+  }
+}
+
 Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
 {
 #ifdef WITH_MULTI
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 2006db02ce7..e9b7cde7a16 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -373,12 +373,6 @@ class Device {
     return NULL;
   }
 
-  /* Device specific pointer for BVH creation. Currently only used by Embree. */
-  virtual void *bvh_device() const
-  {
-    return NULL;
-  }
-
   /* load/compile kernels, must be called before adding tasks */
   virtual bool load_kernels(const DeviceRequestedFeatures & /*requested_features*/)
   {
@@ -427,10 +421,7 @@ class Device {
                            const DeviceDrawParams &draw_params);
 
   /* acceleration structure building */
-  virtual bool build_optix_bvh(BVH *)
-  {
-    return false;
-  }
+  virtual void build_bvh(BVH *bvh, Progress &progress, bool refit);
 
 #ifdef WITH_NETWORK
   /* networking */
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 6912ac1e638..fea4fc53d1f 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -47,6 +47,8 @@
 #include "kernel/osl/osl_globals.h"
 // clang-format on
 
+#include "bvh/bvh_embree.h"
+
 #include "render/buffers.h"
 #include "render/coverage.h"
 
@@ -188,6 +190,7 @@ class CPUDevice : public Device {
 #endif
   thread_spin_lock oidn_task_lock;
 #ifdef WITH_EMBREE
+  RTCScene embree_scene = NULL;
   RTCDevice embree_device;
 #endif
 
@@ -472,6 +475,15 @@ class CPUDevice : public Device {
 
   virtual void const_copy_to(const char *name, void *host, size_t size) override
   {
+#if WITH_EMBREE
+    if (strcmp(name, "__data") == 0) {
+      assert(size <= sizeof(KernelData));
+
+      // Update scene handle (since it is different for each device on multi devices)
+      KernelData *const data = (KernelData *)host;
+      data->bvh.scene = embree_scene;
+    }
+#endif
     kernel_const_copy(&kernel_globals, name, host, size);
   }
 
@@ -537,13 +549,26 @@ class CPUDevice : public Device {
 #endif
   }
 
-  void *bvh_device() const override
+  void build_bvh(BVH *bvh, Progress &progress, bool refit) override
   {
 #ifdef WITH_EMBREE
-    return embree_device;
-#else
-    return NULL;
+    if (bvh->params.bvh_layout == BVH_LAYOUT_EMBREE ||
+        bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE) {
+      BVHEmbree *const bvh_embree = static_cast<BVHEmbree *>(bvh);
+      if (refit) {
+        bvh_embree->refit(progress);
+      }
+      else {
+        bvh_embree->build(progress, &stats, embree_device);
+      }
+
+      if (bvh->params.top_level) {
+        embree_scene = bvh_embree->scene;
+      }
+    }
+    else
 #endif
+      Device::build_bvh(bvh, progress, refit);
   }
 
   void thread_run(DeviceTask &task)
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index 2e72a0b4393..ed6e764b995 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -17,11 +17,14 @@
 #include <sstream>
 #include <stdlib.h>
 
+#include "bvh/bvh_multi.h"
+
 #include "device/device.h"
 #include "device/device_intern.h"
 #include "device/device_network.h"
 
 #include "render/buffers.h"
+#include "render/geometry.h"
 
 #include "util/util_foreach.h"
 #include "util/util_list.h"
@@ -164,9 +167,24 @@ class MultiDevice : public Device {
   virtual BVHLayoutMask get_bvh_layout_mask() const
   {
     BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_ALL;
+    BVHLayoutMask bvh_layout_mask_all = BVH_LAYOUT_NONE;
     foreach (const SubDevice &sub_device, devices) {
-      bvh_layout_mask &= sub_device.device->get_bvh_layout_mask();
+      BVHLayoutMask device_bvh_layout_mask = sub_device.device->get_bvh_layout_mask();
+      bvh_layout_mask &= device_bvh_layout_mask;
+      bvh_layout_mask_all |= device_bvh_layout_mask;
+    }
+
+    /* With multiple OptiX devices, every device needs its own acceleration structure */
+    if (bvh_layout_mask == BVH_LAYOUT_OPTIX) {
+      return BVH_LAYOUT_MULTI_OPTIX;
     }
+
+    /* When devices do not share a common BVH layout, fall back to creating one for each */
+    const BVHLayoutMask BVH_LAYOUT_OPTIX_EMBREE = (BVH_LAYOUT_OPTIX | BVH_LAYOUT_EMBREE);
+    if ((bvh_layout_mask_all & BVH_LAYOUT_OPTIX_EMBREE) == BVH_LAYOUT_OPTIX_EMBREE) {
+      return BVH_LAYOUT_MULTI_OPTIX_EMBREE;
+    }
+
     return bvh_layout_mask;
   }
 
@@ -227,21 +245,58 @@ class MultiDevice : public Device {
     return result;
   }
 
-  bool build_optix_bvh(BVH *bvh)
+  void build_bvh(BVH *bvh, Progress &progress, bool refit) override
   {
+    /* Try to build and share a single acceleration structure, if possible */
+    if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2) {
+      devices.back().device->build_bvh(bvh, progress, refit);
+      return;
+    }
+
+    BVHMulti *const bvh_multi = static_cast<BVHMulti *>(bvh);
+    bvh_multi->sub_bvhs.resize(devices.size());
+
+    vector<BVHMulti *> geom_bvhs;
+    geom_bvhs.reserve(bvh->geometry.size());
+    foreach (Geometry *geom, bvh->geometry) {
+      geom_bvhs.push_back(static_cast<BVHMulti *>(geom->bvh));
+    }
+
     /* Broadcast acceleration structure build to all render devices */
+    size_t i = 0;
     foreach (SubDevice &sub, devices) {
-      if (!sub.device->build_optix_bvh(bvh))
-        return false;
+      /* Change geometry BVH pointers to the sub BVH */
+      for (size_t k = 0; k < bvh->geometry.size(); ++k) {
+        bvh->geometry[k]->bvh = geom_bvhs[k]->sub_bvhs[i];
+      }
+
+      if (!bvh_multi->sub_bvhs[i]) {
+        BVHParams params = bvh->params;
+        if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX)
+          params.bvh_layout = BVH_LAYOUT_OPTIX;
+        else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE)
+          params.bvh_layout = sub.device->info.type == DEVICE_OPTIX ? BVH_LAYOUT_OPTIX :
+                                                                      BVH_LAYOUT_EMBREE;
+
+        /* Skip building a bottom level acceleration structure for non-instanced geometry on Embree
+         * (since they are put into the top level directly, see bvh_embree.cpp) */
+        if (!params.top_level && params.bvh_layout == BVH_LAYOUT_EMBREE &&
+            !bvh->geometry[0]->is_instanced()) {
+          i++;
+          continue;
+        }
+
+        bvh_multi->sub_bvhs[i] = BVH::create(params, bvh->geometry, bvh->objects, sub.device);
+      }
+
+      sub.device->build_bvh(bvh_multi->sub_bvhs[i], progress, refit);
+      i++;
     }
-    return true;
-  }
 
-  virtual void *bvh_device() const
-  {
-    /* CPU devices will always be at the back, so simply choose the last one.
-       There should only ever be one CPU device anyway and we need the Embree device for it. */
-    return devices.back().device->bvh_device();
+    /* Change geomtry BVH pointers back to the multi BVH */
+    for (size_t k = 0; k < bvh->geometry.size(); ++k) {
+      bvh->geometry[k]->bvh = geom_bvhs[k];
+    }
   }
 
   virtual void *osl_memory()
diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp
index c6276c1e955..a721f426dfe 100644
--- a/intern/cycles/device/device_optix.cpp
+++ b/intern/cycles/device/device_optix.cpp
@@ -31,6 +31,7 @@
 #  include "util/util_logging.h"
 #  include "util/util_md5.h"
 #  include "util/util_path.h"
+#  include "util/util_progress.h"
 #  include "util/util_time.h"
 
 #  ifdef WITH_CUDA_DYNLOAD
@@ -186,7 +187,6 @@ class OptiXDevice : public CUDADevice {
   bool motion_blur = false;
   device_vector<SbtRecord> sbt_data;
   device_only_memory<KernelParams> launch_params;
-  vector<CUdeviceptr> as_mem;
   OptixTraversableHandle tlas_handle = 0;
 
   OptixDenoiser denoiser = NULL;
@@ -258,11 +258,6 @@ class OptiXDevice : public CUDADevice {
     // Make CUDA context current
     const CUDAContextScope scope(cuContext);
 
-    // Free all acceleration structures
-    for (CUdeviceptr mem : as_mem) {
-      cuMemFree(mem);
-    }
-
     sbt_data.free();
     texture_info.free();
     launch_params.free();
@@ -1136,11 +1131,10 @@ class OptiXDevice : public CUDADevice {
     }
   }
 
-  bool build_optix_bvh(const OptixBuildInput &build_input,
-                       uint16_t num_motion_steps,
-                       OptixTraversableHandle &out_handle,
-                       CUdeviceptr &out_data,
-                       OptixBuildOperation operation)
+  bool build_optix_bvh(BVHOptiX *bvh,
+                       OptixBuildOperation operation,
+                       const OptixBuildInput &build_input,
+                       uint16_t num_motion_steps)
   {
     const CUDAContextScope scope(cuContext);
 
@@ -1166,24 +1160,21 @@ class OptiXDevice : public CUDADevice {
         optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));
 
     // Allocate required output buffers
-    device_only_memory<char> temp_mem(this, "temp_build_mem");
+    device_only_memory<char> temp_mem(this, "optix temp as build mem");
     temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
     if (!temp_mem.device_pointer)
       return false;  // Make sure temporary memory allocation succeeded
 
-    // Move textures to host memory if there is not enough room
-    size_t size = 0, free = 0;
-    cuMemGetInfo(&free, &size);
-    size = sizes.outputSizeInBytes + device_working_headroom;
-    if (size >= free && can_map_host) {
-      move_textures_to_host(size - free, false);
-    }
-
+    device_only_memory<char> &out_data = bvh->as_data;
     if (operation == OPTIX_BUILD_OPERATION_BUILD) {
-      check_result_cuda_ret(cuMemAlloc(&out_data, sizes.outputSizeInBytes));
+      assert(out_data.device == this);
+      out_data.alloc_to_device(sizes.outputSizeInBytes);
+      if (!out_data.device_pointer)
+        return false;
+    }
+    else {
+      assert(out_data.device_pointer && out_data.device_size >= sizes.outputSizeInBytes);
     }
-
-    as_mem.push_back(out_data);
 
     // Finally build the acceleration structure
     OptixAccelEmitDesc compacted_size_prop;
@@ -1192,6 +1183,7 @@ class OptiXDevice : public CUDADevice {
     // Make sure this pointer is 8-byte aligned
     compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8);
 
+    OptixTraversableHandle out_handle = 0;
     check_result_optix_ret(optixAccelBuild(context,
                                            NULL,
                                            &options,
@@ -1199,11 +1191,12 @@ class OptiXDevice : public CUDADevice {
                                            1,
                                            temp_mem.device_pointer,
                                            sizes.tempSizeInBytes,
-                                           out_data,
+                                           out_data.device_pointer,
                                            sizes.outputSizeInBytes,
                                            &out_handle,
                                            background ? &compacted_size_prop : NULL,
                                            background ? 1 : 0));
+    bvh->traversable_handle = static_cast<uint64_t>(out_handle);
 
     // Wait for all operations to finish
     check_result_cuda_ret(cuStreamSynchronize(NULL));
@@ -1219,81 +1212,60 @@ class OptiXDevice : public CUDADevice {
 
       // There is no point compacting if the size does not change
       if (compacted_size < sizes.outputSizeInBytes) {
-        CUdeviceptr compacted_data = 0;
-        if (cuMemAlloc(&compacted_data, compacted_size) != CUDA_SUCCESS)
+        device_only_memory<char> compacted_data(this, "optix compacted as");
+        compacted_data.alloc_to_device(compacted_size);
+        if (!compacted_data.device_pointer)
           // Do not compact if memory allocation for compacted acceleration structure fails
           // Can just use the uncompacted one then, so succeed here regardless
           return true;
-        as_mem.push_back(compacted_data);
 
-        check_result_optix_ret(optixAccelCompact(
-            context, NULL, out_handle, compacted_data, compacted_size, &out_handle));
+        check_result_optix_ret(optixAccelCompact(context,
+                                                 NULL,
+                                                 out_handle,
+                                                 compacted_data.device_pointer,
+                                                 compacted_size,
+                                                 &out_handle));
+        bvh->traversable_handle = static_cast<uint64_t>(out_handle);
 
         // Wait for compaction to finish
         check_result_cuda_ret(cuStreamSynchronize(NULL));
 
-        // Free uncompacted acceleration structure
-        cuMemFree(out_data);
-        as_mem.erase(as_mem.end() - 2);  // Remove 'out_data' from 'as_mem' array
+        std::swap(out_data.device_size, compacted_data.device_size);
+        std::swap(out_data.device_pointer, compacted_data.device_pointer);
       }
     }
 
     return true;
   }
 
-  bool build_optix_bvh(BVH *bvh) override
+  void build_bvh(BVH *bvh, Progress &progress, bool refit) override
   {
-    assert(bvh->params.top_level);
-
-    unsigned int num_instances = 0;
-    unordered_map<Geometry *, OptixTraversableHandle> geometry;
-    geometry.reserve(bvh->geometry.size());
+    BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
 
-    // Free all previous acceleration structures which can not be refit
-    std::set<CUdeviceptr> refit_mem;
+    progress.set_substatus("Building OptiX acceleration structure");
 
-    for (Geometry *geom : bvh->geometry) {
-      if (static_cast<BVHOptiX *>(geom->bvh)->do_refit) {
-        refit_mem.insert(static_cast<BVHOptiX *>(geom->bvh)->optix_data_handle);
-      }
-    }
+    if (!bvh->params.top_level) {
+      assert(bvh->objects.size() == 1 && bvh->geometry.size() == 1);
 
-    for (CUdeviceptr mem : as_mem) {
-      if (refit_mem.find(mem) == refit_mem.end()) {
-        cuMemFree(mem);
-      }
-    }
-
-    as_mem.clear();
-
-    // Build bottom level acceleration structures (BLAS)
-    // Note: Always keep this logic in sync with bvh_optix.cpp!
-    for (Object *ob : bvh->objects) {
-      // Skip geometry for which acceleration structure already exists
-      Geometry *geom = ob->get_geometry();
-      if (geometry.find(geom) != geometry.end())
-        continue;
-
-      OptixTraversableHandle handle;
-      OptixBuildOperation operation;
-      CUdeviceptr out_data;
-      // Refit is only possible in viewport for now.
-      if (static_cast<BVHOptiX *>(geom->bvh)->do_refit && !background) {
-        out_data = static_cast<BVHOptiX *>(geom->bvh)->optix_data_handle;
-        handle = static_cast<BVHOptiX *>(geom->bvh)->optix_handle;
+      // Refit is only possible in viewport for now (because AS is built with
+      // OPTIX_BUILD_FLAG_ALLOW_UPDATE only there, see above)
+      OptixBuildOperation operation = OPTIX_BUILD_OPERATION_BUILD;
+      if (refit && !background) {
+        assert(bvh_optix->traversable_handle != 0);
         operation = OPTIX_BUILD_OPERATION_UPDATE;
       }
       else {
-        out_data = 0;
-        handle = 0;
-        operation = OPTIX_BUILD_OPERATION_BUILD;
+        bvh_optix->as_data.free();
+        bvh_optix->traversable_handle = 0;
       }
 
+      // Build bottom level acceleration structures (BLAS)
+      Geometry *const geom = bvh->geometry[0];
       if (geom->geometry_type == Geometry::HAIR) {
         // Build BLAS for curve primitives
-        Hair *const hair = static_cast<Hair *const>(ob->get_geometry());
+        Hair *const hair = static_cast<Hair *const>(geom);
         if (hair->num_curves() == 0) {
-          continue;
+          return;
         }
 
         const size_t num_segments = hair->num_segments();
@@ -1304,10 +1276,10 @@ class OptiXDevice : public CUDADevice {
           num_motion_steps = hair->get_motion_steps();
         }
 
-        device_vector<OptixAabb> aabb_data(this, "temp_aabb_data", MEM_READ_ONLY);
+        device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY);
 #  if OPTIX_ABI_VERSION >= 36
-        device_vector<int> index_data(this, "temp_index_data", MEM_READ_ONLY);
-        device_vector<float4> vertex_data(this, "temp_vertex_data", MEM_READ_ONLY);
+        device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
+        device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
         // Four control points for each curve segment
         const size_t num_vertices = num_segments * 4;
         if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
@@ -1325,7 +1297,7 @@ class OptiXDevice : public CUDADevice {
           size_t center_step = (num_motion_steps - 1) / 2;
           if (step != center_step) {
             size_t attr_offset = (step > center_step) ? step - 1 : step;
-            // Technically this is a float4 array, but sizeof(float3) is the same as sizeof(float4)
+            // Technically this is a float4 array, but sizeof(float3) == sizeof(float4)
             keys = motion_keys->data_float3() + attr_offset * hair->get_curve_keys().size();
           }
 
@@ -1452,22 +1424,15 @@ class OptiXDevice : public CUDADevice {
 #  endif
         }
 
-        // Allocate memory for new BLAS and build it
-        if (build_optix_bvh(build_input, num_motion_steps, handle, out_data, operation)) {
-          geometry.insert({ob->get_geometry(), handle});
-          static_cast<BVHOptiX *>(geom->bvh)->optix_data_handle = out_data;
-          static_cast<BVHOptiX *>(geom->bvh)->optix_handle = handle;
-          static_cast<BVHOptiX *>(geom->bvh)->do_refit = false;
-        }
-        else {
-          return false;
+        if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
+          progress.set_error("Failed to build OptiX acceleration structure");
         }
       }
       else if (geom->geometry_type == Geometry::MESH || geom->geometry_type == Geometry::VOLUME) {
         // Build BLAS for triangle primitives
-        Mesh *const mesh = static_cast<Mesh *const>(ob->get_geometry());
+        Mesh *const mesh = static_cast<Mesh *const>(geom);
         if (mesh->num_triangles() == 0) {
-          continue;
+          return;
         }
 
         const size_t num_verts = mesh->get_verts().size();
@@ -1478,12 +1443,12 @@ class OptiXDevice : public CUDADevice {
           num_motion_steps = mesh->get_motion_steps();
         }
 
-        device_vector<int> index_data(this, "temp_index_data", MEM_READ_ONLY);
+        device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
         index_data.alloc(mesh->get_triangles().size());
         memcpy(index_data.data(),
                mesh->get_triangles().data(),
                mesh->get_triangles().size() * sizeof(int));
-        device_vector<float3> vertex_data(this, "temp_vertex_data", MEM_READ_ONLY);
+        device_vector<float3> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
         vertex_data.alloc(num_verts * num_motion_steps);
 
         for (size_t step = 0; step < num_motion_steps; ++step) {
@@ -1528,190 +1493,208 @@ class OptiXDevice : public CUDADevice {
         build_input.triangleArray.numSbtRecords = 1;
         build_input.triangleArray.primitiveIndexOffset = mesh->optix_prim_offset;
 
-        // Allocate memory for new BLAS and build it
-        if (build_optix_bvh(build_input, num_motion_steps, handle, out_data, operation)) {
-          geometry.insert({ob->get_geometry(), handle});
-          static_cast<BVHOptiX *>(geom->bvh)->optix_data_handle = out_data;
-          static_cast<BVHOptiX *>(geom->bvh)->optix_handle = handle;
-          static_cast<BVHOptiX *>(geom->bvh)->do_refit = false;
-        }
-        else {
-          return false;
+        if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
+          progress.set_error("Failed to build OptiX acceleration structure");
         }
       }
     }
+    else {
+      unsigned int num_instances = 0;
+
+      bvh_optix->as_data.free();
+      bvh_optix->traversable_handle = 0;
+      bvh_optix->motion_transform_data.free();
 
-    // Fill instance descriptions
+      // Fill instance descriptions
 #  if OPTIX_ABI_VERSION < 41
-    device_vector<OptixAabb> aabbs(this, "tlas_aabbs", MEM_READ_ONLY);
-    aabbs.alloc(bvh->objects.size());
+      device_vector<OptixAabb> aabbs(this, "optix tlas aabbs", MEM_READ_ONLY);
+      aabbs.alloc(bvh->objects.size());
 #  endif
-    device_vector<OptixInstance> instances(this, "tlas_instances", MEM_READ_ONLY);
-    instances.alloc(bvh->objects.size());
-
-    for (Object *ob : bvh->objects) {
-      // Skip non-traceable objects
-      if (!ob->is_traceable())
-        continue;
-
-      // Create separate instance for triangle/curve meshes of an object
-      const auto handle_it = geometry.find(ob->get_geometry());
-      if (handle_it == geometry.end()) {
-        continue;
+      device_vector<OptixInstance> instances(this, "optix tlas instances", MEM_READ_ONLY);
+      instances.alloc(bvh->objects.size());
+
+      // Calculate total motion transform size and allocate memory for them
+      size_t motion_transform_offset = 0;
+      if (motion_blur) {
+        size_t total_motion_transform_size = 0;
+        for (Object *const ob : bvh->objects) {
+          if (ob->is_traceable() && ob->use_motion()) {
+            total_motion_transform_size = align_up(total_motion_transform_size,
+                                                   OPTIX_TRANSFORM_BYTE_ALIGNMENT);
+            const size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
+            total_motion_transform_size = total_motion_transform_size +
+                                          sizeof(OptixSRTMotionTransform) +
+                                          motion_keys * sizeof(OptixSRTData);
+          }
+        }
+
+        assert(bvh_optix->motion_transform_data.device == this);
+        bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size);
       }
-      OptixTraversableHandle handle = handle_it->second;
+
+      for (Object *ob : bvh->objects) {
+        // Skip non-traceable objects
+        if (!ob->is_traceable())
+          continue;
+
+        BVHOptiX *const blas = static_cast<BVHOptiX *>(ob->get_geometry()->bvh);
+        OptixTraversableHandle handle = blas->traversable_handle;
 
 #  if OPTIX_ABI_VERSION < 41
-      OptixAabb &aabb = aabbs[num_instances];
-      aabb.minX = ob->bounds.min.x;
-      aabb.minY = ob->bounds.min.y;
-      aabb.minZ = ob->bounds.min.z;
-      aabb.maxX = ob->bounds.max.x;
-      aabb.maxY = ob->bounds.max.y;
-      aabb.maxZ = ob->bounds.max.z;
+        OptixAabb &aabb = aabbs[num_instances];
+        aabb.minX = ob->bounds.min.x;
+        aabb.minY = ob->bounds.min.y;
+        aabb.minZ = ob->bounds.min.z;
+        aabb.maxX = ob->bounds.max.x;
+        aabb.maxY = ob->bounds.max.y;
+        aabb.maxZ = ob->bounds.max.z;
 #  endif
 
-      OptixInstance &instance = instances[num_instances++];
-      memset(&instance, 0, sizeof(instance));
+        OptixInstance &instance = instances[num_instances++];
+        memset(&instance, 0, sizeof(instance));
 
-      // Clear transform to identity matrix
-      instance.transform[0] = 1.0f;
-      instance.transform[5] = 1.0f;
-      instance.transform[10] = 1.0f;
+        // Clear transform to identity matrix
+        instance.transform[0] = 1.0f;
+        instance.transform[5] = 1.0f;
+        instance.transform[10] = 1.0f;
 
-      // Set user instance ID to object index
-      instance.instanceId = ob->get_device_index();
+        // Set user instance ID to object index
+        instance.instanceId = ob->get_device_index();
 
-      // Have to have at least one bit in the mask, or else instance would always be culled
-      instance.visibilityMask = 1;
+        // Have to have at least one bit in the mask, or else instance would always be culled
+        instance.visibilityMask = 1;
 
-      if (ob->get_geometry()->has_volume) {
-        // Volumes have a special bit set in the visibility mask so a trace can mask only volumes
-        instance.visibilityMask |= 2;
-      }
+        if (ob->get_geometry()->has_volume) {
+          // Volumes have a special bit set in the visibility mask so a trace can mask only volumes
+          instance.visibilityMask |= 2;
+        }
 
-      if (ob->get_geometry()->geometry_type == Geometry::HAIR) {
-        // Same applies to curves (so they can be skipped in local trace calls)
-        instance.visibilityMask |= 4;
+        if (ob->get_geometry()->geometry_type == Geometry::HAIR) {
+          // Same applies to curves (so they can be skipped in local trace calls)
+          instance.visibilityMask |= 4;
 
 #  if OPTIX_ABI_VERSION >= 36
-        if (motion_blur && ob->get_geometry()->has_motion_blur() &&
-            DebugFlags().optix.curves_api &&
-            static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) {
-          // Select between motion blur and non-motion blur built-in intersection module
-          instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
-        }
+          if (motion_blur && ob->get_geometry()->has_motion_blur() &&
+              DebugFlags().optix.curves_api &&
+              static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) {
+            // Select between motion blur and non-motion blur built-in intersection module
+            instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
+          }
 #  endif
-      }
-
-      // Insert motion traversable if object has motion
-      if (motion_blur && ob->use_motion()) {
-        size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
-        size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
-                                       motion_keys * sizeof(OptixSRTData);
-
-        const CUDAContextScope scope(cuContext);
-
-        CUdeviceptr motion_transform_gpu = 0;
-        check_result_cuda_ret(cuMemAlloc(&motion_transform_gpu, motion_transform_size));
-        as_mem.push_back(motion_transform_gpu);
-
-        // Allocate host side memory for motion transform and fill it with transform data
-        OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
-            new uint8_t[motion_transform_size]);
-        motion_transform.child = handle;
-        motion_transform.motionOptions.numKeys = ob->get_motion().size();
-        motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
-        motion_transform.motionOptions.timeBegin = 0.0f;
-        motion_transform.motionOptions.timeEnd = 1.0f;
-
-        OptixSRTData *const srt_data = motion_transform.srtData;
-        array<DecomposedTransform> decomp(ob->get_motion().size());
-        transform_motion_decompose(
-            decomp.data(), ob->get_motion().data(), ob->get_motion().size());
-
-        for (size_t i = 0; i < ob->get_motion().size(); ++i) {
-          // Scale
-          srt_data[i].sx = decomp[i].y.w;  // scale.x.x
-          srt_data[i].sy = decomp[i].z.w;  // scale.y.y
-          srt_data[i].sz = decomp[i].w.w;  // scale.z.z
-
-          // Shear
-          srt_data[i].a = decomp[i].z.x;  // scale.x.y
-          srt_data[i].b = decomp[i].z.y;  // scale.x.z
-          srt_data[i].c = decomp[i].w.x;  // scale.y.z
-          assert(decomp[i].z.z == 0.0f);  // scale.y.x
-          assert(decomp[i].w.y == 0.0f);  // scale.z.x
-          assert(decomp[i].w.z == 0.0f);  // scale.z.y
-
-          // Pivot point
-          srt_data[i].pvx = 0.0f;
-          srt_data[i].pvy = 0.0f;
-          srt_data[i].pvz = 0.0f;
-
-          // Rotation
-          srt_data[i].qx = decomp[i].x.x;
-          srt_data[i].qy = decomp[i].x.y;
-          srt_data[i].qz = decomp[i].x.z;
-          srt_data[i].qw = decomp[i].x.w;
-
-          // Translation
-          srt_data[i].tx = decomp[i].y.x;
-          srt_data[i].ty = decomp[i].y.y;
-          srt_data[i].tz = decomp[i].y.z;
         }
 
-        // Upload motion transform to GPU
-        cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
-        delete[] reinterpret_cast<uint8_t *>(&motion_transform);
+        // Insert motion traversable if object has motion
+        if (motion_blur && ob->use_motion()) {
+          size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
+          size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
+                                         motion_keys * sizeof(OptixSRTData);
+
+          const CUDAContextScope scope(cuContext);
+
+          motion_transform_offset = align_up(motion_transform_offset,
+                                             OPTIX_TRANSFORM_BYTE_ALIGNMENT);
+          CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data.device_pointer +
+                                             motion_transform_offset;
+          motion_transform_offset += motion_transform_size;
+
+          // Allocate host side memory for motion transform and fill it with transform data
+          OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
+              new uint8_t[motion_transform_size]);
+          motion_transform.child = handle;
+          motion_transform.motionOptions.numKeys = ob->get_motion().size();
+          motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
+          motion_transform.motionOptions.timeBegin = 0.0f;
+          motion_transform.motionOptions.timeEnd = 1.0f;
+
+          OptixSRTData *const srt_data = motion_transform.srtData;
+          array<DecomposedTransform> decomp(ob->get_motion().size());
+          transform_motion_decompose(
+              decomp.data(), ob->get_motion().data(), ob->get_motion().size());
+
+          for (size_t i = 0; i < ob->get_motion().size(); ++i) {
+            // Scale
+            srt_data[i].sx = decomp[i].y.w;  // scale.x.x
+            srt_data[i].sy = decomp[i].z.w;  // scale.y.y
+            srt_data[i].sz = decomp[i].w.w;  // scale.z.z
+
+            // Shear
+            srt_data[i].a = decomp[i].z.x;  // scale.x.y
+            srt_data[i].b = decomp[i].z.y;  // scale.x.z
+            srt_data[i].c = decomp[i].w.x;  // scale.y.z
+            assert(decomp[i].z.z == 0.0f);  // scale.y.x
+            assert(decomp[i].w.y == 0.0f);  // scale.z.x
+            assert(decomp[i].w.z == 0.0f);  // scale.z.y
+
+            // Pivot point
+            srt_data[i].pvx = 0.0f;
+            srt_data[i].pvy = 0.0f;
+            srt_data[i].pvz = 0.0f;
+
+            // Rotation
+            srt_data[i].qx = decomp[i].x.x;
+            srt_data[i].qy = decomp[i].x.y;
+            srt_data[i].qz = decomp[i].x.z;
+            srt_data[i].qw = decomp[i].x.w;
+
+            // Translation
+            srt_data[i].tx = decomp[i].y.x;
+            srt_data[i].ty = decomp[i].y.y;
+            srt_data[i].tz = decomp[i].y.z;
+          }
 
-        // Disable instance transform if object uses motion transform already
-        instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
+          // Upload motion transform to GPU
+          cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
+          delete[] reinterpret_cast<uint8_t *>(&motion_transform);
 
-        // Get traversable handle to motion transform
-        optixConvertPointerToTraversableHandle(context,
-                                               motion_transform_gpu,
-                                               OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
-                                               &instance.traversableHandle);
-      }
-      else {
-        instance.traversableHandle = handle;
+          // Disable instance transform if object uses motion transform already
+          instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
 
-        if (ob->get_geometry()->is_instanced()) {
-          // Set transform matrix
-          memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform));
+          // Get traversable handle to motion transform
+          optixConvertPointerToTraversableHandle(context,
+                                                 motion_transform_gpu,
+                                                 OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
+                                                 &instance.traversableHandle);
         }
         else {
-          // Disable instance transform if geometry already has it applied to vertex data
-          instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
-          // Non-instanced objects read ID from prim_object, so
-          // distinguish them from instanced objects with high bit set
-          instance.instanceId |= 0x800000;
+          instance.traversableHandle = handle;
+
+          if (ob->get_geometry()->is_instanced()) {
+            // Set transform matrix
+            memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform));
+          }
+          else {
+            // Disable instance transform if geometry already has it applied to vertex data
+            instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
+            // Non-instanced objects read ID from prim_object, so
+            // distinguish them from instanced objects with high bit set
+            instance.instanceId |= 0x800000;
+          }
         }
       }
-    }
 
-    // Upload instance descriptions
+      // Upload instance descriptions
 #  if OPTIX_ABI_VERSION < 41
-    aabbs.resize(num_instances);
-    aabbs.copy_to_device();
+      aabbs.resize(num_instances);
+      aabbs.copy_to_device();
 #  endif
-    instances.resize(num_instances);
-    instances.copy_to_device();
+      instances.resize(num_instances);
+      instances.copy_to_device();
 
-    // Build top-level acceleration structure (TLAS)
-    OptixBuildInput build_input = {};
-    build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
+      // Build top-level acceleration structure (TLAS)
+      OptixBuildInput build_input = {};
+      build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
 #  if OPTIX_ABI_VERSION < 41  // Instance AABBs no longer need to be set since OptiX 7.2
-    build_input.instanceArray.aabbs = aabbs.device_pointer;
-    build_input.instanceArray.numAabbs = num_instances;
+      build_input.instanceArray.aabbs = aabbs.device_pointer;
+      build_input.instanceArray.numAabbs = num_instances;
 #  endif
-    build_input.instanceArray.instances = instances.device_pointer;
-    build_input.instanceArray.numInstances = num_instances;
+      build_input.instanceArray.instances = instances.device_pointer;
+      build_input.instanceArray.numInstances = num_instances;
 
-    CUdeviceptr out_data = 0;
-    tlas_handle = 0;
-    return build_optix_bvh(build_input, 0, tlas_handle, out_data, OPTIX_BUILD_OPERATION_BUILD);
+      if (!build_optix_bvh(bvh_optix, OPTIX_BUILD_OPERATION_BUILD, build_input, 0)) {
+        progress.set_error("Failed to build OptiX acceleration structure");
+      }
+      tlas_handle = bvh_optix->traversable_handle;
+    }
   }
 
   void const_copy_to(const char *name, void *host, size_t size) override
@@ -1724,7 +1707,7 @@ class OptiXDevice : public CUDADevice {
     if (strcmp(name, "__data") == 0) {
       assert(size <= sizeof(KernelData));
 
-      // Fix traversable handle on multi devices
+      // Update traversable handle (since it is different for each device on multi devices)
       KernelData *const data = (KernelData *)host;
       *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;