Cycles: Add CPU+GPU rendering support with OptiX

Adds support for building multiple BVH types in order to support using both CPU and OptiX devices for rendering simultaneously. Primitive packing for Embree and OptiX is now standalone, so it only needs to be run once and can be shared between the two. Additionally, BVH building was made a device call, so that each device backend can decide how to perform the building. The multi-device for instance creates a special multi-BVH that holds references to several sub-BVHs, one for each sub-device. Reviewed By: brecht, kevindietrich Differential Revision: https://developer.blender.org/D9718
author: Patrick Mours <pmours@nvidia.com> 2020-12-10 16:18:25 +0300
committer: Patrick Mours <pmours@nvidia.com> 2020-12-11 15:24:29 +0300
commit: bfb6fce6594e9cf133bd18aee311c1e5e32dc799 (patch)
tree: 7c813e17ea87e9aae64221b3ac7a8d42ab894c85 /intern/cycles/device
parent: d72ec16e70721408c875040325c984941687b4a2 (diff)
7 files changed, 355 insertions, 267 deletions
diff --git a/intern/cycles/device/cuda/device_cuda.h b/intern/cycles/device/cuda/device_cuda.h
index e5e3e24165d..c3271c3cfcf 100644
--- a/intern/cycles/device/cuda/device_cuda.h
+++ b/intern/cycles/device/cuda/device_cuda.h
@@ -71,6 +71,7 @@ class CUDADevice : public Device {
   };
   typedef map<device_memory *, CUDAMem> CUDAMemMap;
   CUDAMemMap cuda_mem_map;
+  thread_mutex cuda_mem_map_mutex;
 
   struct PixelMem {
     GLuint cuPBO;
diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp
index 01adf10f252..6c26b247743 100644
--- a/intern/cycles/device/cuda/device_cuda_impl.cpp
+++ b/intern/cycles/device/cuda/device_cuda_impl.cpp
@@ -718,8 +718,10 @@ void CUDADevice::init_host_memory()
 void CUDADevice::load_texture_info()
 {
   if (need_texture_info) {
-    texture_info.copy_to_device();
+    /* Unset flag before copying, so this does not loop indenfinetly if the copy below calls
+     * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
     need_texture_info = false;
+    texture_info.copy_to_device();
   }
 }
 
@@ -988,6 +990,7 @@ void CUDADevice::mem_alloc(device_memory &mem)
     assert(!"mem_alloc not supported for global memory.");
   }
   else {
+    thread_scoped_lock lock(cuda_mem_map_mutex);
     generic_alloc(mem);
   }
 }
@@ -1006,10 +1009,10 @@ void CUDADevice::mem_copy_to(device_memory &mem)
     tex_alloc((device_texture &)mem);
   }
   else {
+    thread_scoped_lock lock(cuda_mem_map_mutex);
     if (!mem.device_pointer) {
       generic_alloc(mem);
     }
-
     generic_copy_to(mem);
   }
 }
@@ -1048,6 +1051,7 @@ void CUDADevice::mem_zero(device_memory &mem)
 
   /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
    * regardless of mem.host_pointer and mem.shared_pointer. */
+  thread_scoped_lock lock(cuda_mem_map_mutex);
   if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
     const CUDAContextScope scope(this);
     cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
@@ -1069,6 +1073,7 @@ void CUDADevice::mem_free(device_memory &mem)
     tex_free((device_texture &)mem);
   }
   else {
+    thread_scoped_lock lock(cuda_mem_map_mutex);
     generic_free(mem);
   }
 }
@@ -1092,6 +1097,7 @@ void CUDADevice::const_copy_to(const char *name, void *host, size_t size)
 void CUDADevice::global_alloc(device_memory &mem)
 {
   if (mem.is_resident(this)) {
+    thread_scoped_lock lock(cuda_mem_map_mutex);
     generic_alloc(mem);
     generic_copy_to(mem);
   }
@@ -1102,6 +1108,7 @@ void CUDADevice::global_alloc(device_memory &mem)
 void CUDADevice::global_free(device_memory &mem)
 {
   if (mem.is_resident(this) && mem.device_pointer) {
+    thread_scoped_lock lock(cuda_mem_map_mutex);
     generic_free(mem);
   }
 }
@@ -1170,6 +1177,8 @@ void CUDADevice::tex_alloc(device_texture &mem)
   size_t src_pitch = mem.data_width * dsize * mem.data_elements;
   size_t dst_pitch = src_pitch;
 
+  thread_scoped_lock lock(cuda_mem_map_mutex);
+
   if (!mem.is_resident(this)) {
     cmem = &cuda_mem_map[&mem];
     cmem->texobject = 0;
@@ -1257,6 +1266,9 @@ void CUDADevice::tex_alloc(device_texture &mem)
     cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
   }
 
+  /* Unlock mutex before resizing texture info, since that may attempt to lock it again. */
+  lock.unlock();
+
   /* Resize once */
   const uint slot = mem.slot;
   if (slot >= texture_info.size()) {
@@ -1305,6 +1317,11 @@ void CUDADevice::tex_alloc(device_texture &mem)
     texDesc.filterMode = filter_mode;
     texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
 
+    /* Lock again and refresh the data pointer (in case another thread modified the map in the
+     * meantime). */
+    lock.lock();
+    cmem = &cuda_mem_map[&mem];
+
     cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
 
     texture_info[slot].data = (uint64_t)cmem->texobject;
@@ -1318,6 +1335,7 @@ void CUDADevice::tex_free(device_texture &mem)
 {
   if (mem.device_pointer) {
     CUDAContextScope scope(this);
+    thread_scoped_lock lock(cuda_mem_map_mutex);
     const CUDAMem &cmem = cuda_mem_map[&mem];
 
     if (cmem.texobject) {
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index eb8fb8040e3..1efd628b79b 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -17,6 +17,8 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include "bvh/bvh2.h"
+
 #include "device/device.h"
 #include "device/device_intern.h"
 
@@ -364,6 +366,19 @@ void Device::draw_pixels(device_memory &rgba,
   }
 }
 
+void Device::build_bvh(BVH *bvh, Progress &progress, bool refit)
+{
+  assert(bvh->params.bvh_layout == BVH_LAYOUT_BVH2);
+
+  BVH2 *const bvh2 = static_cast<BVH2 *>(bvh);
+  if (refit) {
+    bvh2->refit(progress);
+  }
+  else {
+    bvh2->build(progress, &stats);
+  }
+}
+
 Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
 {
 #ifdef WITH_MULTI
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 2006db02ce7..e9b7cde7a16 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -373,12 +373,6 @@ class Device {
     return NULL;
   }
 
-  /* Device specific pointer for BVH creation. Currently only used by Embree. */
-  virtual void *bvh_device() const
-  {
-    return NULL;
-  }
-
   /* load/compile kernels, must be called before adding tasks */
   virtual bool load_kernels(const DeviceRequestedFeatures & /*requested_features*/)
   {
@@ -427,10 +421,7 @@ class Device {
                            const DeviceDrawParams &draw_params);
 
   /* acceleration structure building */
-  virtual bool build_optix_bvh(BVH *)
-  {
-    return false;
-  }
+  virtual void build_bvh(BVH *bvh, Progress &progress, bool refit);
 
 #ifdef WITH_NETWORK
   /* networking */
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 6912ac1e638..fea4fc53d1f 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -47,6 +47,8 @@
 #include "kernel/osl/osl_globals.h"
 // clang-format on
 
+#include "bvh/bvh_embree.h"
+
 #include "render/buffers.h"
 #include "render/coverage.h"
 
@@ -188,6 +190,7 @@ class CPUDevice : public Device {
 #endif
   thread_spin_lock oidn_task_lock;
 #ifdef WITH_EMBREE
+  RTCScene embree_scene = NULL;
   RTCDevice embree_device;
 #endif
 
@@ -472,6 +475,15 @@ class CPUDevice : public Device {
 
   virtual void const_copy_to(const char *name, void *host, size_t size) override
   {
+#if WITH_EMBREE
+    if (strcmp(name, "__data") == 0) {
+      assert(size <= sizeof(KernelData));
+
+      // Update scene handle (since it is different for each device on multi devices)
+      KernelData *const data = (KernelData *)host;
+      data->bvh.scene = embree_scene;
+    }
+#endif
     kernel_const_copy(&kernel_globals, name, host, size);
   }
 
@@ -537,13 +549,26 @@ class CPUDevice : public Device {
 #endif
   }
 
-  void *bvh_device() const override
+  void build_bvh(BVH *bvh, Progress &progress, bool refit) override
   {
 #ifdef WITH_EMBREE
-    return embree_device;
-#else
-    return NULL;
+    if (bvh->params.bvh_layout == BVH_LAYOUT_EMBREE ||
+        bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE) {
+      BVHEmbree *const bvh_embree = static_cast<BVHEmbree *>(bvh);
+      if (refit) {
+        bvh_embree->refit(progress);
+      }
+      else {
+        bvh_embree->build(progress, &stats, embree_device);
+      }
+
+      if (bvh->params.top_level) {
+        embree_scene = bvh_embree->scene;
+      }
+    }
+    else
 #endif
+      Device::build_bvh(bvh, progress, refit);
   }
 
   void thread_run(DeviceTask &task)
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index 2e72a0b4393..ed6e764b995 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -17,11 +17,14 @@
 #include <sstream>
 #include <stdlib.h>
 
+#include "bvh/bvh_multi.h"
+
 #include "device/device.h"
 #include "device/device_intern.h"
 #include "device/device_network.h"
 
 #include "render/buffers.h"
+#include "render/geometry.h"
 
 #include "util/util_foreach.h"
 #include "util/util_list.h"
@@ -164,9 +167,24 @@ class MultiDevice : public Device {
   virtual BVHLayoutMask get_bvh_layout_mask() const
   {
     BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_ALL;
+    BVHLayoutMask bvh_layout_mask_all = BVH_LAYOUT_NONE;
     foreach (const SubDevice &sub_device, devices) {
-      bvh_layout_mask &= sub_device.device->get_bvh_layout_mask();
+      BVHLayoutMask device_bvh_layout_mask = sub_device.device->get_bvh_layout_mask();
+      bvh_layout_mask &= device_bvh_layout_mask;
+      bvh_layout_mask_all |= device_bvh_layout_mask;
+    }
+
+    /* With multiple OptiX devices, every device needs its own acceleration structure */
+    if (bvh_layout_mask == BVH_LAYOUT_OPTIX) {
+      return BVH_LAYOUT_MULTI_OPTIX;
     }
+
+    /* When devices do not share a common BVH layout, fall back to creating one for each */
+    const BVHLayoutMask BVH_LAYOUT_OPTIX_EMBREE = (BVH_LAYOUT_OPTIX | BVH_LAYOUT_EMBREE);
+    if ((bvh_layout_mask_all & BVH_LAYOUT_OPTIX_EMBREE) == BVH_LAYOUT_OPTIX_EMBREE) {
+      return BVH_LAYOUT_MULTI_OPTIX_EMBREE;
+    }
+
     return bvh_layout_mask;
   }
 
@@ -227,21 +245,58 @@ class MultiDevice : public Device {
     return result;
   }
 
-  bool build_optix_bvh(BVH *bvh)
+  void build_bvh(BVH *bvh, Progress &progress, bool refit) override
   {
+    /* Try to build and share a single acceleration structure, if possible */
+    if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2) {
+      devices.back().device->build_bvh(bvh, progress, refit);
+      return;
+    }
+
+    BVHMulti *const bvh_multi = static_cast<BVHMulti *>(bvh);
+    bvh_multi->sub_bvhs.resize(devices.size());
+
+    vector<BVHMulti *> geom_bvhs;
+    geom_bvhs.reserve(bvh->geometry.size());
+    foreach (Geometry *geom, bvh->geometry) {
+      geom_bvhs.push_back(static_cast<BVHMulti *>(geom->bvh));
+    }
+
     /* Broadcast acceleration structure build to all render devices */
+    size_t i = 0;
     foreach (SubDevice &sub, devices) {
-      if (!sub.device->build_optix_bvh(bvh))
-        return false;
+      /* Change geometry BVH pointers to the sub BVH */
+      for (size_t k = 0; k < bvh->geometry.size(); ++k) {
+        bvh->geometry[k]->bvh = geom_bvhs[k]->sub_bvhs[i];
+      }
+
+      if (!bvh_multi->sub_bvhs[i]) {
+        BVHParams params = bvh->params;
+        if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX)
+          params.bvh_layout = BVH_LAYOUT_OPTIX;
+        else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE)
+          params.bvh_layout = sub.device->info.type == DEVICE_OPTIX ? BVH_LAYOUT_OPTIX :
+                                                                      BVH_LAYOUT_EMBREE;
+
+        /* Skip building a bottom level acceleration structure for non-instanced geometry on Embree
+         * (since they are put into the top level directly, see bvh_embree.cpp) */
+        if (!params.top_level && params.bvh_layout == BVH_LAYOUT_EMBREE &&
+            !bvh->geometry[0]->is_instanced()) {
+          i++;
+          continue;
+        }
+
+        bvh_multi->sub_bvhs[i] = BVH::create(params, bvh->geometry, bvh->objects, sub.device);
+      }
+
+      sub.device->build_bvh(bvh_multi->sub_bvhs[i], progress, refit);
+      i++;
     }
-    return true;
-  }
 
-  virtual void *bvh_device() const
-  {
-    /* CPU devices will always be at the back, so simply choose the last one.
-       There should only ever be one CPU device anyway and we need the Embree device for it. */
-    return devices.back().device->bvh_device();
+    /* Change geomtry BVH pointers back to the multi BVH */
+    for (size_t k = 0; k < bvh->geometry.size(); ++k) {
+      bvh->geometry[k]->bvh = geom_bvhs[k];
+    }
   }
 
   virtual void *osl_memory()
diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp
index c6276c1e955..a721f426dfe 100644
--- a/intern/cycles/device/device_optix.cpp
+++ b/intern/cycles/device/device_optix.cpp
@@ -31,6 +31,7 @@
 #  include "util/util_logging.h"
 #  include "util/util_md5.h"
 #  include "util/util_path.h"
+#  include "util/util_progress.h"
 #  include "util/util_time.h"
 
 #  ifdef WITH_CUDA_DYNLOAD
@@ -186,7 +187,6 @@ class OptiXDevice : public CUDADevice {
   bool motion_blur = false;
   device_vector<SbtRecord> sbt_data;
   device_only_memory<KernelParams> launch_params;
-  vector<CUdeviceptr> as_mem;
   OptixTraversableHandle tlas_handle = 0;
 
   OptixDenoiser denoiser = NULL;
@@ -258,11 +258,6 @@ class OptiXDevice : public CUDADevice {
     // Make CUDA context current
     const CUDAContextScope scope(cuContext);
 
-    // Free all acceleration structures
-    for (CUdeviceptr mem : as_mem) {
-      cuMemFree(mem);
-    }
-
     sbt_data.free();
     texture_info.free();
     launch_params.free();
@@ -1136,11 +1131,10 @@ class OptiXDevice : public CUDADevice {
     }
   }
 
-  bool build_optix_bvh(const OptixBuildInput &build_input,
-                       uint16_t num_motion_steps,
-                       OptixTraversableHandle &out_handle,
-                       CUdeviceptr &out_data,
-                       OptixBuildOperation operation)
+  bool build_optix_bvh(BVHOptiX *bvh,
+                       OptixBuildOperation operation,
+                       const OptixBuildInput &build_input,
+                       uint16_t num_motion_steps)
   {
     const CUDAContextScope scope(cuContext);
 
@@ -1166,24 +1160,21 @@ class OptiXDevice : public CUDADevice {
         optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));
 
     // Allocate required output buffers
-    device_only_memory<char> temp_mem(this, "temp_build_mem");
+    device_only_memory<char> temp_mem(this, "optix temp as build mem");
     temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
     if (!temp_mem.device_pointer)
       return false;  // Make sure temporary memory allocation succeeded
 
-    // Move textures to host memory if there is not enough room
-    size_t size = 0, free = 0;
-    cuMemGetInfo(&free, &size);
-    size = sizes.outputSizeInBytes + device_working_headroom;
-    if (size >= free && can_map_host) {
-      move_textures_to_host(size - free, false);
-    }
-
+    device_only_memory<char> &out_data = bvh->as_data;
     if (operation == OPTIX_BUILD_OPERATION_BUILD) {
-      check_result_cuda_ret(cuMemAlloc(&out_data, sizes.outputSizeInBytes));
+      assert(out_data.device == this);
+      out_data.alloc_to_device(sizes.outputSizeInBytes);
+      if (!out_data.device_pointer)
+        return false;
+    }
+    else {
+      assert(out_data.device_pointer && out_data.device_size >= sizes.outputSizeInBytes);
     }
-
-    as_mem.push_back(out_data);
 
     // Finally build the acceleration structure
     OptixAccelEmitDesc compacted_size_prop;
@@ -1192,6 +1183,7 @@ class OptiXDevice : public CUDADevice {
     // Make sure this pointer is 8-byte aligned
     compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8);
 
+    OptixTraversableHandle out_handle = 0;
     check_result_optix_ret(optixAccelBuild(context,
                                            NULL,
                                            &options,
@@ -1199,11 +1191,12 @@ class OptiXDevice : public CUDADevice {
                                            1,
                                            temp_mem.device_pointer,
                                            sizes.tempSizeInBytes,
-                                           out_data,
+                                           out_data.device_pointer,
                                            sizes.outputSizeInBytes,
                                            &out_handle,
                                            background ? &compacted_size_prop : NULL,
                                            background ? 1 : 0));
+    bvh->traversable_handle = static_cast<uint64_t>(out_handle);
 
     // Wait for all operations to finish
     check_result_cuda_ret(cuStreamSynchronize(NULL));
@@ -1219,81 +1212,60 @@ class OptiXDevice : public CUDADevice {
 
       // There is no point compacting if the size does not change
       if (compacted_size < sizes.outputSizeInBytes) {
-        CUdeviceptr compacted_data = 0;
-        if (cuMemAlloc(&compacted_data, compacted_size) != CUDA_SUCCESS)
+        device_only_memory<char> compacted_data(this, "optix compacted as");
+        compacted_data.alloc_to_device(compacted_size);
+        if (!compacted_data.device_pointer)
           // Do not compact if memory allocation for compacted acceleration structure fails
           // Can just use the uncompacted one then, so succeed here regardless
           return true;
-        as_mem.push_back(compacted_data);
 
-        check_result_optix_ret(optixAccelCompact(
-            context, NULL, out_handle, compacted_data, compacted_size, &out_handle));
+        check_result_optix_ret(optixAccelCompact(context,
+                                                 NULL,
+                                                 out_handle,
+                                                 compacted_data.device_pointer,
+                                                 compacted_size,
+                                                 &out_handle));
+        bvh->traversable_handle = static_cast<uint64_t>(out_handle);
 
         // Wait for compaction to finish
         check_result_cuda_ret(cuStreamSynchronize(NULL));
 
-        // Free uncompacted acceleration structure
-        cuMemFree(out_data);
-        as_mem.erase(as_mem.end() - 2);  // Remove 'out_data' from 'as_mem' array
+        std::swap(out_data.device_size, compacted_data.device_size);
+        std::swap(out_data.device_pointer, compacted_data.device_pointer);
       }
     }
 
     return true;
   }
 
-  bool build_optix_bvh(BVH *bvh) override
+  void build_bvh(BVH *bvh, Progress &progress, bool refit) override
   {
-    assert(bvh->params.top_level);
-
-    unsigned int num_instances = 0;
-    unordered_map<Geometry *, OptixTraversableHandle> geometry;
-    geometry.reserve(bvh->geometry.size());
+    BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
 
-    // Free all previous acceleration structures which can not be refit
-    std::set<CUdeviceptr> refit_mem;
+    progress.set_substatus("Building OptiX acceleration structure");
 
-    for (Geometry *geom : bvh->geometry) {
-      if (static_cast<BVHOptiX *>(geom->bvh)->do_refit) {
-        refit_mem.insert(static_cast<BVHOptiX *>(geom->bvh)->optix_data_handle);
-      }
-    }
+    if (!bvh->params.top_level) {
+      assert(bvh->objects.size() == 1 && bvh->geometry.size() == 1);
 
-    for (CUdeviceptr mem : as_mem) {
-      if (refit_mem.find(mem) == refit_mem.end()) {
-        cuMemFree(mem);
-      }
-    }
-
-    as_mem.clear();
-
-    // Build bottom level acceleration structures (BLAS)
-    // Note: Always keep this logic in sync with bvh_optix.cpp!
-    for (Object *ob : bvh->objects) {
-      // Skip geometry for which acceleration structure already exists
-      Geometry *geom = ob->get_geometry();
-      if (geometry.find(geom) != geometry.end())
-        continue;
-
-      OptixTraversableHandle handle;
-      OptixBuildOperation operation;
-      CUdeviceptr out_data;
-      // Refit is only possible in viewport for now.
-      if (static_cast<BVHOptiX *>(geom->bvh)->do_refit && !background) {
-        out_data = static_cast<BVHOptiX *>(geom->bvh)->optix_data_handle;
-        handle = static_cast<BVHOptiX *>(geom->bvh)->optix_handle;
+      // Refit is only possible in viewport for now (because AS is built with
+      // OPTIX_BUILD_FLAG_ALLOW_UPDATE only there, see above)
+      OptixBuildOperation operation = OPTIX_BUILD_OPERATION_BUILD;
+      if (refit && !background) {
+        assert(bvh_optix->traversable_handle != 0);
         operation = OPTIX_BUILD_OPERATION_UPDATE;
       }
       else {
-        out_data = 0;
-        handle = 0;
-        operation = OPTIX_BUILD_OPERATION_BUILD;
+        bvh_optix->as_data.free();
+        bvh_optix->traversable_handle = 0;
       }
 
+      // Build bottom level acceleration structures (BLAS)
+      Geometry *const geom = bvh->geometry[0];
       if (geom->geometry_type == Geometry::HAIR) {
         // Build BLAS for curve primitives
-        Hair *const hair = static_cast<Hair *const>(ob->get_geometry());
+        Hair *const hair = static_cast<Hair *const>(geom);
         if (hair->num_curves() == 0) {
-          continue;
+          return;
         }
 
         const size_t num_segments = hair->num_segments();
@@ -1304,10 +1276,10 @@ class OptiXDevice : public CUDADevice {
           num_motion_steps = hair->get_motion_steps();
         }
 
-        device_vector<OptixAabb> aabb_data(this, "temp_aabb_data", MEM_READ_ONLY);
+        device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY);
 #  if OPTIX_ABI_VERSION >= 36
-        device_vector<int> index_data(this, "temp_index_data", MEM_READ_ONLY);
-        device_vector<float4> vertex_data(this, "temp_vertex_data", MEM_READ_ONLY);
+        device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
+        device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
         // Four control points for each curve segment
         const size_t num_vertices = num_segments * 4;
         if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
@@ -1325,7 +1297,7 @@ class OptiXDevice : public CUDADevice {
           size_t center_step = (num_motion_steps - 1) / 2;
           if (step != center_step) {
             size_t attr_offset = (step > center_step) ? step - 1 : step;
-            // Technically this is a float4 array, but sizeof(float3) is the same as sizeof(float4)
+            // Technically this is a float4 array, but sizeof(float3) == sizeof(float4)
             keys = motion_keys->data_float3() + attr_offset * hair->get_curve_keys().size();
           }
 
@@ -1452,22 +1424,15 @@ class OptiXDevice : public CUDADevice {
 #  endif
         }
 
-        // Allocate memory for new BLAS and build it
-        if (build_optix_bvh(build_input, num_motion_steps, handle, out_data, operation)) {
-          geometry.insert({ob->get_geometry(), handle});
-          static_cast<BVHOptiX *>(geom->bvh)->optix_data_handle = out_data;
-          static_cast<BVHOptiX *>(geom->bvh)->optix_handle = handle;
-          static_cast<BVHOptiX *>(geom->bvh)->do_refit = false;
-        }
-        else {
-          return false;
+        if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
+          progress.set_error("Failed to build OptiX acceleration structure");
         }
       }
       else if (geom->geometry_type == Geometry::MESH || geom->geometry_type == Geometry::VOLUME) {
         // Build BLAS for triangle primitives
-        Mesh *const mesh = static_cast<Mesh *const>(ob->get_geometry());
+        Mesh *const mesh = static_cast<Mesh *const>(geom);
         if (mesh->num_triangles() == 0) {
-          continue;
+          return;
         }
 
         const size_t num_verts = mesh->get_verts().size();
@@ -1478,12 +1443,12 @@ class OptiXDevice : public CUDADevice {
           num_motion_steps = mesh->get_motion_steps();
         }
 
-        device_vector<int> index_data(this, "temp_index_data", MEM_READ_ONLY);
+        device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
         index_data.alloc(mesh->get_triangles().size());
         memcpy(index_data.data(),
                mesh->get_triangles().data(),
                mesh->get_triangles().size() * sizeof(int));
-        device_vector<float3> vertex_data(this, "temp_vertex_data", MEM_READ_ONLY);
+        device_vector<float3> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
         vertex_data.alloc(num_verts * num_motion_steps);
 
         for (size_t step = 0; step < num_motion_steps; ++step) {
@@ -1528,190 +1493,208 @@ class OptiXDevice : public CUDADevice {
         build_input.triangleArray.numSbtRecords = 1;
         build_input.triangleArray.primitiveIndexOffset = mesh->optix_prim_offset;
 
-        // Allocate memory for new BLAS and build it
-        if (build_optix_bvh(build_input, num_motion_steps, handle, out_data, operation)) {
-          geometry.insert({ob->get_geometry(), handle});
-          static_cast<BVHOptiX *>(geom->bvh)->optix_data_handle = out_data;
-          static_cast<BVHOptiX *>(geom->bvh)->optix_handle = handle;
-          static_cast<BVHOptiX *>(geom->bvh)->do_refit = false;
-        }
-        else {
-          return false;
+        if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
+          progress.set_error("Failed to build OptiX acceleration structure");
         }
       }
     }
+    else {
+      unsigned int num_instances = 0;
+
+      bvh_optix->as_data.free();
+      bvh_optix->traversable_handle = 0;
+      bvh_optix->motion_transform_data.free();
 
-    // Fill instance descriptions
+      // Fill instance descriptions
 #  if OPTIX_ABI_VERSION < 41
-    device_vector<OptixAabb> aabbs(this, "tlas_aabbs", MEM_READ_ONLY);
-    aabbs.alloc(bvh->objects.size());
+      device_vector<OptixAabb> aabbs(this, "optix tlas aabbs", MEM_READ_ONLY);
+      aabbs.alloc(bvh->objects.size());
 #  endif
-    device_vector<OptixInstance> instances(this, "tlas_instances", MEM_READ_ONLY);
-    instances.alloc(bvh->objects.size());
-
-    for (Object *ob : bvh->objects) {
-      // Skip non-traceable objects
-      if (!ob->is_traceable())
-        continue;
-
-      // Create separate instance for triangle/curve meshes of an object
-      const auto handle_it = geometry.find(ob->get_geometry());
-      if (handle_it == geometry.end()) {
-        continue;
+      device_vector<OptixInstance> instances(this, "optix tlas instances", MEM_READ_ONLY);
+      instances.alloc(bvh->objects.size());
+
+      // Calculate total motion transform size and allocate memory for them
+      size_t motion_transform_offset = 0;
+      if (motion_blur) {
+        size_t total_motion_transform_size = 0;
+        for (Object *const ob : bvh->objects) {
+          if (ob->is_traceable() && ob->use_motion()) {
+            total_motion_transform_size = align_up(total_motion_transform_size,
+                                                   OPTIX_TRANSFORM_BYTE_ALIGNMENT);
+            const size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
+            total_motion_transform_size = total_motion_transform_size +
+                                          sizeof(OptixSRTMotionTransform) +
+                                          motion_keys * sizeof(OptixSRTData);
+          }
+        }
+
+        assert(bvh_optix->motion_transform_data.device == this);
+        bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size);
       }
-      OptixTraversableHandle handle = handle_it->second;
+
+      for (Object *ob : bvh->objects) {
+        // Skip non-traceable objects
+        if (!ob->is_traceable())
+          continue;
+
+        BVHOptiX *const blas = static_cast<BVHOptiX *>(ob->get_geometry()->bvh);
+        OptixTraversableHandle handle = blas->traversable_handle;
 
 #  if OPTIX_ABI_VERSION < 41
-      OptixAabb &aabb = aabbs[num_instances];
-      aabb.minX = ob->bounds.min.x;
-      aabb.minY = ob->bounds.min.y;
-      aabb.minZ = ob->bounds.min.z;
-      aabb.maxX = ob->bounds.max.x;
-      aabb.maxY = ob->bounds.max.y;
-      aabb.maxZ = ob->bounds.max.z;
+        OptixAabb &aabb = aabbs[num_instances];
+        aabb.minX = ob->bounds.min.x;
+        aabb.minY = ob->bounds.min.y;
+        aabb.minZ = ob->bounds.min.z;
+        aabb.maxX = ob->bounds.max.x;
+        aabb.maxY = ob->bounds.max.y;
+        aabb.maxZ = ob->bounds.max.z;
 #  endif
 
-      OptixInstance &instance = instances[num_instances++];
-      memset(&instance, 0, sizeof(instance));
+        OptixInstance &instance = instances[num_instances++];
+        memset(&instance, 0, sizeof(instance));
 
-      // Clear transform to identity matrix
-      instance.transform[0] = 1.0f;
-      instance.transform[5] = 1.0f;
-      instance.transform[10] = 1.0f;
+        // Clear transform to identity matrix
+        instance.transform[0] = 1.0f;
+        instance.transform[5] = 1.0f;
+        instance.transform[10] = 1.0f;
 
-      // Set user instance ID to object index
-      instance.instanceId = ob->get_device_index();
+        // Set user instance ID to object index
+        instance.instanceId = ob->get_device_index();
 
-      // Have to have at least one bit in the mask, or else instance would always be culled
-      instance.visibilityMask = 1;
+        // Have to have at least one bit in the mask, or else instance would always be culled
+        instance.visibilityMask = 1;
 
-      if (ob->get_geometry()->has_volume) {
-        // Volumes have a special bit set in the visibility mask so a trace can mask only volumes
-        instance.visibilityMask |= 2;
-      }
+        if (ob->get_geometry()->has_volume) {
+          // Volumes have a special bit set in the visibility mask so a trace can mask only volumes
+          instance.visibilityMask |= 2;
+        }
 
-      if (ob->get_geometry()->geometry_type == Geometry::HAIR) {
-        // Same applies to curves (so they can be skipped in local trace calls)
-        instance.visibilityMask |= 4;
+        if (ob->get_geometry()->geometry_type == Geometry::HAIR) {
+          // Same applies to curves (so they can be skipped in local trace calls)
+          instance.visibilityMask |= 4;
 
 #  if OPTIX_ABI_VERSION >= 36
-        if (motion_blur && ob->get_geometry()->has_motion_blur() &&
-            DebugFlags().optix.curves_api &&
-            static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) {
-          // Select between motion blur and non-motion blur built-in intersection module
-          instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
-        }
+          if (motion_blur && ob->get_geometry()->has_motion_blur() &&
+              DebugFlags().optix.curves_api &&
+              static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) {
+            // Select between motion blur and non-motion blur built-in intersection module
+            instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
+          }
 #  endif
-      }
-
-      // Insert motion traversable if object has motion
-      if (motion_blur && ob->use_motion()) {
-        size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
-        size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
-                                       motion_keys * sizeof(OptixSRTData);
-
-        const CUDAContextScope scope(cuContext);
-
-        CUdeviceptr motion_transform_gpu = 0;
-        check_result_cuda_ret(cuMemAlloc(&motion_transform_gpu, motion_transform_size));
-        as_mem.push_back(motion_transform_gpu);
-
-        // Allocate host side memory for motion transform and fill it with transform data
-        OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
-            new uint8_t[motion_transform_size]);
-        motion_transform.child = handle;
-        motion_transform.motionOptions.numKeys = ob->get_motion().size();
-        motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
-        motion_transform.motionOptions.timeBegin = 0.0f;
-        motion_transform.motionOptions.timeEnd = 1.0f;
-
-        OptixSRTData *const srt_data = motion_transform.srtData;
-        array<DecomposedTransform> decomp(ob->get_motion().size());
-        transform_motion_decompose(
-            decomp.data(), ob->get_motion().data(), ob->get_motion().size());
-
-        for (size_t i = 0; i < ob->get_motion().size(); ++i) {
-          // Scale
-          srt_data[i].sx = decomp[i].y.w;  // scale.x.x
-          srt_data[i].sy = decomp[i].z.w;  // scale.y.y
-          srt_data[i].sz = decomp[i].w.w;  // scale.z.z
-
-          // Shear
-          srt_data[i].a = decomp[i].z.x;  // scale.x.y
-          srt_data[i].b = decomp[i].z.y;  // scale.x.z
-          srt_data[i].c = decomp[i].w.x;  // scale.y.z
-          assert(decomp[i].z.z == 0.0f);  // scale.y.x
-          assert(decomp[i].w.y == 0.0f);  // scale.z.x
-          assert(decomp[i].w.z == 0.0f);  // scale.z.y
-
-          // Pivot point
-          srt_data[i].pvx = 0.0f;
-          srt_data[i].pvy = 0.0f;
-          srt_data[i].pvz = 0.0f;
-
-          // Rotation
-          srt_data[i].qx = decomp[i].x.x;
-          srt_data[i].qy = decomp[i].x.y;
-          srt_data[i].qz = decomp[i].x.z;
-          srt_data[i].qw = decomp[i].x.w;
-
-          // Translation
-          srt_data[i].tx = decomp[i].y.x;
-          srt_data[i].ty = decomp[i].y.y;
-          srt_data[i].tz = decomp[i].y.z;
         }
 
-        // Upload motion transform to GPU
-        cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
-        delete[] reinterpret_cast<uint8_t *>(&motion_transform);
+        // Insert motion traversable if object has motion
+        if (motion_blur && ob->use_motion()) {
+          size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
+          size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
+                                         motion_keys * sizeof(OptixSRTData);
+
+          const CUDAContextScope scope(cuContext);
+
+          motion_transform_offset = align_up(motion_transform_offset,
+                                             OPTIX_TRANSFORM_BYTE_ALIGNMENT);
+          CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data.device_pointer +
+                                             motion_transform_offset;
+          motion_transform_offset += motion_transform_size;
+
+          // Allocate host side memory for motion transform and fill it with transform data
+          OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
+              new uint8_t[motion_transform_size]);
+          motion_transform.child = handle;
+          motion_transform.motionOptions.numKeys = ob->get_motion().size();
+          motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
+          motion_transform.motionOptions.timeBegin = 0.0f;
+          motion_transform.motionOptions.timeEnd = 1.0f;
+
+          OptixSRTData *const srt_data = motion_transform.srtData;
+          array<DecomposedTransform> decomp(ob->get_motion().size());
+          transform_motion_decompose(
+              decomp.data(), ob->get_motion().data(), ob->get_motion().size());
+
+          for (size_t i = 0; i < ob->get_motion().size(); ++i) {
+            // Scale
+            srt_data[i].sx = decomp[i].y.w;  // scale.x.x
+            srt_data[i].sy = decomp[i].z.w;  // scale.y.y
+            srt_data[i].sz = decomp[i].w.w;  // scale.z.z
+
+            // Shear
+            srt_data[i].a = decomp[i].z.x;  // scale.x.y
+            srt_data[i].b = decomp[i].z.y;  // scale.x.z
+            srt_data[i].c = decomp[i].w.x;  // scale.y.z
+            assert(decomp[i].z.z == 0.0f);  // scale.y.x
+            assert(decomp[i].w.y == 0.0f);  // scale.z.x
+            assert(decomp[i].w.z == 0.0f);  // scale.z.y
+
+            // Pivot point
+            srt_data[i].pvx = 0.0f;
+            srt_data[i].pvy = 0.0f;
+            srt_data[i].pvz = 0.0f;
+
+            // Rotation
+            srt_data[i].qx = decomp[i].x.x;
+            srt_data[i].qy = decomp[i].x.y;
+            srt_data[i].qz = decomp[i].x.z;
+            srt_data[i].qw = decomp[i].x.w;
+
+            // Translation
+            srt_data[i].tx = decomp[i].y.x;
+            srt_data[i].ty = decomp[i].y.y;
+            srt_data[i].tz = decomp[i].y.z;
+          }
 
-        // Disable instance transform if object uses motion transform already
-        instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
+          // Upload motion transform to GPU
+          cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
+          delete[] reinterpret_cast<uint8_t *>(&motion_transform);
 
-        // Get traversable handle to motion transform
-        optixConvertPointerToTraversableHandle(context,
-                                               motion_transform_gpu,
-                                               OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
-                                               &instance.traversableHandle);
-      }
-      else {
-        instance.traversableHandle = handle;
+          // Disable instance transform if object uses motion transform already
+          instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
 
-        if (ob->get_geometry()->is_instanced()) {
-          // Set transform matrix
-          memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform));
+          // Get traversable handle to motion transform
+          optixConvertPointerToTraversableHandle(context,
+                                                 motion_transform_gpu,
+                                                 OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
+                                                 &instance.traversableHandle);
         }
         else {
-          // Disable instance transform if geometry already has it applied to vertex data
-          instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
-          // Non-instanced objects read ID from prim_object, so
-          // distinguish them from instanced objects with high bit set
-          instance.instanceId |= 0x800000;
+          instance.traversableHandle = handle;
+
+          if (ob->get_geometry()->is_instanced()) {
+            // Set transform matrix
+            memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform));
+          }
+          else {
+            // Disable instance transform if geometry already has it applied to vertex data
+            instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
+            // Non-instanced objects read ID from prim_object, so
+            // distinguish them from instanced objects with high bit set
+            instance.instanceId |= 0x800000;
+          }
         }
       }
-    }
 
-    // Upload instance descriptions
+      // Upload instance descriptions
 #  if OPTIX_ABI_VERSION < 41
-    aabbs.resize(num_instances);
-    aabbs.copy_to_device();
+      aabbs.resize(num_instances);
+      aabbs.copy_to_device();
 #  endif
-    instances.resize(num_instances);
-    instances.copy_to_device();
+      instances.resize(num_instances);
+      instances.copy_to_device();
 
-    // Build top-level acceleration structure (TLAS)
-    OptixBuildInput build_input = {};
-    build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
+      // Build top-level acceleration structure (TLAS)
+      OptixBuildInput build_input = {};
+      build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
 #  if OPTIX_ABI_VERSION < 41  // Instance AABBs no longer need to be set since OptiX 7.2
-    build_input.instanceArray.aabbs = aabbs.device_pointer;
-    build_input.instanceArray.numAabbs = num_instances;
+      build_input.instanceArray.aabbs = aabbs.device_pointer;
+      build_input.instanceArray.numAabbs = num_instances;
 #  endif
-    build_input.instanceArray.instances = instances.device_pointer;
-    build_input.instanceArray.numInstances = num_instances;
+      build_input.instanceArray.instances = instances.device_pointer;
+      build_input.instanceArray.numInstances = num_instances;
 
-    CUdeviceptr out_data = 0;
-    tlas_handle = 0;
-    return build_optix_bvh(build_input, 0, tlas_handle, out_data, OPTIX_BUILD_OPERATION_BUILD);
+      if (!build_optix_bvh(bvh_optix, OPTIX_BUILD_OPERATION_BUILD, build_input, 0)) {
+        progress.set_error("Failed to build OptiX acceleration structure");
+      }
+      tlas_handle = bvh_optix->traversable_handle;
+    }
   }
 
   void const_copy_to(const char *name, void *host, size_t size) override
@@ -1724,7 +1707,7 @@ class OptiXDevice : public CUDADevice {
     if (strcmp(name, "__data") == 0) {
       assert(size <= sizeof(KernelData));
 
-      // Fix traversable handle on multi devices
+      // Update traversable handle (since it is different for each device on multi devices)
       KernelData *const data = (KernelData *)host;
       *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;
author	Patrick Mours <pmours@nvidia.com>	2020-12-10 16:18:25 +0300
committer	Patrick Mours <pmours@nvidia.com>	2020-12-11 15:24:29 +0300
commit	bfb6fce6594e9cf133bd18aee311c1e5e32dc799 (patch)
tree	7c813e17ea87e9aae64221b3ac7a8d42ab894c85 /intern/cycles/device
parent	d72ec16e70721408c875040325c984941687b4a2 (diff)