Cycles: Add OptiX acceleration structure compaction

This adds compaction support for OptiX acceleration structures, which reduces the device memory footprint in a post step after building. Depending on the scene this can reduce the amount of used device memory quite a bit and even improve performance (smaller acceleration structure improves cache usage). It's only enabled for background renders to make acceleration structure builds fast in viewport. Also fixes a bug in the memory management for OptiX acceleration structures: These were held in a dynamic vector of 'device_memory' instances and used the mem_alloc/mem_free functions. However, those keep track of memory instances in the 'cuda_mem_map' via pointers to 'device_memory' (which works fine everywhere else since those are never copied/moved). But in the case of the vector, it may decide to reallocate at some point, which invalidates those pointers and would result in some nasty accesses to invalid memory. So it is not actually safe to move a 'device_memory' object and therefore this removes the move operator overloads again. Reviewed By: brecht Differential Revision: https://developer.blender.org/D6369
author: Patrick Mours <pmours@nvidia.com> 2019-12-05 21:17:01 +0300
committer: Patrick Mours <pmours@nvidia.com> 2019-12-09 16:32:12 +0300
commit: baeb11826b9fe5525db6dd05ba5271949079fc1e (patch)
tree: 9d646d69ef51c3fb1dab3d76798936160b1560f9 /intern/cycles
parent: c9dc57be3ae59fac84b243f8bfdaddaa9b5e2f20 (diff)
4 files changed, 101 insertions, 77 deletions
diff --git a/intern/cycles/device/device_memory.cpp b/intern/cycles/device/device_memory.cpp
index c106b4505db..3a99a49dffc 100644
--- a/intern/cycles/device/device_memory.cpp
+++ b/intern/cycles/device/device_memory.cpp
@@ -47,31 +47,6 @@ device_memory::~device_memory()
   assert(shared_counter == 0);
 }
 
-device_memory::device_memory(device_memory &&other)
-    : data_type(other.data_type),
-      data_elements(other.data_elements),
-      data_size(other.data_size),
-      device_size(other.device_size),
-      data_width(other.data_width),
-      data_height(other.data_height),
-      data_depth(other.data_depth),
-      type(other.type),
-      name(other.name),
-      interpolation(other.interpolation),
-      extension(other.extension),
-      device(other.device),
-      device_pointer(other.device_pointer),
-      host_pointer(other.host_pointer),
-      shared_pointer(other.shared_pointer),
-      shared_counter(other.shared_counter)
-{
-  other.device_size = 0;
-  other.device_pointer = 0;
-  other.host_pointer = 0;
-  other.shared_pointer = 0;
-  other.shared_counter = 0;
-}
-
 void *device_memory::host_alloc(size_t size)
 {
   if (!size) {
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index f8324e2a214..60740807568 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -235,9 +235,6 @@ class device_memory {
   device_memory(const device_memory &) = delete;
   device_memory &operator=(const device_memory &) = delete;
 
-  /* But moving is possible. */
-  device_memory(device_memory &&);
-
   /* Host allocation on the device. All host_pointer memory should be
    * allocated with these functions, for devices that support using
    * the same pointer for host and device. */
@@ -275,11 +272,6 @@ template<typename T> class device_only_memory : public device_memory {
     free();
   }
 
-  device_only_memory(device_only_memory &&other)
-      : device_memory(static_cast<device_memory &&>(other))
-  {
-  }
-
   void alloc_to_device(size_t num, bool shrink_to_fit = true)
   {
     size_t new_size = num;
@@ -338,10 +330,6 @@ template<typename T> class device_vector : public device_memory {
     free();
   }
 
-  device_vector(device_vector &&other) : device_memory(static_cast<device_memory &&>(other))
-  {
-  }
-
   /* Host memory allocation. */
   T *alloc(size_t width, size_t height = 0, size_t depth = 0)
   {
diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp
index ae3ab7e1fc2..7335e0bc64d 100644
--- a/intern/cycles/device/device_optix.cpp
+++ b/intern/cycles/device/device_optix.cpp
@@ -174,7 +174,7 @@ class OptiXDevice : public Device {
   device_vector<SbtRecord> sbt_data;
   device_vector<TextureInfo> texture_info;
   device_only_memory<KernelParams> launch_params;
-  vector<device_only_memory<uint8_t>> as_mem;
+  vector<CUdeviceptr> as_mem;
   OptixTraversableHandle tlas_handle = 0;
 
   // TODO(pmours): This is copied from device_cuda.cpp, so move to common code eventually
@@ -269,6 +269,9 @@ class OptiXDevice : public Device {
     task_pool.stop();
 
     // Free all acceleration structures
+    for (CUdeviceptr mem : as_mem) {
+      cuMemFree(mem);
+    }
     as_mem.clear();
 
     sbt_data.free();
@@ -831,7 +834,6 @@ class OptiXDevice : public Device {
 
   bool build_optix_bvh(const OptixBuildInput &build_input,
                        uint16_t num_motion_steps,
-                       device_memory &out_data,
                        OptixTraversableHandle &out_handle)
   {
     out_handle = 0;
@@ -842,7 +844,15 @@ class OptiXDevice : public Device {
     OptixAccelBufferSizes sizes = {};
     OptixAccelBuildOptions options;
     options.operation = OPTIX_BUILD_OPERATION_BUILD;
-    options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE;
+    if (background) {
+      // Prefer best performance and lowest memory consumption in background
+      options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
+    }
+    else {
+      // Prefer fast updates in viewport
+      options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD;
+    }
+
     options.motionOptions.numKeys = num_motion_steps;
     options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH;
     options.motionOptions.timeBegin = 0.0f;
@@ -853,31 +863,75 @@ class OptiXDevice : public Device {
 
     // Allocate required output buffers
     device_only_memory<char> temp_mem(this, "temp_build_mem");
-    temp_mem.alloc_to_device(sizes.tempSizeInBytes);
+    temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
+    if (!temp_mem.device_pointer)
+      return false;  // Make sure temporary memory allocation succeeded
+
+    // Move textures to host memory if there is not enough room
+    size_t size = 0, free = 0;
+    cuMemGetInfo(&free, &size);
+    size = sizes.outputSizeInBytes + device_working_headroom;
+    if (size >= free && can_map_host) {
+      move_textures_to_host(size - free, false);
+    }
 
-    out_data.type = MEM_DEVICE_ONLY;
-    out_data.data_type = TYPE_UNKNOWN;
-    out_data.data_elements = 1;
-    out_data.data_size = sizes.outputSizeInBytes;
-    mem_alloc(out_data);
+    CUdeviceptr out_data = 0;
+    check_result_cuda_ret(cuMemAlloc(&out_data, sizes.outputSizeInBytes));
+    as_mem.push_back(out_data);
 
     // Finally build the acceleration structure
+    OptixAccelEmitDesc compacted_size_prop;
+    compacted_size_prop.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
+    // A tiny space was allocated for this property at the end of the temporary buffer above
+    // Make sure this pointer is 8-byte aligned
+    compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8);
+
     check_result_optix_ret(optixAccelBuild(context,
                                            NULL,
                                            &options,
                                            &build_input,
                                            1,
                                            temp_mem.device_pointer,
-                                           sizes.tempSizeInBytes,
-                                           out_data.device_pointer,
+                                           temp_mem.device_size,
+                                           out_data,
                                            sizes.outputSizeInBytes,
                                            &out_handle,
-                                           NULL,
-                                           0));
+                                           &compacted_size_prop,
+                                           1));
 
     // Wait for all operations to finish
     check_result_cuda_ret(cuStreamSynchronize(NULL));
 
+    // Compact acceleration structure to save memory (do not do this in viewport for faster builds)
+    if (background) {
+      uint64_t compacted_size = sizes.outputSizeInBytes;
+      check_result_cuda_ret(
+          cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size)));
+
+      // Temporary memory is no longer needed, so free it now to make space
+      temp_mem.free();
+
+      // There is no point compacting if the size does not change
+      if (compacted_size < sizes.outputSizeInBytes) {
+        CUdeviceptr compacted_data = 0;
+        if (cuMemAlloc(&compacted_data, compacted_size) != CUDA_SUCCESS)
+          // Do not compact if memory allocation for compacted acceleration structure fails
+          // Can just use the uncompacted one then, so succeed here regardless
+          return true;
+        as_mem.push_back(compacted_data);
+
+        check_result_optix_ret(optixAccelCompact(
+            context, NULL, out_handle, compacted_data, compacted_size, &out_handle));
+
+        // Wait for compaction to finish
+        check_result_cuda_ret(cuStreamSynchronize(NULL));
+
+        // Free uncompacted acceleration structure
+        cuMemFree(out_data);
+        as_mem.erase(as_mem.end() - 2);  // Remove 'out_data' from 'as_mem' array
+      }
+    }
+
     return true;
   }
 
@@ -889,7 +943,10 @@ class OptiXDevice : public Device {
     unordered_map<Mesh *, vector<OptixTraversableHandle>> meshes;
     meshes.reserve(bvh->meshes.size());
 
-    // Free all previous acceleration structure
+    // Free all previous acceleration structures
+    for (CUdeviceptr mem : as_mem) {
+      cuMemFree(mem);
+    }
     as_mem.clear();
 
     // Build bottom level acceleration structures (BLAS)
@@ -968,9 +1025,8 @@ class OptiXDevice : public Device {
         build_input.aabbArray.primitiveIndexOffset = mesh->prim_offset;
 
         // Allocate memory for new BLAS and build it
-        as_mem.emplace_back(this, "blas");
         handles.emplace_back();
-        if (!build_optix_bvh(build_input, num_motion_steps, as_mem.back(), handles.back()))
+        if (!build_optix_bvh(build_input, num_motion_steps, handles.back()))
           return false;
       }
 
@@ -1034,9 +1090,8 @@ class OptiXDevice : public Device {
         build_input.triangleArray.primitiveIndexOffset = mesh->prim_offset + mesh->num_segments();
 
         // Allocate memory for new BLAS and build it
-        as_mem.emplace_back(this, "blas");
         handles.emplace_back();
-        if (!build_optix_bvh(build_input, num_motion_steps, as_mem.back(), handles.back()))
+        if (!build_optix_bvh(build_input, num_motion_steps, handles.back()))
           return false;
       }
 
@@ -1081,15 +1136,17 @@ class OptiXDevice : public Device {
 
         // Insert motion traversable if object has motion
         if (motion_blur && ob->use_motion()) {
-          as_mem.emplace_back(this, "motion_transform");
-          device_only_memory<uint8_t> &motion_transform_gpu = as_mem.back();
-          motion_transform_gpu.alloc_to_device(sizeof(OptixSRTMotionTransform) +
-                                               (max(ob->motion.size(), 2) - 2) *
-                                                   sizeof(OptixSRTData));
+          size_t motion_keys = max(ob->motion.size(), 2) - 2;
+          size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
+                                         motion_keys * sizeof(OptixSRTData);
+
+          CUdeviceptr motion_transform_gpu = 0;
+          check_result_cuda_ret(cuMemAlloc(&motion_transform_gpu, motion_transform_size));
+          as_mem.push_back(motion_transform_gpu);
 
           // Allocate host side memory for motion transform and fill it with transform data
           OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
-              motion_transform_gpu.host_pointer = new uint8_t[motion_transform_gpu.memory_size()]);
+              new uint8_t[motion_transform_size]);
           motion_transform.child = handle;
           motion_transform.motionOptions.numKeys = ob->motion.size();
           motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
@@ -1101,38 +1158,43 @@ class OptiXDevice : public Device {
           transform_motion_decompose(decomp.data(), ob->motion.data(), ob->motion.size());
 
           for (size_t i = 0; i < ob->motion.size(); ++i) {
-            // scaling
-            srt_data[i].a = decomp[i].z.x;   // scale.x.y
-            srt_data[i].b = decomp[i].z.y;   // scale.x.z
-            srt_data[i].c = decomp[i].w.x;   // scale.y.z
+            // Scale
             srt_data[i].sx = decomp[i].y.w;  // scale.x.x
             srt_data[i].sy = decomp[i].z.w;  // scale.y.y
             srt_data[i].sz = decomp[i].w.w;  // scale.z.z
-            srt_data[i].pvx = 0;
-            srt_data[i].pvy = 0;
-            srt_data[i].pvz = 0;
-            // rotation
+
+            // Shear
+            srt_data[i].a = decomp[i].z.x;  // scale.x.y
+            srt_data[i].b = decomp[i].z.y;  // scale.x.z
+            srt_data[i].c = decomp[i].w.x;  // scale.y.z
+
+            // Pivot point
+            srt_data[i].pvx = 0.0f;
+            srt_data[i].pvy = 0.0f;
+            srt_data[i].pvz = 0.0f;
+
+            // Rotation
             srt_data[i].qx = decomp[i].x.x;
             srt_data[i].qy = decomp[i].x.y;
             srt_data[i].qz = decomp[i].x.z;
             srt_data[i].qw = decomp[i].x.w;
-            // transform
+
+            // Translation
             srt_data[i].tx = decomp[i].y.x;
             srt_data[i].ty = decomp[i].y.y;
             srt_data[i].tz = decomp[i].y.z;
           }
 
           // Upload motion transform to GPU
-          mem_copy_to(motion_transform_gpu);
-          delete[] reinterpret_cast<uint8_t *>(motion_transform_gpu.host_pointer);
-          motion_transform_gpu.host_pointer = 0;
+          cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
+          delete[] reinterpret_cast<uint8_t *>(&motion_transform);
 
           // Disable instance transform if object uses motion transform already
           instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
 
           // Get traversable handle to motion transform
           optixConvertPointerToTraversableHandle(context,
-                                                 motion_transform_gpu.device_pointer,
+                                                 motion_transform_gpu,
                                                  OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
                                                  &instance.traversableHandle);
         }
@@ -1168,8 +1230,7 @@ class OptiXDevice : public Device {
     build_input.instanceArray.aabbs = aabbs.device_pointer;
     build_input.instanceArray.numAabbs = num_instances;
 
-    as_mem.emplace_back(this, "tlas");
-    return build_optix_bvh(build_input, 0, as_mem.back(), tlas_handle);
+    return build_optix_bvh(build_input, 0, tlas_handle);
   }
 
   void update_texture_info()
diff --git a/intern/cycles/kernel/svm/svm_bevel.h b/intern/cycles/kernel/svm/svm_bevel.h
index 434502f31f9..bf5957ec9e4 100644
--- a/intern/cycles/kernel/svm/svm_bevel.h
+++ b/intern/cycles/kernel/svm/svm_bevel.h
@@ -53,7 +53,7 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
   float3 sum_N = make_float3(0.0f, 0.0f, 0.0f);
 
   for (int sample = 0; sample < num_samples; sample++) {
-    float disk_u = 0.0f, disk_v = 0.0f;
+    float disk_u, disk_v;
     path_branched_rng_2D(
         kg, state->rng_hash, state, sample, num_samples, PRNG_BEVEL_U, &disk_u, &disk_v);
author	Patrick Mours <pmours@nvidia.com>	2019-12-05 21:17:01 +0300
committer	Patrick Mours <pmours@nvidia.com>	2019-12-09 16:32:12 +0300
commit	baeb11826b9fe5525db6dd05ba5271949079fc1e (patch)
tree	9d646d69ef51c3fb1dab3d76798936160b1560f9 /intern/cycles
parent	c9dc57be3ae59fac84b243f8bfdaddaa9b5e2f20 (diff)