From f4f8b6dde32b0438e0b97a6d8ebeb89802987127 Mon Sep 17 00:00:00 2001
From: Patrick Mours <pmours@nvidia.com>
Date: Wed, 3 Mar 2021 14:35:50 +0100
Subject: Cycles: Change device-only memory to actually only allocate on the
 device

This patch changes the `MEM_DEVICE_ONLY` type to only allocate on the device and fail if
that is not possible anymore because out-of-memory (since OptiX acceleration structures may
not be allocated in host memory). It also fixes high peak memory usage during OptiX
acceleration structure building.

Reviewed By: brecht

Maniphest Tasks: T85985

Differential Revision: https://developer.blender.org/D10535
---
 intern/cycles/bvh/bvh_optix.cpp                |  4 ++--
 intern/cycles/device/cuda/device_cuda_impl.cpp | 12 +++++++++---
 intern/cycles/device/device_cpu.cpp            |  5 ++---
 intern/cycles/device/device_denoising.h        |  3 ++-
 intern/cycles/device/device_memory.h           |  4 ++--
 intern/cycles/device/device_optix.cpp          | 23 ++++++++++++++++-------
 6 files changed, 33 insertions(+), 18 deletions(-)

(limited to 'intern')

diff --git a/intern/cycles/bvh/bvh_optix.cpp b/intern/cycles/bvh/bvh_optix.cpp
index e094f339ede..d630e8965dc 100644
--- a/intern/cycles/bvh/bvh_optix.cpp
+++ b/intern/cycles/bvh/bvh_optix.cpp
@@ -27,8 +27,8 @@ BVHOptiX::BVHOptiX(const BVHParams &params_,
                    Device *device)
     : BVH(params_, geometry_, objects_),
       traversable_handle(0),
-      as_data(device, params_.top_level ? "optix tlas" : "optix blas"),
-      motion_transform_data(device, "optix motion transform")
+      as_data(device, params_.top_level ? "optix tlas" : "optix blas", false),
+      motion_transform_data(device, "optix motion transform", false)
 {
 }
 
diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp
index 44a51835f4c..5b62292ca55 100644
--- a/intern/cycles/device/cuda/device_cuda_impl.cpp
+++ b/intern/cycles/device/cuda/device_cuda_impl.cpp
@@ -854,7 +854,7 @@ CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_
 
   void *shared_pointer = 0;
 
-  if (mem_alloc_result != CUDA_SUCCESS && can_map_host) {
+  if (mem_alloc_result != CUDA_SUCCESS && can_map_host && mem.type != MEM_DEVICE_ONLY) {
     if (mem.shared_pointer) {
       /* Another device already allocated host memory. */
       mem_alloc_result = CUDA_SUCCESS;
@@ -877,8 +877,14 @@ CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_
   }
 
   if (mem_alloc_result != CUDA_SUCCESS) {
-    status = " failed, out of device and host memory";
-    set_error("System is out of GPU and shared host memory");
+    if (mem.type == MEM_DEVICE_ONLY) {
+      status = " failed, out of device memory";
+      set_error("System is out of GPU memory");
+    }
+    else {
+      status = " failed, out of device and host memory";
+      set_error("System is out of GPU and shared host memory");
+    }
   }
 
   if (mem.name) {
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index fdfd3f83be6..e2f9c7391da 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -396,8 +396,7 @@ class CPUDevice : public Device {
                 << string_human_readable_size(mem.memory_size()) << ")";
       }
 
-      if (mem.type == MEM_DEVICE_ONLY) {
-        assert(!mem.host_pointer);
+      if (mem.type == MEM_DEVICE_ONLY || !mem.host_pointer) {
         size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES;
         void *data = util_aligned_malloc(mem.memory_size(), alignment);
         mem.device_pointer = (device_ptr)data;
@@ -459,7 +458,7 @@ class CPUDevice : public Device {
       tex_free((device_texture &)mem);
     }
     else if (mem.device_pointer) {
-      if (mem.type == MEM_DEVICE_ONLY) {
+      if (mem.type == MEM_DEVICE_ONLY || !mem.host_pointer) {
         util_aligned_free((void *)mem.device_pointer);
       }
       mem.device_pointer = 0;
diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h
index 2c0dc23b44a..bb8bdfdd225 100644
--- a/intern/cycles/device/device_denoising.h
+++ b/intern/cycles/device/device_denoising.h
@@ -171,7 +171,8 @@ class DenoisingTask {
     bool gpu_temporary_mem;
 
     DenoiseBuffers(Device *device)
-        : mem(device, "denoising pixel buffer"), temporary_mem(device, "denoising temporary mem")
+        : mem(device, "denoising pixel buffer"),
+          temporary_mem(device, "denoising temporary mem", true)
     {
     }
   } buffer;
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index 1f63a152458..97459b9ae6a 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -270,8 +270,8 @@ class device_memory {
 
 template<typename T> class device_only_memory : public device_memory {
  public:
-  device_only_memory(Device *device, const char *name)
-      : device_memory(device, name, MEM_DEVICE_ONLY)
+  device_only_memory(Device *device, const char *name, bool allow_host_memory_fallback = false)
+      : device_memory(device, name, allow_host_memory_fallback ? MEM_READ_WRITE : MEM_DEVICE_ONLY)
   {
     data_type = device_type_traits<T>::data_type;
     data_elements = max(device_type_traits<T>::num_elements, 1);
diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp
index 07ce63f5394..51e1a0033ba 100644
--- a/intern/cycles/device/device_optix.cpp
+++ b/intern/cycles/device/device_optix.cpp
@@ -197,8 +197,8 @@ class OptiXDevice : public CUDADevice {
   OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
       : CUDADevice(info_, stats_, profiler_, background_),
         sbt_data(this, "__sbt", MEM_READ_ONLY),
-        launch_params(this, "__params"),
-        denoiser_state(this, "__denoiser_state")
+        launch_params(this, "__params", false),
+        denoiser_state(this, "__denoiser_state", true)
   {
     // Store number of CUDA streams in device info
     info.cpu_threads = DebugFlags().optix.cuda_streams;
@@ -878,8 +878,8 @@ class OptiXDevice : public CUDADevice {
       device_ptr input_ptr = rtile.buffer + pixel_offset;
 
       // Copy tile data into a common buffer if necessary
-      device_only_memory<float> input(this, "denoiser input");
-      device_vector<TileInfo> tile_info_mem(this, "denoiser tile info", MEM_READ_WRITE);
+      device_only_memory<float> input(this, "denoiser input", true);
+      device_vector<TileInfo> tile_info_mem(this, "denoiser tile info", MEM_READ_ONLY);
 
       bool contiguous_memory = true;
       for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
@@ -924,7 +924,7 @@ class OptiXDevice : public CUDADevice {
       }
 
 #  if OPTIX_DENOISER_NO_PIXEL_STRIDE
-      device_only_memory<float> input_rgb(this, "denoiser input rgb");
+      device_only_memory<float> input_rgb(this, "denoiser input rgb", true);
       input_rgb.alloc_to_device(rect_size.x * rect_size.y * 3 * task.denoising.input_passes);
 
       void *input_args[] = {&input_rgb.device_pointer,
@@ -1146,6 +1146,13 @@ class OptiXDevice : public CUDADevice {
                        const OptixBuildInput &build_input,
                        uint16_t num_motion_steps)
   {
+    /* Allocate and build acceleration structures only one at a time, to prevent parallel builds
+     * from running out of memory (since both original and compacted acceleration structure memory
+     * may be allocated at the same time for the duration of this function). The builds would
+     * otherwise happen on the same CUDA stream anyway. */
+    static thread_mutex mutex;
+    thread_scoped_lock lock(mutex);
+
     const CUDAContextScope scope(cuContext);
 
     // Compute memory usage
@@ -1170,11 +1177,12 @@ class OptiXDevice : public CUDADevice {
         optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));
 
     // Allocate required output buffers
-    device_only_memory<char> temp_mem(this, "optix temp as build mem");
+    device_only_memory<char> temp_mem(this, "optix temp as build mem", true);
     temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
     if (!temp_mem.device_pointer)
       return false;  // Make sure temporary memory allocation succeeded
 
+    // Acceleration structure memory has to be allocated on the device (not allowed to be on host)
     device_only_memory<char> &out_data = bvh->as_data;
     if (operation == OPTIX_BUILD_OPERATION_BUILD) {
       assert(out_data.device == this);
@@ -1222,7 +1230,7 @@ class OptiXDevice : public CUDADevice {
 
       // There is no point compacting if the size does not change
       if (compacted_size < sizes.outputSizeInBytes) {
-        device_only_memory<char> compacted_data(this, "optix compacted as");
+        device_only_memory<char> compacted_data(this, "optix compacted as", false);
         compacted_data.alloc_to_device(compacted_size);
         if (!compacted_data.device_pointer)
           // Do not compact if memory allocation for compacted acceleration structure fails
@@ -1242,6 +1250,7 @@ class OptiXDevice : public CUDADevice {
 
         std::swap(out_data.device_size, compacted_data.device_size);
         std::swap(out_data.device_pointer, compacted_data.device_pointer);
+        // Original acceleration structure memory is freed when 'compacted_data' goes out of scope
       }
     }
 
-- 
cgit v1.2.3