Fix CUDA out of memory error with OptiX viewport denoising on small GPUs

This makes the memory allocation for the denoiser state use the memory allocator in Cycles, which will evict textures to host memory when there is not enough space on the device. This means the allocation for the denoiser state won't just fail if there is no more space and instead more space is made for it to work. Also simplifies code somewhat.
author: Patrick Mours <pmours@nvidia.com> 2020-02-28 17:57:44 +0300
committer: Patrick Mours <pmours@nvidia.com> 2020-02-28 17:58:17 +0300
commit: 0cea9353fdfef4ae736d9894e4f6a0a8cef99619 (patch)
tree: 57b83f0f35a7b0628301b7d2c9cf8cd083788852 /intern
parent: 27fa33c14326e08b61955b0034b5b4b1f2552591 (diff)
1 files changed, 13 insertions, 21 deletions
diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp
index 2ce8bed3783..0121b89e9d8 100644
--- a/intern/cycles/device/device_optix.cpp
+++ b/intern/cycles/device/device_optix.cpp
@@ -186,14 +186,15 @@ class OptiXDevice : public CUDADevice {
   OptixTraversableHandle tlas_handle = 0;
 
   OptixDenoiser denoiser = NULL;
-  pair<int2, CUdeviceptr> denoiser_state = {};
+  device_only_memory<unsigned char> denoiser_state;
   int denoiser_input_passes = 0;
 
  public:
   OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
       : CUDADevice(info_, stats_, profiler_, background_),
         sbt_data(this, "__sbt", MEM_READ_ONLY),
-        launch_params(this, "__params")
+        launch_params(this, "__params"),
+        denoiser_state(this, "__denoiser_state")
   {
     // Store number of CUDA streams in device info
     info.cpu_threads = DebugFlags().optix.cuda_streams;
@@ -255,13 +256,10 @@ class OptiXDevice : public CUDADevice {
       cuMemFree(mem);
     }
 
-    if (denoiser_state.second) {
-      cuMemFree(denoiser_state.second);
-    }
-
     sbt_data.free();
     texture_info.free();
     launch_params.free();
+    denoiser_state.free();
 
     // Unload modules
     if (optix_module != NULL)
@@ -813,32 +811,26 @@ class OptiXDevice : public CUDADevice {
       check_result_optix_ret(
           optixDenoiserComputeMemoryResources(denoiser, rect_size.x, rect_size.y, &sizes));
 
-      auto &state = denoiser_state.second;
-      auto &state_size = denoiser_state.first;
       const size_t scratch_size = sizes.recommendedScratchSizeInBytes;
       const size_t scratch_offset = sizes.stateSizeInBytes;
 
       // Allocate denoiser state if tile size has changed since last setup
-      if (state_size.x != rect_size.x || state_size.y != rect_size.y || recreate_denoiser) {
-        // Free existing state before allocating new one
-        if (state) {
-          cuMemFree(state);
-          state = 0;
-        }
-
-        check_result_cuda_ret(cuMemAlloc(&state, scratch_offset + scratch_size));
+      if (recreate_denoiser || (denoiser_state.data_width != rect_size.x ||
+                                denoiser_state.data_height != rect_size.y)) {
+        denoiser_state.alloc_to_device(scratch_offset + scratch_size);
 
         // Initialize denoiser state for the current tile size
         check_result_optix_ret(optixDenoiserSetup(denoiser,
                                                   0,
                                                   rect_size.x,
                                                   rect_size.y,
-                                                  state,
+                                                  denoiser_state.device_pointer,
                                                   scratch_offset,
-                                                  state + scratch_offset,
+                                                  denoiser_state.device_pointer + scratch_offset,
                                                   scratch_size));
 
-        state_size = rect_size;
+        denoiser_state.data_width = rect_size.x;
+        denoiser_state.data_height = rect_size.y;
       }
 
       // Set up input and output layer information
@@ -880,14 +872,14 @@ class OptiXDevice : public CUDADevice {
       check_result_optix_ret(optixDenoiserInvoke(denoiser,
                                                  0,
                                                  &params,
-                                                 state,
+                                                 denoiser_state.device_pointer,
                                                  scratch_offset,
                                                  input_layers,
                                                  task.denoising.optix_input_passes,
                                                  overlap_offset.x,
                                                  overlap_offset.y,
                                                  output_layers,
-                                                 state + scratch_offset,
+                                                 denoiser_state.device_pointer + scratch_offset,
                                                  scratch_size));
 
 #  if OPTIX_DENOISER_NO_PIXEL_STRIDE
author	Patrick Mours <pmours@nvidia.com>	2020-02-28 17:57:44 +0300
committer	Patrick Mours <pmours@nvidia.com>	2020-02-28 17:58:17 +0300
commit	0cea9353fdfef4ae736d9894e4f6a0a8cef99619 (patch)
tree	57b83f0f35a7b0628301b7d2c9cf8cd083788852 /intern
parent	27fa33c14326e08b61955b0034b5b4b1f2552591 (diff)