From 4fbeb3e6be3ed03e55ebd8a9d41c80c2a4f08815 Mon Sep 17 00:00:00 2001
From: James Horsley <mmdanggg2>
Date: Wed, 27 Jan 2021 14:04:37 +0100
Subject: Fix T85089: Crash when rendering scene that does not fit into GPU
 memory with CUDA/OptiX

The "cuda_mem_map_mutex" was potentially being locked recursively during the call to
"CUDADevice::move_textures_to_host", which crashed. This moves around the locking and
unlocking of "cuda_mem_map_mutex", so that it doesn't call a function that locks it while
still holding the lock.

Reviewed By: pmoursnv

Maniphest Tasks: T85089, T84734

Differential Revision: https://developer.blender.org/D10219
---
 intern/cycles/device/cuda/device_cuda_impl.cpp | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)
diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp
index 307d1469ba5..cff30eb9b48 100644
--- a/intern/cycles/device/cuda/device_cuda_impl.cpp
+++ b/intern/cycles/device/cuda/device_cuda_impl.cpp
@@ -742,6 +742,7 @@ void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
     size_t max_size = 0;
     bool max_is_image = false;
 
+    thread_scoped_lock lock(cuda_mem_map_mutex);
     foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
       device_memory &mem = *pair.first;
       CUDAMem *cmem = &pair.second;
@@ -773,6 +774,7 @@ void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
         max_mem = &mem;
       }
     }
+    lock.unlock();
 
     /* Move to host memory. This part is mutex protected since
      * multiple CUDA devices could be moving the memory. The
@@ -894,6 +896,7 @@ CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_
   }
 
   /* Insert into map of allocations. */
+  thread_scoped_lock lock(cuda_mem_map_mutex);
   CUDAMem *cmem = &cuda_mem_map[&mem];
   if (shared_pointer != 0) {
     /* Replace host pointer with our host allocation. Only works if
@@ -935,6 +938,7 @@ void CUDADevice::generic_copy_to(device_memory &mem)
   /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
    * cuMemAlloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
    * mem.host_pointer. */
+  thread_scoped_lock lock(cuda_mem_map_mutex);
   if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
     const CUDAContextScope scope(this);
     cuda_assert(
@@ -946,6 +950,7 @@ void CUDADevice::generic_free(device_memory &mem)
 {
   if (mem.device_pointer) {
     CUDAContextScope scope(this);
+    thread_scoped_lock lock(cuda_mem_map_mutex);
     const CUDAMem &cmem = cuda_mem_map[&mem];
 
     /* If cmem.use_mapped_host is true, reference counting is used
@@ -990,7 +995,6 @@ void CUDADevice::mem_alloc(device_memory &mem)
     assert(!"mem_alloc not supported for global memory.");
   }
   else {
-    thread_scoped_lock lock(cuda_mem_map_mutex);
     generic_alloc(mem);
   }
 }
@@ -1009,7 +1013,6 @@ void CUDADevice::mem_copy_to(device_memory &mem)
     tex_alloc((device_texture &)mem);
   }
   else {
-    thread_scoped_lock lock(cuda_mem_map_mutex);
     if (!mem.device_pointer) {
       generic_alloc(mem);
     }
@@ -1073,7 +1076,6 @@ void CUDADevice::mem_free(device_memory &mem)
     tex_free((device_texture &)mem);
   }
   else {
-    thread_scoped_lock lock(cuda_mem_map_mutex);
     generic_free(mem);
   }
 }
@@ -1097,7 +1099,6 @@ void CUDADevice::const_copy_to(const char *name, void *host, size_t size)
 void CUDADevice::global_alloc(device_memory &mem)
 {
   if (mem.is_resident(this)) {
-    thread_scoped_lock lock(cuda_mem_map_mutex);
     generic_alloc(mem);
     generic_copy_to(mem);
   }
@@ -1108,7 +1109,6 @@ void CUDADevice::global_alloc(device_memory &mem)
 void CUDADevice::global_free(device_memory &mem)
 {
   if (mem.is_resident(this) && mem.device_pointer) {
-    thread_scoped_lock lock(cuda_mem_map_mutex);
     generic_free(mem);
   }
 }
@@ -1177,9 +1177,8 @@ void CUDADevice::tex_alloc(device_texture &mem)
   size_t src_pitch = mem.data_width * dsize * mem.data_elements;
   size_t dst_pitch = src_pitch;
 
-  thread_scoped_lock lock(cuda_mem_map_mutex);
-
   if (!mem.is_resident(this)) {
+    thread_scoped_lock lock(cuda_mem_map_mutex);
     cmem = &cuda_mem_map[&mem];
     cmem->texobject = 0;
 
@@ -1229,6 +1228,7 @@ void CUDADevice::tex_alloc(device_texture &mem)
     mem.device_size = size;
     stats.mem_alloc(size);
 
+    thread_scoped_lock lock(cuda_mem_map_mutex);
     cmem = &cuda_mem_map[&mem];
     cmem->texobject = 0;
     cmem->array = array_3d;
@@ -1266,9 +1266,6 @@ void CUDADevice::tex_alloc(device_texture &mem)
     cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
   }
 
-  /* Unlock mutex before resizing texture info, since that may attempt to lock it again. */
-  lock.unlock();
-
   /* Resize once */
   const uint slot = mem.slot;
   if (slot >= texture_info.size()) {
@@ -1317,9 +1314,7 @@ void CUDADevice::tex_alloc(device_texture &mem)
     texDesc.filterMode = filter_mode;
     texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
 
-    /* Lock again and refresh the data pointer (in case another thread modified the map in the
-     * meantime). */
-    lock.lock();
+    thread_scoped_lock lock(cuda_mem_map_mutex);
     cmem = &cuda_mem_map[&mem];
 
     cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
@@ -1357,6 +1352,7 @@ void CUDADevice::tex_free(device_texture &mem)
       cuda_mem_map.erase(cuda_mem_map.find(&mem));
     }
     else {
+      lock.unlock();
       generic_free(mem);
     }
   }
-- 
cgit v1.2.3