From 9a9e93e8043b515360eacd71c20b193c9e15204d Mon Sep 17 00:00:00 2001
From: Ha Hyung-jin <robobeg>
Date: Tue, 5 Nov 2019 16:27:52 +0100
Subject: Fix T71071: errors when using multiple CUDA/Optix GPUs and host
 mapped memory

The multi device code did not correctly handle cases where some GPUs store a
resource in device memory and others store it in host mapped memory.

Differential Revision: https://developer.blender.org/D6126
---
 intern/cycles/device/device_cuda.cpp   | 111 +++++++++++++++++++++------------
 intern/cycles/device/device_memory.cpp |   9 ++-
 intern/cycles/device/device_memory.h   |   2 +
 intern/cycles/device/device_optix.cpp  | 107 +++++++++++++++++++------------
 4 files changed, 147 insertions(+), 82 deletions(-)

(limited to 'intern')
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 00dd37f089c..b0a12c258b9 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -142,14 +142,15 @@ class CUDADevice : public Device {
   CUDASplitKernel *split_kernel;
 
   struct CUDAMem {
-    CUDAMem() : texobject(0), array(0), map_host_pointer(0), free_map_host(false)
+    CUDAMem() : texobject(0), array(0), use_mapped_host(false)
     {
     }
 
     CUtexObject texobject;
     CUarray array;
-    void *map_host_pointer;
-    bool free_map_host;
+
+    /* If true, a mapped host memory in shared_pointer is being used. */
+    bool use_mapped_host;
   };
   typedef map<device_memory *, CUDAMem> CUDAMemMap;
   CUDAMemMap cuda_mem_map;
@@ -726,7 +727,7 @@ class CUDADevice : public Device {
         }
 
         /* Already in host memory. */
-        if (cmem->map_host_pointer) {
+        if (cmem->use_mapped_host) {
           continue;
         }
 
@@ -801,7 +802,7 @@ class CUDADevice : public Device {
     cuMemGetInfo(&free, &total);
 
     /* Move textures to host memory if needed. */
-    if (!move_texture_to_host && !is_image && (size + headroom) >= free) {
+    if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
       move_textures_to_host(size + headroom - free, is_texture);
       cuMemGetInfo(&free, &total);
     }
@@ -815,49 +816,36 @@ class CUDADevice : public Device {
     }
 
     /* Fall back to mapped host memory if needed and possible. */
-    void *map_host_pointer = 0;
-    bool free_map_host = false;
 
-    if (mem_alloc_result != CUDA_SUCCESS && can_map_host &&
-        map_host_used + size < map_host_limit) {
+    void *shared_pointer = 0;
+
+    if (mem_alloc_result != CUDA_SUCCESS && can_map_host) {
       if (mem.shared_pointer) {
         /* Another device already allocated host memory. */
         mem_alloc_result = CUDA_SUCCESS;
-        map_host_pointer = mem.shared_pointer;
+        shared_pointer = mem.shared_pointer;
       }
-      else {
+      else if (map_host_used + size < map_host_limit) {
         /* Allocate host memory ourselves. */
         mem_alloc_result = cuMemHostAlloc(
-            &map_host_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
-        mem.shared_pointer = map_host_pointer;
-        free_map_host = true;
+            &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
+
+        assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
+               (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
       }
 
       if (mem_alloc_result == CUDA_SUCCESS) {
-        cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, mem.shared_pointer, 0));
+        cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0));
         map_host_used += size;
         status = " in host memory";
-
-        /* Replace host pointer with our host allocation. Only works if
-         * CUDA memory layout is the same and has no pitch padding. Also
-         * does not work if we move textures to host during a render,
-         * since other devices might be using the memory. */
-        if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
-            mem.host_pointer != mem.shared_pointer) {
-          memcpy(mem.shared_pointer, mem.host_pointer, size);
-          mem.host_free();
-          mem.host_pointer = mem.shared_pointer;
-        }
       }
       else {
         status = " failed, out of host memory";
       }
     }
-    else if (mem_alloc_result != CUDA_SUCCESS) {
-      status = " failed, out of device and host memory";
-    }
 
     if (mem_alloc_result != CUDA_SUCCESS) {
+      status = " failed, out of device and host memory";
       cuda_assert(mem_alloc_result);
     }
 
@@ -877,8 +865,34 @@ class CUDADevice : public Device {
 
     /* Insert into map of allocations. */
     CUDAMem *cmem = &cuda_mem_map[&mem];
-    cmem->map_host_pointer = map_host_pointer;
-    cmem->free_map_host = free_map_host;
+    if (shared_pointer != 0) {
+      /* Replace host pointer with our host allocation. Only works if
+       * CUDA memory layout is the same and has no pitch padding. Also
+       * does not work if we move textures to host during a render,
+       * since other devices might be using the memory. */
+
+      if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
+          mem.host_pointer != shared_pointer) {
+        memcpy(shared_pointer, mem.host_pointer, size);
+
+        /* A Call to device_memory::host_free() should be preceded by
+         * a call to device_memory::device_free() for host memory
+         * allocated by a device to be handled properly. Two exceptions
+         * are here and a call in OptiXDevice::generic_alloc(), where
+         * the current host memory can be assumed to be allocated by
+         * device_memory::host_alloc(), not by a device */
+
+        mem.host_free();
+        mem.host_pointer = shared_pointer;
+      }
+      mem.shared_pointer = shared_pointer;
+      mem.shared_counter++;
+      cmem->use_mapped_host = true;
+    }
+    else {
+      cmem->use_mapped_host = false;
+    }
+
     return cmem;
   }
 
@@ -887,7 +901,12 @@ class CUDADevice : public Device {
     if (mem.host_pointer && mem.device_pointer) {
       CUDAContextScope scope(this);
 
-      if (mem.host_pointer != mem.shared_pointer) {
+      /* If use_mapped_host of mem is false, the current device only
+       * uses device memory allocated by cuMemAlloc regardless of
+       * mem.host_pointer and mem.shared_pointer, and should copy
+       * data from mem.host_pointer. */
+
+      if (cuda_mem_map[&mem].use_mapped_host == false || mem.host_pointer != mem.shared_pointer) {
         cuda_assert(cuMemcpyHtoD(
             cuda_device_ptr(mem.device_pointer), mem.host_pointer, mem.memory_size()));
       }
@@ -900,16 +919,21 @@ class CUDADevice : public Device {
       CUDAContextScope scope(this);
       const CUDAMem &cmem = cuda_mem_map[&mem];
 
-      if (cmem.map_host_pointer) {
-        /* Free host memory. */
-        if (cmem.free_map_host) {
-          cuMemFreeHost(cmem.map_host_pointer);
-          if (mem.host_pointer == mem.shared_pointer) {
-            mem.host_pointer = 0;
+      /* If cmem.use_mapped_host is true, reference counting is used
+       * to safely free a mapped host memory. */
+
+      if (cmem.use_mapped_host) {
+        assert(mem.shared_pointer);
+        if (mem.shared_pointer) {
+          assert(mem.shared_counter > 0);
+          if (--mem.shared_counter == 0) {
+            if (mem.host_pointer == mem.shared_pointer) {
+              mem.host_pointer = 0;
+            }
+            cuMemFreeHost(mem.shared_pointer);
+            mem.shared_pointer = 0;
           }
-          mem.shared_pointer = 0;
         }
-
         map_host_used -= mem.device_size;
       }
       else {
@@ -989,7 +1013,12 @@ class CUDADevice : public Device {
       memset(mem.host_pointer, 0, mem.memory_size());
     }
 
-    if (mem.device_pointer && (!mem.host_pointer || mem.host_pointer != mem.shared_pointer)) {
+    /* If use_mapped_host of mem is false, mem.device_pointer currently
+     * refers to device memory regardless of mem.host_pointer and
+     * mem.shared_pointer. */
+
+    if (mem.device_pointer &&
+        (cuda_mem_map[&mem].use_mapped_host == false || mem.host_pointer != mem.shared_pointer)) {
       CUDAContextScope scope(this);
       cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size()));
     }
diff --git a/intern/cycles/device/device_memory.cpp b/intern/cycles/device/device_memory.cpp
index c8a71bf4b3b..c106b4505db 100644
--- a/intern/cycles/device/device_memory.cpp
+++ b/intern/cycles/device/device_memory.cpp
@@ -36,12 +36,15 @@ device_memory::device_memory(Device *device, const char *name, MemoryType type)
       device(device),
       device_pointer(0),
       host_pointer(0),
-      shared_pointer(0)
+      shared_pointer(0),
+      shared_counter(0)
 {
 }
 
 device_memory::~device_memory()
 {
+  assert(shared_pointer == 0);
+  assert(shared_counter == 0);
 }
 
 device_memory::device_memory(device_memory &&other)
@@ -59,12 +62,14 @@ device_memory::device_memory(device_memory &&other)
       device(other.device),
       device_pointer(other.device_pointer),
       host_pointer(other.host_pointer),
-      shared_pointer(other.shared_pointer)
+      shared_pointer(other.shared_pointer),
+      shared_counter(other.shared_counter)
 {
   other.device_size = 0;
   other.device_pointer = 0;
   other.host_pointer = 0;
   other.shared_pointer = 0;
+  other.shared_counter = 0;
 }
 
 void *device_memory::host_alloc(size_t size)
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index 272da6a9ad3..f8324e2a214 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -216,6 +216,8 @@ class device_memory {
   device_ptr device_pointer;
   void *host_pointer;
   void *shared_pointer;
+  /* reference counter for shared_pointer */
+  int shared_counter;
 
   virtual ~device_memory();
 
diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp
index 370e3cf4085..3162b72a0dc 100644
--- a/intern/cycles/device/device_optix.cpp
+++ b/intern/cycles/device/device_optix.cpp
@@ -138,7 +138,7 @@ class OptiXDevice : public Device {
     bool free_map_host = false;
     CUarray array = NULL;
     CUtexObject texobject = 0;
-    void *map_host_pointer = nullptr;
+    bool use_mapped_host = false;
   };
 
   // Helper class to manage current CUDA context
@@ -1234,7 +1234,7 @@ class OptiXDevice : public Device {
     cuMemGetInfo(&free, &total);
 
     /* Move textures to host memory if needed. */
-    if (!move_texture_to_host && !is_image && (size + headroom) >= free) {
+    if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
       move_textures_to_host(size + headroom - free, is_texture);
       cuMemGetInfo(&free, &total);
     }
@@ -1248,39 +1248,27 @@ class OptiXDevice : public Device {
     }
 
     /* Fall back to mapped host memory if needed and possible. */
-    void *map_host_pointer = 0;
-    bool free_map_host = false;
+    void *shared_pointer = 0;
 
-    if (mem_alloc_result != CUDA_SUCCESS && can_map_host &&
-        map_host_used + size < map_host_limit) {
+    if (mem_alloc_result != CUDA_SUCCESS && can_map_host) {
       if (mem.shared_pointer) {
         /* Another device already allocated host memory. */
         mem_alloc_result = CUDA_SUCCESS;
-        map_host_pointer = mem.shared_pointer;
+        shared_pointer = mem.shared_pointer;
       }
-      else {
+      else if (map_host_used + size < map_host_limit) {
         /* Allocate host memory ourselves. */
         mem_alloc_result = cuMemHostAlloc(
-            &map_host_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
-        mem.shared_pointer = map_host_pointer;
-        free_map_host = true;
+            &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
+
+        assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
+               (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
       }
 
       if (mem_alloc_result == CUDA_SUCCESS) {
-        cuMemHostGetDevicePointer_v2(&device_pointer, mem.shared_pointer, 0);
+        cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0);
         map_host_used += size;
         status = " in host memory";
-
-        /* Replace host pointer with our host allocation. Only works if
-         * CUDA memory layout is the same and has no pitch padding. Also
-         * does not work if we move textures to host during a render,
-         * since other devices might be using the memory. */
-        if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
-            mem.host_pointer != mem.shared_pointer) {
-          memcpy(mem.shared_pointer, mem.host_pointer, size);
-          mem.host_free();
-          mem.host_pointer = mem.shared_pointer;
-        }
       }
       else {
         status = " failed, out of host memory";
@@ -1311,8 +1299,34 @@ class OptiXDevice : public Device {
 
     /* Insert into map of allocations. */
     CUDAMem *cmem = &cuda_mem_map[&mem];
-    cmem->map_host_pointer = map_host_pointer;
-    cmem->free_map_host = free_map_host;
+    if (shared_pointer != 0) {
+      /* Replace host pointer with our host allocation. Only works if
+       * CUDA memory layout is the same and has no pitch padding. Also
+       * does not work if we move textures to host during a render,
+       * since other devices might be using the memory. */
+
+      if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
+          mem.host_pointer != shared_pointer) {
+        memcpy(shared_pointer, mem.host_pointer, size);
+
+        /* A call to device_memory::host_free() should be preceded by
+         * a call to device_memory::device_free() for host memory
+         * allocated by a device to be handled properly. Two exceptions
+         * are here and a call in CUDADevice::generic_alloc(), where
+         * the current host memory can be assumed to be allocated by
+         * device_memory::host_alloc(), not by a device */
+
+        mem.host_free();
+        mem.host_pointer = shared_pointer;
+      }
+      mem.shared_pointer = shared_pointer;
+      mem.shared_counter++;
+      cmem->use_mapped_host = true;
+    }
+    else {
+      cmem->use_mapped_host = false;
+    }
+
     return cmem;
   }
 
@@ -1560,7 +1574,12 @@ class OptiXDevice : public Device {
     if (mem.host_pointer && mem.device_pointer) {
       CUDAContextScope scope(cuda_context);
 
-      if (mem.host_pointer != mem.shared_pointer) {
+      /* If use_mapped_host of mem is false, the current device only
+       * uses device memory allocated by cuMemAlloc regardless of
+       * mem.host_pointer and mem.shared_pointer, and should copy
+       * data from mem.host_pointer. */
+
+      if (cuda_mem_map[&mem].use_mapped_host == false || mem.host_pointer != mem.shared_pointer) {
         check_result_cuda(
             cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size()));
       }
@@ -1595,14 +1614,19 @@ class OptiXDevice : public Device {
   {
     if (mem.host_pointer)
       memset(mem.host_pointer, 0, mem.memory_size());
-    if (mem.host_pointer && mem.host_pointer == mem.shared_pointer)
-      return;  // This is shared host memory, so no device memory to update
 
     if (!mem.device_pointer)
       mem_alloc(mem);  // Need to allocate memory first if it does not exist yet
 
-    const CUDAContextScope scope(cuda_context);
-    check_result_cuda(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
+    /* If use_mapped_host of mem is false, mem.device_pointer currently
+     * refers to device memory regardless of mem.host_pointer and
+     * mem.shared_pointer. */
+
+    if (mem.device_pointer &&
+        (cuda_mem_map[&mem].use_mapped_host == false || mem.host_pointer != mem.shared_pointer)) {
+      const CUDAContextScope scope(cuda_context);
+      check_result_cuda(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
+    }
   }
 
   void mem_free(device_memory &mem) override
@@ -1624,16 +1648,21 @@ class OptiXDevice : public Device {
       CUDAContextScope scope(cuda_context);
       const CUDAMem &cmem = cuda_mem_map[&mem];
 
-      if (cmem.map_host_pointer) {
-        /* Free host memory. */
-        if (cmem.free_map_host) {
-          cuMemFreeHost(cmem.map_host_pointer);
-          if (mem.host_pointer == mem.shared_pointer) {
-            mem.host_pointer = 0;
+      /* If cmem.use_mapped_host is true, reference counting is used
+       * to safely free a mapped host memory. */
+
+      if (cmem.use_mapped_host) {
+        assert(mem.shared_pointer);
+        if (mem.shared_pointer) {
+          assert(mem.shared_counter > 0);
+          if (--mem.shared_counter == 0) {
+            if (mem.host_pointer == mem.shared_pointer) {
+              mem.host_pointer = 0;
+            }
+            cuMemFreeHost(mem.shared_pointer);
+            mem.shared_pointer = 0;
           }
-          mem.shared_pointer = 0;
         }
-
         map_host_used -= mem.device_size;
       }
       else {
@@ -1699,7 +1728,7 @@ class OptiXDevice : public Device {
         }
 
         /* Already in host memory. */
-        if (cmem->map_host_pointer) {
+        if (cmem->use_mapped_host) {
           continue;
         }
 
-- 
cgit v1.2.3