From 9a9e93e8043b515360eacd71c20b193c9e15204d Mon Sep 17 00:00:00 2001 From: Ha Hyung-jin Date: Tue, 5 Nov 2019 16:27:52 +0100 Subject: Fix T71071: errors when using multiple CUDA/Optix GPUs and host mapped memory The multi device code did not correctly handle cases where some GPUs store a resource in device memory and others store it in host mapped memory. Differential Revision: https://developer.blender.org/D6126 --- intern/cycles/device/device_cuda.cpp | 111 +++++++++++++++++++++------------ intern/cycles/device/device_memory.cpp | 9 ++- intern/cycles/device/device_memory.h | 2 + intern/cycles/device/device_optix.cpp | 107 +++++++++++++++++++------------ 4 files changed, 147 insertions(+), 82 deletions(-) (limited to 'intern') diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 00dd37f089c..b0a12c258b9 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -142,14 +142,15 @@ class CUDADevice : public Device { CUDASplitKernel *split_kernel; struct CUDAMem { - CUDAMem() : texobject(0), array(0), map_host_pointer(0), free_map_host(false) + CUDAMem() : texobject(0), array(0), use_mapped_host(false) { } CUtexObject texobject; CUarray array; - void *map_host_pointer; - bool free_map_host; + + /* If true, a mapped host memory in shared_pointer is being used. */ + bool use_mapped_host; }; typedef map CUDAMemMap; CUDAMemMap cuda_mem_map; @@ -726,7 +727,7 @@ class CUDADevice : public Device { } /* Already in host memory. */ - if (cmem->map_host_pointer) { + if (cmem->use_mapped_host) { continue; } @@ -801,7 +802,7 @@ class CUDADevice : public Device { cuMemGetInfo(&free, &total); /* Move textures to host memory if needed. */ - if (!move_texture_to_host && !is_image && (size + headroom) >= free) { + if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) { move_textures_to_host(size + headroom - free, is_texture); cuMemGetInfo(&free, &total); } @@ -815,49 +816,36 @@ class CUDADevice : public Device { } /* Fall back to mapped host memory if needed and possible. */ - void *map_host_pointer = 0; - bool free_map_host = false; - if (mem_alloc_result != CUDA_SUCCESS && can_map_host && - map_host_used + size < map_host_limit) { + void *shared_pointer = 0; + + if (mem_alloc_result != CUDA_SUCCESS && can_map_host) { if (mem.shared_pointer) { /* Another device already allocated host memory. */ mem_alloc_result = CUDA_SUCCESS; - map_host_pointer = mem.shared_pointer; + shared_pointer = mem.shared_pointer; } - else { + else if (map_host_used + size < map_host_limit) { /* Allocate host memory ourselves. */ mem_alloc_result = cuMemHostAlloc( - &map_host_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED); - mem.shared_pointer = map_host_pointer; - free_map_host = true; + &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED); + + assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) || + (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0)); } if (mem_alloc_result == CUDA_SUCCESS) { - cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, mem.shared_pointer, 0)); + cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0)); map_host_used += size; status = " in host memory"; - - /* Replace host pointer with our host allocation. Only works if - * CUDA memory layout is the same and has no pitch padding. Also - * does not work if we move textures to host during a render, - * since other devices might be using the memory. */ - if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer && - mem.host_pointer != mem.shared_pointer) { - memcpy(mem.shared_pointer, mem.host_pointer, size); - mem.host_free(); - mem.host_pointer = mem.shared_pointer; - } } else { status = " failed, out of host memory"; } } - else if (mem_alloc_result != CUDA_SUCCESS) { - status = " failed, out of device and host memory"; - } if (mem_alloc_result != CUDA_SUCCESS) { + status = " failed, out of device and host memory"; cuda_assert(mem_alloc_result); } @@ -877,8 +865,34 @@ class CUDADevice : public Device { /* Insert into map of allocations. */ CUDAMem *cmem = &cuda_mem_map[&mem]; - cmem->map_host_pointer = map_host_pointer; - cmem->free_map_host = free_map_host; + if (shared_pointer != 0) { + /* Replace host pointer with our host allocation. Only works if + * CUDA memory layout is the same and has no pitch padding. Also + * does not work if we move textures to host during a render, + * since other devices might be using the memory. */ + + if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer && + mem.host_pointer != shared_pointer) { + memcpy(shared_pointer, mem.host_pointer, size); + + /* A Call to device_memory::host_free() should be preceded by + * a call to device_memory::device_free() for host memory + * allocated by a device to be handled properly. Two exceptions + * are here and a call in OptiXDevice::generic_alloc(), where + * the current host memory can be assumed to be allocated by + * device_memory::host_alloc(), not by a device */ + + mem.host_free(); + mem.host_pointer = shared_pointer; + } + mem.shared_pointer = shared_pointer; + mem.shared_counter++; + cmem->use_mapped_host = true; + } + else { + cmem->use_mapped_host = false; + } + return cmem; } @@ -887,7 +901,12 @@ class CUDADevice : public Device { if (mem.host_pointer && mem.device_pointer) { CUDAContextScope scope(this); - if (mem.host_pointer != mem.shared_pointer) { + /* If use_mapped_host of mem is false, the current device only + * uses device memory allocated by cuMemAlloc regardless of + * mem.host_pointer and mem.shared_pointer, and should copy + * data from mem.host_pointer. */ + + if (cuda_mem_map[&mem].use_mapped_host == false || mem.host_pointer != mem.shared_pointer) { cuda_assert(cuMemcpyHtoD( cuda_device_ptr(mem.device_pointer), mem.host_pointer, mem.memory_size())); } @@ -900,16 +919,21 @@ class CUDADevice : public Device { CUDAContextScope scope(this); const CUDAMem &cmem = cuda_mem_map[&mem]; - if (cmem.map_host_pointer) { - /* Free host memory. */ - if (cmem.free_map_host) { - cuMemFreeHost(cmem.map_host_pointer); - if (mem.host_pointer == mem.shared_pointer) { - mem.host_pointer = 0; + /* If cmem.use_mapped_host is true, reference counting is used + * to safely free a mapped host memory. */ + + if (cmem.use_mapped_host) { + assert(mem.shared_pointer); + if (mem.shared_pointer) { + assert(mem.shared_counter > 0); + if (--mem.shared_counter == 0) { + if (mem.host_pointer == mem.shared_pointer) { + mem.host_pointer = 0; + } + cuMemFreeHost(mem.shared_pointer); + mem.shared_pointer = 0; } - mem.shared_pointer = 0; } - map_host_used -= mem.device_size; } else { @@ -989,7 +1013,12 @@ class CUDADevice : public Device { memset(mem.host_pointer, 0, mem.memory_size()); } - if (mem.device_pointer && (!mem.host_pointer || mem.host_pointer != mem.shared_pointer)) { + /* If use_mapped_host of mem is false, mem.device_pointer currently + * refers to device memory regardless of mem.host_pointer and + * mem.shared_pointer. */ + + if (mem.device_pointer && + (cuda_mem_map[&mem].use_mapped_host == false || mem.host_pointer != mem.shared_pointer)) { CUDAContextScope scope(this); cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size())); } diff --git a/intern/cycles/device/device_memory.cpp b/intern/cycles/device/device_memory.cpp index c8a71bf4b3b..c106b4505db 100644 --- a/intern/cycles/device/device_memory.cpp +++ b/intern/cycles/device/device_memory.cpp @@ -36,12 +36,15 @@ device_memory::device_memory(Device *device, const char *name, MemoryType type) device(device), device_pointer(0), host_pointer(0), - shared_pointer(0) + shared_pointer(0), + shared_counter(0) { } device_memory::~device_memory() { + assert(shared_pointer == 0); + assert(shared_counter == 0); } device_memory::device_memory(device_memory &&other) @@ -59,12 +62,14 @@ device_memory::device_memory(device_memory &&other) device(other.device), device_pointer(other.device_pointer), host_pointer(other.host_pointer), - shared_pointer(other.shared_pointer) + shared_pointer(other.shared_pointer), + shared_counter(other.shared_counter) { other.device_size = 0; other.device_pointer = 0; other.host_pointer = 0; other.shared_pointer = 0; + other.shared_counter = 0; } void *device_memory::host_alloc(size_t size) diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h index 272da6a9ad3..f8324e2a214 100644 --- a/intern/cycles/device/device_memory.h +++ b/intern/cycles/device/device_memory.h @@ -216,6 +216,8 @@ class device_memory { device_ptr device_pointer; void *host_pointer; void *shared_pointer; + /* reference counter for shared_pointer */ + int shared_counter; virtual ~device_memory(); diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp index 370e3cf4085..3162b72a0dc 100644 --- a/intern/cycles/device/device_optix.cpp +++ b/intern/cycles/device/device_optix.cpp @@ -138,7 +138,7 @@ class OptiXDevice : public Device { bool free_map_host = false; CUarray array = NULL; CUtexObject texobject = 0; - void *map_host_pointer = nullptr; + bool use_mapped_host = false; }; // Helper class to manage current CUDA context @@ -1234,7 +1234,7 @@ class OptiXDevice : public Device { cuMemGetInfo(&free, &total); /* Move textures to host memory if needed. */ - if (!move_texture_to_host && !is_image && (size + headroom) >= free) { + if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) { move_textures_to_host(size + headroom - free, is_texture); cuMemGetInfo(&free, &total); } @@ -1248,39 +1248,27 @@ class OptiXDevice : public Device { } /* Fall back to mapped host memory if needed and possible. */ - void *map_host_pointer = 0; - bool free_map_host = false; + void *shared_pointer = 0; - if (mem_alloc_result != CUDA_SUCCESS && can_map_host && - map_host_used + size < map_host_limit) { + if (mem_alloc_result != CUDA_SUCCESS && can_map_host) { if (mem.shared_pointer) { /* Another device already allocated host memory. */ mem_alloc_result = CUDA_SUCCESS; - map_host_pointer = mem.shared_pointer; + shared_pointer = mem.shared_pointer; } - else { + else if (map_host_used + size < map_host_limit) { /* Allocate host memory ourselves. */ mem_alloc_result = cuMemHostAlloc( - &map_host_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED); - mem.shared_pointer = map_host_pointer; - free_map_host = true; + &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED); + + assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) || + (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0)); } if (mem_alloc_result == CUDA_SUCCESS) { - cuMemHostGetDevicePointer_v2(&device_pointer, mem.shared_pointer, 0); + cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0); map_host_used += size; status = " in host memory"; - - /* Replace host pointer with our host allocation. Only works if - * CUDA memory layout is the same and has no pitch padding. Also - * does not work if we move textures to host during a render, - * since other devices might be using the memory. */ - if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer && - mem.host_pointer != mem.shared_pointer) { - memcpy(mem.shared_pointer, mem.host_pointer, size); - mem.host_free(); - mem.host_pointer = mem.shared_pointer; - } } else { status = " failed, out of host memory"; @@ -1311,8 +1299,34 @@ class OptiXDevice : public Device { /* Insert into map of allocations. */ CUDAMem *cmem = &cuda_mem_map[&mem]; - cmem->map_host_pointer = map_host_pointer; - cmem->free_map_host = free_map_host; + if (shared_pointer != 0) { + /* Replace host pointer with our host allocation. Only works if + * CUDA memory layout is the same and has no pitch padding. Also + * does not work if we move textures to host during a render, + * since other devices might be using the memory. */ + + if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer && + mem.host_pointer != shared_pointer) { + memcpy(shared_pointer, mem.host_pointer, size); + + /* A call to device_memory::host_free() should be preceded by + * a call to device_memory::device_free() for host memory + * allocated by a device to be handled properly. Two exceptions + * are here and a call in CUDADevice::generic_alloc(), where + * the current host memory can be assumed to be allocated by + * device_memory::host_alloc(), not by a device */ + + mem.host_free(); + mem.host_pointer = shared_pointer; + } + mem.shared_pointer = shared_pointer; + mem.shared_counter++; + cmem->use_mapped_host = true; + } + else { + cmem->use_mapped_host = false; + } + return cmem; } @@ -1560,7 +1574,12 @@ class OptiXDevice : public Device { if (mem.host_pointer && mem.device_pointer) { CUDAContextScope scope(cuda_context); - if (mem.host_pointer != mem.shared_pointer) { + /* If use_mapped_host of mem is false, the current device only + * uses device memory allocated by cuMemAlloc regardless of + * mem.host_pointer and mem.shared_pointer, and should copy + * data from mem.host_pointer. */ + + if (cuda_mem_map[&mem].use_mapped_host == false || mem.host_pointer != mem.shared_pointer) { check_result_cuda( cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size())); } @@ -1595,14 +1614,19 @@ class OptiXDevice : public Device { { if (mem.host_pointer) memset(mem.host_pointer, 0, mem.memory_size()); - if (mem.host_pointer && mem.host_pointer == mem.shared_pointer) - return; // This is shared host memory, so no device memory to update if (!mem.device_pointer) mem_alloc(mem); // Need to allocate memory first if it does not exist yet - const CUDAContextScope scope(cuda_context); - check_result_cuda(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size())); + /* If use_mapped_host of mem is false, mem.device_pointer currently + * refers to device memory regardless of mem.host_pointer and + * mem.shared_pointer. */ + + if (mem.device_pointer && + (cuda_mem_map[&mem].use_mapped_host == false || mem.host_pointer != mem.shared_pointer)) { + const CUDAContextScope scope(cuda_context); + check_result_cuda(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size())); + } } void mem_free(device_memory &mem) override @@ -1624,16 +1648,21 @@ class OptiXDevice : public Device { CUDAContextScope scope(cuda_context); const CUDAMem &cmem = cuda_mem_map[&mem]; - if (cmem.map_host_pointer) { - /* Free host memory. */ - if (cmem.free_map_host) { - cuMemFreeHost(cmem.map_host_pointer); - if (mem.host_pointer == mem.shared_pointer) { - mem.host_pointer = 0; + /* If cmem.use_mapped_host is true, reference counting is used + * to safely free a mapped host memory. */ + + if (cmem.use_mapped_host) { + assert(mem.shared_pointer); + if (mem.shared_pointer) { + assert(mem.shared_counter > 0); + if (--mem.shared_counter == 0) { + if (mem.host_pointer == mem.shared_pointer) { + mem.host_pointer = 0; + } + cuMemFreeHost(mem.shared_pointer); + mem.shared_pointer = 0; } - mem.shared_pointer = 0; } - map_host_used -= mem.device_size; } else { @@ -1699,7 +1728,7 @@ class OptiXDevice : public Device { } /* Already in host memory. */ - if (cmem->map_host_pointer) { + if (cmem->use_mapped_host) { continue; } -- cgit v1.2.3