From 336ca6796a7a6ee26ff6d889643df07a37efa554 Mon Sep 17 00:00:00 2001 From: Sergey Sharybin Date: Mon, 22 Nov 2021 17:07:55 +0100 Subject: Fix T90308: Cycles crash copying memory from device to host Happens when device runs out of memory and Cycles is moving some textures to the host memory. The delayed memory free for OptiX BVH was moving data from one device_memory to another, leaving the original device memory in an invalid state. This was ruining the allocation map in the CUDA device which is using pointer to the device_memory. This change makes it so the memory pointer is stolen from BVH into the delayed memory free list. Additionally, forbid copying and moving instances of device_memory and added sanity checks in the device implementation. Differential Revision: https://developer.blender.org/D13316 --- intern/cycles/device/cuda/device_impl.cpp | 2 ++ intern/cycles/device/hip/device_impl.cpp | 2 ++ intern/cycles/device/memory.cpp | 39 ------------------------------ intern/cycles/device/memory.h | 9 +++++-- intern/cycles/device/optix/device_impl.cpp | 14 +++++------ intern/cycles/device/optix/device_impl.h | 3 ++- 6 files changed, 20 insertions(+), 49 deletions(-) (limited to 'intern/cycles/device') diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp index 2bb0592bcc5..20945796a2d 100644 --- a/intern/cycles/device/cuda/device_impl.cpp +++ b/intern/cycles/device/cuda/device_impl.cpp @@ -777,6 +777,7 @@ void CUDADevice::generic_free(device_memory &mem) if (mem.device_pointer) { CUDAContextScope scope(this); thread_scoped_lock lock(cuda_mem_map_mutex); + DCHECK(cuda_mem_map.find(&mem) != cuda_mem_map.end()); const CUDAMem &cmem = cuda_mem_map[&mem]; /* If cmem.use_mapped_host is true, reference counting is used @@ -1145,6 +1146,7 @@ void CUDADevice::tex_free(device_texture &mem) if (mem.device_pointer) { CUDAContextScope scope(this); thread_scoped_lock lock(cuda_mem_map_mutex); + DCHECK(cuda_mem_map.find(&mem) != cuda_mem_map.end()); const CUDAMem &cmem = cuda_mem_map[&mem]; if (cmem.texobject) { diff --git a/intern/cycles/device/hip/device_impl.cpp b/intern/cycles/device/hip/device_impl.cpp index 76e300eb132..5b0e2951c39 100644 --- a/intern/cycles/device/hip/device_impl.cpp +++ b/intern/cycles/device/hip/device_impl.cpp @@ -745,6 +745,7 @@ void HIPDevice::generic_free(device_memory &mem) if (mem.device_pointer) { HIPContextScope scope(this); thread_scoped_lock lock(hip_mem_map_mutex); + DCHECK(hip_mem_map.find(&mem) != hip_mem_map.end()); const HIPMem &cmem = hip_mem_map[&mem]; /* If cmem.use_mapped_host is true, reference counting is used @@ -1114,6 +1115,7 @@ void HIPDevice::tex_free(device_texture &mem) if (mem.device_pointer) { HIPContextScope scope(this); thread_scoped_lock lock(hip_mem_map_mutex); + DCHECK(hip_mem_map.find(&mem) != hip_mem_map.end()); const HIPMem &cmem = hip_mem_map[&mem]; if (cmem.texobject) { diff --git a/intern/cycles/device/memory.cpp b/intern/cycles/device/memory.cpp index f162b00d9f7..86bf2542c92 100644 --- a/intern/cycles/device/memory.cpp +++ b/intern/cycles/device/memory.cpp @@ -44,45 +44,6 @@ device_memory::device_memory(Device *device, const char *name, MemoryType type) { } -device_memory::device_memory(device_memory &&other) noexcept - : data_type(other.data_type), - data_elements(other.data_elements), - data_size(other.data_size), - device_size(other.device_size), - data_width(other.data_width), - data_height(other.data_height), - data_depth(other.data_depth), - type(other.type), - name(other.name), - device(other.device), - device_pointer(other.device_pointer), - host_pointer(other.host_pointer), - shared_pointer(other.shared_pointer), - shared_counter(other.shared_counter), - original_device_ptr(other.original_device_ptr), - original_device_size(other.original_device_size), - original_device(other.original_device), - need_realloc_(other.need_realloc_), - modified(other.modified) -{ - other.data_elements = 0; - other.data_size = 0; - other.device_size = 0; - other.data_width = 0; - other.data_height = 0; - other.data_depth = 0; - other.device = 0; - other.device_pointer = 0; - other.host_pointer = 0; - other.shared_pointer = 0; - other.shared_counter = 0; - other.original_device_ptr = 0; - other.original_device_size = 0; - other.original_device = 0; - other.need_realloc_ = false; - other.modified = false; -} - device_memory::~device_memory() { assert(shared_pointer == 0); diff --git a/intern/cycles/device/memory.h b/intern/cycles/device/memory.h index 281c54cc6a5..e04142117aa 100644 --- a/intern/cycles/device/memory.h +++ b/intern/cycles/device/memory.h @@ -281,11 +281,16 @@ class device_memory { /* Only create through subclasses. */ device_memory(Device *device, const char *name, MemoryType type); - device_memory(device_memory &&other) noexcept; - /* No copying allowed. */ + /* No copying and allowed. + * + * This is because device implementation might need to register device memory in an allocation + * map of some sort and use pointer as a key to identify blocks. Moving data from one place to + * another bypassing device allocation routines will make those maps hard to maintain. */ device_memory(const device_memory &) = delete; + device_memory(device_memory &&other) noexcept = delete; device_memory &operator=(const device_memory &) = delete; + device_memory &operator=(device_memory &&) = delete; /* Host allocation on the device. All host_pointer memory should be * allocated with these functions, for devices that support using diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp index bb690551c04..6e897e3831f 100644 --- a/intern/cycles/device/optix/device_impl.cpp +++ b/intern/cycles/device/optix/device_impl.cpp @@ -1032,7 +1032,7 @@ bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh, return false; } - device_only_memory &out_data = bvh->as_data; + device_only_memory &out_data = *bvh->as_data; if (operation == OPTIX_BUILD_OPERATION_BUILD) { assert(out_data.device == this); out_data.alloc_to_device(sizes.outputSizeInBytes); @@ -1123,7 +1123,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) operation = OPTIX_BUILD_OPERATION_UPDATE; } else { - bvh_optix->as_data.free(); + bvh_optix->as_data->free(); bvh_optix->traversable_handle = 0; } @@ -1344,9 +1344,9 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) unsigned int num_instances = 0; unsigned int max_num_instances = 0xFFFFFFFF; - bvh_optix->as_data.free(); + bvh_optix->as_data->free(); bvh_optix->traversable_handle = 0; - bvh_optix->motion_transform_data.free(); + bvh_optix->motion_transform_data->free(); optixDeviceContextGetProperty(context, OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID, @@ -1379,8 +1379,8 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) } } - assert(bvh_optix->motion_transform_data.device == this); - bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size); + assert(bvh_optix->motion_transform_data->device == this); + bvh_optix->motion_transform_data->alloc_to_device(total_motion_transform_size); } for (Object *ob : bvh->objects) { @@ -1441,7 +1441,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) motion_transform_offset = align_up(motion_transform_offset, OPTIX_TRANSFORM_BYTE_ALIGNMENT); - CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data.device_pointer + + CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data->device_pointer + motion_transform_offset; motion_transform_offset += motion_transform_size; diff --git a/intern/cycles/device/optix/device_impl.h b/intern/cycles/device/optix/device_impl.h index 5cfc249b430..1b43972d99f 100644 --- a/intern/cycles/device/optix/device_impl.h +++ b/intern/cycles/device/optix/device_impl.h @@ -23,6 +23,7 @@ # include "device/optix/queue.h" # include "device/optix/util.h" # include "kernel/types.h" +# include "util/unique_ptr.h" CCL_NAMESPACE_BEGIN @@ -76,7 +77,7 @@ class OptiXDevice : public CUDADevice { device_only_memory launch_params; OptixTraversableHandle tlas_handle = 0; - vector> delayed_free_bvh_memory; + vector>> delayed_free_bvh_memory; thread_mutex delayed_free_bvh_mutex; class Denoiser { -- cgit v1.2.3