From f4f8b6dde32b0438e0b97a6d8ebeb89802987127 Mon Sep 17 00:00:00 2001 From: Patrick Mours Date: Wed, 3 Mar 2021 14:35:50 +0100 Subject: Cycles: Change device-only memory to actually only allocate on the device This patch changes the `MEM_DEVICE_ONLY` type to only allocate on the device and fail if that is not possible anymore because out-of-memory (since OptiX acceleration structures may not be allocated in host memory). It also fixes high peak memory usage during OptiX acceleration structure building. Reviewed By: brecht Maniphest Tasks: T85985 Differential Revision: https://developer.blender.org/D10535 --- intern/cycles/bvh/bvh_optix.cpp | 4 ++-- intern/cycles/device/cuda/device_cuda_impl.cpp | 12 +++++++++--- intern/cycles/device/device_cpu.cpp | 5 ++--- intern/cycles/device/device_denoising.h | 3 ++- intern/cycles/device/device_memory.h | 4 ++-- intern/cycles/device/device_optix.cpp | 23 ++++++++++++++++------- 6 files changed, 33 insertions(+), 18 deletions(-) (limited to 'intern') diff --git a/intern/cycles/bvh/bvh_optix.cpp b/intern/cycles/bvh/bvh_optix.cpp index e094f339ede..d630e8965dc 100644 --- a/intern/cycles/bvh/bvh_optix.cpp +++ b/intern/cycles/bvh/bvh_optix.cpp @@ -27,8 +27,8 @@ BVHOptiX::BVHOptiX(const BVHParams ¶ms_, Device *device) : BVH(params_, geometry_, objects_), traversable_handle(0), - as_data(device, params_.top_level ? "optix tlas" : "optix blas"), - motion_transform_data(device, "optix motion transform") + as_data(device, params_.top_level ? "optix tlas" : "optix blas", false), + motion_transform_data(device, "optix motion transform", false) { } diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp index 44a51835f4c..5b62292ca55 100644 --- a/intern/cycles/device/cuda/device_cuda_impl.cpp +++ b/intern/cycles/device/cuda/device_cuda_impl.cpp @@ -854,7 +854,7 @@ CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_ void *shared_pointer = 0; - if (mem_alloc_result != CUDA_SUCCESS && can_map_host) { + if (mem_alloc_result != CUDA_SUCCESS && can_map_host && mem.type != MEM_DEVICE_ONLY) { if (mem.shared_pointer) { /* Another device already allocated host memory. */ mem_alloc_result = CUDA_SUCCESS; @@ -877,8 +877,14 @@ CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_ } if (mem_alloc_result != CUDA_SUCCESS) { - status = " failed, out of device and host memory"; - set_error("System is out of GPU and shared host memory"); + if (mem.type == MEM_DEVICE_ONLY) { + status = " failed, out of device memory"; + set_error("System is out of GPU memory"); + } + else { + status = " failed, out of device and host memory"; + set_error("System is out of GPU and shared host memory"); + } } if (mem.name) { diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index fdfd3f83be6..e2f9c7391da 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -396,8 +396,7 @@ class CPUDevice : public Device { << string_human_readable_size(mem.memory_size()) << ")"; } - if (mem.type == MEM_DEVICE_ONLY) { - assert(!mem.host_pointer); + if (mem.type == MEM_DEVICE_ONLY || !mem.host_pointer) { size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES; void *data = util_aligned_malloc(mem.memory_size(), alignment); mem.device_pointer = (device_ptr)data; @@ -459,7 +458,7 @@ class CPUDevice : public Device { tex_free((device_texture &)mem); } else if (mem.device_pointer) { - if (mem.type == MEM_DEVICE_ONLY) { + if (mem.type == MEM_DEVICE_ONLY || !mem.host_pointer) { util_aligned_free((void *)mem.device_pointer); } mem.device_pointer = 0; diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h index 2c0dc23b44a..bb8bdfdd225 100644 --- a/intern/cycles/device/device_denoising.h +++ b/intern/cycles/device/device_denoising.h @@ -171,7 +171,8 @@ class DenoisingTask { bool gpu_temporary_mem; DenoiseBuffers(Device *device) - : mem(device, "denoising pixel buffer"), temporary_mem(device, "denoising temporary mem") + : mem(device, "denoising pixel buffer"), + temporary_mem(device, "denoising temporary mem", true) { } } buffer; diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h index 1f63a152458..97459b9ae6a 100644 --- a/intern/cycles/device/device_memory.h +++ b/intern/cycles/device/device_memory.h @@ -270,8 +270,8 @@ class device_memory { template class device_only_memory : public device_memory { public: - device_only_memory(Device *device, const char *name) - : device_memory(device, name, MEM_DEVICE_ONLY) + device_only_memory(Device *device, const char *name, bool allow_host_memory_fallback = false) + : device_memory(device, name, allow_host_memory_fallback ? MEM_READ_WRITE : MEM_DEVICE_ONLY) { data_type = device_type_traits::data_type; data_elements = max(device_type_traits::num_elements, 1); diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp index 07ce63f5394..51e1a0033ba 100644 --- a/intern/cycles/device/device_optix.cpp +++ b/intern/cycles/device/device_optix.cpp @@ -197,8 +197,8 @@ class OptiXDevice : public CUDADevice { OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_) : CUDADevice(info_, stats_, profiler_, background_), sbt_data(this, "__sbt", MEM_READ_ONLY), - launch_params(this, "__params"), - denoiser_state(this, "__denoiser_state") + launch_params(this, "__params", false), + denoiser_state(this, "__denoiser_state", true) { // Store number of CUDA streams in device info info.cpu_threads = DebugFlags().optix.cuda_streams; @@ -878,8 +878,8 @@ class OptiXDevice : public CUDADevice { device_ptr input_ptr = rtile.buffer + pixel_offset; // Copy tile data into a common buffer if necessary - device_only_memory input(this, "denoiser input"); - device_vector tile_info_mem(this, "denoiser tile info", MEM_READ_WRITE); + device_only_memory input(this, "denoiser input", true); + device_vector tile_info_mem(this, "denoiser tile info", MEM_READ_ONLY); bool contiguous_memory = true; for (int i = 0; i < RenderTileNeighbors::SIZE; i++) { @@ -924,7 +924,7 @@ class OptiXDevice : public CUDADevice { } # if OPTIX_DENOISER_NO_PIXEL_STRIDE - device_only_memory input_rgb(this, "denoiser input rgb"); + device_only_memory input_rgb(this, "denoiser input rgb", true); input_rgb.alloc_to_device(rect_size.x * rect_size.y * 3 * task.denoising.input_passes); void *input_args[] = {&input_rgb.device_pointer, @@ -1146,6 +1146,13 @@ class OptiXDevice : public CUDADevice { const OptixBuildInput &build_input, uint16_t num_motion_steps) { + /* Allocate and build acceleration structures only one at a time, to prevent parallel builds + * from running out of memory (since both original and compacted acceleration structure memory + * may be allocated at the same time for the duration of this function). The builds would + * otherwise happen on the same CUDA stream anyway. */ + static thread_mutex mutex; + thread_scoped_lock lock(mutex); + const CUDAContextScope scope(cuContext); // Compute memory usage @@ -1170,11 +1177,12 @@ class OptiXDevice : public CUDADevice { optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes)); // Allocate required output buffers - device_only_memory temp_mem(this, "optix temp as build mem"); + device_only_memory temp_mem(this, "optix temp as build mem", true); temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8); if (!temp_mem.device_pointer) return false; // Make sure temporary memory allocation succeeded + // Acceleration structure memory has to be allocated on the device (not allowed to be on host) device_only_memory &out_data = bvh->as_data; if (operation == OPTIX_BUILD_OPERATION_BUILD) { assert(out_data.device == this); @@ -1222,7 +1230,7 @@ class OptiXDevice : public CUDADevice { // There is no point compacting if the size does not change if (compacted_size < sizes.outputSizeInBytes) { - device_only_memory compacted_data(this, "optix compacted as"); + device_only_memory compacted_data(this, "optix compacted as", false); compacted_data.alloc_to_device(compacted_size); if (!compacted_data.device_pointer) // Do not compact if memory allocation for compacted acceleration structure fails @@ -1242,6 +1250,7 @@ class OptiXDevice : public CUDADevice { std::swap(out_data.device_size, compacted_data.device_size); std::swap(out_data.device_pointer, compacted_data.device_pointer); + // Original acceleration structure memory is freed when 'compacted_data' goes out of scope } } -- cgit v1.2.3