diff options
-rw-r--r-- | intern/cycles/device/device_memory.h | 1 | ||||
-rw-r--r-- | intern/cycles/device/device_optix.cpp | 675 | ||||
-rw-r--r-- | source/blender/windowmanager/intern/wm_splash_screen.c | 2 |
3 files changed, 504 insertions, 174 deletions
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h index 5b43ce8b0bc..272da6a9ad3 100644 --- a/intern/cycles/device/device_memory.h +++ b/intern/cycles/device/device_memory.h @@ -224,6 +224,7 @@ class device_memory { protected: friend class CUDADevice; + friend class OptiXDevice; /* Only create through subclasses. */ device_memory(Device *device, const char *name, MemoryType type); diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp index 6f4734059da..e230662e698 100644 --- a/intern/cycles/device/device_optix.cpp +++ b/intern/cycles/device/device_optix.cpp @@ -177,7 +177,14 @@ class OptiXDevice : public Device { vector<device_only_memory<uint8_t>> blas; OptixTraversableHandle tlas_handle = 0; + // TODO(pmours): This is copied from device_cuda.cpp, so move to common code eventually + int can_map_host = 0; + size_t map_host_used = 0; + size_t map_host_limit = 0; + size_t device_working_headroom = 32 * 1024 * 1024LL; // 32MB + size_t device_texture_headroom = 128 * 1024 * 1024LL; // 128MB map<device_memory *, CUDAMem> cuda_mem_map; + bool move_texture_to_host = false; public: OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_) @@ -199,6 +206,25 @@ class OptiXDevice : public Device { // Make that CUDA context current const CUDAContextScope scope(cuda_context); + // Limit amount of host mapped memory (see init_host_memory in device_cuda.cpp) + size_t default_limit = 4 * 1024 * 1024 * 1024LL; + size_t system_ram = system_physical_ram(); + if (system_ram > 0) { + if (system_ram / 2 > default_limit) { + map_host_limit = system_ram - default_limit; + } + else { + map_host_limit = system_ram / 2; + } + } + else { + VLOG(1) << "Mapped host memory disabled, failed to get system RAM"; + } + + // Check device support for pinned host memory + check_result_cuda( + cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuda_device)); + // Create OptiX context for this device OptixDeviceContextOptions options = {}; # ifdef WITH_CYCLES_LOGGING @@ -826,6 +852,7 @@ class OptiXDevice : public Device { device_only_memory<char> temp_mem(this, "temp_build_mem"); temp_mem.alloc_to_device(sizes.tempSizeInBytes); + out_data.type = MEM_DEVICE_ONLY; out_data.data_type = TYPE_UNKNOWN; out_data.data_elements = 1; out_data.data_size = sizes.outputSizeInBytes; @@ -1168,131 +1195,162 @@ class OptiXDevice : public Device { void mem_alloc(device_memory &mem) override { - const CUDAContextScope scope(cuda_context); + if (mem.type == MEM_PIXELS && !background) { + assert(!"mem_alloc not supported for pixels."); + } + else if (mem.type == MEM_TEXTURE) { + assert(!"mem_alloc not supported for textures."); + } + else { + generic_alloc(mem); + } + } - mem.device_size = mem.memory_size(); - - if (mem.type == MEM_TEXTURE && mem.interpolation != INTERPOLATION_NONE) { - CUDAMem &cmem = cuda_mem_map[&mem]; // Lock and get associated memory information - - CUDA_TEXTURE_DESC tex_desc = {}; - tex_desc.flags = CU_TRSF_NORMALIZED_COORDINATES; - CUDA_RESOURCE_DESC res_desc = {}; - - switch (mem.extension) { - default: - assert(0); - case EXTENSION_REPEAT: - tex_desc.addressMode[0] = tex_desc.addressMode[1] = tex_desc.addressMode[2] = - CU_TR_ADDRESS_MODE_WRAP; - break; - case EXTENSION_EXTEND: - tex_desc.addressMode[0] = tex_desc.addressMode[1] = tex_desc.addressMode[2] = - CU_TR_ADDRESS_MODE_CLAMP; - break; - case EXTENSION_CLIP: - tex_desc.addressMode[0] = tex_desc.addressMode[1] = tex_desc.addressMode[2] = - CU_TR_ADDRESS_MODE_BORDER; - break; - } + CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0) + { + CUDAContextScope scope(cuda_context); - switch (mem.interpolation) { - default: // Default to linear for unsupported interpolation types - case INTERPOLATION_LINEAR: - tex_desc.filterMode = CU_TR_FILTER_MODE_LINEAR; - break; - case INTERPOLATION_CLOSEST: - tex_desc.filterMode = CU_TR_FILTER_MODE_POINT; - break; - } + CUdeviceptr device_pointer = 0; + size_t size = mem.memory_size() + pitch_padding; - CUarray_format format; - switch (mem.data_type) { - default: - assert(0); - case TYPE_UCHAR: - format = CU_AD_FORMAT_UNSIGNED_INT8; - break; - case TYPE_UINT16: - format = CU_AD_FORMAT_UNSIGNED_INT16; - break; - case TYPE_UINT: - format = CU_AD_FORMAT_UNSIGNED_INT32; - break; - case TYPE_INT: - format = CU_AD_FORMAT_SIGNED_INT32; - break; - case TYPE_FLOAT: - format = CU_AD_FORMAT_FLOAT; - break; - case TYPE_HALF: - format = CU_AD_FORMAT_HALF; - break; - } + CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY; + const char *status = ""; - if (mem.data_depth > 1) { /* 3D texture using array. */ - CUDA_ARRAY3D_DESCRIPTOR desc; - desc.Width = mem.data_width; - desc.Height = mem.data_height; - desc.Depth = mem.data_depth; - desc.Format = format; - desc.NumChannels = mem.data_elements; - desc.Flags = 0; + /* First try allocating in device memory, respecting headroom. We make + * an exception for texture info. It is small and frequently accessed, + * so treat it as working memory. + * + * If there is not enough room for working memory, we will try to move + * textures to host memory, assuming the performance impact would have + * been worse for working memory. */ + bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info); + bool is_image = is_texture && (mem.data_height > 1); - check_result_cuda(cuArray3DCreate(&cmem.array, &desc)); - mem.device_pointer = (device_ptr)cmem.array; + size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom; - res_desc.resType = CU_RESOURCE_TYPE_ARRAY; - res_desc.res.array.hArray = cmem.array; + size_t total = 0, free = 0; + cuMemGetInfo(&free, &total); + + /* Move textures to host memory if needed. */ + if (!move_texture_to_host && !is_image && (size + headroom) >= free) { + move_textures_to_host(size + headroom - free, is_texture); + cuMemGetInfo(&free, &total); + } + + /* Allocate in device memory. */ + if (!move_texture_to_host && (size + headroom) < free) { + mem_alloc_result = cuMemAlloc(&device_pointer, size); + if (mem_alloc_result == CUDA_SUCCESS) { + status = " in device memory"; } - else if (mem.data_height > 0) { /* 2D texture using array. */ - CUDA_ARRAY_DESCRIPTOR desc; - desc.Width = mem.data_width; - desc.Height = mem.data_height; - desc.Format = format; - desc.NumChannels = mem.data_elements; - - check_result_cuda(cuArrayCreate(&cmem.array, &desc)); - mem.device_pointer = (device_ptr)cmem.array; - - res_desc.resType = CU_RESOURCE_TYPE_ARRAY; - res_desc.res.array.hArray = cmem.array; + } + + /* Fall back to mapped host memory if needed and possible. */ + void *map_host_pointer = 0; + bool free_map_host = false; + + if (mem_alloc_result != CUDA_SUCCESS && can_map_host && + map_host_used + size < map_host_limit) { + if (mem.shared_pointer) { + /* Another device already allocated host memory. */ + mem_alloc_result = CUDA_SUCCESS; + map_host_pointer = mem.shared_pointer; } else { - check_result_cuda(cuMemAlloc((CUdeviceptr *)&mem.device_pointer, mem.device_size)); + /* Allocate host memory ourselves. */ + mem_alloc_result = cuMemHostAlloc( + &map_host_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED); + mem.shared_pointer = map_host_pointer; + free_map_host = true; + } - res_desc.resType = CU_RESOURCE_TYPE_LINEAR; - res_desc.res.linear.devPtr = (CUdeviceptr)mem.device_pointer; - res_desc.res.linear.format = format; - res_desc.res.linear.numChannels = mem.data_elements; - res_desc.res.linear.sizeInBytes = mem.device_size; + if (mem_alloc_result == CUDA_SUCCESS) { + cuMemHostGetDevicePointer_v2(&device_pointer, mem.shared_pointer, 0); + map_host_used += size; + status = " in host memory"; + + /* Replace host pointer with our host allocation. Only works if + * CUDA memory layout is the same and has no pitch padding. Also + * does not work if we move textures to host during a render, + * since other devices might be using the memory. */ + if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer && + mem.host_pointer != mem.shared_pointer) { + memcpy(mem.shared_pointer, mem.host_pointer, size); + mem.host_free(); + mem.host_pointer = mem.shared_pointer; + } } + else { + status = " failed, out of host memory"; + } + } + else if (mem_alloc_result != CUDA_SUCCESS) { + status = " failed, out of device and host memory"; + } - check_result_cuda(cuTexObjectCreate(&cmem.texobject, &res_desc, &tex_desc, NULL)); + if (mem.name) { + VLOG(1) << "Buffer allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")" << status; + } - int flat_slot = 0; - if (string_startswith(mem.name, "__tex_image")) { - flat_slot = atoi(mem.name + string(mem.name).rfind("_") + 1); - } + if (mem_alloc_result != CUDA_SUCCESS) { + set_error(string_printf("Buffer allocate %s", status)); + return NULL; + } + + mem.device_pointer = (device_ptr)device_pointer; + mem.device_size = size; + stats.mem_alloc(size); + + if (!mem.device_pointer) { + return NULL; + } - if (flat_slot >= texture_info.size()) - texture_info.resize(flat_slot + 128); + /* Insert into map of allocations. */ + CUDAMem *cmem = &cuda_mem_map[&mem]; + cmem->map_host_pointer = map_host_pointer; + cmem->free_map_host = free_map_host; + return cmem; + } - TextureInfo &info = texture_info[flat_slot]; - info.data = (uint64_t)cmem.texobject; - info.cl_buffer = 0; - info.interpolation = mem.interpolation; - info.extension = mem.extension; - info.width = mem.data_width; - info.height = mem.data_height; - info.depth = mem.data_depth; + void tex_alloc(device_memory &mem) + { + CUDAContextScope scope(cuda_context); + + /* General variables for both architectures */ + string bind_name = mem.name; + size_t dsize = datatype_size(mem.data_type); + size_t size = mem.memory_size(); + + CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP; + switch (mem.extension) { + case EXTENSION_REPEAT: + address_mode = CU_TR_ADDRESS_MODE_WRAP; + break; + case EXTENSION_EXTEND: + address_mode = CU_TR_ADDRESS_MODE_CLAMP; + break; + case EXTENSION_CLIP: + address_mode = CU_TR_ADDRESS_MODE_BORDER; + break; + default: + assert(0); + break; + } - // Texture information has changed and needs an update, delay this to next launch - need_texture_info = true; + CUfilter_mode filter_mode; + if (mem.interpolation == INTERPOLATION_CLOSEST) { + filter_mode = CU_TR_FILTER_MODE_POINT; } else { - // This is not a texture but simple linear memory - check_result_cuda(cuMemAlloc((CUdeviceptr *)&mem.device_pointer, mem.device_size)); + filter_mode = CU_TR_FILTER_MODE_LINEAR; + } + + /* Data Storage */ + if (mem.interpolation == INTERPOLATION_NONE) { + generic_alloc(mem); + generic_copy_to(mem); // Update data storage pointers in launch parameters # define KERNEL_TEX(data_type, tex_name) \ @@ -1301,77 +1359,233 @@ class OptiXDevice : public Device { mem.name, offsetof(KernelParams, tex_name), &mem.device_pointer, sizeof(device_ptr)); # include "kernel/kernel_textures.h" # undef KERNEL_TEX + return; } - stats.mem_alloc(mem.device_size); - } + /* Image Texture Storage */ + CUarray_format_enum format; + switch (mem.data_type) { + case TYPE_UCHAR: + format = CU_AD_FORMAT_UNSIGNED_INT8; + break; + case TYPE_UINT16: + format = CU_AD_FORMAT_UNSIGNED_INT16; + break; + case TYPE_UINT: + format = CU_AD_FORMAT_UNSIGNED_INT32; + break; + case TYPE_INT: + format = CU_AD_FORMAT_SIGNED_INT32; + break; + case TYPE_FLOAT: + format = CU_AD_FORMAT_FLOAT; + break; + case TYPE_HALF: + format = CU_AD_FORMAT_HALF; + break; + default: + assert(0); + return; + } - void mem_copy_to(device_memory &mem) override - { - if (!mem.host_pointer || mem.host_pointer == mem.shared_pointer) - return; - if (!mem.device_pointer) - mem_alloc(mem); // Need to allocate memory first if it does not exist yet + CUDAMem *cmem = NULL; + CUarray array_3d = NULL; + size_t src_pitch = mem.data_width * dsize * mem.data_elements; + size_t dst_pitch = src_pitch; - const CUDAContextScope scope(cuda_context); + if (mem.data_depth > 1) { + /* 3D texture using array, there is no API for linear memory. */ + CUDA_ARRAY3D_DESCRIPTOR desc; - if (mem.type == MEM_TEXTURE && mem.interpolation != INTERPOLATION_NONE) { - const CUDAMem &cmem = cuda_mem_map[&mem]; // Lock and get associated memory information + desc.Width = mem.data_width; + desc.Height = mem.data_height; + desc.Depth = mem.data_depth; + desc.Format = format; + desc.NumChannels = mem.data_elements; + desc.Flags = 0; - size_t src_pitch = mem.data_width * datatype_size(mem.data_type) * mem.data_elements; + VLOG(1) << "Array 3D allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; - if (mem.data_depth > 1) { - CUDA_MEMCPY3D param; - memset(¶m, 0, sizeof(param)); - param.dstMemoryType = CU_MEMORYTYPE_ARRAY; - param.dstArray = cmem.array; - param.srcMemoryType = CU_MEMORYTYPE_HOST; - param.srcHost = mem.host_pointer; - param.srcPitch = src_pitch; - param.WidthInBytes = param.srcPitch; - param.Height = mem.data_height; - param.Depth = mem.data_depth; + check_result_cuda(cuArray3DCreate(&array_3d, &desc)); - check_result_cuda(cuMemcpy3D(¶m)); + if (!array_3d) { + return; } - else if (mem.data_height > 0) { - CUDA_MEMCPY2D param; - memset(¶m, 0, sizeof(param)); - param.dstMemoryType = CU_MEMORYTYPE_ARRAY; - param.dstArray = cmem.array; - param.srcMemoryType = CU_MEMORYTYPE_HOST; - param.srcHost = mem.host_pointer; - param.srcPitch = src_pitch; - param.WidthInBytes = param.srcPitch; - param.Height = mem.data_height; - - check_result_cuda(cuMemcpy2D(¶m)); + + CUDA_MEMCPY3D param; + memset(¶m, 0, sizeof(param)); + param.dstMemoryType = CU_MEMORYTYPE_ARRAY; + param.dstArray = array_3d; + param.srcMemoryType = CU_MEMORYTYPE_HOST; + param.srcHost = mem.host_pointer; + param.srcPitch = src_pitch; + param.WidthInBytes = param.srcPitch; + param.Height = mem.data_height; + param.Depth = mem.data_depth; + + check_result_cuda(cuMemcpy3D(¶m)); + + mem.device_pointer = (device_ptr)array_3d; + mem.device_size = size; + stats.mem_alloc(size); + + cmem = &cuda_mem_map[&mem]; + cmem->texobject = 0; + cmem->array = array_3d; + } + else if (mem.data_height > 0) { + /* 2D texture, using pitch aligned linear memory. */ + int alignment = 0; + check_result_cuda(cuDeviceGetAttribute( + &alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuda_device)); + dst_pitch = align_up(src_pitch, alignment); + size_t dst_size = dst_pitch * mem.data_height; + + cmem = generic_alloc(mem, dst_size - mem.memory_size()); + if (!cmem) { + return; } - else { - check_result_cuda( - cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.device_size)); + + CUDA_MEMCPY2D param; + memset(¶m, 0, sizeof(param)); + param.dstMemoryType = CU_MEMORYTYPE_DEVICE; + param.dstDevice = mem.device_pointer; + param.dstPitch = dst_pitch; + param.srcMemoryType = CU_MEMORYTYPE_HOST; + param.srcHost = mem.host_pointer; + param.srcPitch = src_pitch; + param.WidthInBytes = param.srcPitch; + param.Height = mem.data_height; + + check_result_cuda(cuMemcpy2DUnaligned(¶m)); + } + else { + /* 1D texture, using linear memory. */ + cmem = generic_alloc(mem); + if (!cmem) { + return; } + + check_result_cuda(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size)); + } + + /* Kepler+, bindless textures. */ + int flat_slot = 0; + if (string_startswith(mem.name, "__tex_image")) { + int pos = string(mem.name).rfind("_"); + flat_slot = atoi(mem.name + pos + 1); } else { - // This is not a texture but simple linear memory - check_result_cuda( - cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.device_size)); + assert(0); + } + + CUDA_RESOURCE_DESC resDesc; + memset(&resDesc, 0, sizeof(resDesc)); + + if (array_3d) { + resDesc.resType = CU_RESOURCE_TYPE_ARRAY; + resDesc.res.array.hArray = array_3d; + resDesc.flags = 0; + } + else if (mem.data_height > 0) { + resDesc.resType = CU_RESOURCE_TYPE_PITCH2D; + resDesc.res.pitch2D.devPtr = mem.device_pointer; + resDesc.res.pitch2D.format = format; + resDesc.res.pitch2D.numChannels = mem.data_elements; + resDesc.res.pitch2D.height = mem.data_height; + resDesc.res.pitch2D.width = mem.data_width; + resDesc.res.pitch2D.pitchInBytes = dst_pitch; + } + else { + resDesc.resType = CU_RESOURCE_TYPE_LINEAR; + resDesc.res.linear.devPtr = mem.device_pointer; + resDesc.res.linear.format = format; + resDesc.res.linear.numChannels = mem.data_elements; + resDesc.res.linear.sizeInBytes = mem.device_size; } + + CUDA_TEXTURE_DESC texDesc; + memset(&texDesc, 0, sizeof(texDesc)); + texDesc.addressMode[0] = address_mode; + texDesc.addressMode[1] = address_mode; + texDesc.addressMode[2] = address_mode; + texDesc.filterMode = filter_mode; + texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES; + + check_result_cuda(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL)); + + /* Resize once */ + if (flat_slot >= texture_info.size()) { + /* Allocate some slots in advance, to reduce amount + * of re-allocations. */ + texture_info.resize(flat_slot + 128); + } + + /* Set Mapping and tag that we need to (re-)upload to device */ + TextureInfo &info = texture_info[flat_slot]; + info.data = (uint64_t)cmem->texobject; + info.cl_buffer = 0; + info.interpolation = mem.interpolation; + info.extension = mem.extension; + info.width = mem.data_width; + info.height = mem.data_height; + info.depth = mem.data_depth; + need_texture_info = true; } - void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override + void mem_copy_to(device_memory &mem) override { - // Calculate linear memory offset and size - const size_t size = elem * w * h; - const size_t offset = elem * y * w; + if (mem.type == MEM_PIXELS) { + assert(!"mem_copy_to not supported for pixels."); + } + else if (mem.type == MEM_TEXTURE) { + tex_free(mem); + tex_alloc(mem); + } + else { + if (!mem.device_pointer) { + generic_alloc(mem); + } + generic_copy_to(mem); + } + } + + void generic_copy_to(device_memory &mem) + { if (mem.host_pointer && mem.device_pointer) { - const CUDAContextScope scope(cuda_context); - check_result_cuda(cuMemcpyDtoH( - (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size)); + CUDAContextScope scope(cuda_context); + + if (mem.host_pointer != mem.shared_pointer) { + check_result_cuda( + cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size())); + } + } + } + + void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override + { + if (mem.type == MEM_PIXELS && !background) { + assert(!"mem_copy_from not supported for pixels."); + } + else if (mem.type == MEM_TEXTURE) { + assert(!"mem_copy_from not supported for textures."); } - else if (mem.host_pointer) { - memset((char *)mem.host_pointer + offset, 0, size); + else { + // Calculate linear memory offset and size + const size_t size = elem * w * h; + const size_t offset = elem * y * w; + + if (mem.host_pointer && mem.device_pointer) { + const CUDAContextScope scope(cuda_context); + check_result_cuda(cuMemcpyDtoH( + (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size)); + } + else if (mem.host_pointer) { + memset((char *)mem.host_pointer + offset, 0, size); + } } } @@ -1391,30 +1605,145 @@ class OptiXDevice : public Device { void mem_free(device_memory &mem) override { - assert(mem.device_pointer); + if (mem.type == MEM_PIXELS && !background) { + assert(!"mem_free not supported for pixels."); + } + else if (mem.type == MEM_TEXTURE) { + tex_free(mem); + } + else { + generic_free(mem); + } + } - const CUDAContextScope scope(cuda_context); + void generic_free(device_memory &mem) + { + if (mem.device_pointer) { + CUDAContextScope scope(cuda_context); + const CUDAMem &cmem = cuda_mem_map[&mem]; + + if (cmem.map_host_pointer) { + /* Free host memory. */ + if (cmem.free_map_host) { + cuMemFreeHost(cmem.map_host_pointer); + if (mem.host_pointer == mem.shared_pointer) { + mem.host_pointer = 0; + } + mem.shared_pointer = 0; + } - if (mem.type == MEM_TEXTURE && mem.interpolation != INTERPOLATION_NONE) { - CUDAMem &cmem = cuda_mem_map[&mem]; // Lock and get associated memory information + map_host_used -= mem.device_size; + } + else { + /* Free device memory. */ + cuMemFree(mem.device_pointer); + } - if (cmem.array) - cuArrayDestroy(cmem.array); - else - cuMemFree((CUdeviceptr)mem.device_pointer); + stats.mem_free(mem.device_size); + mem.device_pointer = 0; + mem.device_size = 0; - if (cmem.texobject) + cuda_mem_map.erase(cuda_mem_map.find(&mem)); + } + } + + void tex_free(device_memory &mem) + { + if (mem.device_pointer) { + CUDAContextScope scope(cuda_context); + const CUDAMem &cmem = cuda_mem_map[&mem]; + + if (cmem.texobject) { + /* Free bindless texture. */ cuTexObjectDestroy(cmem.texobject); + } + + if (cmem.array) { + /* Free array. */ + cuArrayDestroy(cmem.array); + stats.mem_free(mem.device_size); + mem.device_pointer = 0; + mem.device_size = 0; + + cuda_mem_map.erase(cuda_mem_map.find(&mem)); + } + else { + generic_free(mem); + } } - else { - // This is not a texture but simple linear memory - cuMemFree((CUdeviceptr)mem.device_pointer); + } + + void move_textures_to_host(size_t size, bool for_texture) + { + /* Signal to reallocate textures in host memory only. */ + move_texture_to_host = true; + + while (size > 0) { + /* Find suitable memory allocation to move. */ + device_memory *max_mem = NULL; + size_t max_size = 0; + bool max_is_image = false; + + foreach (auto &pair, cuda_mem_map) { + device_memory &mem = *pair.first; + CUDAMem *cmem = &pair.second; + + bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info); + bool is_image = is_texture && (mem.data_height > 1); + + /* Can't move this type of memory. */ + if (!is_texture || cmem->array) { + continue; + } + + /* Already in host memory. */ + if (cmem->map_host_pointer) { + continue; + } + + /* For other textures, only move image textures. */ + if (for_texture && !is_image) { + continue; + } + + /* Try to move largest allocation, prefer moving images. */ + if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) { + max_is_image = is_image; + max_size = mem.device_size; + max_mem = &mem; + } + } + + /* Move to host memory. This part is mutex protected since + * multiple CUDA devices could be moving the memory. The + * first one will do it, and the rest will adopt the pointer. */ + if (max_mem) { + VLOG(1) << "Move memory from device to host: " << max_mem->name; + + static thread_mutex move_mutex; + thread_scoped_lock lock(move_mutex); + + /* Preserve the original device pointer, in case of multi device + * we can't change it because the pointer mapping would break. */ + device_ptr prev_pointer = max_mem->device_pointer; + size_t prev_size = max_mem->device_size; + + tex_free(*max_mem); + tex_alloc(*max_mem); + size = (max_size >= size) ? 0 : size - max_size; + + max_mem->device_pointer = prev_pointer; + max_mem->device_size = prev_size; + } + else { + break; + } } - stats.mem_free(mem.device_size); + /* Update texture info array with new pointers. */ + update_texture_info(); - mem.device_size = 0; - mem.device_pointer = 0; + move_texture_to_host = false; } void const_copy_to(const char *name, void *host, size_t size) override diff --git a/source/blender/windowmanager/intern/wm_splash_screen.c b/source/blender/windowmanager/intern/wm_splash_screen.c index 662238b4ae1..3603ce81654 100644 --- a/source/blender/windowmanager/intern/wm_splash_screen.c +++ b/source/blender/windowmanager/intern/wm_splash_screen.c @@ -209,7 +209,7 @@ static ImBuf *wm_block_splash_image(int r_unit_size[2]) ibuf = IMB_loadiffname(splash_filepath, IB_rect, NULL); /* We could skip this check, see comment about 'x_expect' above. */ - if (ibuf->x != x_expect) { + if (ibuf && ibuf->x != x_expect) { CLOG_ERROR(WM_LOG_OPERATORS, "Splash expected %d width found %d, ignoring: %s\n", x_expect, |