diff options
Diffstat (limited to 'intern/cycles/device')
-rw-r--r-- | intern/cycles/device/device_cuda.cpp | 271 | ||||
-rw-r--r-- | intern/cycles/device/device_memory.cpp | 3 | ||||
-rw-r--r-- | intern/cycles/device/device_memory.h | 3 | ||||
-rw-r--r-- | intern/cycles/device/device_multi.cpp | 16 |
4 files changed, 271 insertions, 22 deletions
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 29aabd3169c..51d9994ee26 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -37,6 +37,7 @@ # include <cudaGL.h> #endif #include "util/util_debug.h" +#include "util/util_foreach.h" #include "util/util_logging.h" #include "util/util_map.h" #include "util/util_md5.h" @@ -128,6 +129,12 @@ public: CUdevice cuDevice; CUcontext cuContext; CUmodule cuModule, cuFilterModule; + size_t device_texture_headroom; + size_t device_working_headroom; + bool move_texture_to_host; + size_t map_host_used; + size_t map_host_limit; + int can_map_host; int cuDevId; int cuDevArchitecture; bool first_error; @@ -135,12 +142,15 @@ public: struct CUDAMem { CUDAMem() - : texobject(0), array(0) {} + : texobject(0), array(0), map_host_pointer(0), free_map_host(false) {} CUtexObject texobject; CUarray array; + void *map_host_pointer; + bool free_map_host; }; - map<device_memory*, CUDAMem> cuda_mem_map; + typedef map<device_memory*, CUDAMem> CUDAMemMap; + CUDAMemMap cuda_mem_map; struct PixelMem { GLuint cuPBO; @@ -240,6 +250,13 @@ public: need_texture_info = false; + device_texture_headroom = 0; + device_working_headroom = 0; + move_texture_to_host = false; + map_host_limit = 0; + map_host_used = 0; + can_map_host = 0; + /* Intialize CUDA. */ if(cuda_error(cuInit(0))) return; @@ -248,9 +265,16 @@ public: if(cuda_error(cuDeviceGet(&cuDevice, cuDevId))) return; - /* CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render, + /* CU_CTX_MAP_HOST for mapping host memory when out of device memory. + * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render, * so we can predict which memory to map to host. */ + cuda_assert(cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice)); + unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX; + if(can_map_host) { + ctx_flags |= CU_CTX_MAP_HOST; + init_host_memory(); + } /* Create context. */ CUresult result; @@ -611,6 +635,50 @@ public: VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after) << " bytes. (" << string_human_readable_size(free_before - free_after) << ")"; + +#if 0 + /* For testing mapped host memory, fill up device memory. */ + const size_t keep_mb = 1024; + + while(free_after > keep_mb * 1024 * 1024LL) { + CUdeviceptr tmp; + cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL)); + cuMemGetInfo(&free_after, &total); + } +#endif + } + + void init_host_memory() + { + /* Limit amount of host mapped memory, because allocating too much can + * cause system instability. Leave at least half or 4 GB of system + * memory free, whichever is smaller. */ + size_t default_limit = 4 * 1024 * 1024 * 1024LL; + size_t system_ram = system_physical_ram(); + + if(system_ram > 0) { + if(system_ram / 2 > default_limit) { + map_host_limit = system_ram - default_limit; + } + else { + map_host_limit = system_ram / 2; + } + } + else { + VLOG(1) << "Mapped host memory disabled, failed to get system RAM"; + map_host_limit = 0; + } + + /* Amount of device memory to keep is free after texture memory + * and working memory allocations respectively. We set the working + * memory limit headroom lower so that some space is left after all + * texture memory allocations. */ + device_working_headroom = 32 * 1024 * 1024LL; // 32MB + device_texture_headroom = 128 * 1024 * 1024LL; // 128MB + + VLOG(1) << "Mapped host memory limit set to " + << string_human_readable_number(map_host_limit) << " bytes. (" + << string_human_readable_size(map_host_limit) << ")"; } void load_texture_info() @@ -621,20 +689,167 @@ public: } } - CUDAMem *generic_alloc(device_memory& mem, size_t padding = 0) + void move_textures_to_host(size_t size, bool for_texture) + { + /* Signal to reallocate textures in host memory only. */ + move_texture_to_host = true; + + while(size > 0) { + /* Find suitable memory allocation to move. */ + device_memory *max_mem = NULL; + size_t max_size = 0; + bool max_is_image = false; + + foreach(CUDAMemMap::value_type& pair, cuda_mem_map) { + device_memory& mem = *pair.first; + CUDAMem *cmem = &pair.second; + + bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info); + bool is_image = is_texture && (mem.data_height > 1); + + /* Can't move this type of memory. */ + if(!is_texture || cmem->array) { + continue; + } + + /* Already in host memory. */ + if(cmem->map_host_pointer) { + continue; + } + + /* For other textures, only move image textures. */ + if(for_texture && !is_image) { + continue; + } + + /* Try to move largest allocation, prefer moving images. */ + if(is_image > max_is_image || + (is_image == max_is_image && mem.device_size > max_size)) { + max_is_image = is_image; + max_size = mem.device_size; + max_mem = &mem; + } + } + + /* Move to host memory. This part is mutex protected since + * multiple CUDA devices could be moving the memory. The + * first one will do it, and the rest will adopt the pointer. */ + if(max_mem) { + VLOG(1) << "Move memory from device to host: " << max_mem->name; + + static thread_mutex move_mutex; + thread_scoped_lock lock(move_mutex); + + /* Preserve the original device pointer, in case of multi device + * we can't change it because the pointer mapping would break. */ + device_ptr prev_pointer = max_mem->device_pointer; + size_t prev_size = max_mem->device_size; + + tex_free(*max_mem); + tex_alloc(*max_mem); + size = (max_size >= size)? 0: size - max_size; + + max_mem->device_pointer = prev_pointer; + max_mem->device_size = prev_size; + } + else { + break; + } + } + + /* Update texture info array with new pointers. */ + load_texture_info(); + + move_texture_to_host = false; + } + + CUDAMem *generic_alloc(device_memory& mem, size_t pitch_padding = 0) { CUDAContextScope scope(this); + CUdeviceptr device_pointer = 0; + size_t size = mem.memory_size() + pitch_padding; + + CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY; + const char *status = ""; + + /* First try allocating in device memory, respecting headroom. We make + * an exception for texture info. It is small and frequently accessed, + * so treat it as working memory. + * + * If there is not enough room for working memory, we will try to move + * textures to host memory, assuming the performance impact would have + * been worse for working memory. */ + bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info); + bool is_image = is_texture && (mem.data_height > 1); + + size_t headroom = (is_texture)? device_texture_headroom: + device_working_headroom; + + size_t total = 0, free = 0; + cuMemGetInfo(&free, &total); + + /* Move textures to host memory if needed. */ + if(!move_texture_to_host && !is_image && (size + headroom) >= free) { + move_textures_to_host(size + headroom - free, is_texture); + cuMemGetInfo(&free, &total); + } + + /* Allocate in device memory. */ + if(!move_texture_to_host && (size + headroom) < free) { + mem_alloc_result = cuMemAlloc(&device_pointer, size); + if(mem_alloc_result == CUDA_SUCCESS) { + status = " in device memory"; + } + } + + /* Fall back to mapped host memory if needed and possible. */ + void *map_host_pointer = 0; + bool free_map_host = false; + + if(mem_alloc_result != CUDA_SUCCESS && can_map_host && + map_host_used + size < map_host_limit) { + if(mem.shared_pointer) { + /* Another device already allocated host memory. */ + mem_alloc_result = CUDA_SUCCESS; + map_host_pointer = mem.shared_pointer; + } + else { + /* Allocate host memory ourselves. */ + mem_alloc_result = cuMemHostAlloc(&map_host_pointer, size, + CU_MEMHOSTALLOC_DEVICEMAP | + CU_MEMHOSTALLOC_WRITECOMBINED); + mem.shared_pointer = map_host_pointer; + free_map_host = true; + } + + if(mem_alloc_result == CUDA_SUCCESS) { + cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, mem.shared_pointer, 0)); + map_host_used += size; + status = " in host memory"; + + /* Replace host pointer with our host allocation. Only works if + * CUDA memory layout is the same and has no pitch padding. */ + if(pitch_padding == 0 && mem.host_pointer && mem.host_pointer != mem.shared_pointer) { + memcpy(mem.shared_pointer, mem.host_pointer, size); + mem.host_free(); + mem.host_pointer = mem.shared_pointer; + } + } + } + + if(mem_alloc_result != CUDA_SUCCESS) { + cuda_assert(mem_alloc_result); + status = " failed, out of memory"; + } + if(mem.name) { VLOG(1) << "Buffer allocate: " << mem.name << ", " << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; + << string_human_readable_size(mem.memory_size()) << ")" + << status; } - /* Allocate memory on device. */ - CUdeviceptr device_pointer = 0; - size_t size = mem.memory_size(); - cuda_assert(cuMemAlloc(&device_pointer, size + padding)); mem.device_pointer = (device_ptr)device_pointer; mem.device_size = size; stats.mem_alloc(size); @@ -645,6 +860,8 @@ public: /* Insert into map of allocations. */ CUDAMem *cmem = &cuda_mem_map[&mem]; + cmem->map_host_pointer = map_host_pointer; + cmem->free_map_host = free_map_host; return cmem; } @@ -652,7 +869,12 @@ public: { if(mem.host_pointer && mem.device_pointer) { CUDAContextScope scope(this); - cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), mem.host_pointer, mem.memory_size())); + + if(mem.host_pointer != mem.shared_pointer) { + cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), + mem.host_pointer, + mem.memory_size())); + } } } @@ -660,8 +882,24 @@ public: { if(mem.device_pointer) { CUDAContextScope scope(this); + const CUDAMem& cmem = cuda_mem_map[&mem]; - cuda_assert(cuMemFree(cuda_device_ptr(mem.device_pointer))); + if(cmem.map_host_pointer) { + /* Free host memory. */ + if(cmem.free_map_host) { + cuMemFreeHost(cmem.map_host_pointer); + if(mem.host_pointer == mem.shared_pointer) { + mem.host_pointer = 0; + } + mem.shared_pointer = 0; + } + + map_host_used -= mem.device_size; + } + else { + /* Free device memory. */ + cuMemFree(mem.device_pointer); + } stats.mem_free(mem.device_size); mem.device_pointer = 0; @@ -735,7 +973,8 @@ public: memset(mem.host_pointer, 0, mem.memory_size()); } - if(mem.device_pointer) { + if(mem.device_pointer && + (!mem.host_pointer || mem.host_pointer != mem.shared_pointer)) { CUDAContextScope scope(this); cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size())); } @@ -774,10 +1013,6 @@ public: { CUDAContextScope scope(this); - VLOG(1) << "Texture allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; - /* Check if we are on sm_30 or above, for bindless textures. */ bool has_fermi_limits = info.has_fermi_limits; @@ -881,6 +1116,10 @@ public: desc.NumChannels = mem.data_elements; desc.Flags = 0; + VLOG(1) << "Array 3D allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + cuda_assert(cuArray3DCreate(&array_3d, &desc)); if(!array_3d) { diff --git a/intern/cycles/device/device_memory.cpp b/intern/cycles/device/device_memory.cpp index b5db76bb3df..82598007a59 100644 --- a/intern/cycles/device/device_memory.cpp +++ b/intern/cycles/device/device_memory.cpp @@ -35,7 +35,8 @@ device_memory::device_memory(Device *device, const char *name, MemoryType type) extension(EXTENSION_REPEAT), device(device), device_pointer(0), - host_pointer(0) + host_pointer(0), + shared_pointer(0) { } diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h index 453dab9bfb3..2a027917066 100644 --- a/intern/cycles/device/device_memory.h +++ b/intern/cycles/device/device_memory.h @@ -197,10 +197,13 @@ public: Device *device; device_ptr device_pointer; void *host_pointer; + void *shared_pointer; virtual ~device_memory(); protected: + friend class CUDADevice; + /* Only create through subclasses. */ device_memory(Device *device, const char *name, MemoryType type); diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp index fd28a9d6188..3a4c08b6eb2 100644 --- a/intern/cycles/device/device_multi.cpp +++ b/intern/cycles/device/device_multi.cpp @@ -48,11 +48,17 @@ public: MultiDevice(DeviceInfo& info, Stats &stats, bool background_) : Device(info, stats, background_), unique_key(1) { - Device *device; - foreach(DeviceInfo& subinfo, info.multi_devices) { - device = Device::create(subinfo, sub_stats_, background); - devices.push_back(SubDevice(device)); + Device *device = Device::create(subinfo, sub_stats_, background); + + /* Always add CPU devices at the back since GPU devices can change + * host memory pointers, which CPU uses as device pointer. */ + if(subinfo.type == DEVICE_CPU) { + devices.push_back(SubDevice(device)); + } + else { + devices.push_front(SubDevice(device)); + } } #ifdef WITH_NETWORK @@ -63,7 +69,7 @@ public: vector<string> servers = discovery.get_server_list(); foreach(string& server, servers) { - device = device_network_create(info, stats, server.c_str()); + Device *device = device_network_create(info, stats, server.c_str()); if(device) devices.push_back(SubDevice(device)); } |