4 files changed, 271 insertions, 22 deletions
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 29aabd3169c..51d9994ee26 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -37,6 +37,7 @@
 #  include <cudaGL.h>
 #endif
 #include "util/util_debug.h"
+#include "util/util_foreach.h"
 #include "util/util_logging.h"
 #include "util/util_map.h"
 #include "util/util_md5.h"
@@ -128,6 +129,12 @@ public:
 	CUdevice cuDevice;
 	CUcontext cuContext;
 	CUmodule cuModule, cuFilterModule;
+	size_t device_texture_headroom;
+	size_t device_working_headroom;
+	bool move_texture_to_host;
+	size_t map_host_used;
+	size_t map_host_limit;
+	int can_map_host;
 	int cuDevId;
 	int cuDevArchitecture;
 	bool first_error;
@@ -135,12 +142,15 @@ public:
 
 	struct CUDAMem {
 		CUDAMem()
-		: texobject(0), array(0) {}
+		: texobject(0), array(0), map_host_pointer(0), free_map_host(false) {}
 
 		CUtexObject texobject;
 		CUarray array;
+		void *map_host_pointer;
+		bool free_map_host;
 	};
-	map<device_memory*, CUDAMem> cuda_mem_map;
+	typedef map<device_memory*, CUDAMem> CUDAMemMap;
+	CUDAMemMap cuda_mem_map;
 
 	struct PixelMem {
 		GLuint cuPBO;
@@ -240,6 +250,13 @@ public:
 
 		need_texture_info = false;
 
+		device_texture_headroom = 0;
+		device_working_headroom = 0;
+		move_texture_to_host = false;
+		map_host_limit = 0;
+		map_host_used = 0;
+		can_map_host = 0;
+
 		/* Intialize CUDA. */
 		if(cuda_error(cuInit(0)))
 			return;
@@ -248,9 +265,16 @@ public:
 		if(cuda_error(cuDeviceGet(&cuDevice, cuDevId)))
 			return;
 
-		/* CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
+		/* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
+		 * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
 		 * so we can predict which memory to map to host. */
+		cuda_assert(cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
+
 		unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
+		if(can_map_host) {
+			ctx_flags |= CU_CTX_MAP_HOST;
+			init_host_memory();
+		}
 
 		/* Create context. */
 		CUresult result;
@@ -611,6 +635,50 @@ public:
 		VLOG(1) << "Local memory reserved "
 		        << string_human_readable_number(free_before - free_after) << " bytes. ("
 		        << string_human_readable_size(free_before - free_after) << ")";
+
+#if 0
+		/* For testing mapped host memory, fill up device memory. */
+		const size_t keep_mb = 1024;
+
+		while(free_after > keep_mb * 1024 * 1024LL) {
+			CUdeviceptr tmp;
+			cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
+			cuMemGetInfo(&free_after, &total);
+		}
+#endif
+	}
+
+	void init_host_memory()
+	{
+		/* Limit amount of host mapped memory, because allocating too much can
+		 * cause system instability. Leave at least half or 4 GB of system
+		 * memory free, whichever is smaller. */
+		size_t default_limit = 4 * 1024 * 1024 * 1024LL;
+		size_t system_ram = system_physical_ram();
+
+		if(system_ram > 0) {
+			if(system_ram / 2 > default_limit) {
+				map_host_limit = system_ram - default_limit;
+			}
+			else {
+				map_host_limit = system_ram / 2;
+			}
+		}
+		else {
+			VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
+			map_host_limit = 0;
+		}
+
+		/* Amount of device memory to keep is free after texture memory
+		 * and working memory allocations respectively. We set the working
+		 * memory limit headroom lower so that some space is left after all
+		 * texture memory allocations. */
+		device_working_headroom = 32 * 1024 * 1024LL; // 32MB
+		device_texture_headroom = 128 * 1024 * 1024LL; // 128MB
+
+		VLOG(1) << "Mapped host memory limit set to "
+		        << string_human_readable_number(map_host_limit) << " bytes. ("
+		        << string_human_readable_size(map_host_limit) << ")";
 	}
 
 	void load_texture_info()
@@ -621,20 +689,167 @@ public:
 		}
 	}
 
-	CUDAMem *generic_alloc(device_memory& mem, size_t padding = 0)
+	void move_textures_to_host(size_t size, bool for_texture)
+	{
+		/* Signal to reallocate textures in host memory only. */
+		move_texture_to_host = true;
+
+		while(size > 0) {
+			/* Find suitable memory allocation to move. */
+			device_memory *max_mem = NULL;
+			size_t max_size = 0;
+			bool max_is_image = false;
+
+			foreach(CUDAMemMap::value_type& pair, cuda_mem_map) {
+				device_memory& mem = *pair.first;
+				CUDAMem *cmem = &pair.second;
+
+				bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
+				bool is_image = is_texture && (mem.data_height > 1);
+
+				/* Can't move this type of memory. */
+				if(!is_texture || cmem->array) {
+					continue;
+				}
+
+				/* Already in host memory. */
+				if(cmem->map_host_pointer) {
+					continue;
+				}
+
+				/* For other textures, only move image textures. */
+				if(for_texture && !is_image) {
+					continue;
+				}
+
+				/* Try to move largest allocation, prefer moving images. */
+				if(is_image > max_is_image ||
+				   (is_image == max_is_image && mem.device_size > max_size)) {
+					max_is_image = is_image;
+					max_size = mem.device_size;
+					max_mem = &mem;
+				}
+			}
+
+			/* Move to host memory. This part is mutex protected since
+			 * multiple CUDA devices could be moving the memory. The
+			 * first one will do it, and the rest will adopt the pointer. */
+			if(max_mem) {
+				VLOG(1) << "Move memory from device to host: " << max_mem->name;
+
+				static thread_mutex move_mutex;
+				thread_scoped_lock lock(move_mutex);
+
+				/* Preserve the original device pointer, in case of multi device
+				 * we can't change it because the pointer mapping would break. */
+				device_ptr prev_pointer = max_mem->device_pointer;
+				size_t prev_size = max_mem->device_size;
+
+				tex_free(*max_mem);
+				tex_alloc(*max_mem);
+				size = (max_size >= size)? 0: size - max_size;
+
+				max_mem->device_pointer = prev_pointer;
+				max_mem->device_size = prev_size;
+			}
+			else {
+				break;
+			}
+		}
+
+		/* Update texture info array with new pointers. */
+		load_texture_info();
+
+		move_texture_to_host = false;
+	}
+
+	CUDAMem *generic_alloc(device_memory& mem, size_t pitch_padding = 0)
 	{
 		CUDAContextScope scope(this);
 
+		CUdeviceptr device_pointer = 0;
+		size_t size = mem.memory_size() + pitch_padding;
+
+		CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
+		const char *status = "";
+
+		/* First try allocating in device memory, respecting headroom. We make
+		 * an exception for texture info. It is small and frequently accessed,
+		 * so treat it as working memory.
+		 *
+		 * If there is not enough room for working memory, we will try to move
+		 * textures to host memory, assuming the performance impact would have
+		 * been worse for working memory. */
+		bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
+		bool is_image = is_texture && (mem.data_height > 1);
+
+		size_t headroom = (is_texture)? device_texture_headroom:
+		                                device_working_headroom;
+
+		size_t total = 0, free = 0;
+		cuMemGetInfo(&free, &total);
+
+		/* Move textures to host memory if needed. */
+		if(!move_texture_to_host && !is_image && (size + headroom) >= free) {
+			move_textures_to_host(size + headroom - free, is_texture);
+			cuMemGetInfo(&free, &total);
+		}
+
+		/* Allocate in device memory. */
+		if(!move_texture_to_host && (size + headroom) < free) {
+			mem_alloc_result = cuMemAlloc(&device_pointer, size);
+			if(mem_alloc_result == CUDA_SUCCESS) {
+				status = " in device memory";
+			}
+		}
+
+		/* Fall back to mapped host memory if needed and possible. */
+		void *map_host_pointer = 0;
+		bool free_map_host = false;
+
+		if(mem_alloc_result != CUDA_SUCCESS && can_map_host &&
+		   map_host_used + size < map_host_limit) {
+			if(mem.shared_pointer) {
+				/* Another device already allocated host memory. */
+				mem_alloc_result = CUDA_SUCCESS;
+				map_host_pointer = mem.shared_pointer;
+			}
+			else {
+				/* Allocate host memory ourselves. */
+				mem_alloc_result = cuMemHostAlloc(&map_host_pointer, size,
+				                                  CU_MEMHOSTALLOC_DEVICEMAP |
+				                                  CU_MEMHOSTALLOC_WRITECOMBINED);
+				mem.shared_pointer = map_host_pointer;
+				free_map_host = true;
+			}
+
+			if(mem_alloc_result == CUDA_SUCCESS) {
+				cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, mem.shared_pointer, 0));
+				map_host_used += size;
+				status = " in host memory";
+
+				/* Replace host pointer with our host allocation. Only works if
+				 * CUDA memory layout is the same and has no pitch padding. */
+				if(pitch_padding == 0 && mem.host_pointer && mem.host_pointer != mem.shared_pointer) {
+					memcpy(mem.shared_pointer, mem.host_pointer, size);
+					mem.host_free();
+					mem.host_pointer = mem.shared_pointer;
+				}
+			}
+		}
+
+		if(mem_alloc_result != CUDA_SUCCESS) {
+			cuda_assert(mem_alloc_result);
+			status = " failed, out of memory";
+		}
+
 		if(mem.name) {
 			VLOG(1) << "Buffer allocate: " << mem.name << ", "
 					<< string_human_readable_number(mem.memory_size()) << " bytes. ("
-					<< string_human_readable_size(mem.memory_size()) << ")";
+					<< string_human_readable_size(mem.memory_size()) << ")"
+					<< status;
 		}
 
-		/* Allocate memory on device. */
-		CUdeviceptr device_pointer = 0;
-		size_t size = mem.memory_size();
-		cuda_assert(cuMemAlloc(&device_pointer, size + padding));
 		mem.device_pointer = (device_ptr)device_pointer;
 		mem.device_size = size;
 		stats.mem_alloc(size);
@@ -645,6 +860,8 @@ public:
 
 		/* Insert into map of allocations. */
 		CUDAMem *cmem = &cuda_mem_map[&mem];
+		cmem->map_host_pointer = map_host_pointer;
+		cmem->free_map_host = free_map_host;
 		return cmem;
 	}
 
@@ -652,7 +869,12 @@ public:
 	{
 		if(mem.host_pointer && mem.device_pointer) {
 			CUDAContextScope scope(this);
-			cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), mem.host_pointer, mem.memory_size()));
+
+			if(mem.host_pointer != mem.shared_pointer) {
+				cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer),
+				                         mem.host_pointer,
+				                         mem.memory_size()));
+			}
 		}
 	}
 
@@ -660,8 +882,24 @@ public:
 	{
 		if(mem.device_pointer) {
 			CUDAContextScope scope(this);
+			const CUDAMem& cmem = cuda_mem_map[&mem];
 
-			cuda_assert(cuMemFree(cuda_device_ptr(mem.device_pointer)));
+			if(cmem.map_host_pointer) {
+				/* Free host memory. */
+				if(cmem.free_map_host) {
+					cuMemFreeHost(cmem.map_host_pointer);
+					if(mem.host_pointer == mem.shared_pointer) {
+						mem.host_pointer = 0;
+					}
+					mem.shared_pointer = 0;
+				}
+
+				map_host_used -= mem.device_size;
+			}
+			else {
+				/* Free device memory. */
+				cuMemFree(mem.device_pointer);
+			}
 
 			stats.mem_free(mem.device_size);
 			mem.device_pointer = 0;
@@ -735,7 +973,8 @@ public:
 			memset(mem.host_pointer, 0, mem.memory_size());
 		}
 
-		if(mem.device_pointer) {
+		if(mem.device_pointer &&
+		   (!mem.host_pointer || mem.host_pointer != mem.shared_pointer)) {
 			CUDAContextScope scope(this);
 			cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size()));
 		}
@@ -774,10 +1013,6 @@ public:
 	{
 		CUDAContextScope scope(this);
 
-		VLOG(1) << "Texture allocate: " << mem.name << ", "
-		        << string_human_readable_number(mem.memory_size()) << " bytes. ("
-		        << string_human_readable_size(mem.memory_size()) << ")";
-
 		/* Check if we are on sm_30 or above, for bindless textures. */
 		bool has_fermi_limits = info.has_fermi_limits;
 
@@ -881,6 +1116,10 @@ public:
 			desc.NumChannels = mem.data_elements;
 			desc.Flags = 0;
 
+			VLOG(1) << "Array 3D allocate: " << mem.name << ", "
+			        << string_human_readable_number(mem.memory_size()) << " bytes. ("
+			        << string_human_readable_size(mem.memory_size()) << ")";
+
 			cuda_assert(cuArray3DCreate(&array_3d, &desc));
 
 			if(!array_3d) {
diff --git a/intern/cycles/device/device_memory.cpp b/intern/cycles/device/device_memory.cpp
index b5db76bb3df..82598007a59 100644
--- a/intern/cycles/device/device_memory.cpp
+++ b/intern/cycles/device/device_memory.cpp
@@ -35,7 +35,8 @@ device_memory::device_memory(Device *device, const char *name, MemoryType type)
   extension(EXTENSION_REPEAT),
   device(device),
   device_pointer(0),
-  host_pointer(0)
+  host_pointer(0),
+  shared_pointer(0)
 {
 }
 
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index 453dab9bfb3..2a027917066 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -197,10 +197,13 @@ public:
 	Device *device;
 	device_ptr device_pointer;
 	void *host_pointer;
+	void *shared_pointer;
 
 	virtual ~device_memory();
 
 protected:
+	friend class CUDADevice;
+
 	/* Only create through subclasses. */
 	device_memory(Device *device, const char *name, MemoryType type);
 
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index fd28a9d6188..3a4c08b6eb2 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -48,11 +48,17 @@ public:
 	MultiDevice(DeviceInfo& info, Stats &stats, bool background_)
 	: Device(info, stats, background_), unique_key(1)
 	{
-		Device *device;
-
 		foreach(DeviceInfo& subinfo, info.multi_devices) {
-			device = Device::create(subinfo, sub_stats_, background);
-			devices.push_back(SubDevice(device));
+			Device *device = Device::create(subinfo, sub_stats_, background);
+
+			/* Always add CPU devices at the back since GPU devices can change
+			 * host memory pointers, which CPU uses as device pointer. */
+			if(subinfo.type == DEVICE_CPU) {
+				devices.push_back(SubDevice(device));
+			}
+			else {
+				devices.push_front(SubDevice(device));
+			}
 		}
 
 #ifdef WITH_NETWORK
@@ -63,7 +69,7 @@ public:
 		vector<string> servers = discovery.get_server_list();
 
 		foreach(string& server, servers) {
-			device = device_network_create(info, stats, server.c_str());
+			Device *device = device_network_create(info, stats, server.c_str());
 			if(device)
 				devices.push_back(SubDevice(device));
 		}