From 7ad9333fad25b9a7cabea0d659eaf724f89912c8 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brechtvanlommel@gmail.com>
Date: Fri, 20 Oct 2017 23:31:13 +0200
Subject: Code refactor: store device/interp/extension/type in each
 device_memory.

---
 intern/cycles/device/device_cuda.cpp | 58 +++++++++++++++++-------------------
 1 file changed, 28 insertions(+), 30 deletions(-)

(limited to 'intern/cycles/device/device_cuda.cpp')
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 0f17b67c8c6..1295ec86355 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -217,7 +217,8 @@ public:
 	}
 
 	CUDADevice(DeviceInfo& info, Stats &stats, bool background_)
-	: Device(info, stats, background_)
+	: Device(info, stats, background_),
+	  texture_info(this, "__texture_info")
 	{
 		first_error = true;
 		background = background_;
@@ -548,17 +549,17 @@ public:
 	{
 		if(info.has_bindless_textures && need_texture_info) {
 			tex_free(texture_info);
-			tex_alloc("__texture_info", texture_info, INTERPOLATION_NONE, EXTENSION_REPEAT);
+			tex_alloc(texture_info);
 			need_texture_info = false;
 		}
 	}
 
-	void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/)
+	void mem_alloc(device_memory& mem)
 	{
 		CUDAContextScope scope(this);
 
-		if(name) {
-			VLOG(1) << "Buffer allocate: " << name << ", "
+		if(mem.name) {
+			VLOG(1) << "Buffer allocate: " << mem.name << ", "
 			        << string_human_readable_number(mem.memory_size()) << " bytes. ("
 			        << string_human_readable_size(mem.memory_size()) << ")";
 		}
@@ -619,7 +620,7 @@ public:
 		}
 	}
 
-	virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/, MemoryType /*type*/)
+	virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/)
 	{
 		return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset));
 	}
@@ -635,14 +636,11 @@ public:
 		cuda_assert(cuMemcpyHtoD(mem, host, size));
 	}
 
-	void tex_alloc(const char *name,
-	               device_memory& mem,
-	               InterpolationType interpolation,
-	               ExtensionType extension)
+	void tex_alloc(device_memory& mem)
 	{
 		CUDAContextScope scope(this);
 
-		VLOG(1) << "Texture allocate: " << name << ", "
+		VLOG(1) << "Texture allocate: " << mem.name << ", "
 		        << string_human_readable_number(mem.memory_size()) << " bytes. ("
 		        << string_human_readable_size(mem.memory_size()) << ")";
 
@@ -650,12 +648,12 @@ public:
 		bool has_bindless_textures = info.has_bindless_textures;
 
 		/* General variables for both architectures */
-		string bind_name = name;
+		string bind_name = mem.name;
 		size_t dsize = datatype_size(mem.data_type);
 		size_t size = mem.memory_size();
 
 		CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
-		switch(extension) {
+		switch(mem.extension) {
 			case EXTENSION_REPEAT:
 				address_mode = CU_TR_ADDRESS_MODE_WRAP;
 				break;
@@ -671,7 +669,7 @@ public:
 		}
 
 		CUfilter_mode filter_mode;
-		if(interpolation == INTERPOLATION_CLOSEST) {
+		if(mem.interpolation == INTERPOLATION_CLOSEST) {
 			filter_mode = CU_TR_FILTER_MODE_POINT;
 		}
 		else {
@@ -681,13 +679,13 @@ public:
 		/* General variables for Fermi */
 		CUtexref texref = NULL;
 
-		if(!has_bindless_textures && interpolation != INTERPOLATION_NONE) {
+		if(!has_bindless_textures && mem.interpolation != INTERPOLATION_NONE) {
 			if(mem.data_depth > 1) {
 				/* Kernel uses different bind names for 2d and 3d float textures,
 				 * so we have to adjust couple of things here.
 				 */
 				vector<string> tokens;
-				string_split(tokens, name, "_");
+				string_split(tokens, mem.name, "_");
 				bind_name = string_printf("__tex_image_%s_3d_%s",
 				                          tokens[2].c_str(),
 				                          tokens[3].c_str());
@@ -700,9 +698,9 @@ public:
 			}
 		}
 
-		if(interpolation == INTERPOLATION_NONE) {
+		if(mem.interpolation == INTERPOLATION_NONE) {
 			/* Data Storage */
-			mem_alloc(NULL, mem, MEM_READ_ONLY);
+			mem_alloc(mem);
 			mem_copy_to(mem);
 
 			CUdeviceptr cumem;
@@ -802,9 +800,9 @@ public:
 			if(has_bindless_textures) {
 				/* Bindless Textures - Kepler */
 				int flat_slot = 0;
-				if(string_startswith(name, "__tex_image")) {
-					int pos =  string(name).rfind("_");
-					flat_slot = atoi(name + pos + 1);
+				if(string_startswith(mem.name, "__tex_image")) {
+					int pos =  string(mem.name).rfind("_");
+					flat_slot = atoi(mem.name + pos + 1);
 				}
 				else {
 					assert(0);
@@ -843,8 +841,8 @@ public:
 				TextureInfo& info = texture_info[flat_slot];
 				info.data = (uint64_t)tex;
 				info.cl_buffer = 0;
-				info.interpolation = interpolation;
-				info.extension = extension;
+				info.interpolation = mem.interpolation;
+				info.extension = mem.extension;
 				info.width = mem.data_width;
 				info.height = mem.data_height;
 				info.depth = mem.data_depth;
@@ -869,7 +867,7 @@ public:
 		}
 
 		/* Fermi and Kepler */
-		tex_interp_map[mem.device_pointer] = (interpolation != INTERPOLATION_NONE);
+		tex_interp_map[mem.device_pointer] = (mem.interpolation != INTERPOLATION_NONE);
 	}
 
 	void tex_free(device_memory& mem)
@@ -900,7 +898,7 @@ public:
 
 	bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task)
 	{
-		mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_ONLY);
+		mem_alloc(task->tiles_mem);
 
 		TilesInfo *tiles = (TilesInfo*) task->tiles_mem.data_pointer;
 		for(int i = 0; i < 9; i++) {
@@ -1297,7 +1295,7 @@ public:
 		cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
 
 		/* Allocate work tile. */
-		device_vector<WorkTile> work_tiles;
+		device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
 		work_tiles.resize(1);
 
 		WorkTile *wtile = work_tiles.get_data();
@@ -1308,7 +1306,7 @@ public:
 		wtile->offset = rtile.offset;
 		wtile->stride = rtile.stride;
 		wtile->buffer = (float*)cuda_device_ptr(rtile.buffer);
-		mem_alloc("work_tiles", work_tiles, MEM_READ_ONLY);
+		mem_alloc(work_tiles);
 
 		CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer);
 
@@ -1730,7 +1728,7 @@ public:
 			while(task->acquire_tile(this, tile)) {
 				if(tile.task == RenderTile::PATH_TRACE) {
 					if(use_split_kernel()) {
-						device_memory void_buffer;
+						device_memory void_buffer(this, "void_buffer", MEM_READ_ONLY);
 						split_kernel->path_trace(task, tile, void_buffer, void_buffer);
 					}
 					else {
@@ -1885,9 +1883,9 @@ uint64_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory
 {
 	CUDAContextScope scope(device);
 
-	device_vector<uint64_t> size_buffer;
+	device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
 	size_buffer.resize(1);
-	device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE);
+	device->mem_alloc(size_buffer);
 
 	uint threads = num_threads;
 	CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer);
-- 
cgit v1.2.3


From aa8b4c5d8124c0379eeee9eacd1a0887a573d7d7 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brechtvanlommel@gmail.com>
Date: Mon, 23 Oct 2017 19:32:59 +0200
Subject: Code refactor: use device_only_memory and device_vector in more
 places.

---
 intern/cycles/device/device_cuda.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'intern/cycles/device/device_cuda.cpp')

diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 1295ec86355..be606a92434 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -1728,7 +1728,7 @@ public:
 			while(task->acquire_tile(this, tile)) {
 				if(tile.task == RenderTile::PATH_TRACE) {
 					if(use_split_kernel()) {
-						device_memory void_buffer(this, "void_buffer", MEM_READ_ONLY);
+						device_only_memory<uchar> void_buffer(this, "void_buffer");
 						split_kernel->path_trace(task, tile, void_buffer, void_buffer);
 					}
 					else {
-- 
cgit v1.2.3


From 070a668d04844610059aaedc80c49e9038fd1779 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brechtvanlommel@gmail.com>
Date: Sat, 21 Oct 2017 01:09:59 +0200
Subject: Code refactor: move more memory allocation logic into device API.

* Remove tex_* and pixels_* functions, replace by mem_*.
* Add MEM_TEXTURE and MEM_PIXELS as memory types recognized by devices.
* No longer create device_memory and call mem_* directly, always go
  through device_only_memory, device_vector and device_pixels.
---
 intern/cycles/device/device_cuda.cpp | 263 ++++++++++++++++++++---------------
 1 file changed, 151 insertions(+), 112 deletions(-)

(limited to 'intern/cycles/device/device_cuda.cpp')

diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index be606a92434..aa6386e455b 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -218,7 +218,7 @@ public:
 
 	CUDADevice(DeviceInfo& info, Stats &stats, bool background_)
 	: Device(info, stats, background_),
-	  texture_info(this, "__texture_info")
+	  texture_info(this, "__texture_info", MEM_TEXTURE)
 	{
 		first_error = true;
 		background = background_;
@@ -275,7 +275,7 @@ public:
 		delete split_kernel;
 
 		if(info.has_bindless_textures) {
-			tex_free(texture_info);
+			texture_info.free();
 		}
 
 		cuda_assert(cuCtxDestroy(cuContext));
@@ -548,20 +548,19 @@ public:
 	void load_texture_info()
 	{
 		if(info.has_bindless_textures && need_texture_info) {
-			tex_free(texture_info);
-			tex_alloc(texture_info);
+			texture_info.copy_to_device();
 			need_texture_info = false;
 		}
 	}
 
-	void mem_alloc(device_memory& mem)
+	void generic_alloc(device_memory& mem)
 	{
 		CUDAContextScope scope(this);
 
 		if(mem.name) {
 			VLOG(1) << "Buffer allocate: " << mem.name << ", "
-			        << string_human_readable_number(mem.memory_size()) << " bytes. ("
-			        << string_human_readable_size(mem.memory_size()) << ")";
+					<< string_human_readable_number(mem.memory_size()) << " bytes. ("
+					<< string_human_readable_size(mem.memory_size()) << ")";
 		}
 
 		CUdeviceptr device_pointer;
@@ -572,31 +571,88 @@ public:
 		stats.mem_alloc(size);
 	}
 
+	void generic_copy_to(device_memory& mem)
+	{
+		if(mem.device_pointer) {
+			CUDAContextScope scope(this);
+			cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), (void*)mem.data_pointer, mem.memory_size()));
+		}
+	}
+
+	void generic_free(device_memory& mem)
+	{
+		if(mem.device_pointer) {
+			CUDAContextScope scope(this);
+
+			cuda_assert(cuMemFree(cuda_device_ptr(mem.device_pointer)));
+
+			mem.device_pointer = 0;
+
+			stats.mem_free(mem.device_size);
+			mem.device_size = 0;
+		}
+	}
+
+	void mem_alloc(device_memory& mem)
+	{
+		if(mem.type == MEM_PIXELS && !background) {
+			pixels_alloc(mem);
+		}
+		else if(mem.type == MEM_TEXTURE) {
+			assert(!"mem_alloc not supported for textures.");
+		}
+		else {
+			generic_alloc(mem);
+		}
+	}
+
 	void mem_copy_to(device_memory& mem)
 	{
-		CUDAContextScope scope(this);
+		if(mem.type == MEM_PIXELS) {
+			assert(!"mem_copy_to not supported for pixels.");
+		}
+		else if(mem.type == MEM_TEXTURE) {
+			tex_free(mem);
+			tex_alloc(mem);
+		}
+		else {
+			if(!mem.device_pointer) {
+				generic_alloc(mem);
+			}
 
-		if(mem.device_pointer)
-			cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), (void*)mem.data_pointer, mem.memory_size()));
+			generic_copy_to(mem);
+		}
 	}
 
 	void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
 	{
-		CUDAContextScope scope(this);
-		size_t offset = elem*y*w;
-		size_t size = elem*w*h;
-
-		if(mem.device_pointer) {
-			cuda_assert(cuMemcpyDtoH((uchar*)mem.data_pointer + offset,
-			                         (CUdeviceptr)(mem.device_pointer + offset), size));
+		if(mem.type == MEM_PIXELS && !background) {
+			pixels_copy_from(mem, y, w, h);
+		}
+		else if(mem.type == MEM_TEXTURE) {
+			assert(!"mem_copy_from not supported for textures.");
 		}
 		else {
-			memset((char*)mem.data_pointer + offset, 0, size);
+			CUDAContextScope scope(this);
+			size_t offset = elem*y*w;
+			size_t size = elem*w*h;
+
+			if(mem.device_pointer) {
+				cuda_assert(cuMemcpyDtoH((uchar*)mem.data_pointer + offset,
+										 (CUdeviceptr)(mem.device_pointer + offset), size));
+			}
+			else {
+				memset((char*)mem.data_pointer + offset, 0, size);
+			}
 		}
 	}
 
 	void mem_zero(device_memory& mem)
 	{
+		if(!mem.device_pointer) {
+			mem_alloc(mem);
+		}
+
 		if(mem.data_pointer) {
 			memset((void*)mem.data_pointer, 0, mem.memory_size());
 		}
@@ -609,14 +665,14 @@ public:
 
 	void mem_free(device_memory& mem)
 	{
-		if(mem.device_pointer) {
-			CUDAContextScope scope(this);
-			cuda_assert(cuMemFree(cuda_device_ptr(mem.device_pointer)));
-
-			mem.device_pointer = 0;
-
-			stats.mem_free(mem.device_size);
-			mem.device_size = 0;
+		if(mem.type == MEM_PIXELS && !background) {
+			pixels_free(mem);
+		}
+		else if(mem.type == MEM_TEXTURE) {
+			tex_free(mem);
+		}
+		else {
+			generic_free(mem);
 		}
 	}
 
@@ -700,8 +756,8 @@ public:
 
 		if(mem.interpolation == INTERPOLATION_NONE) {
 			/* Data Storage */
-			mem_alloc(mem);
-			mem_copy_to(mem);
+			generic_alloc(mem);
+			generic_copy_to(mem);
 
 			CUdeviceptr cumem;
 			size_t cubytes;
@@ -891,21 +947,19 @@ public:
 			}
 			else {
 				tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
-				mem_free(mem);
+				generic_free(mem);
 			}
 		}
 	}
 
 	bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task)
 	{
-		mem_alloc(task->tiles_mem);
-
 		TilesInfo *tiles = (TilesInfo*) task->tiles_mem.data_pointer;
 		for(int i = 0; i < 9; i++) {
 			tiles->buffers[i] = buffers[i];
 		}
 
-		mem_copy_to(task->tiles_mem);
+		task->tiles_mem.copy_to_device();
 
 		return !have_error();
 	}
@@ -1272,7 +1326,7 @@ public:
 		task.unmap_neighbor_tiles(rtiles, this);
 	}
 
-	void path_trace(DeviceTask& task, RenderTile& rtile)
+	void path_trace(DeviceTask& task, RenderTile& rtile, device_vector<WorkTile>& work_tiles)
 	{
 		if(have_error())
 			return;
@@ -1295,8 +1349,7 @@ public:
 		cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
 
 		/* Allocate work tile. */
-		device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
-		work_tiles.resize(1);
+		work_tiles.alloc(1);
 
 		WorkTile *wtile = work_tiles.get_data();
 		wtile->x = rtile.x;
@@ -1306,9 +1359,6 @@ public:
 		wtile->offset = rtile.offset;
 		wtile->stride = rtile.stride;
 		wtile->buffer = (float*)cuda_device_ptr(rtile.buffer);
-		mem_alloc(work_tiles);
-
-		CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer);
 
 		/* Prepare work size. More step samples render faster, but for now we
 		 * remain conservative for GPUs connected to a display to avoid driver
@@ -1329,8 +1379,9 @@ public:
 			/* Setup and copy work tile to device. */
 			wtile->start_sample = sample;
 			wtile->num_samples = min(step_samples, end_sample - sample);;
-			mem_copy_to(work_tiles);
+			work_tiles.copy_to_device();
 
+			CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer);
 			uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
 			uint num_blocks = divide_up(total_work_size, num_threads_per_block);
 
@@ -1354,8 +1405,6 @@ public:
 					break;
 			}
 		}
-
-		mem_free(work_tiles);
 	}
 
 	void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
@@ -1508,104 +1557,90 @@ public:
 
 	void pixels_alloc(device_memory& mem)
 	{
-		if(!background) {
-			PixelMem pmem;
-
-			pmem.w = mem.data_width;
-			pmem.h = mem.data_height;
+		PixelMem pmem;
 
-			CUDAContextScope scope(this);
+		pmem.w = mem.data_width;
+		pmem.h = mem.data_height;
 
-			glGenBuffers(1, &pmem.cuPBO);
-			glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
-			if(mem.data_type == TYPE_HALF)
-				glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(GLhalf)*4, NULL, GL_DYNAMIC_DRAW);
-			else
-				glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(uint8_t)*4, NULL, GL_DYNAMIC_DRAW);
+		CUDAContextScope scope(this);
 
-			glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+		glGenBuffers(1, &pmem.cuPBO);
+		glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
+		if(mem.data_type == TYPE_HALF)
+			glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(GLhalf)*4, NULL, GL_DYNAMIC_DRAW);
+		else
+			glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(uint8_t)*4, NULL, GL_DYNAMIC_DRAW);
 
-			glGenTextures(1, &pmem.cuTexId);
-			glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
-			if(mem.data_type == TYPE_HALF)
-				glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F_ARB, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL);
-			else
-				glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
-			glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-			glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-			glBindTexture(GL_TEXTURE_2D, 0);
+		glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
 
-			CUresult result = cuGraphicsGLRegisterBuffer(&pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
+		glGenTextures(1, &pmem.cuTexId);
+		glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
+		if(mem.data_type == TYPE_HALF)
+			glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F_ARB, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL);
+		else
+			glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
+		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+		glBindTexture(GL_TEXTURE_2D, 0);
 
-			if(result == CUDA_SUCCESS) {
-				mem.device_pointer = pmem.cuTexId;
-				pixel_mem_map[mem.device_pointer] = pmem;
+		CUresult result = cuGraphicsGLRegisterBuffer(&pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
 
-				mem.device_size = mem.memory_size();
-				stats.mem_alloc(mem.device_size);
+		if(result == CUDA_SUCCESS) {
+			mem.device_pointer = pmem.cuTexId;
+			pixel_mem_map[mem.device_pointer] = pmem;
 
-				return;
-			}
-			else {
-				/* failed to register buffer, fallback to no interop */
-				glDeleteBuffers(1, &pmem.cuPBO);
-				glDeleteTextures(1, &pmem.cuTexId);
+			mem.device_size = mem.memory_size();
+			stats.mem_alloc(mem.device_size);
 
-				background = true;
-			}
+			return;
 		}
+		else {
+			/* failed to register buffer, fallback to no interop */
+			glDeleteBuffers(1, &pmem.cuPBO);
+			glDeleteTextures(1, &pmem.cuTexId);
 
-		Device::pixels_alloc(mem);
+			background = true;
+		}
 	}
 
 	void pixels_copy_from(device_memory& mem, int y, int w, int h)
 	{
-		if(!background) {
-			PixelMem pmem = pixel_mem_map[mem.device_pointer];
-
-			CUDAContextScope scope(this);
-
-			glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
-			uchar *pixels = (uchar*)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
-			size_t offset = sizeof(uchar)*4*y*w;
-			memcpy((uchar*)mem.data_pointer + offset, pixels + offset, sizeof(uchar)*4*w*h);
-			glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
-			glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+		PixelMem pmem = pixel_mem_map[mem.device_pointer];
 
-			return;
-		}
+		CUDAContextScope scope(this);
 
-		Device::pixels_copy_from(mem, y, w, h);
+		glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
+		uchar *pixels = (uchar*)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
+		size_t offset = sizeof(uchar)*4*y*w;
+		memcpy((uchar*)mem.data_pointer + offset, pixels + offset, sizeof(uchar)*4*w*h);
+		glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
+		glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
 	}
 
 	void pixels_free(device_memory& mem)
 	{
 		if(mem.device_pointer) {
-			if(!background) {
-				PixelMem pmem = pixel_mem_map[mem.device_pointer];
-
-				CUDAContextScope scope(this);
-
-				cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource));
-				glDeleteBuffers(1, &pmem.cuPBO);
-				glDeleteTextures(1, &pmem.cuTexId);
+			PixelMem pmem = pixel_mem_map[mem.device_pointer];
 
-				pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
-				mem.device_pointer = 0;
+			CUDAContextScope scope(this);
 
-				stats.mem_free(mem.device_size);
-				mem.device_size = 0;
+			cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource));
+			glDeleteBuffers(1, &pmem.cuPBO);
+			glDeleteTextures(1, &pmem.cuTexId);
 
-				return;
-			}
+			pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
+			mem.device_pointer = 0;
 
-			Device::pixels_free(mem);
+			stats.mem_free(mem.device_size);
+			mem.device_size = 0;
 		}
 	}
 
 	void draw_pixels(device_memory& mem, int y, int w, int h, int dx, int dy, int width, int height, bool transparent,
 		const DeviceDrawParams &draw_params)
 	{
+		assert(mem.type == MEM_PIXELS);
+
 		if(!background) {
 			PixelMem pmem = pixel_mem_map[mem.device_pointer];
 			float *vpointer;
@@ -1724,6 +1759,8 @@ public:
 				}
 			}
 
+			device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
+
 			/* keep rendering tiles until done */
 			while(task->acquire_tile(this, tile)) {
 				if(tile.task == RenderTile::PATH_TRACE) {
@@ -1732,7 +1769,7 @@ public:
 						split_kernel->path_trace(task, tile, void_buffer, void_buffer);
 					}
 					else {
-						path_trace(*task, tile);
+						path_trace(*task, tile, work_tiles);
 					}
 				}
 				else if(tile.task == RenderTile::DENOISE) {
@@ -1750,6 +1787,8 @@ public:
 						break;
 				}
 			}
+
+			work_tiles.free();
 		}
 		else if(task->type == DeviceTask::SHADER) {
 			shader(*task);
@@ -1884,8 +1923,8 @@ uint64_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory
 	CUDAContextScope scope(device);
 
 	device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
-	size_buffer.resize(1);
-	device->mem_alloc(size_buffer);
+	size_buffer.alloc(1);
+	size_buffer.zero_to_device();
 
 	uint threads = num_threads;
 	CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer);
@@ -1908,9 +1947,9 @@ uint64_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory
 	                           1, 1, 1,
 	                           0, 0, (void**)&args, 0));
 
-	device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint64_t));
+	size_buffer.copy_from_device(0, 1, 1);
 	size_t size = size_buffer[0];
-	device->mem_free(size_buffer);
+	size_buffer.free();
 
 	return size;
 }
-- 
cgit v1.2.3