Code refactor: add WorkTile struct for passing work to kernel.

This makes sharing some code between mega/split in following commits a bit easier, and also paves the way for rendering multiple tiles later.
author: Brecht Van Lommel <brechtvanlommel@gmail.com> 2017-09-27 00:42:36 +0300
committer: Brecht Van Lommel <brechtvanlommel@gmail.com> 2017-10-04 22:11:14 +0300
commit: 5b7d6ea54b2fc35b8b12c667f5bf9a1c9c46d5c2 (patch)
tree: 99a9ca07d5366b164dfdf267ad1ed3691d2d7d57 /intern/cycles/device/device_cuda.cpp
parent: 660e8e59e7b4265324a8fba7ae716f84a73c6c64 (diff)
1 files changed, 32 insertions, 26 deletions
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 29b5bd70789..7ee74e9a512 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -1293,8 +1293,6 @@ public:
 		CUDAContextScope scope(this);
 
 		CUfunction cuPathTrace;
-		CUdeviceptr d_buffer = cuda_device_ptr(rtile.buffer);
-		CUdeviceptr d_rng_state = cuda_device_ptr(rtile.rng_state);
 
 		/* get kernel function */
 		if(branched) {
@@ -1308,40 +1306,48 @@ public:
 			return;
 		}
 
-		/* pass in parameters */
-		void *args[] = {&d_buffer,
-		                &d_rng_state,
-		                &sample,
-		                &rtile.x,
-		                &rtile.y,
-		                &rtile.w,
-		                &rtile.h,
-		                &rtile.offset,
-		                &rtile.stride};
+		cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
 
-		/* launch kernel */
-		int threads_per_block;
-		cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuPathTrace));
+		/* allocate work tile */
+		device_vector<WorkTile> work_tiles;
+		work_tiles.resize(1);
 
-		/*int num_registers;
-		cuda_assert(cuFuncGetAttribute(&num_registers, CU_FUNC_ATTRIBUTE_NUM_REGS, cuPathTrace));
+		WorkTile *wtile = work_tiles.get_data();
+		wtile->x = rtile.x;
+		wtile->y = rtile.y;
+		wtile->w = rtile.w;
+		wtile->h = rtile.h;
+		wtile->offset = rtile.offset;
+		wtile->stride = rtile.stride;
+		wtile->start_sample = sample;
+		wtile->num_samples = 1;
+		wtile->buffer = (float*)cuda_device_ptr(rtile.buffer);
+		wtile->rng_state = (uint*)cuda_device_ptr(rtile.rng_state);
 
-		printf("threads_per_block %d\n", threads_per_block);
-		printf("num_registers %d\n", num_registers);*/
+		mem_alloc("work_tiles", work_tiles, MEM_READ_ONLY);
+		mem_copy_to(work_tiles);
 
-		int xthreads = (int)sqrt(threads_per_block);
-		int ythreads = (int)sqrt(threads_per_block);
-		int xblocks = (rtile.w + xthreads - 1)/xthreads;
-		int yblocks = (rtile.h + ythreads - 1)/ythreads;
+		CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer);
 
-		cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
+		uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
+
+		/* pass in parameters */
+		void *args[] = {&d_work_tiles,
+		                &total_work_size};
+
+		/* launch kernel */
+		int num_threads_per_block;
+		cuda_assert(cuFuncGetAttribute(&num_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuPathTrace));
+		int num_blocks = divide_up(total_work_size, num_threads_per_block);
 
 		cuda_assert(cuLaunchKernel(cuPathTrace,
-		                           xblocks , yblocks, 1, /* blocks */
-		                           xthreads, ythreads, 1, /* threads */
+		                           num_blocks, 1, 1,
+		                           num_threads_per_block, 1, 1,
 		                           0, 0, args, 0));
 
 		cuda_assert(cuCtxSynchronize());
+
+		mem_free(work_tiles);
 	}
 
 	void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
author	Brecht Van Lommel <brechtvanlommel@gmail.com>	2017-09-27 00:42:36 +0300
committer	Brecht Van Lommel <brechtvanlommel@gmail.com>	2017-10-04 22:11:14 +0300
commit	5b7d6ea54b2fc35b8b12c667f5bf9a1c9c46d5c2 (patch)
tree	99a9ca07d5366b164dfdf267ad1ed3691d2d7d57 /intern/cycles/device/device_cuda.cpp
parent	660e8e59e7b4265324a8fba7ae716f84a73c6c64 (diff)