diff options
author | Brecht Van Lommel <brechtvanlommel@gmail.com> | 2017-09-27 00:42:36 +0300 |
---|---|---|
committer | Brecht Van Lommel <brechtvanlommel@gmail.com> | 2017-10-04 22:11:14 +0300 |
commit | 5b7d6ea54b2fc35b8b12c667f5bf9a1c9c46d5c2 (patch) | |
tree | 99a9ca07d5366b164dfdf267ad1ed3691d2d7d57 /intern/cycles/device/device_cuda.cpp | |
parent | 660e8e59e7b4265324a8fba7ae716f84a73c6c64 (diff) |
Code refactor: add WorkTile struct for passing work to kernel.
This makes sharing some code between mega/split in following commits a bit
easier, and also paves the way for rendering multiple tiles later.
Diffstat (limited to 'intern/cycles/device/device_cuda.cpp')
-rw-r--r-- | intern/cycles/device/device_cuda.cpp | 58 |
1 files changed, 32 insertions, 26 deletions
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 29b5bd70789..7ee74e9a512 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -1293,8 +1293,6 @@ public: CUDAContextScope scope(this); CUfunction cuPathTrace; - CUdeviceptr d_buffer = cuda_device_ptr(rtile.buffer); - CUdeviceptr d_rng_state = cuda_device_ptr(rtile.rng_state); /* get kernel function */ if(branched) { @@ -1308,40 +1306,48 @@ public: return; } - /* pass in parameters */ - void *args[] = {&d_buffer, - &d_rng_state, - &sample, - &rtile.x, - &rtile.y, - &rtile.w, - &rtile.h, - &rtile.offset, - &rtile.stride}; + cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)); - /* launch kernel */ - int threads_per_block; - cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuPathTrace)); + /* allocate work tile */ + device_vector<WorkTile> work_tiles; + work_tiles.resize(1); - /*int num_registers; - cuda_assert(cuFuncGetAttribute(&num_registers, CU_FUNC_ATTRIBUTE_NUM_REGS, cuPathTrace)); + WorkTile *wtile = work_tiles.get_data(); + wtile->x = rtile.x; + wtile->y = rtile.y; + wtile->w = rtile.w; + wtile->h = rtile.h; + wtile->offset = rtile.offset; + wtile->stride = rtile.stride; + wtile->start_sample = sample; + wtile->num_samples = 1; + wtile->buffer = (float*)cuda_device_ptr(rtile.buffer); + wtile->rng_state = (uint*)cuda_device_ptr(rtile.rng_state); - printf("threads_per_block %d\n", threads_per_block); - printf("num_registers %d\n", num_registers);*/ + mem_alloc("work_tiles", work_tiles, MEM_READ_ONLY); + mem_copy_to(work_tiles); - int xthreads = (int)sqrt(threads_per_block); - int ythreads = (int)sqrt(threads_per_block); - int xblocks = (rtile.w + xthreads - 1)/xthreads; - int yblocks = (rtile.h + ythreads - 1)/ythreads; + CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer); - cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)); + uint total_work_size = wtile->w * wtile->h * wtile->num_samples; + + /* pass in parameters */ + void *args[] = {&d_work_tiles, + &total_work_size}; + + /* launch kernel */ + int num_threads_per_block; + cuda_assert(cuFuncGetAttribute(&num_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuPathTrace)); + int num_blocks = divide_up(total_work_size, num_threads_per_block); cuda_assert(cuLaunchKernel(cuPathTrace, - xblocks , yblocks, 1, /* blocks */ - xthreads, ythreads, 1, /* threads */ + num_blocks, 1, 1, + num_threads_per_block, 1, 1, 0, 0, args, 0)); cuda_assert(cuCtxSynchronize()); + + mem_free(work_tiles); } void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half) |