From 5b7d6ea54b2fc35b8b12c667f5bf9a1c9c46d5c2 Mon Sep 17 00:00:00 2001 From: Brecht Van Lommel Date: Tue, 26 Sep 2017 23:42:36 +0200 Subject: Code refactor: add WorkTile struct for passing work to kernel. This makes sharing some code between mega/split in following commits a bit easier, and also paves the way for rendering multiple tiles later. --- intern/cycles/device/device_cuda.cpp | 58 ++++++++++++---------- intern/cycles/device/device_memory.h | 6 ++- intern/cycles/kernel/kernel_types.h | 15 ++++++ intern/cycles/kernel/kernel_work_stealing.h | 34 ++++++------- intern/cycles/kernel/kernels/cuda/kernel.cu | 26 ++++++---- .../kernel/kernels/opencl/kernel_split_function.h | 4 +- intern/cycles/kernel/split/kernel_buffer_update.h | 20 ++++---- intern/cycles/kernel/split/kernel_data_init.h | 24 ++++----- ..._holdout_emission_blurring_pathtermination_ao.h | 2 +- intern/cycles/kernel/split/kernel_path_init.h | 16 +++--- .../cycles/kernel/split/kernel_split_data_types.h | 17 +------ 11 files changed, 122 insertions(+), 100 deletions(-) (limited to 'intern/cycles') diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 29b5bd70789..7ee74e9a512 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -1293,8 +1293,6 @@ public: CUDAContextScope scope(this); CUfunction cuPathTrace; - CUdeviceptr d_buffer = cuda_device_ptr(rtile.buffer); - CUdeviceptr d_rng_state = cuda_device_ptr(rtile.rng_state); /* get kernel function */ if(branched) { @@ -1308,40 +1306,48 @@ public: return; } - /* pass in parameters */ - void *args[] = {&d_buffer, - &d_rng_state, - &sample, - &rtile.x, - &rtile.y, - &rtile.w, - &rtile.h, - &rtile.offset, - &rtile.stride}; + cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)); - /* launch kernel */ - int threads_per_block; - cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuPathTrace)); + /* allocate work tile */ + device_vector work_tiles; + work_tiles.resize(1); - /*int num_registers; - cuda_assert(cuFuncGetAttribute(&num_registers, CU_FUNC_ATTRIBUTE_NUM_REGS, cuPathTrace)); + WorkTile *wtile = work_tiles.get_data(); + wtile->x = rtile.x; + wtile->y = rtile.y; + wtile->w = rtile.w; + wtile->h = rtile.h; + wtile->offset = rtile.offset; + wtile->stride = rtile.stride; + wtile->start_sample = sample; + wtile->num_samples = 1; + wtile->buffer = (float*)cuda_device_ptr(rtile.buffer); + wtile->rng_state = (uint*)cuda_device_ptr(rtile.rng_state); - printf("threads_per_block %d\n", threads_per_block); - printf("num_registers %d\n", num_registers);*/ + mem_alloc("work_tiles", work_tiles, MEM_READ_ONLY); + mem_copy_to(work_tiles); - int xthreads = (int)sqrt(threads_per_block); - int ythreads = (int)sqrt(threads_per_block); - int xblocks = (rtile.w + xthreads - 1)/xthreads; - int yblocks = (rtile.h + ythreads - 1)/ythreads; + CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer); - cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)); + uint total_work_size = wtile->w * wtile->h * wtile->num_samples; + + /* pass in parameters */ + void *args[] = {&d_work_tiles, + &total_work_size}; + + /* launch kernel */ + int num_threads_per_block; + cuda_assert(cuFuncGetAttribute(&num_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuPathTrace)); + int num_blocks = divide_up(total_work_size, num_threads_per_block); cuda_assert(cuLaunchKernel(cuPathTrace, - xblocks , yblocks, 1, /* blocks */ - xthreads, ythreads, 1, /* threads */ + num_blocks, 1, 1, + num_threads_per_block, 1, 1, 0, 0, args, 0)); cuda_assert(cuCtxSynchronize()); + + mem_free(work_tiles); } void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half) diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h index b63dd00068b..20707ad04c9 100644 --- a/intern/cycles/device/device_memory.h +++ b/intern/cycles/device/device_memory.h @@ -46,6 +46,7 @@ enum MemoryType { /* Supported Data Types */ enum DataType { + TYPE_UNKNOWN, TYPE_UCHAR, TYPE_UINT, TYPE_INT, @@ -57,6 +58,7 @@ enum DataType { static inline size_t datatype_size(DataType datatype) { switch(datatype) { + case TYPE_UNKNOWN: return 1; case TYPE_UCHAR: return sizeof(uchar); case TYPE_FLOAT: return sizeof(float); case TYPE_UINT: return sizeof(uint); @@ -70,8 +72,8 @@ static inline size_t datatype_size(DataType datatype) /* Traits for data types */ template struct device_type_traits { - static const DataType data_type = TYPE_UCHAR; - static const int num_elements = 0; + static const DataType data_type = TYPE_UNKNOWN; + static const int num_elements = sizeof(T); }; template<> struct device_type_traits { diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index 6c5b6ca3b2d..bf3a2881666 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -1448,6 +1448,21 @@ enum RayState { #define PATCH_MAP_NODE_IS_LEAF (1u << 31) #define PATCH_MAP_NODE_INDEX_MASK (~(PATCH_MAP_NODE_IS_SET | PATCH_MAP_NODE_IS_LEAF)) +/* Work Tiles */ + +typedef struct WorkTile { + uint x, y, w, h; + + uint start_sample; + uint num_samples; + + uint offset; + uint stride; + + ccl_global float *buffer; + ccl_global uint *rng_state; +} WorkTile; + CCL_NAMESPACE_END #endif /* __KERNEL_TYPES_H__ */ diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h index 0c11158e8da..0c2d9379b63 100644 --- a/intern/cycles/kernel/kernel_work_stealing.h +++ b/intern/cycles/kernel/kernel_work_stealing.h @@ -27,29 +27,28 @@ CCL_NAMESPACE_BEGIN # pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable #endif +#ifdef __SPLIT_KERNEL__ /* Returns true if there is work */ ccl_device bool get_next_work(KernelGlobals *kg, - uint thread_index, + ccl_global uint *work_pools, + uint total_work_size, + uint ray_index, ccl_private uint *global_work_index) { - uint total_work_size = kernel_split_params.w - * kernel_split_params.h - * kernel_split_params.num_samples; - /* With a small amount of work there may be more threads than work due to * rounding up of global size, stop such threads immediately. */ - if(thread_index >= total_work_size) { + if(ray_index >= total_work_size) { return false; } /* Increase atomic work index counter in pool. */ - uint pool = thread_index / WORK_POOL_SIZE; - uint work_index = atomic_fetch_and_inc_uint32(&kernel_split_params.work_pools[pool]); + uint pool = ray_index / WORK_POOL_SIZE; + uint work_index = atomic_fetch_and_inc_uint32(&work_pools[pool]); /* Map per-pool work index to a global work index. */ uint global_size = ccl_global_size(0) * ccl_global_size(1); kernel_assert(global_size % WORK_POOL_SIZE == 0); - kernel_assert(thread_index < global_size); + kernel_assert(ray_index < global_size); *global_work_index = (work_index / WORK_POOL_SIZE) * global_size + (pool * WORK_POOL_SIZE) @@ -58,23 +57,24 @@ ccl_device bool get_next_work(KernelGlobals *kg, /* Test if all work for this pool is done. */ return (*global_work_index < total_work_size); } +#endif -/* Map global work index to pixel X/Y and sample. */ -ccl_device_inline void get_work_pixel(KernelGlobals *kg, +/* Map global work index to tile, pixel X/Y and sample. */ +ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile, uint global_work_index, ccl_private uint *x, ccl_private uint *y, ccl_private uint *sample) { - uint tile_pixels = kernel_split_params.w * kernel_split_params.h; + uint tile_pixels = tile->w * tile->h; uint sample_offset = global_work_index / tile_pixels; uint pixel_offset = global_work_index - sample_offset * tile_pixels; - uint y_offset = pixel_offset / kernel_split_params.w; - uint x_offset = pixel_offset - y_offset * kernel_split_params.w; + uint y_offset = pixel_offset / tile->w; + uint x_offset = pixel_offset - y_offset * tile->w; - *x = kernel_split_params.x + x_offset; - *y = kernel_split_params.y + y_offset; - *sample = kernel_split_params.start_sample + sample_offset; + *x = tile->x + x_offset; + *y = tile->y + y_offset; + *sample = tile->start_sample + sample_offset; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu index dc343cb387a..4d100634421 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel.cu +++ b/intern/cycles/kernel/kernels/cuda/kernel.cu @@ -20,6 +20,7 @@ #include "kernel/kernel_compat_cuda.h" #include "kernel_config.h" + #include "kernel/kernel_math.h" #include "kernel/kernel_types.h" #include "kernel/kernel_globals.h" @@ -27,32 +28,37 @@ #include "kernel/kernel_path.h" #include "kernel/kernel_path_branched.h" #include "kernel/kernel_bake.h" +#include "kernel/kernel_work_stealing.h" /* kernels */ extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride) +kernel_cuda_path_trace(WorkTile *tile, uint total_work_size) { - int x = sx + blockDim.x*blockIdx.x + threadIdx.x; - int y = sy + blockDim.y*blockIdx.y + threadIdx.y; + int work_index = ccl_global_id(0); + + if(work_index < total_work_size) { + uint x, y, sample; + get_work_pixel(tile, work_index, &x, &y, &sample); - if(x < sx + sw && y < sy + sh) { KernelGlobals kg; - kernel_path_trace(&kg, buffer, rng_state, sample, x, y, offset, stride); + kernel_path_trace(&kg, tile->buffer, tile->rng_state, sample, x, y, tile->offset, tile->stride); } } #ifdef __BRANCHED_PATH__ extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_BRANCHED_MAX_REGISTERS) -kernel_cuda_branched_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride) +kernel_cuda_branched_path_trace(WorkTile *tile, uint total_work_size) { - int x = sx + blockDim.x*blockIdx.x + threadIdx.x; - int y = sy + blockDim.y*blockIdx.y + threadIdx.y; + int work_index = ccl_global_id(0); + + if(work_index < total_work_size) { + uint x, y, sample; + get_work_pixel(tile, work_index, &x, &y, &sample); - if(x < sx + sw && y < sy + sh) { KernelGlobals kg; - kernel_branched_path_trace(&kg, buffer, rng_state, sample, x, y, offset, stride); + kernel_branched_path_trace(&kg, tile->buffer, tile->rng_state, sample, x, y, tile->offset, tile->stride); } } #endif diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h index 591c3846ef2..499138b5581 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h +++ b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h @@ -42,11 +42,11 @@ __kernel void KERNEL_NAME_EVAL(kernel_ocl_path_trace, KERNEL_NAME)( if(ccl_local_id(0) + ccl_local_id(1) == 0) { kg->data = data; - kernel_split_params.rng_state = rng_state; + kernel_split_params.tile.rng_state = rng_state; kernel_split_params.queue_index = queue_index; kernel_split_params.use_queues_flag = use_queues_flag; kernel_split_params.work_pools = work_pools; - kernel_split_params.buffer = buffer; + kernel_split_params.tile.buffer = buffer; split_data_init(kg, &kernel_split_state, ccl_global_size(0)*ccl_global_size(1), split_data_buffer, ray_state); diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h index c9e7deddafa..e8547767480 100644 --- a/intern/cycles/kernel/split/kernel_buffer_update.h +++ b/intern/cycles/kernel/split/kernel_buffer_update.h @@ -75,8 +75,6 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg, if(ray_index != QUEUE_EMPTY_SLOT) { #endif - int stride = kernel_split_params.stride; - ccl_global char *ray_state = kernel_split_state.ray_state; ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; @@ -86,7 +84,7 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg, if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) { uint sample = state->sample; uint buffer_offset = kernel_split_state.buffer_offset[ray_index]; - ccl_global float *buffer = kernel_split_params.buffer + buffer_offset; + ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset; /* accumulate result in output buffer */ kernel_write_result(kg, buffer, sample, L); @@ -96,22 +94,27 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg, if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { /* We have completed current work; So get next work */ + ccl_global uint *work_pools = kernel_split_params.work_pools; + uint total_work_size = kernel_split_params.total_work_size; uint work_index; - if(!get_next_work(kg, ray_index, &work_index)) { + + if(!get_next_work(kg, work_pools, total_work_size, ray_index, &work_index)) { /* If work is invalid, this means no more work is available and the thread may exit */ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE); } if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { + ccl_global WorkTile *tile = &kernel_split_params.tile; uint x, y, sample; - get_work_pixel(kg, work_index, &x, &y, &sample); + get_work_pixel(tile, work_index, &x, &y, &sample); /* Remap rng_state to current pixel. */ - ccl_global uint *rng_state = kernel_split_params.rng_state; - rng_state += kernel_split_params.offset + x + y*stride; + ccl_global uint *rng_state = kernel_split_params.tile.rng_state; + rng_state += tile->offset + x + y*tile->stride; /* Store buffer offset for writing to passes. */ - uint buffer_offset = (kernel_split_params.offset + x + y*stride) * kernel_data.film.pass_stride; + uint buffer_offset = (tile->offset + x + y*tile->stride) * kernel_data.film.pass_stride; + ccl_global float *buffer = tile->buffer + buffer_offset; kernel_split_state.buffer_offset[ray_index] = buffer_offset; /* Initialize random numbers and ray. */ @@ -135,7 +138,6 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg, /* These rays do not participate in path-iteration. */ float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f); /* Accumulate result in output buffer. */ - ccl_global float *buffer = kernel_split_params.buffer + buffer_offset; kernel_write_pass_float4(buffer, sample, L_rad); ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h index 2c042dfde6f..2da3ca47466 100644 --- a/intern/cycles/kernel/split/kernel_data_init.h +++ b/intern/cycles/kernel/split/kernel_data_init.h @@ -73,28 +73,28 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)( kg->data = data; #endif - kernel_split_params.x = sx; - kernel_split_params.y = sy; - kernel_split_params.w = sw; - kernel_split_params.h = sh; + kernel_split_params.tile.x = sx; + kernel_split_params.tile.y = sy; + kernel_split_params.tile.w = sw; + kernel_split_params.tile.h = sh; - kernel_split_params.offset = offset; - kernel_split_params.stride = stride; + kernel_split_params.tile.start_sample = start_sample; + kernel_split_params.tile.num_samples = num_samples; - kernel_split_params.rng_state = rng_state; + kernel_split_params.tile.offset = offset; + kernel_split_params.tile.stride = stride; - kernel_split_params.start_sample = start_sample; - kernel_split_params.end_sample = end_sample; + kernel_split_params.tile.rng_state = rng_state; + kernel_split_params.tile.buffer = buffer; + + kernel_split_params.total_work_size = sw * sh * num_samples; kernel_split_params.work_pools = work_pools; - kernel_split_params.num_samples = num_samples; kernel_split_params.queue_index = Queue_index; kernel_split_params.queue_size = queuesize; kernel_split_params.use_queues_flag = use_queues_flag; - kernel_split_params.buffer = buffer; - split_data_init(kg, &kernel_split_state, num_elements, split_data_buffer, ray_state); #ifdef __KERNEL_OPENCL__ diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h index dffd291012d..906bad8ceb6 100644 --- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h +++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h @@ -98,7 +98,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { uint buffer_offset = kernel_split_state.buffer_offset[ray_index]; - ccl_global float *buffer = kernel_split_params.buffer + buffer_offset; + ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset; ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; diff --git a/intern/cycles/kernel/split/kernel_path_init.h b/intern/cycles/kernel/split/kernel_path_init.h index 0ab2289348b..701d39403ad 100644 --- a/intern/cycles/kernel/split/kernel_path_init.h +++ b/intern/cycles/kernel/split/kernel_path_init.h @@ -30,23 +30,28 @@ ccl_device void kernel_path_init(KernelGlobals *kg) { kernel_split_state.ray_state[ray_index] = RAY_ACTIVE; /* Get work. */ + ccl_global uint *work_pools = kernel_split_params.work_pools; + uint total_work_size = kernel_split_params.total_work_size; uint work_index; - if(!get_next_work(kg, ray_index, &work_index)) { + + if(!get_next_work(kg, work_pools, total_work_size, ray_index, &work_index)) { /* No more work, mark ray as inactive */ kernel_split_state.ray_state[ray_index] = RAY_INACTIVE; return; } + ccl_global WorkTile *tile = &kernel_split_params.tile; uint x, y, sample; - get_work_pixel(kg, work_index, &x, &y, &sample); + get_work_pixel(tile, work_index, &x, &y, &sample); /* Remap rng_state and buffer to current pixel. */ - ccl_global uint *rng_state = kernel_split_params.rng_state; - rng_state += kernel_split_params.offset + x + y*kernel_split_params.stride; + ccl_global uint *rng_state = kernel_split_params.tile.rng_state; + rng_state += tile->offset + x + y*tile->stride; /* Store buffer offset for writing to passes. */ - uint buffer_offset = (kernel_split_params.offset + x + y*kernel_split_params.stride) * kernel_data.film.pass_stride; + uint buffer_offset = (tile->offset + x + y*tile->stride) * kernel_data.film.pass_stride; + ccl_global float *buffer = tile->buffer + buffer_offset; kernel_split_state.buffer_offset[ray_index] = buffer_offset; /* Initialize random numbers and ray. */ @@ -78,7 +83,6 @@ ccl_device void kernel_path_init(KernelGlobals *kg) { /* These rays do not participate in path-iteration. */ float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f); /* Accumulate result in output buffer. */ - ccl_global float *buffer = kernel_split_params.buffer + buffer_offset; kernel_write_pass_float4(buffer, sample, L_rad); ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE); } diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h index c58c8463f5c..b0e6e5f5250 100644 --- a/intern/cycles/kernel/split/kernel_split_data_types.h +++ b/intern/cycles/kernel/split/kernel_split_data_types.h @@ -22,28 +22,15 @@ CCL_NAMESPACE_BEGIN /* parameters used by the split kernels, we use a single struct to avoid passing these to each kernel */ typedef struct SplitParams { - int x; - int y; - int w; - int h; - - int offset; - int stride; - - ccl_global uint *rng_state; - - int start_sample; - int end_sample; + WorkTile tile; + uint total_work_size; ccl_global unsigned int *work_pools; - unsigned int num_samples; ccl_global int *queue_index; int queue_size; ccl_global char *use_queues_flag; - ccl_global float *buffer; - /* Place for storing sd->flag. AMD GPU OpenCL compiler workaround */ int dummy_sd_flag; } SplitParams; -- cgit v1.2.3