diff options
author | Mai Lavelle <mai.lavelle@gmail.com> | 2017-01-26 09:41:48 +0300 |
---|---|---|
committer | Mai Lavelle <mai.lavelle@gmail.com> | 2017-03-08 08:52:41 +0300 |
commit | 520b53364c73c75c4ff400d639dad13630f0e6fc (patch) | |
tree | 8ecdc8d47cb3d604b42d955840b9a3d38738946b /intern | |
parent | dfd6055eb0798c8cba245d47cd4a3fe51270a52a (diff) |
Cycles: Add OpenCL kernel for zeroing memory buffers
Transferring memory to the device was very slow and there's really no
need when only zeroing a buffer.
Diffstat (limited to 'intern')
-rw-r--r-- | intern/cycles/device/opencl/opencl_base.cpp | 54 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/opencl/kernel.cl | 16 |
2 files changed, 69 insertions, 1 deletions
diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp index c3f099b795e..7fa14eee70c 100644 --- a/intern/cycles/device/opencl/opencl_base.cpp +++ b/intern/cycles/device/opencl/opencl_base.cpp @@ -206,6 +206,7 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea base_program.add_kernel(ustring("convert_to_half_float")); base_program.add_kernel(ustring("shader")); base_program.add_kernel(ustring("bake")); + base_program.add_kernel(ustring("zero_buffer")); vector<OpenCLProgram*> programs; programs.push_back(&base_program); @@ -311,10 +312,61 @@ void OpenCLDeviceBase::mem_copy_from(device_memory& mem, int y, int w, int h, in void OpenCLDeviceBase::mem_zero(device_memory& mem) { if(mem.device_pointer) { + if(base_program.is_loaded()) { + cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer")); + + size_t global_size[] = {1024, 1024}; + size_t num_threads = global_size[0] * global_size[1]; + + cl_mem d_buffer = CL_MEM_PTR(mem.device_pointer); + unsigned long long d_offset = 0; + unsigned long long d_size = 0; + + while(d_offset < mem.memory_size()) { + d_size = std::min<unsigned long long>(num_threads*sizeof(float4), mem.memory_size() - d_offset); + + kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset); + + ciErr = clEnqueueNDRangeKernel(cqCommandQueue, + ckZeroBuffer, + 2, + NULL, + global_size, + NULL, + 0, + NULL, + NULL); + opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); + + d_offset += d_size; + } + } + if(mem.data_pointer) { memset((void*)mem.data_pointer, 0, mem.memory_size()); } - mem_copy_to(mem); + + if(!base_program.is_loaded()) { + void* zero = (void*)mem.data_pointer; + + if(!mem.data_pointer) { + zero = util_aligned_malloc(mem.memory_size(), 16); + memset(zero, 0, mem.memory_size()); + } + + opencl_assert(clEnqueueWriteBuffer(cqCommandQueue, + CL_MEM_PTR(mem.device_pointer), + CL_TRUE, + 0, + mem.memory_size(), + zero, + 0, + NULL, NULL)); + + if(!mem.data_pointer) { + util_aligned_free(zero); + } + } } } diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl index a68f97857b6..e501fd4f015 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel.cl @@ -193,4 +193,20 @@ __kernel void kernel_ocl_convert_to_half_float( kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride); } +__kernel void kernel_ocl_zero_buffer(ccl_global float4 *buffer, ulong size, ulong offset) +{ + size_t i = get_global_id(0) + get_global_id(1) * get_global_size(0); + + if(i < size / sizeof(float4)) { + buffer[i+offset/sizeof(float4)] = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + else if(i == size / sizeof(float4)) { + ccl_global uchar *b = (ccl_global uchar*)&buffer[i+offset/sizeof(float4)]; + + for(i = 0; i < size % sizeof(float4); i++) { + *(b++) = 0; + } + } +} + #endif /* __COMPILE_ONLY_MEGAKERNEL__ */ |