Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/device/opencl/opencl_base.cpp')
-rw-r--r--intern/cycles/device/opencl/opencl_base.cpp465
1 files changed, 440 insertions, 25 deletions
diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp
index 22aeaddcde8..ae1a7b917c3 100644
--- a/intern/cycles/device/opencl/opencl_base.cpp
+++ b/intern/cycles/device/opencl/opencl_base.cpp
@@ -213,8 +213,23 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea
base_program.add_kernel(ustring("bake"));
base_program.add_kernel(ustring("zero_buffer"));
+ denoising_program = OpenCLProgram(this, "denoising", "filter.cl", "");
+ denoising_program.add_kernel(ustring("filter_divide_shadow"));
+ denoising_program.add_kernel(ustring("filter_get_feature"));
+ denoising_program.add_kernel(ustring("filter_combine_halves"));
+ denoising_program.add_kernel(ustring("filter_construct_transform"));
+ denoising_program.add_kernel(ustring("filter_nlm_calc_difference"));
+ denoising_program.add_kernel(ustring("filter_nlm_blur"));
+ denoising_program.add_kernel(ustring("filter_nlm_calc_weight"));
+ denoising_program.add_kernel(ustring("filter_nlm_update_output"));
+ denoising_program.add_kernel(ustring("filter_nlm_normalize"));
+ denoising_program.add_kernel(ustring("filter_nlm_construct_gramian"));
+ denoising_program.add_kernel(ustring("filter_finalize"));
+ denoising_program.add_kernel(ustring("filter_set_tiles"));
+
vector<OpenCLProgram*> programs;
programs.push_back(&base_program);
+ programs.push_back(&denoising_program);
/* Call actual class to fill the vector with its programs. */
if(!load_kernels(requested_features, programs)) {
return false;
@@ -322,37 +337,42 @@ void OpenCLDeviceBase::mem_copy_from(device_memory& mem, int y, int w, int h, in
NULL, NULL));
}
-void OpenCLDeviceBase::mem_zero(device_memory& mem)
+void OpenCLDeviceBase::mem_zero_kernel(device_ptr mem, size_t size)
{
- if(mem.device_pointer) {
- if(base_program.is_loaded()) {
- cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
+ cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
- size_t global_size[] = {1024, 1024};
- size_t num_threads = global_size[0] * global_size[1];
+ size_t global_size[] = {1024, 1024};
+ size_t num_threads = global_size[0] * global_size[1];
- cl_mem d_buffer = CL_MEM_PTR(mem.device_pointer);
- cl_ulong d_offset = 0;
- cl_ulong d_size = 0;
+ cl_mem d_buffer = CL_MEM_PTR(mem);
+ cl_ulong d_offset = 0;
+ cl_ulong d_size = 0;
- while(d_offset < mem.memory_size()) {
- d_size = std::min<cl_ulong>(num_threads*sizeof(float4), mem.memory_size() - d_offset);
+ while(d_offset < size) {
+ d_size = std::min<cl_ulong>(num_threads*sizeof(float4), size - d_offset);
- kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
+ kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
- ciErr = clEnqueueNDRangeKernel(cqCommandQueue,
- ckZeroBuffer,
- 2,
- NULL,
- global_size,
- NULL,
- 0,
- NULL,
- NULL);
- opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
+ ciErr = clEnqueueNDRangeKernel(cqCommandQueue,
+ ckZeroBuffer,
+ 2,
+ NULL,
+ global_size,
+ NULL,
+ 0,
+ NULL,
+ NULL);
+ opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
- d_offset += d_size;
- }
+ d_offset += d_size;
+ }
+}
+
+void OpenCLDeviceBase::mem_zero(device_memory& mem)
+{
+ if(mem.device_pointer) {
+ if(base_program.is_loaded()) {
+ mem_zero_kernel(mem.device_pointer, mem.memory_size());
}
if(mem.data_pointer) {
@@ -396,6 +416,41 @@ void OpenCLDeviceBase::mem_free(device_memory& mem)
}
}
+int OpenCLDeviceBase::mem_address_alignment()
+{
+ return OpenCLInfo::mem_address_alignment(cdDevice);
+}
+
+device_ptr OpenCLDeviceBase::mem_alloc_sub_ptr(device_memory& mem, int offset, int size, MemoryType type)
+{
+ cl_mem_flags mem_flag;
+ if(type == MEM_READ_ONLY)
+ mem_flag = CL_MEM_READ_ONLY;
+ else if(type == MEM_WRITE_ONLY)
+ mem_flag = CL_MEM_WRITE_ONLY;
+ else
+ mem_flag = CL_MEM_READ_WRITE;
+
+ cl_buffer_region info;
+ info.origin = mem.memory_elements_size(offset);
+ info.size = mem.memory_elements_size(size);
+
+ device_ptr sub_buf = (device_ptr) clCreateSubBuffer(CL_MEM_PTR(mem.device_pointer),
+ mem_flag,
+ CL_BUFFER_CREATE_TYPE_REGION,
+ &info,
+ &ciErr);
+ opencl_assert_err(ciErr, "clCreateSubBuffer");
+ return sub_buf;
+}
+
+void OpenCLDeviceBase::mem_free_sub_ptr(device_ptr device_pointer)
+{
+ if(device_pointer && device_pointer != null_mem) {
+ opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer)));
+ }
+}
+
void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size)
{
ConstMemMap::iterator i = const_mem_map.find(name);
@@ -449,7 +504,7 @@ size_t OpenCLDeviceBase::global_size_round_up(int group_size, int global_size)
return global_size + ((r == 0)? 0: group_size - r);
}
-void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h)
+void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size)
{
size_t workgroup_size, max_work_items[3];
@@ -458,6 +513,10 @@ void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h)
clGetDeviceInfo(cdDevice,
CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);
+ if(max_workgroup_size > 0 && workgroup_size > max_workgroup_size) {
+ workgroup_size = max_workgroup_size;
+ }
+
/* Try to divide evenly over 2 dimensions. */
size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
@@ -543,6 +602,362 @@ set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name);
enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
}
+bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
+ device_ptr guide_ptr,
+ device_ptr variance_ptr,
+ device_ptr out_ptr,
+ DenoisingTask *task)
+{
+ int4 rect = task->rect;
+ int w = rect.z-rect.x;
+ int h = rect.w-rect.y;
+ int r = task->nlm_state.r;
+ int f = task->nlm_state.f;
+ float a = task->nlm_state.a;
+ float k_2 = task->nlm_state.k_2;
+
+ cl_mem difference = CL_MEM_PTR(task->nlm_state.temporary_1_ptr);
+ cl_mem blurDifference = CL_MEM_PTR(task->nlm_state.temporary_2_ptr);
+ cl_mem weightAccum = CL_MEM_PTR(task->nlm_state.temporary_3_ptr);
+
+ cl_mem image_mem = CL_MEM_PTR(image_ptr);
+ cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
+ cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
+ cl_mem out_mem = CL_MEM_PTR(out_ptr);
+
+ mem_zero_kernel(task->nlm_state.temporary_3_ptr, sizeof(float)*w*h);
+ mem_zero_kernel(out_ptr, sizeof(float)*w*h);
+
+ cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
+ cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
+ cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
+ cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output"));
+ cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize"));
+
+ for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+ int dy = i / (2*r+1) - r;
+ int dx = i % (2*r+1) - r;
+ int4 local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
+ kernel_set_args(ckNLMCalcDifference, 0,
+ dx, dy, guide_mem, variance_mem,
+ difference, local_rect, w, 0, a, k_2);
+ kernel_set_args(ckNLMBlur, 0,
+ difference, blurDifference, local_rect, w, f);
+ kernel_set_args(ckNLMCalcWeight, 0,
+ blurDifference, difference, local_rect, w, f);
+ kernel_set_args(ckNLMUpdateOutput, 0,
+ dx, dy, blurDifference, image_mem,
+ out_mem, weightAccum, local_rect, w, f);
+
+ enqueue_kernel(ckNLMCalcDifference, w, h);
+ enqueue_kernel(ckNLMBlur, w, h);
+ enqueue_kernel(ckNLMCalcWeight, w, h);
+ enqueue_kernel(ckNLMBlur, w, h);
+ enqueue_kernel(ckNLMUpdateOutput, w, h);
+ }
+
+ int4 local_rect = make_int4(0, 0, w, h);
+ kernel_set_args(ckNLMNormalize, 0,
+ out_mem, weightAccum, local_rect, w);
+ enqueue_kernel(ckNLMNormalize, w, h);
+
+ return true;
+}
+
+bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task)
+{
+ cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
+ cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
+ cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
+
+ cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
+
+ kernel_set_args(ckFilterConstructTransform, 0,
+ buffer_mem,
+ transform_mem,
+ rank_mem,
+ task->filter_area,
+ task->rect,
+ task->buffer.pass_stride,
+ task->radius,
+ task->pca_threshold);
+
+ enqueue_kernel(ckFilterConstructTransform,
+ task->storage.w,
+ task->storage.h,
+ 256);
+
+ return true;
+}
+
+bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
+ device_ptr color_variance_ptr,
+ device_ptr guide_ptr,
+ device_ptr guide_variance_ptr,
+ device_ptr output_ptr,
+ DenoisingTask *task)
+{
+ mem_zero(task->storage.XtWX);
+ mem_zero(task->storage.XtWY);
+
+ cl_mem color_mem = CL_MEM_PTR(color_ptr);
+ cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
+ cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
+ cl_mem guide_variance_mem = CL_MEM_PTR(guide_variance_ptr);
+ cl_mem output_mem = CL_MEM_PTR(output_ptr);
+
+ cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
+ cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
+ cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
+ cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
+ cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
+
+ cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
+ cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
+ cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
+ cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
+ cl_kernel ckFinalize = denoising_program(ustring("filter_finalize"));
+
+ cl_mem difference = CL_MEM_PTR(task->reconstruction_state.temporary_1_ptr);
+ cl_mem blurDifference = CL_MEM_PTR(task->reconstruction_state.temporary_2_ptr);
+
+ int r = task->radius;
+ int f = 4;
+ float a = 1.0f;
+ for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+ int dy = i / (2*r+1) - r;
+ int dx = i % (2*r+1) - r;
+
+ int local_rect[4] = {max(0, -dx), max(0, -dy),
+ task->reconstruction_state.source_w - max(0, dx),
+ task->reconstruction_state.source_h - max(0, dy)};
+
+ kernel_set_args(ckNLMCalcDifference, 0,
+ dx, dy,
+ guide_mem,
+ guide_variance_mem,
+ difference,
+ local_rect,
+ task->buffer.w,
+ task->buffer.pass_stride,
+ a, task->nlm_k_2);
+ enqueue_kernel(ckNLMCalcDifference,
+ task->reconstruction_state.source_w,
+ task->reconstruction_state.source_h);
+
+ kernel_set_args(ckNLMBlur, 0,
+ difference,
+ blurDifference,
+ local_rect,
+ task->buffer.w,
+ f);
+ enqueue_kernel(ckNLMBlur,
+ task->reconstruction_state.source_w,
+ task->reconstruction_state.source_h);
+
+ kernel_set_args(ckNLMCalcWeight, 0,
+ blurDifference,
+ difference,
+ local_rect,
+ task->buffer.w,
+ f);
+ enqueue_kernel(ckNLMCalcWeight,
+ task->reconstruction_state.source_w,
+ task->reconstruction_state.source_h);
+
+ /* Reuse previous arguments. */
+ enqueue_kernel(ckNLMBlur,
+ task->reconstruction_state.source_w,
+ task->reconstruction_state.source_h);
+
+ kernel_set_args(ckNLMConstructGramian, 0,
+ dx, dy,
+ blurDifference,
+ buffer_mem,
+ color_mem,
+ color_variance_mem,
+ transform_mem,
+ rank_mem,
+ XtWX_mem,
+ XtWY_mem,
+ local_rect,
+ task->reconstruction_state.filter_rect,
+ task->buffer.w,
+ task->buffer.h,
+ f,
+ task->buffer.pass_stride);
+ enqueue_kernel(ckNLMConstructGramian,
+ task->reconstruction_state.source_w,
+ task->reconstruction_state.source_h,
+ 256);
+ }
+
+ kernel_set_args(ckFinalize, 0,
+ task->buffer.w,
+ task->buffer.h,
+ output_mem,
+ rank_mem,
+ XtWX_mem,
+ XtWY_mem,
+ task->filter_area,
+ task->reconstruction_state.buffer_params,
+ task->render_buffer.samples);
+ enqueue_kernel(ckFinalize,
+ task->reconstruction_state.source_w,
+ task->reconstruction_state.source_h);
+
+ return true;
+}
+
+bool OpenCLDeviceBase::denoising_combine_halves(device_ptr a_ptr,
+ device_ptr b_ptr,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr,
+ int r, int4 rect,
+ DenoisingTask *task)
+{
+ (void) task;
+
+ cl_mem a_mem = CL_MEM_PTR(a_ptr);
+ cl_mem b_mem = CL_MEM_PTR(b_ptr);
+ cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
+ cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
+
+ cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves"));
+
+ kernel_set_args(ckFilterCombineHalves, 0,
+ mean_mem,
+ variance_mem,
+ a_mem,
+ b_mem,
+ rect,
+ r);
+ enqueue_kernel(ckFilterCombineHalves,
+ task->rect.z-task->rect.x,
+ task->rect.w-task->rect.y);
+
+ return true;
+}
+
+bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr,
+ device_ptr b_ptr,
+ device_ptr sample_variance_ptr,
+ device_ptr sv_variance_ptr,
+ device_ptr buffer_variance_ptr,
+ DenoisingTask *task)
+{
+ (void) task;
+
+ cl_mem a_mem = CL_MEM_PTR(a_ptr);
+ cl_mem b_mem = CL_MEM_PTR(b_ptr);
+ cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr);
+ cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr);
+ cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr);
+
+ cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
+
+ cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
+
+ char split_kernel = is_split_kernel()? 1 : 0;
+ kernel_set_args(ckFilterDivideShadow, 0,
+ task->render_buffer.samples,
+ tiles_mem,
+ a_mem,
+ b_mem,
+ sample_variance_mem,
+ sv_variance_mem,
+ buffer_variance_mem,
+ task->rect,
+ task->render_buffer.pass_stride,
+ task->render_buffer.denoising_data_offset,
+ split_kernel);
+ enqueue_kernel(ckFilterDivideShadow,
+ task->rect.z-task->rect.x,
+ task->rect.w-task->rect.y);
+
+ return true;
+}
+
+bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
+ int variance_offset,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr,
+ DenoisingTask *task)
+{
+ cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
+ cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
+
+ cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
+
+ cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
+
+ char split_kernel = is_split_kernel()? 1 : 0;
+ kernel_set_args(ckFilterGetFeature, 0,
+ task->render_buffer.samples,
+ tiles_mem,
+ mean_offset,
+ variance_offset,
+ mean_mem,
+ variance_mem,
+ task->rect,
+ task->render_buffer.pass_stride,
+ task->render_buffer.denoising_data_offset,
+ split_kernel);
+ enqueue_kernel(ckFilterGetFeature,
+ task->rect.z-task->rect.x,
+ task->rect.w-task->rect.y);
+
+ return true;
+}
+
+bool OpenCLDeviceBase::denoising_set_tiles(device_ptr *buffers,
+ DenoisingTask *task)
+{
+ mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_WRITE);
+ mem_copy_to(task->tiles_mem);
+
+ cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
+
+ cl_kernel ckFilterSetTiles = denoising_program(ustring("filter_set_tiles"));
+
+ kernel_set_args(ckFilterSetTiles, 0, tiles_mem);
+ for(int i = 0; i < 9; i++) {
+ cl_mem buffer_mem = CL_MEM_PTR(buffers[i]);
+ kernel_set_args(ckFilterSetTiles, i+1, buffer_mem);
+ }
+
+ enqueue_kernel(ckFilterSetTiles, 1, 1);
+
+ return true;
+}
+
+void OpenCLDeviceBase::denoise(RenderTile &rtile, const DeviceTask &task)
+{
+ DenoisingTask denoising(this);
+
+ denoising.functions.set_tiles = function_bind(&OpenCLDeviceBase::denoising_set_tiles, this, _1, &denoising);
+ denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising);
+ denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising);
+ denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
+ denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.combine_halves = function_bind(&OpenCLDeviceBase::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
+ denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+
+ denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
+ denoising.render_buffer.samples = rtile.sample;
+
+ RenderTile rtiles[9];
+ rtiles[4] = rtile;
+ task.map_neighbor_tiles(rtiles, this);
+ denoising.tiles_from_rendertiles(rtiles);
+
+ denoising.init_from_devicetask(task);
+
+ denoising.run_denoising();
+
+ task.unmap_neighbor_tiles(rtiles, this);
+}
+
void OpenCLDeviceBase::shader(DeviceTask& task)
{
/* cast arguments to cl types */