Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/device')
-rw-r--r--intern/cycles/device/device_cpu.cpp21
-rw-r--r--intern/cycles/device/device_cuda.cpp244
-rw-r--r--intern/cycles/device/device_denoising.cpp24
-rw-r--r--intern/cycles/device/device_denoising.h5
-rw-r--r--intern/cycles/device/opencl/opencl.h4
-rw-r--r--intern/cycles/device/opencl/opencl_base.cpp225
6 files changed, 271 insertions, 252 deletions
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 999b9230d29..2d28ccd2b49 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -190,9 +190,9 @@ public:
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel;
KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel;
- KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)> filter_construct_transform_kernel;
- KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int)> filter_nlm_construct_gramian_kernel;
- KernelFunctions<void(*)(int, int, int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel;
+ KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)> filter_construct_transform_kernel;
+ KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel;
+ KernelFunctions<void(*)(int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel;
KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*,
int, int, int, int, int, int, int, int, ccl_global int*, int,
@@ -565,13 +565,13 @@ public:
(float*) color_variance_ptr,
difference,
local_rect,
- task->buffer.w,
+ task->buffer.stride,
task->buffer.pass_stride,
1.0f,
task->nlm_k_2);
- filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4);
- filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.w, 4);
- filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4);
+ filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
+ filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.stride, 4);
+ filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
filter_nlm_construct_gramian_kernel()(dx, dy,
blurDifference,
(float*) task->buffer.mem.device_pointer,
@@ -580,9 +580,8 @@ public:
(float*) task->storage.XtWX.device_pointer,
(float3*) task->storage.XtWY.device_pointer,
local_rect,
- &task->reconstruction_state.filter_rect.x,
- task->buffer.w,
- task->buffer.h,
+ &task->reconstruction_state.filter_window.x,
+ task->buffer.stride,
4,
task->buffer.pass_stride);
}
@@ -591,8 +590,6 @@ public:
filter_finalize_kernel()(x,
y,
y*task->filter_area.z + x,
- task->buffer.w,
- task->buffer.h,
(float*) output_ptr,
(int*) task->storage.rank.device_pointer,
(float*) task->storage.XtWX.device_pointer,
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index d8d787ba706..a663da748df 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -1087,6 +1087,19 @@ public:
threads, threads, 1, \
0, 0, args, 0));
+/* Similar as above, but for 1-dimensional blocks. */
+#define CUDA_GET_BLOCKSIZE_1D(func, w, h) \
+ int threads_per_block; \
+ cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
+ int xblocks = ((w) + threads_per_block - 1)/threads_per_block; \
+ int yblocks = h;
+
+#define CUDA_LAUNCH_KERNEL_1D(func, args) \
+ cuda_assert(cuLaunchKernel(func, \
+ xblocks, yblocks, 1, \
+ threads_per_block, 1, 1, \
+ 0, 0, args, 0));
+
bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
DenoisingTask *task)
{
@@ -1095,60 +1108,65 @@ public:
CUDAContextScope scope(this);
- int4 rect = task->rect;
- int w = align_up(rect.z-rect.x, 4);
- int h = rect.w-rect.y;
+ int stride = task->buffer.stride;
+ int w = task->buffer.width;
+ int h = task->buffer.h;
int r = task->nlm_state.r;
int f = task->nlm_state.f;
float a = task->nlm_state.a;
float k_2 = task->nlm_state.k_2;
- CUdeviceptr difference = task->nlm_state.temporary_1_ptr;
- CUdeviceptr blurDifference = task->nlm_state.temporary_2_ptr;
- CUdeviceptr weightAccum = task->nlm_state.temporary_3_ptr;
+ int shift_stride = stride*h;
+ int num_shifts = (2*r+1)*(2*r+1);
+ int mem_size = sizeof(float)*shift_stride*2*num_shifts;
+ int channel_offset = 0;
- cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*w*h));
- cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*w*h));
+ CUdeviceptr temporary_mem;
+ cuda_assert(cuMemAlloc(&temporary_mem, mem_size));
+ CUdeviceptr difference = temporary_mem;
+ CUdeviceptr blurDifference = temporary_mem + sizeof(float)*shift_stride * num_shifts;
- CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput, cuNLMNormalize;
- cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
- cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
- cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
- cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
- cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
+ CUdeviceptr weightAccum = task->nlm_state.temporary_3_ptr;
+ cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*shift_stride));
+ cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*shift_stride));
- cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
+ {
+ CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput;
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+ cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+ cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
- CUDA_GET_BLOCKSIZE(cuNLMCalcDifference, rect.z-rect.x, rect.w-rect.y);
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
- int dx, dy;
- int4 local_rect;
- int channel_offset = 0;
- void *calc_difference_args[] = {&dx, &dy, &guide_ptr, &variance_ptr, &difference, &local_rect, &w, &channel_offset, &a, &k_2};
- void *blur_args[] = {&difference, &blurDifference, &local_rect, &w, &f};
- void *calc_weight_args[] = {&blurDifference, &difference, &local_rect, &w, &f};
- void *update_output_args[] = {&dx, &dy, &blurDifference, &image_ptr, &out_ptr, &weightAccum, &local_rect, &w, &f};
-
- for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
- dy = i / (2*r+1) - r;
- dx = i % (2*r+1) - r;
- local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
-
- CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args);
- CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
- CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args);
- CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
- CUDA_LAUNCH_KERNEL(cuNLMUpdateOutput, update_output_args);
- }
-
- local_rect = make_int4(0, 0, rect.z-rect.x, rect.w-rect.y);
- void *normalize_args[] = {&out_ptr, &weightAccum, &local_rect, &w};
- CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
- cuda_assert(cuCtxSynchronize());
+ CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w*h, num_shifts);
+
+ void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &difference, &w, &h, &stride, &shift_stride, &r, &channel_offset, &a, &k_2};
+ void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &shift_stride, &r, &f};
+ void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &shift_stride, &r, &f};
+ void *update_output_args[] = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &shift_stride, &r, &f};
+
+ CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
+ CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+ CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
+ CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+ CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args);
+ }
+
+ cuMemFree(temporary_mem);
+
+ {
+ CUfunction cuNLMNormalize;
+ cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
+ void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride};
+ CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h);
+ CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
+ cuda_assert(cuCtxSynchronize());
+ }
return !have_error();
}
@@ -1194,91 +1212,81 @@ public:
mem_zero(task->storage.XtWX);
mem_zero(task->storage.XtWY);
- CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian, cuFinalize;
- cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
- cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
- cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
- cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
- cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
+ int r = task->radius;
+ int f = 4;
+ float a = 1.0f;
+ float k_2 = task->nlm_k_2;
- cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
- cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
+ int w = task->reconstruction_state.source_w;
+ int h = task->reconstruction_state.source_h;
+ int stride = task->buffer.stride;
- CUDA_GET_BLOCKSIZE(cuNLMCalcDifference,
- task->reconstruction_state.source_w,
- task->reconstruction_state.source_h);
+ int shift_stride = stride*h;
+ int num_shifts = (2*r+1)*(2*r+1);
+ int mem_size = sizeof(float)*shift_stride*num_shifts;
- CUdeviceptr difference = task->reconstruction_state.temporary_1_ptr;
- CUdeviceptr blurDifference = task->reconstruction_state.temporary_2_ptr;
+ CUdeviceptr temporary_mem;
+ cuda_assert(cuMemAlloc(&temporary_mem, 2*mem_size));
+ CUdeviceptr difference = temporary_mem;
+ CUdeviceptr blurDifference = temporary_mem + mem_size;
- int r = task->radius;
- int f = 4;
- float a = 1.0f;
- for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
- int dy = i / (2*r+1) - r;
- int dx = i % (2*r+1) - r;
-
- int local_rect[4] = {max(0, -dx), max(0, -dy),
- task->reconstruction_state.source_w - max(0, dx),
- task->reconstruction_state.source_h - max(0, dy)};
-
- void *calc_difference_args[] = {&dx, &dy,
- &color_ptr,
- &color_variance_ptr,
- &difference,
- &local_rect,
- &task->buffer.w,
- &task->buffer.pass_stride,
- &a,
- &task->nlm_k_2};
- CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args);
-
- void *blur_args[] = {&difference,
- &blurDifference,
- &local_rect,
- &task->buffer.w,
- &f};
- CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
-
- void *calc_weight_args[] = {&blurDifference,
- &difference,
- &local_rect,
- &task->buffer.w,
- &f};
- CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args);
-
- /* Reuse previous arguments. */
- CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
-
- void *construct_gramian_args[] = {&dx, &dy,
- &blurDifference,
+ {
+ CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+ cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+ cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
+
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
+
+ CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
+ task->reconstruction_state.source_w * task->reconstruction_state.source_h,
+ num_shifts);
+
+ void *calc_difference_args[] = {&color_ptr, &color_variance_ptr, &difference, &w, &h, &stride, &shift_stride, &r, &task->buffer.pass_stride, &a, &k_2};
+ void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &shift_stride, &r, &f};
+ void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &shift_stride, &r, &f};
+ void *construct_gramian_args[] = {&blurDifference,
&task->buffer.mem.device_pointer,
&task->storage.transform.device_pointer,
&task->storage.rank.device_pointer,
&task->storage.XtWX.device_pointer,
&task->storage.XtWY.device_pointer,
- &local_rect,
- &task->reconstruction_state.filter_rect,
- &task->buffer.w,
- &task->buffer.h,
+ &task->reconstruction_state.filter_window,
+ &w, &h, &stride,
+ &shift_stride, &r,
&f,
&task->buffer.pass_stride};
- CUDA_LAUNCH_KERNEL(cuNLMConstructGramian, construct_gramian_args);
- }
-
- void *finalize_args[] = {&task->buffer.w,
- &task->buffer.h,
- &output_ptr,
- &task->storage.rank.device_pointer,
- &task->storage.XtWX.device_pointer,
- &task->storage.XtWY.device_pointer,
- &task->filter_area,
- &task->reconstruction_state.buffer_params.x,
- &task->render_buffer.samples};
- CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
+
+ CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
+ CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+ CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
+ CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+ CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
+ }
+
+ cuMemFree(temporary_mem);
+
+ {
+ CUfunction cuFinalize;
+ cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
+ cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
+ void *finalize_args[] = {&output_ptr,
+ &task->storage.rank.device_pointer,
+ &task->storage.XtWX.device_pointer,
+ &task->storage.XtWY.device_pointer,
+ &task->filter_area,
+ &task->reconstruction_state.buffer_params.x,
+ &task->render_buffer.samples};
+ CUDA_GET_BLOCKSIZE(cuFinalize,
+ task->reconstruction_state.source_w,
+ task->reconstruction_state.source_h);
+ CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
+ }
+
cuda_assert(cuCtxSynchronize());
return !have_error();
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
index 69c43e4a8cf..1862deb9a61 100644
--- a/intern/cycles/device/device_denoising.cpp
+++ b/intern/cycles/device/device_denoising.cpp
@@ -57,10 +57,9 @@ void DenoisingTask::init_from_devicetask(const DeviceTask &task)
render_buffer.denoising_clean_offset = task.pass_denoising_clean;
/* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring tiles */
- rect = make_int4(max(tiles->x[0], filter_area.x - radius),
- max(tiles->y[0], filter_area.y - radius),
- min(tiles->x[3], filter_area.x + filter_area.z + radius),
- min(tiles->y[3], filter_area.y + filter_area.w + radius));
+ rect = rect_from_shape(filter_area.x, filter_area.y, filter_area.z, filter_area.w);
+ rect = rect_expand(rect, radius);
+ rect = rect_clip(rect, make_int4(tiles->x[0], tiles->y[0], tiles->x[3], tiles->y[3]));
}
void DenoisingTask::tiles_from_rendertiles(RenderTile *rtiles)
@@ -93,9 +92,10 @@ bool DenoisingTask::run_denoising()
{
/* Allocate denoising buffer. */
buffer.passes = 14;
- buffer.w = align_up(rect.z - rect.x, 4);
+ buffer.width = rect.z - rect.x;
+ buffer.stride = align_up(buffer.width, 4);
buffer.h = rect.w - rect.y;
- buffer.pass_stride = align_up(buffer.w * buffer.h, divide_up(device->mem_address_alignment(), sizeof(float)));
+ buffer.pass_stride = align_up(buffer.stride * buffer.h, divide_up(device->mem_address_alignment(), sizeof(float)));
buffer.mem.alloc_to_device(buffer.pass_stride * buffer.passes, false);
device_ptr null_ptr = (device_ptr) 0;
@@ -203,15 +203,17 @@ bool DenoisingTask::run_denoising()
functions.construct_transform();
- storage.temporary_1.alloc_to_device(buffer.w*buffer.h, false);
- storage.temporary_2.alloc_to_device(buffer.w*buffer.h, false);
- reconstruction_state.temporary_1_ptr = storage.temporary_1.device_pointer;
- reconstruction_state.temporary_2_ptr = storage.temporary_2.device_pointer;
+ device_only_memory<float> temporary_1(device, "Denoising NLM temporary 1");
+ device_only_memory<float> temporary_2(device, "Denoising NLM temporary 2");
+ temporary_1.alloc_to_device(buffer.pass_stride, false);
+ temporary_2.alloc_to_device(buffer.pass_stride, false);
+ reconstruction_state.temporary_1_ptr = temporary_1.device_pointer;
+ reconstruction_state.temporary_2_ptr = temporary_2.device_pointer;
storage.XtWX.alloc_to_device(storage.w*storage.h*XTWX_SIZE, false);
storage.XtWY.alloc_to_device(storage.w*storage.h*XTWY_SIZE, false);
- reconstruction_state.filter_rect = make_int4(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h);
+ reconstruction_state.filter_window = rect_from_shape(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h);
int tile_coordinate_offset = filter_area.y*render_buffer.stride + filter_area.x;
reconstruction_state.buffer_params = make_int4(render_buffer.offset + tile_coordinate_offset,
render_buffer.stride,
diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h
index ec4e7933cdc..77a82d0ad04 100644
--- a/intern/cycles/device/device_denoising.h
+++ b/intern/cycles/device/device_denoising.h
@@ -94,7 +94,7 @@ public:
device_ptr temporary_1_ptr; /* There two images are used as temporary storage. */
device_ptr temporary_2_ptr;
- int4 filter_rect;
+ int4 filter_window;
int4 buffer_params;
int source_w;
@@ -148,8 +148,9 @@ public:
struct DenoiseBuffers {
int pass_stride;
int passes;
- int w;
+ int stride;
int h;
+ int width;
device_only_memory<float> mem;
DenoiseBuffers(Device *device)
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h
index c02f8ffafe6..f38c2f65c1e 100644
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -353,7 +353,9 @@ public:
void tex_free(device_memory& mem);
size_t global_size_round_up(int group_size, int global_size);
- void enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size = -1);
+ void enqueue_kernel(cl_kernel kernel, size_t w, size_t h,
+ bool x_workgroups = false,
+ size_t max_workgroup_size = -1);
void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name);
void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp
index f43177247ef..fe084edc90e 100644
--- a/intern/cycles/device/opencl/opencl_base.cpp
+++ b/intern/cycles/device/opencl/opencl_base.cpp
@@ -560,7 +560,7 @@ size_t OpenCLDeviceBase::global_size_round_up(int group_size, int global_size)
return global_size + ((r == 0)? 0: group_size - r);
}
-void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size)
+void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size)
{
size_t workgroup_size, max_work_items[3];
@@ -574,8 +574,15 @@ void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size
}
/* Try to divide evenly over 2 dimensions. */
- size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
- size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
+ size_t local_size[2];
+ if(x_workgroups) {
+ local_size[0] = workgroup_size;
+ local_size[1] = 1;
+ }
+ else {
+ size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
+ local_size[0] = local_size[1] = sqrt_workgroup_size;
+ }
/* Some implementations have max size 1 on 2nd dimension. */
if(local_size[1] > max_work_items[1]) {
@@ -731,17 +738,25 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
device_ptr out_ptr,
DenoisingTask *task)
{
- int4 rect = task->rect;
- int w = rect.z-rect.x;
- int h = rect.w-rect.y;
+
+ int stride = task->buffer.stride;
+ int w = task->buffer.width;
+ int h = task->buffer.h;
int r = task->nlm_state.r;
int f = task->nlm_state.f;
float a = task->nlm_state.a;
float k_2 = task->nlm_state.k_2;
- cl_mem difference = CL_MEM_PTR(task->nlm_state.temporary_1_ptr);
- cl_mem blurDifference = CL_MEM_PTR(task->nlm_state.temporary_2_ptr);
- cl_mem weightAccum = CL_MEM_PTR(task->nlm_state.temporary_3_ptr);
+ int shift_stride = stride*h;
+ int num_shifts = (2*r+1)*(2*r+1);
+ int mem_size = sizeof(float)*shift_stride*num_shifts;
+
+ cl_mem weightAccum = CL_MEM_PTR(task->nlm_state.temporary_3_ptr);
+
+ cl_mem difference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
+ opencl_assert_err(ciErr, "clCreateBuffer denoising_non_local_means");
+ cl_mem blurDifference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
+ opencl_assert_err(ciErr, "clCreateBuffer denoising_non_local_means");
cl_mem image_mem = CL_MEM_PTR(image_ptr);
cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
@@ -757,31 +772,45 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output"));
cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize"));
- for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
- int dy = i / (2*r+1) - r;
- int dx = i % (2*r+1) - r;
- int4 local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
- kernel_set_args(ckNLMCalcDifference, 0,
- dx, dy, guide_mem, variance_mem,
- difference, local_rect, w, 0, a, k_2);
- kernel_set_args(ckNLMBlur, 0,
- difference, blurDifference, local_rect, w, f);
- kernel_set_args(ckNLMCalcWeight, 0,
- blurDifference, difference, local_rect, w, f);
- kernel_set_args(ckNLMUpdateOutput, 0,
- dx, dy, blurDifference, image_mem,
- out_mem, weightAccum, local_rect, w, f);
-
- enqueue_kernel(ckNLMCalcDifference, w, h);
- enqueue_kernel(ckNLMBlur, w, h);
- enqueue_kernel(ckNLMCalcWeight, w, h);
- enqueue_kernel(ckNLMBlur, w, h);
- enqueue_kernel(ckNLMUpdateOutput, w, h);
- }
+ kernel_set_args(ckNLMCalcDifference, 0,
+ guide_mem,
+ variance_mem,
+ difference,
+ w, h, stride,
+ shift_stride,
+ r, 0, a, k_2);
+ kernel_set_args(ckNLMBlur, 0,
+ difference,
+ blurDifference,
+ w, h, stride,
+ shift_stride,
+ r, f);
+ kernel_set_args(ckNLMCalcWeight, 0,
+ blurDifference,
+ difference,
+ w, h, stride,
+ shift_stride,
+ r, f);
+ kernel_set_args(ckNLMUpdateOutput, 0,
+ blurDifference,
+ image_mem,
+ out_mem,
+ weightAccum,
+ w, h, stride,
+ shift_stride,
+ r, f);
+
+ enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true);
+ enqueue_kernel(ckNLMBlur, w*h, num_shifts, true);
+ enqueue_kernel(ckNLMCalcWeight, w*h, num_shifts, true);
+ enqueue_kernel(ckNLMBlur, w*h, num_shifts, true);
+ enqueue_kernel(ckNLMUpdateOutput, w*h, num_shifts, true);
+
+ opencl_assert(clReleaseMemObject(difference));
+ opencl_assert(clReleaseMemObject(blurDifference));
- int4 local_rect = make_int4(0, 0, w, h);
kernel_set_args(ckNLMNormalize, 0,
- out_mem, weightAccum, local_rect, w);
+ out_mem, weightAccum, w, h, stride);
enqueue_kernel(ckNLMNormalize, w, h);
return true;
@@ -837,81 +866,63 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
cl_kernel ckFinalize = denoising_program(ustring("filter_finalize"));
- cl_mem difference = CL_MEM_PTR(task->reconstruction_state.temporary_1_ptr);
- cl_mem blurDifference = CL_MEM_PTR(task->reconstruction_state.temporary_2_ptr);
-
- int r = task->radius;
- int f = 4;
- float a = 1.0f;
- for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
- int dy = i / (2*r+1) - r;
- int dx = i % (2*r+1) - r;
-
- int local_rect[4] = {max(0, -dx), max(0, -dy),
- task->reconstruction_state.source_w - max(0, dx),
- task->reconstruction_state.source_h - max(0, dy)};
-
- kernel_set_args(ckNLMCalcDifference, 0,
- dx, dy,
- color_mem,
- color_variance_mem,
- difference,
- local_rect,
- task->buffer.w,
- task->buffer.pass_stride,
- a, task->nlm_k_2);
- enqueue_kernel(ckNLMCalcDifference,
- task->reconstruction_state.source_w,
- task->reconstruction_state.source_h);
-
- kernel_set_args(ckNLMBlur, 0,
- difference,
- blurDifference,
- local_rect,
- task->buffer.w,
- f);
- enqueue_kernel(ckNLMBlur,
- task->reconstruction_state.source_w,
- task->reconstruction_state.source_h);
-
- kernel_set_args(ckNLMCalcWeight, 0,
- blurDifference,
- difference,
- local_rect,
- task->buffer.w,
- f);
- enqueue_kernel(ckNLMCalcWeight,
- task->reconstruction_state.source_w,
- task->reconstruction_state.source_h);
-
- /* Reuse previous arguments. */
- enqueue_kernel(ckNLMBlur,
- task->reconstruction_state.source_w,
- task->reconstruction_state.source_h);
-
- kernel_set_args(ckNLMConstructGramian, 0,
- dx, dy,
- blurDifference,
- buffer_mem,
- transform_mem,
- rank_mem,
- XtWX_mem,
- XtWY_mem,
- local_rect,
- task->reconstruction_state.filter_rect,
- task->buffer.w,
- task->buffer.h,
- f,
- task->buffer.pass_stride);
- enqueue_kernel(ckNLMConstructGramian,
- task->reconstruction_state.source_w,
- task->reconstruction_state.source_h,
- 256);
- }
+ int w = task->reconstruction_state.source_w;
+ int h = task->reconstruction_state.source_h;
+ int stride = task->buffer.stride;
+
+ int shift_stride = stride*h;
+ int num_shifts = (2*task->radius + 1)*(2*task->radius + 1);
+ int mem_size = sizeof(float)*shift_stride*num_shifts;
+
+ cl_mem difference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
+ opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct");
+ cl_mem blurDifference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
+ opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct");
+
+ kernel_set_args(ckNLMCalcDifference, 0,
+ color_mem,
+ color_variance_mem,
+ difference,
+ w, h, stride,
+ shift_stride,
+ task->radius,
+ task->buffer.pass_stride,
+ 1.0f, task->nlm_k_2);
+ kernel_set_args(ckNLMBlur, 0,
+ difference,
+ blurDifference,
+ w, h, stride,
+ shift_stride,
+ task->radius, 4);
+ kernel_set_args(ckNLMCalcWeight, 0,
+ blurDifference,
+ difference,
+ w, h, stride,
+ shift_stride,
+ task->radius, 4);
+ kernel_set_args(ckNLMConstructGramian, 0,
+ blurDifference,
+ buffer_mem,
+ transform_mem,
+ rank_mem,
+ XtWX_mem,
+ XtWY_mem,
+ task->reconstruction_state.filter_window,
+ w, h, stride,
+ shift_stride,
+ task->radius, 4,
+ task->buffer.pass_stride);
+
+ enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true);
+ enqueue_kernel(ckNLMBlur, w*h, num_shifts, true);
+ enqueue_kernel(ckNLMCalcWeight, w*h, num_shifts, true);
+ enqueue_kernel(ckNLMBlur, w*h, num_shifts, true);
+ enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256);
+
+ opencl_assert(clReleaseMemObject(difference));
+ opencl_assert(clReleaseMemObject(blurDifference));
kernel_set_args(ckFinalize, 0,
- task->buffer.w,
- task->buffer.h,
output_mem,
rank_mem,
XtWX_mem,
@@ -919,9 +930,7 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
task->filter_area,
task->reconstruction_state.buffer_params,
task->render_buffer.samples);
- enqueue_kernel(ckFinalize,
- task->reconstruction_state.source_w,
- task->reconstruction_state.source_h);
+ enqueue_kernel(ckFinalize, w, h);
return true;
}