diff options
author | Campbell Barton <ideasman42@gmail.com> | 2018-07-05 08:54:47 +0300 |
---|---|---|
committer | Campbell Barton <ideasman42@gmail.com> | 2018-07-05 08:54:47 +0300 |
commit | 49b86bcfec6885952458b3791cfc599463f02a35 (patch) | |
tree | 902cce1e99b2f5182c3a87640d36a310748dc62b /intern | |
parent | 53c63db2ee1bd544384840915c7f562819a7dbbc (diff) | |
parent | cd17b3258327522b8c6f56a3ee7239a91f2be149 (diff) |
Merge branch 'master' into blender2.8
Diffstat (limited to 'intern')
19 files changed, 352 insertions, 358 deletions
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index 6be60f8bbb6..be0dcc20755 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -179,8 +179,8 @@ public: KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_byte_kernel; KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, int, int, int, int, int)> shader_kernel; - KernelFunctions<void(*)(int, TilesInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int)> filter_divide_shadow_kernel; - KernelFunctions<void(*)(int, TilesInfo*, int, int, int, int, float*, float*, int*, int, int)> filter_get_feature_kernel; + KernelFunctions<void(*)(int, TileInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int)> filter_divide_shadow_kernel; + KernelFunctions<void(*)(int, TileInfo*, int, int, int, int, float*, float*, int*, int, int)> filter_get_feature_kernel; KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_detect_outliers_kernel; KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel; @@ -459,18 +459,6 @@ public: } }; - bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task) - { - TilesInfo *tiles = (TilesInfo*) task->tiles_mem.host_pointer; - for(int i = 0; i < 9; i++) { - tiles->buffers[i] = buffers[i]; - } - - task->tiles_mem.copy_to_device(); - - return true; - } - bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr, DenoisingTask *task) { @@ -626,7 +614,7 @@ public: for(int y = task->rect.y; y < task->rect.w; y++) { for(int x = task->rect.x; x < task->rect.z; x++) { filter_divide_shadow_kernel()(task->render_buffer.samples, - task->tiles, + task->tile_info, x, y, (float*) a_ptr, (float*) b_ptr, @@ -635,7 +623,7 @@ public: (float*) buffer_variance_ptr, &task->rect.x, task->render_buffer.pass_stride, - task->render_buffer.denoising_data_offset); + task->render_buffer.offset); } } return true; @@ -650,7 +638,7 @@ public: for(int y = task->rect.y; y < task->rect.w; y++) { for(int x = task->rect.x; x < task->rect.z; x++) { filter_get_feature_kernel()(task->render_buffer.samples, - task->tiles, + task->tile_info, mean_offset, variance_offset, x, y, @@ -658,7 +646,7 @@ public: (float*) variance_ptr, &task->rect.x, task->render_buffer.pass_stride, - task->render_buffer.denoising_data_offset); + task->render_buffer.offset); } } return true; @@ -711,7 +699,7 @@ public: } } - void denoise(DeviceTask &task, DenoisingTask& denoising, RenderTile &tile) + void denoise(DenoisingTask& denoising, RenderTile &tile) { tile.sample = tile.start_sample + tile.num_samples; @@ -722,23 +710,11 @@ public: denoising.functions.combine_halves = function_bind(&CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising); denoising.functions.detect_outliers = function_bind(&CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); - denoising.functions.set_tiles = function_bind(&CPUDevice::denoising_set_tiles, this, _1, &denoising); denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h); denoising.render_buffer.samples = tile.sample; - RenderTile rtiles[9]; - rtiles[4] = tile; - task.map_neighbor_tiles(rtiles, this); - denoising.tiles_from_rendertiles(rtiles); - - denoising.init_from_devicetask(task); - - denoising.run_denoising(); - - task.unmap_neighbor_tiles(rtiles, this); - - task.update_progress(&tile, tile.w*tile.h); + denoising.run_denoising(&tile); } void thread_render(DeviceTask& task) @@ -766,7 +742,7 @@ public: } RenderTile tile; - DenoisingTask denoising(this); + DenoisingTask denoising(this, task); while(task.acquire_tile(this, tile)) { if(tile.task == RenderTile::PATH_TRACE) { @@ -779,7 +755,9 @@ public: } } else if(tile.task == RenderTile::DENOISE) { - denoise(task, denoising, tile); + denoise(denoising, tile); + + task.update_progress(&tile, tile.w*tile.h); } task.release_tile(tile); diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 656ad07d657..209fe84b789 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -1251,18 +1251,6 @@ public: } } - bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task) - { - TilesInfo *tiles = (TilesInfo*) task->tiles_mem.host_pointer; - for(int i = 0; i < 9; i++) { - tiles->buffers[i] = buffers[i]; - } - - task->tiles_mem.copy_to_device(); - - return !have_error(); - } - #define CUDA_GET_BLOCKSIZE(func, w, h) \ int threads_per_block; \ cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ @@ -1534,7 +1522,7 @@ public: task->rect.w-task->rect.y); void *args[] = {&task->render_buffer.samples, - &task->tiles_mem.device_pointer, + &task->tile_info_mem.device_pointer, &a_ptr, &b_ptr, &sample_variance_ptr, @@ -1542,7 +1530,7 @@ public: &buffer_variance_ptr, &task->rect, &task->render_buffer.pass_stride, - &task->render_buffer.denoising_data_offset}; + &task->render_buffer.offset}; CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args); cuda_assert(cuCtxSynchronize()); @@ -1568,14 +1556,14 @@ public: task->rect.w-task->rect.y); void *args[] = {&task->render_buffer.samples, - &task->tiles_mem.device_pointer, + &task->tile_info_mem.device_pointer, &mean_offset, &variance_offset, &mean_ptr, &variance_ptr, &task->rect, &task->render_buffer.pass_stride, - &task->render_buffer.denoising_data_offset}; + &task->render_buffer.offset}; CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args); cuda_assert(cuCtxSynchronize()); @@ -1613,7 +1601,7 @@ public: return !have_error(); } - void denoise(RenderTile &rtile, DenoisingTask& denoising, const DeviceTask &task) + void denoise(RenderTile &rtile, DenoisingTask& denoising) { denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising); denoising.functions.reconstruct = function_bind(&CUDADevice::denoising_reconstruct, this, _1, _2, _3, &denoising); @@ -1622,21 +1610,11 @@ public: denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising); denoising.functions.detect_outliers = function_bind(&CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); - denoising.functions.set_tiles = function_bind(&CUDADevice::denoising_set_tiles, this, _1, &denoising); denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); denoising.render_buffer.samples = rtile.sample; - RenderTile rtiles[9]; - rtiles[4] = rtile; - task.map_neighbor_tiles(rtiles, this); - denoising.tiles_from_rendertiles(rtiles); - - denoising.init_from_devicetask(task); - - denoising.run_denoising(); - - task.unmap_neighbor_tiles(rtiles, this); + denoising.run_denoising(&rtile); } void path_trace(DeviceTask& task, RenderTile& rtile, device_vector<WorkTile>& work_tiles) @@ -2092,7 +2070,7 @@ public: /* keep rendering tiles until done */ RenderTile tile; - DenoisingTask denoising(this); + DenoisingTask denoising(this, *task); while(task->acquire_tile(this, tile)) { if(tile.task == RenderTile::PATH_TRACE) { @@ -2107,7 +2085,7 @@ public: else if(tile.task == RenderTile::DENOISE) { tile.sample = tile.start_sample + tile.num_samples; - denoise(tile, denoising, *task); + denoise(tile, denoising); task->update_progress(&tile, tile.w*tile.h); } diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp index 644cf6cd10e..c0d4634262d 100644 --- a/intern/cycles/device/device_denoising.cpp +++ b/intern/cycles/device/device_denoising.cpp @@ -20,12 +20,29 @@ CCL_NAMESPACE_BEGIN -DenoisingTask::DenoisingTask(Device *device) -: tiles_mem(device, "denoising tiles_mem", MEM_READ_WRITE), +DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task) +: tile_info_mem(device, "denoising tile info mem", MEM_READ_WRITE), storage(device), buffer(device), device(device) { + radius = task.denoising_radius; + nlm_k_2 = powf(2.0f, lerp(-5.0f, 3.0f, task.denoising_strength)); + if(task.denoising_relative_pca) { + pca_threshold = -powf(10.0f, lerp(-8.0f, 0.0f, task.denoising_feature_strength)); + } + else { + pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising_feature_strength)); + } + + render_buffer.pass_stride = task.pass_stride; + render_buffer.offset = task.pass_denoising_data; + + target_buffer.pass_stride = task.pass_stride; + target_buffer.denoising_clean_offset = task.pass_denoising_clean; + + functions.map_neighbor_tiles = function_bind(task.map_neighbor_tiles, _1, device); + functions.unmap_neighbor_tiles = function_bind(task.unmap_neighbor_tiles, _1, device); } DenoisingTask::~DenoisingTask() @@ -38,170 +55,170 @@ DenoisingTask::~DenoisingTask() storage.temporary_2.free(); storage.temporary_color.free(); buffer.mem.free(); - tiles_mem.free(); -} - -void DenoisingTask::init_from_devicetask(const DeviceTask &task) -{ - radius = task.denoising_radius; - nlm_k_2 = powf(2.0f, lerp(-5.0f, 3.0f, task.denoising_strength)); - if(task.denoising_relative_pca) { - pca_threshold = -powf(10.0f, lerp(-8.0f, 0.0f, task.denoising_feature_strength)); - } - else { - pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising_feature_strength)); - } - - render_buffer.pass_stride = task.pass_stride; - render_buffer.denoising_data_offset = task.pass_denoising_data; - render_buffer.denoising_clean_offset = task.pass_denoising_clean; - - /* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring tiles */ - rect = rect_from_shape(filter_area.x, filter_area.y, filter_area.z, filter_area.w); - rect = rect_expand(rect, radius); - rect = rect_clip(rect, make_int4(tiles->x[0], tiles->y[0], tiles->x[3], tiles->y[3])); + tile_info_mem.free(); } -void DenoisingTask::tiles_from_rendertiles(RenderTile *rtiles) +void DenoisingTask::set_render_buffer(RenderTile *rtiles) { - tiles = (TilesInfo*) tiles_mem.alloc(sizeof(TilesInfo)/sizeof(int)); + tile_info = (TileInfo*) tile_info_mem.alloc(sizeof(TileInfo)/sizeof(int)); - device_ptr buffers[9]; for(int i = 0; i < 9; i++) { - buffers[i] = rtiles[i].buffer; - tiles->offsets[i] = rtiles[i].offset; - tiles->strides[i] = rtiles[i].stride; + tile_info->offsets[i] = rtiles[i].offset; + tile_info->strides[i] = rtiles[i].stride; + tile_info->buffers[i] = rtiles[i].buffer; } - tiles->x[0] = rtiles[3].x; - tiles->x[1] = rtiles[4].x; - tiles->x[2] = rtiles[5].x; - tiles->x[3] = rtiles[5].x + rtiles[5].w; - tiles->y[0] = rtiles[1].y; - tiles->y[1] = rtiles[4].y; - tiles->y[2] = rtiles[7].y; - tiles->y[3] = rtiles[7].y + rtiles[7].h; - - render_buffer.offset = rtiles[4].offset; - render_buffer.stride = rtiles[4].stride; - render_buffer.ptr = rtiles[4].buffer; - - functions.set_tiles(buffers); + tile_info->x[0] = rtiles[3].x; + tile_info->x[1] = rtiles[4].x; + tile_info->x[2] = rtiles[5].x; + tile_info->x[3] = rtiles[5].x + rtiles[5].w; + tile_info->y[0] = rtiles[1].y; + tile_info->y[1] = rtiles[4].y; + tile_info->y[2] = rtiles[7].y; + tile_info->y[3] = rtiles[7].y + rtiles[7].h; + + target_buffer.offset = rtiles[9].offset; + target_buffer.stride = rtiles[9].stride; + target_buffer.ptr = rtiles[9].buffer; + + tile_info_mem.copy_to_device(); } -bool DenoisingTask::run_denoising() +void DenoisingTask::setup_denoising_buffer() { - /* Allocate denoising buffer. */ + /* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring tiles */ + rect = rect_from_shape(filter_area.x, filter_area.y, filter_area.z, filter_area.w); + rect = rect_expand(rect, radius); + rect = rect_clip(rect, make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3])); + buffer.passes = 14; buffer.width = rect.z - rect.x; buffer.stride = align_up(buffer.width, 4); buffer.h = rect.w - rect.y; - buffer.pass_stride = align_up(buffer.stride * buffer.h, divide_up(device->mem_sub_ptr_alignment(), sizeof(float))); - buffer.mem.alloc_to_device(buffer.pass_stride * buffer.passes, false); + int alignment_floats = divide_up(device->mem_sub_ptr_alignment(), sizeof(float)); + buffer.pass_stride = align_up(buffer.stride * buffer.h, alignment_floats); + /* Pad the total size by four floats since the SIMD kernels might go a bit over the end. */ + int mem_size = align_up(buffer.pass_stride * buffer.passes + 4, alignment_floats); + buffer.mem.alloc_to_device(mem_size, false); +} +void DenoisingTask::prefilter_shadowing() +{ device_ptr null_ptr = (device_ptr) 0; - /* Prefilter shadow feature. */ - { - device_sub_ptr unfiltered_a (buffer.mem, 0, buffer.pass_stride); - device_sub_ptr unfiltered_b (buffer.mem, 1*buffer.pass_stride, buffer.pass_stride); - device_sub_ptr sample_var (buffer.mem, 2*buffer.pass_stride, buffer.pass_stride); - device_sub_ptr sample_var_var (buffer.mem, 3*buffer.pass_stride, buffer.pass_stride); - device_sub_ptr buffer_var (buffer.mem, 5*buffer.pass_stride, buffer.pass_stride); - device_sub_ptr filtered_var (buffer.mem, 6*buffer.pass_stride, buffer.pass_stride); - device_sub_ptr nlm_temporary_1(buffer.mem, 7*buffer.pass_stride, buffer.pass_stride); - device_sub_ptr nlm_temporary_2(buffer.mem, 8*buffer.pass_stride, buffer.pass_stride); - device_sub_ptr nlm_temporary_3(buffer.mem, 9*buffer.pass_stride, buffer.pass_stride); - - nlm_state.temporary_1_ptr = *nlm_temporary_1; - nlm_state.temporary_2_ptr = *nlm_temporary_2; - nlm_state.temporary_3_ptr = *nlm_temporary_3; - - /* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */ - functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var); - - /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */ - nlm_state.set_parameters(6, 3, 4.0f, 1.0f); - functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var); - - /* Reuse memory, the previous data isn't needed anymore. */ - device_ptr filtered_a = *buffer_var, - filtered_b = *sample_var; - /* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */ - nlm_state.set_parameters(5, 3, 1.0f, 0.25f); - functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a); - functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b); - - device_ptr residual_var = *sample_var_var; - /* Estimate the residual variance between the two filtered halves. */ - functions.combine_halves(filtered_a, filtered_b, null_ptr, residual_var, 2, rect); - - device_ptr final_a = *unfiltered_a, - final_b = *unfiltered_b; - /* Use the residual variance for a second filter pass. */ - nlm_state.set_parameters(4, 2, 1.0f, 0.5f); - functions.non_local_means(filtered_a, filtered_b, residual_var, final_a); - functions.non_local_means(filtered_b, filtered_a, residual_var, final_b); - - /* Combine the two double-filtered halves to a final shadow feature. */ - device_sub_ptr shadow_pass(buffer.mem, 4*buffer.pass_stride, buffer.pass_stride); - functions.combine_halves(final_a, final_b, *shadow_pass, null_ptr, 0, rect); - } + device_sub_ptr unfiltered_a (buffer.mem, 0, buffer.pass_stride); + device_sub_ptr unfiltered_b (buffer.mem, 1*buffer.pass_stride, buffer.pass_stride); + device_sub_ptr sample_var (buffer.mem, 2*buffer.pass_stride, buffer.pass_stride); + device_sub_ptr sample_var_var (buffer.mem, 3*buffer.pass_stride, buffer.pass_stride); + device_sub_ptr buffer_var (buffer.mem, 5*buffer.pass_stride, buffer.pass_stride); + device_sub_ptr filtered_var (buffer.mem, 6*buffer.pass_stride, buffer.pass_stride); + device_sub_ptr nlm_temporary_1(buffer.mem, 7*buffer.pass_stride, buffer.pass_stride); + device_sub_ptr nlm_temporary_2(buffer.mem, 8*buffer.pass_stride, buffer.pass_stride); + device_sub_ptr nlm_temporary_3(buffer.mem, 9*buffer.pass_stride, buffer.pass_stride); + + nlm_state.temporary_1_ptr = *nlm_temporary_1; + nlm_state.temporary_2_ptr = *nlm_temporary_2; + nlm_state.temporary_3_ptr = *nlm_temporary_3; + + /* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */ + functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var); + + /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */ + nlm_state.set_parameters(6, 3, 4.0f, 1.0f); + functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var); + + /* Reuse memory, the previous data isn't needed anymore. */ + device_ptr filtered_a = *buffer_var, + filtered_b = *sample_var; + /* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */ + nlm_state.set_parameters(5, 3, 1.0f, 0.25f); + functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a); + functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b); + + device_ptr residual_var = *sample_var_var; + /* Estimate the residual variance between the two filtered halves. */ + functions.combine_halves(filtered_a, filtered_b, null_ptr, residual_var, 2, rect); + + device_ptr final_a = *unfiltered_a, + final_b = *unfiltered_b; + /* Use the residual variance for a second filter pass. */ + nlm_state.set_parameters(4, 2, 1.0f, 0.5f); + functions.non_local_means(filtered_a, filtered_b, residual_var, final_a); + functions.non_local_means(filtered_b, filtered_a, residual_var, final_b); + + /* Combine the two double-filtered halves to a final shadow feature. */ + device_sub_ptr shadow_pass(buffer.mem, 4*buffer.pass_stride, buffer.pass_stride); + functions.combine_halves(final_a, final_b, *shadow_pass, null_ptr, 0, rect); +} - /* Prefilter general features. */ - { - device_sub_ptr unfiltered (buffer.mem, 8*buffer.pass_stride, buffer.pass_stride); - device_sub_ptr variance (buffer.mem, 9*buffer.pass_stride, buffer.pass_stride); - device_sub_ptr nlm_temporary_1(buffer.mem, 10*buffer.pass_stride, buffer.pass_stride); - device_sub_ptr nlm_temporary_2(buffer.mem, 11*buffer.pass_stride, buffer.pass_stride); - device_sub_ptr nlm_temporary_3(buffer.mem, 12*buffer.pass_stride, buffer.pass_stride); - - nlm_state.temporary_1_ptr = *nlm_temporary_1; - nlm_state.temporary_2_ptr = *nlm_temporary_2; - nlm_state.temporary_3_ptr = *nlm_temporary_3; - - int mean_from[] = { 0, 1, 2, 12, 6, 7, 8 }; - int variance_from[] = { 3, 4, 5, 13, 9, 10, 11}; - int pass_to[] = { 1, 2, 3, 0, 5, 6, 7}; - for(int pass = 0; pass < 7; pass++) { - device_sub_ptr feature_pass(buffer.mem, pass_to[pass]*buffer.pass_stride, buffer.pass_stride); - /* Get the unfiltered pass and its variance from the RenderBuffers. */ - functions.get_feature(mean_from[pass], variance_from[pass], *unfiltered, *variance); - /* Smooth the pass and store the result in the denoising buffers. */ - nlm_state.set_parameters(2, 2, 1.0f, 0.25f); - functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass); - } +void DenoisingTask::prefilter_features() +{ + device_sub_ptr unfiltered (buffer.mem, 8*buffer.pass_stride, buffer.pass_stride); + device_sub_ptr variance (buffer.mem, 9*buffer.pass_stride, buffer.pass_stride); + device_sub_ptr nlm_temporary_1(buffer.mem, 10*buffer.pass_stride, buffer.pass_stride); + device_sub_ptr nlm_temporary_2(buffer.mem, 11*buffer.pass_stride, buffer.pass_stride); + device_sub_ptr nlm_temporary_3(buffer.mem, 12*buffer.pass_stride, buffer.pass_stride); + + nlm_state.temporary_1_ptr = *nlm_temporary_1; + nlm_state.temporary_2_ptr = *nlm_temporary_2; + nlm_state.temporary_3_ptr = *nlm_temporary_3; + + int mean_from[] = { 0, 1, 2, 12, 6, 7, 8 }; + int variance_from[] = { 3, 4, 5, 13, 9, 10, 11}; + int pass_to[] = { 1, 2, 3, 0, 5, 6, 7}; + for(int pass = 0; pass < 7; pass++) { + device_sub_ptr feature_pass(buffer.mem, pass_to[pass]*buffer.pass_stride, buffer.pass_stride); + /* Get the unfiltered pass and its variance from the RenderBuffers. */ + functions.get_feature(mean_from[pass], variance_from[pass], *unfiltered, *variance); + /* Smooth the pass and store the result in the denoising buffers. */ + nlm_state.set_parameters(2, 2, 1.0f, 0.25f); + functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass); } +} - /* Copy color passes. */ - { - int mean_from[] = {20, 21, 22}; - int variance_from[] = {23, 24, 25}; - int mean_to[] = { 8, 9, 10}; - int variance_to[] = {11, 12, 13}; - int num_color_passes = 3; - - storage.temporary_color.alloc_to_device(3*buffer.pass_stride, false); - - for(int pass = 0; pass < num_color_passes; pass++) { - device_sub_ptr color_pass(storage.temporary_color, pass*buffer.pass_stride, buffer.pass_stride); - device_sub_ptr color_var_pass(buffer.mem, variance_to[pass]*buffer.pass_stride, buffer.pass_stride); - functions.get_feature(mean_from[pass], variance_from[pass], *color_pass, *color_var_pass); - } - - { - device_sub_ptr depth_pass (buffer.mem, 0, buffer.pass_stride); - device_sub_ptr color_var_pass(buffer.mem, variance_to[0]*buffer.pass_stride, 3*buffer.pass_stride); - device_sub_ptr output_pass (buffer.mem, mean_to[0]*buffer.pass_stride, 3*buffer.pass_stride); - functions.detect_outliers(storage.temporary_color.device_pointer, *color_var_pass, *depth_pass, *output_pass); - } +void DenoisingTask::prefilter_color() +{ + int mean_from[] = {20, 21, 22}; + int variance_from[] = {23, 24, 25}; + int mean_to[] = { 8, 9, 10}; + int variance_to[] = {11, 12, 13}; + int num_color_passes = 3; + + storage.temporary_color.alloc_to_device(3*buffer.pass_stride, false); + device_sub_ptr nlm_temporary_1(storage.temporary_color, 0*buffer.pass_stride, buffer.pass_stride); + device_sub_ptr nlm_temporary_2(storage.temporary_color, 1*buffer.pass_stride, buffer.pass_stride); + device_sub_ptr nlm_temporary_3(storage.temporary_color, 2*buffer.pass_stride, buffer.pass_stride); + + nlm_state.temporary_1_ptr = *nlm_temporary_1; + nlm_state.temporary_2_ptr = *nlm_temporary_2; + nlm_state.temporary_3_ptr = *nlm_temporary_3; + + for(int pass = 0; pass < num_color_passes; pass++) { + device_sub_ptr color_pass(storage.temporary_color, pass*buffer.pass_stride, buffer.pass_stride); + device_sub_ptr color_var_pass(buffer.mem, variance_to[pass]*buffer.pass_stride, buffer.pass_stride); + functions.get_feature(mean_from[pass], variance_from[pass], *color_pass, *color_var_pass); } + device_sub_ptr depth_pass (buffer.mem, 0, buffer.pass_stride); + device_sub_ptr color_var_pass(buffer.mem, variance_to[0]*buffer.pass_stride, 3*buffer.pass_stride); + device_sub_ptr output_pass (buffer.mem, mean_to[0]*buffer.pass_stride, 3*buffer.pass_stride); + functions.detect_outliers(storage.temporary_color.device_pointer, *color_var_pass, *depth_pass, *output_pass); + + storage.temporary_color.free(); +} + +void DenoisingTask::construct_transform() +{ storage.w = filter_area.z; storage.h = filter_area.w; + storage.transform.alloc_to_device(storage.w*storage.h*TRANSFORM_SIZE, false); storage.rank.alloc_to_device(storage.w*storage.h, false); functions.construct_transform(); +} + +void DenoisingTask::reconstruct() +{ device_only_memory<float> temporary_1(device, "Denoising NLM temporary 1"); device_only_memory<float> temporary_2(device, "Denoising NLM temporary 2"); @@ -214,21 +231,36 @@ bool DenoisingTask::run_denoising() storage.XtWY.alloc_to_device(storage.w*storage.h*XTWY_SIZE, false); reconstruction_state.filter_window = rect_from_shape(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h); - int tile_coordinate_offset = filter_area.y*render_buffer.stride + filter_area.x; - reconstruction_state.buffer_params = make_int4(render_buffer.offset + tile_coordinate_offset, - render_buffer.stride, - render_buffer.pass_stride, - render_buffer.denoising_clean_offset); + int tile_coordinate_offset = filter_area.y*target_buffer.stride + filter_area.x; + reconstruction_state.buffer_params = make_int4(target_buffer.offset + tile_coordinate_offset, + target_buffer.stride, + target_buffer.pass_stride, + target_buffer.denoising_clean_offset); reconstruction_state.source_w = rect.z-rect.x; reconstruction_state.source_h = rect.w-rect.y; - { - device_sub_ptr color_ptr (buffer.mem, 8*buffer.pass_stride, 3*buffer.pass_stride); - device_sub_ptr color_var_ptr(buffer.mem, 11*buffer.pass_stride, 3*buffer.pass_stride); - functions.reconstruct(*color_ptr, *color_var_ptr, render_buffer.ptr); - } + device_sub_ptr color_ptr (buffer.mem, 8*buffer.pass_stride, 3*buffer.pass_stride); + device_sub_ptr color_var_ptr(buffer.mem, 11*buffer.pass_stride, 3*buffer.pass_stride); + functions.reconstruct(*color_ptr, *color_var_ptr, target_buffer.ptr); +} + +void DenoisingTask::run_denoising(RenderTile *tile) +{ + RenderTile rtiles[10]; + rtiles[4] = *tile; + functions.map_neighbor_tiles(rtiles); + set_render_buffer(rtiles); + + setup_denoising_buffer(); + + prefilter_shadowing(); + prefilter_features(); + prefilter_color(); + + construct_transform(); + reconstruct(); - return true; + functions.unmap_neighbor_tiles(rtiles); } CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h index 77a82d0ad04..e0da52867f1 100644 --- a/intern/cycles/device/device_denoising.h +++ b/intern/cycles/device/device_denoising.h @@ -32,20 +32,24 @@ public: float nlm_k_2; float pca_threshold; - /* Pointer and parameters of the RenderBuffers. */ + /* Parameters of the RenderBuffers. */ struct RenderBuffers { - int denoising_data_offset; - int denoising_clean_offset; + int offset; int pass_stride; + int samples; + } render_buffer; + + /* Pointer and parameters of the target buffer. */ + struct TargetBuffer { int offset; int stride; + int pass_stride; + int denoising_clean_offset; device_ptr ptr; - int samples; - } render_buffer; + } target_buffer; - TilesInfo *tiles; - device_vector<int> tiles_mem; - void tiles_from_rendertiles(RenderTile *rtiles); + TileInfo *tile_info; + device_vector<int> tile_info_mem; int4 rect; int4 filter_area; @@ -85,7 +89,8 @@ public: device_ptr depth_ptr, device_ptr output_ptr )> detect_outliers; - function<bool(device_ptr*)> set_tiles; + function<void(RenderTile *rtiles)> map_neighbor_tiles; + function<void(RenderTile *rtiles)> unmap_neighbor_tiles; } functions; /* Stores state of the current Reconstruction operation, @@ -138,12 +143,10 @@ public: {} } storage; - DenoisingTask(Device *device); + DenoisingTask(Device *device, const DeviceTask &task); ~DenoisingTask(); - void init_from_devicetask(const DeviceTask &task); - - bool run_denoising(); + void run_denoising(RenderTile *tile); struct DenoiseBuffers { int pass_stride; @@ -160,6 +163,14 @@ public: protected: Device *device; + + void set_render_buffer(RenderTile *rtiles); + void setup_denoising_buffer(); + void prefilter_shadowing(); + void prefilter_features(); + void prefilter_color(); + void construct_transform(); + void reconstruct(); }; CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_memory.cpp b/intern/cycles/device/device_memory.cpp index c6248fcf88b..482af685011 100644 --- a/intern/cycles/device/device_memory.cpp +++ b/intern/cycles/device/device_memory.cpp @@ -104,6 +104,26 @@ void device_memory::device_zero() } } +void device_memory::swap_device(Device *new_device, + size_t new_device_size, + device_ptr new_device_ptr) +{ + original_device = device; + original_device_size = device_size; + original_device_ptr = device_pointer; + + device = new_device; + device_size = new_device_size; + device_pointer = new_device_ptr; +} + +void device_memory::restore_device() +{ + device = original_device; + device_size = original_device_size; + device_pointer = original_device_ptr; +} + /* Device Sub Ptr */ device_sub_ptr::device_sub_ptr(device_memory& mem, int offset, int size) diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h index d8fe41e78bb..1138964f18c 100644 --- a/intern/cycles/device/device_memory.h +++ b/intern/cycles/device/device_memory.h @@ -200,6 +200,9 @@ public: virtual ~device_memory(); + void swap_device(Device *new_device, size_t new_device_size, device_ptr new_device_ptr); + void restore_device(); + protected: friend class CUDADevice; @@ -222,6 +225,10 @@ protected: void device_copy_to(); void device_copy_from(int y, int w, int h, int elem); void device_zero(); + + device_ptr original_device_ptr; + size_t original_device_size; + Device *original_device; }; /* Device Only Memory diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp index 91507e6be0c..65102c3f20f 100644 --- a/intern/cycles/device/device_multi.cpp +++ b/intern/cycles/device/device_multi.cpp @@ -285,26 +285,27 @@ public: mem.copy_from_device(0, mem.data_size, 1); } - Device *original_device = mem.device; - device_ptr original_ptr = mem.device_pointer; - size_t original_size = mem.device_size; - - mem.device = sub_device; - mem.device_pointer = 0; - mem.device_size = 0; + mem.swap_device(sub_device, 0, 0); mem.copy_to_device(); tiles[i].buffer = mem.device_pointer; + tiles[i].device_size = mem.device_size; - mem.device = original_device; - mem.device_pointer = original_ptr; - mem.device_size = original_size; + mem.restore_device(); } } } void unmap_neighbor_tiles(Device * sub_device, RenderTile * tiles) { + /* Copy denoised result back to the host. */ + device_vector<float> &mem = tiles[9].buffers->buffer; + mem.swap_device(sub_device, tiles[9].device_size, tiles[9].buffer); + mem.copy_from_device(0, mem.data_size, 1); + mem.restore_device(); + /* Copy denoised result to the original device. */ + mem.copy_to_device(); + for(int i = 0; i < 9; i++) { if(!tiles[i].buffers) { continue; @@ -312,28 +313,9 @@ public: device_vector<float> &mem = tiles[i].buffers->buffer; if(mem.device != sub_device) { - Device *original_device = mem.device; - device_ptr original_ptr = mem.device_pointer; - size_t original_size = mem.device_size; - - mem.device = sub_device; - mem.device_pointer = tiles[i].buffer; - - /* Copy denoised tile to the host. */ - if(i == 4) { - mem.copy_from_device(0, mem.data_size, 1); - } - + mem.swap_device(sub_device, tiles[i].device_size, tiles[i].buffer); sub_device->mem_free(mem); - - mem.device = original_device; - mem.device_pointer = original_ptr; - mem.device_size = original_size; - - /* Copy denoised tile to the original device. */ - if(i == 4) { - mem.copy_to_device(); - } + mem.restore_device(); } } } diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h index 85ef14ee29a..22e0503365c 100644 --- a/intern/cycles/device/opencl/opencl.h +++ b/intern/cycles/device/opencl/opencl.h @@ -362,7 +362,7 @@ public: void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half); void shader(DeviceTask& task); - void denoise(RenderTile& tile, DenoisingTask& denoising, const DeviceTask& task); + void denoise(RenderTile& tile, DenoisingTask& denoising); class OpenCLDeviceTask : public DeviceTask { public: @@ -436,8 +436,6 @@ protected: device_ptr depth_ptr, device_ptr output_ptr, DenoisingTask *task); - bool denoising_set_tiles(device_ptr *buffers, - DenoisingTask *task); device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int size); void mem_free_sub_ptr(device_ptr ptr); diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp index bfa2702ad62..9a50d217321 100644 --- a/intern/cycles/device/opencl/opencl_base.cpp +++ b/intern/cycles/device/opencl/opencl_base.cpp @@ -246,7 +246,6 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea denoising_program.add_kernel(ustring("filter_nlm_normalize")); denoising_program.add_kernel(ustring("filter_nlm_construct_gramian")); denoising_program.add_kernel(ustring("filter_finalize")); - denoising_program.add_kernel(ustring("filter_set_tiles")); vector<OpenCLProgram*> programs; programs.push_back(&base_program); @@ -977,13 +976,20 @@ bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr, cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr); cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr); - cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer); + cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer); cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow")); - kernel_set_args(ckFilterDivideShadow, 0, - task->render_buffer.samples, - tiles_mem, + int arg_ofs = kernel_set_args(ckFilterDivideShadow, 0, + task->render_buffer.samples, + tile_info_mem); + cl_mem buffers[9]; + for(int i = 0; i < 9; i++) { + buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]); + arg_ofs += kernel_set_args(ckFilterDivideShadow, arg_ofs, + buffers[i]); + } + kernel_set_args(ckFilterDivideShadow, arg_ofs, a_mem, b_mem, sample_variance_mem, @@ -991,7 +997,7 @@ bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr, buffer_variance_mem, task->rect, task->render_buffer.pass_stride, - task->render_buffer.denoising_data_offset); + task->render_buffer.offset); enqueue_kernel(ckFilterDivideShadow, task->rect.z-task->rect.x, task->rect.w-task->rect.y); @@ -1008,20 +1014,27 @@ bool OpenCLDeviceBase::denoising_get_feature(int mean_offset, cl_mem mean_mem = CL_MEM_PTR(mean_ptr); cl_mem variance_mem = CL_MEM_PTR(variance_ptr); - cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer); + cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer); cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature")); - kernel_set_args(ckFilterGetFeature, 0, - task->render_buffer.samples, - tiles_mem, + int arg_ofs = kernel_set_args(ckFilterGetFeature, 0, + task->render_buffer.samples, + tile_info_mem); + cl_mem buffers[9]; + for(int i = 0; i < 9; i++) { + buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]); + arg_ofs += kernel_set_args(ckFilterGetFeature, arg_ofs, + buffers[i]); + } + kernel_set_args(ckFilterGetFeature, arg_ofs, mean_offset, variance_offset, mean_mem, variance_mem, task->rect, task->render_buffer.pass_stride, - task->render_buffer.denoising_data_offset); + task->render_buffer.offset); enqueue_kernel(ckFilterGetFeature, task->rect.z-task->rect.x, task->rect.w-task->rect.y); @@ -1056,29 +1069,8 @@ bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr, return true; } -bool OpenCLDeviceBase::denoising_set_tiles(device_ptr *buffers, - DenoisingTask *task) -{ - task->tiles_mem.copy_to_device(); - - cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer); - - cl_kernel ckFilterSetTiles = denoising_program(ustring("filter_set_tiles")); - - kernel_set_args(ckFilterSetTiles, 0, tiles_mem); - for(int i = 0; i < 9; i++) { - cl_mem buffer_mem = CL_MEM_PTR(buffers[i]); - kernel_set_args(ckFilterSetTiles, i+1, buffer_mem); - } - - enqueue_kernel(ckFilterSetTiles, 1, 1); - - return true; -} - -void OpenCLDeviceBase::denoise(RenderTile &rtile, DenoisingTask& denoising, const DeviceTask &task) +void OpenCLDeviceBase::denoise(RenderTile &rtile, DenoisingTask& denoising) { - denoising.functions.set_tiles = function_bind(&OpenCLDeviceBase::denoising_set_tiles, this, _1, &denoising); denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising); denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, &denoising); denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); @@ -1090,16 +1082,7 @@ void OpenCLDeviceBase::denoise(RenderTile &rtile, DenoisingTask& denoising, cons denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); denoising.render_buffer.samples = rtile.sample; - RenderTile rtiles[9]; - rtiles[4] = rtile; - task.map_neighbor_tiles(rtiles, this); - denoising.tiles_from_rendertiles(rtiles); - - denoising.init_from_devicetask(task); - - denoising.run_denoising(); - - task.unmap_neighbor_tiles(rtiles, this); + denoising.run_denoising(&rtile); } void OpenCLDeviceBase::shader(DeviceTask& task) diff --git a/intern/cycles/device/opencl/opencl_mega.cpp b/intern/cycles/device/opencl/opencl_mega.cpp index ef39cfb5f7d..e004c0b44f4 100644 --- a/intern/cycles/device/opencl/opencl_mega.cpp +++ b/intern/cycles/device/opencl/opencl_mega.cpp @@ -107,7 +107,7 @@ public: } else if(task->type == DeviceTask::RENDER) { RenderTile tile; - DenoisingTask denoising(this); + DenoisingTask denoising(this, *task); /* Keep rendering tiles until done. */ while(task->acquire_tile(this, tile)) { @@ -141,7 +141,7 @@ public: } else if(tile.task == RenderTile::DENOISE) { tile.sample = tile.start_sample + tile.num_samples; - denoise(tile, denoising, *task); + denoise(tile, denoising); task->update_progress(&tile, tile.w*tile.h); } diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp index 51d3c7bb10f..66a4aa7e891 100644 --- a/intern/cycles/device/opencl/opencl_split.cpp +++ b/intern/cycles/device/opencl/opencl_split.cpp @@ -129,7 +129,7 @@ public: } else if(task->type == DeviceTask::RENDER) { RenderTile tile; - DenoisingTask denoising(this); + DenoisingTask denoising(this, *task); /* Allocate buffer for kernel globals */ device_only_memory<KernelGlobalsDummy> kgbuffer(this, "kernel_globals"); @@ -159,7 +159,7 @@ public: } else if(tile.task == RenderTile::DENOISE) { tile.sample = tile.start_sample + tile.num_samples; - denoise(tile, denoising, *task); + denoise(tile, denoising); task->update_progress(&tile, tile.w*tile.h); } diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h index ce96f733aff..d48ea3ac1d6 100644 --- a/intern/cycles/kernel/filter/filter_defines.h +++ b/intern/cycles/kernel/filter/filter_defines.h @@ -22,7 +22,7 @@ #define XTWX_SIZE (((DENOISE_FEATURES+1)*(DENOISE_FEATURES+2))/2) #define XTWY_SIZE (DENOISE_FEATURES+1) -typedef struct TilesInfo { +typedef struct TileInfo { int offsets[9]; int strides[9]; int x[4]; @@ -33,6 +33,31 @@ typedef struct TilesInfo { #else long long int buffers[9]; #endif -} TilesInfo; +} TileInfo; + +#ifdef __KERNEL_OPENCL__ +# define CCL_FILTER_TILE_INFO ccl_global TileInfo* tile_info, \ + ccl_global float *tile_buffer_1, \ + ccl_global float *tile_buffer_2, \ + ccl_global float *tile_buffer_3, \ + ccl_global float *tile_buffer_4, \ + ccl_global float *tile_buffer_5, \ + ccl_global float *tile_buffer_6, \ + ccl_global float *tile_buffer_7, \ + ccl_global float *tile_buffer_8, \ + ccl_global float *tile_buffer_9 +# define CCL_FILTER_TILE_INFO_ARG tile_info, \ + tile_buffer_1, tile_buffer_2, tile_buffer_3, \ + tile_buffer_4, tile_buffer_5, tile_buffer_6, \ + tile_buffer_7, tile_buffer_8, tile_buffer_9 +# define ccl_get_tile_buffer(id) (tile_buffer_ ## id) +#else +# ifdef __KERNEL_CUDA__ +# define CCL_FILTER_TILE_INFO ccl_global TileInfo* tile_info +# else +# define CCL_FILTER_TILE_INFO TileInfo* tile_info +# endif +# define ccl_get_tile_buffer(id) (tile_info->buffers[id]) +#endif #endif /* __FILTER_DEFINES_H__*/ diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h index 4af209341f6..3507f80df46 100644 --- a/intern/cycles/kernel/filter/filter_prefilter.h +++ b/intern/cycles/kernel/filter/filter_prefilter.h @@ -26,7 +26,7 @@ CCL_NAMESPACE_BEGIN * bufferVariance: The buffer-based variance of the shadow feature. Unbiased, but quite noisy. */ ccl_device void kernel_filter_divide_shadow(int sample, - ccl_global TilesInfo *tiles, + CCL_FILTER_TILE_INFO, int x, int y, ccl_global float *unfilteredA, ccl_global float *unfilteredB, @@ -37,13 +37,13 @@ ccl_device void kernel_filter_divide_shadow(int sample, int buffer_pass_stride, int buffer_denoising_offset) { - int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2); - int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2); + int xtile = (x < tile_info->x[1])? 0: ((x < tile_info->x[2])? 1: 2); + int ytile = (y < tile_info->y[1])? 0: ((y < tile_info->y[2])? 1: 2); int tile = ytile*3+xtile; - int offset = tiles->offsets[tile]; - int stride = tiles->strides[tile]; - const ccl_global float *ccl_restrict center_buffer = (ccl_global float*) tiles->buffers[tile]; + int offset = tile_info->offsets[tile]; + int stride = tile_info->strides[tile]; + const ccl_global float *ccl_restrict center_buffer = (ccl_global float*) ccl_get_tile_buffer(tile); center_buffer += (y*stride + x + offset)*buffer_pass_stride; center_buffer += buffer_denoising_offset + 14; @@ -79,7 +79,7 @@ ccl_device void kernel_filter_divide_shadow(int sample, * - rect: The prefilter area (lower pixels inclusive, upper pixels exclusive). */ ccl_device void kernel_filter_get_feature(int sample, - ccl_global TilesInfo *tiles, + CCL_FILTER_TILE_INFO, int m_offset, int v_offset, int x, int y, ccl_global float *mean, @@ -87,10 +87,10 @@ ccl_device void kernel_filter_get_feature(int sample, int4 rect, int buffer_pass_stride, int buffer_denoising_offset) { - int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2); - int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2); + int xtile = (x < tile_info->x[1])? 0: ((x < tile_info->x[2])? 1: 2); + int ytile = (y < tile_info->y[1])? 0: ((y < tile_info->y[2])? 1: 2); int tile = ytile*3+xtile; - ccl_global float *center_buffer = ((ccl_global float*) tiles->buffers[tile]) + (tiles->offsets[tile] + y*tiles->strides[tile] + x)*buffer_pass_stride + buffer_denoising_offset; + ccl_global float *center_buffer = ((ccl_global float*) ccl_get_tile_buffer(tile)) + (tile_info->offsets[tile] + y*tile_info->strides[tile] + x)*buffer_pass_stride + buffer_denoising_offset; int buffer_w = align_up(rect.z - rect.x, 4); int idx = (y-rect.y)*buffer_w + (x - rect.x); diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h index 4231aba88d7..b62aa9663ec 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_cpu.h +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h @@ -17,7 +17,7 @@ /* Templated common declaration part of all CPU kernels. */ void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample, - TilesInfo *tiles, + TileInfo *tile_info, int x, int y, float *unfilteredA, @@ -30,7 +30,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample, int buffer_denoising_offset); void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, - TilesInfo *tiles, + TileInfo *tile_info, int m_offset, int v_offset, int x, diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h index 504622ecfd9..26777fdabb2 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h @@ -34,7 +34,7 @@ CCL_NAMESPACE_BEGIN /* Denoise filter */ void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample, - TilesInfo *tiles, + TileInfo *tile_info, int x, int y, float *unfilteredA, @@ -49,7 +49,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample, #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow); #else - kernel_filter_divide_shadow(sample, tiles, + kernel_filter_divide_shadow(sample, tile_info, x, y, unfilteredA, unfilteredB, @@ -63,7 +63,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample, } void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, - TilesInfo *tiles, + TileInfo *tile_info, int m_offset, int v_offset, int x, @@ -76,7 +76,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_get_feature); #else - kernel_filter_get_feature(sample, tiles, + kernel_filter_get_feature(sample, tile_info, m_offset, v_offset, x, y, mean, variance, diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu index 035f0484488..0561c40e6b1 100644 --- a/intern/cycles/kernel/kernels/cuda/filter.cu +++ b/intern/cycles/kernel/kernels/cuda/filter.cu @@ -29,7 +29,7 @@ extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_divide_shadow(int sample, - TilesInfo *tiles, + TileInfo *tile_info, float *unfilteredA, float *unfilteredB, float *sampleVariance, @@ -43,7 +43,7 @@ kernel_cuda_filter_divide_shadow(int sample, int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; if(x < prefilter_rect.z && y < prefilter_rect.w) { kernel_filter_divide_shadow(sample, - tiles, + tile_info, x, y, unfilteredA, unfilteredB, @@ -59,7 +59,7 @@ kernel_cuda_filter_divide_shadow(int sample, extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_get_feature(int sample, - TilesInfo *tiles, + TileInfo *tile_info, int m_offset, int v_offset, float *mean, @@ -72,7 +72,7 @@ kernel_cuda_filter_get_feature(int sample, int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; if(x < prefilter_rect.z && y < prefilter_rect.w) { kernel_filter_get_feature(sample, - tiles, + tile_info, m_offset, v_offset, x, y, mean, variance, diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl index 2b77807c38b..3c75754fb39 100644 --- a/intern/cycles/kernel/kernels/opencl/filter.cl +++ b/intern/cycles/kernel/kernels/opencl/filter.cl @@ -23,7 +23,7 @@ /* kernels */ __kernel void kernel_ocl_filter_divide_shadow(int sample, - ccl_global TilesInfo *tiles, + CCL_FILTER_TILE_INFO, ccl_global float *unfilteredA, ccl_global float *unfilteredB, ccl_global float *sampleVariance, @@ -37,7 +37,7 @@ __kernel void kernel_ocl_filter_divide_shadow(int sample, int y = prefilter_rect.y + get_global_id(1); if(x < prefilter_rect.z && y < prefilter_rect.w) { kernel_filter_divide_shadow(sample, - tiles, + CCL_FILTER_TILE_INFO_ARG, x, y, unfilteredA, unfilteredB, @@ -51,7 +51,7 @@ __kernel void kernel_ocl_filter_divide_shadow(int sample, } __kernel void kernel_ocl_filter_get_feature(int sample, - ccl_global TilesInfo *tiles, + CCL_FILTER_TILE_INFO, int m_offset, int v_offset, ccl_global float *mean, @@ -64,7 +64,7 @@ __kernel void kernel_ocl_filter_get_feature(int sample, int y = prefilter_rect.y + get_global_id(1); if(x < prefilter_rect.z && y < prefilter_rect.w) { kernel_filter_get_feature(sample, - tiles, + CCL_FILTER_TILE_INFO_ARG, m_offset, v_offset, x, y, mean, variance, @@ -276,27 +276,3 @@ __kernel void kernel_ocl_filter_finalize(ccl_global float *buffer, buffer_params, sample); } } - -__kernel void kernel_ocl_filter_set_tiles(ccl_global TilesInfo* tiles, - ccl_global float *buffer_1, - ccl_global float *buffer_2, - ccl_global float *buffer_3, - ccl_global float *buffer_4, - ccl_global float *buffer_5, - ccl_global float *buffer_6, - ccl_global float *buffer_7, - ccl_global float *buffer_8, - ccl_global float *buffer_9) -{ - if((get_global_id(0) == 0) && (get_global_id(1) == 0)) { - tiles->buffers[0] = buffer_1; - tiles->buffers[1] = buffer_2; - tiles->buffers[2] = buffer_3; - tiles->buffers[3] = buffer_4; - tiles->buffers[4] = buffer_5; - tiles->buffers[5] = buffer_6; - tiles->buffers[6] = buffer_7; - tiles->buffers[7] = buffer_8; - tiles->buffers[8] = buffer_9; - } -} diff --git a/intern/cycles/render/buffers.h b/intern/cycles/render/buffers.h index dfc98fe2061..ee7c0490dfd 100644 --- a/intern/cycles/render/buffers.h +++ b/intern/cycles/render/buffers.h @@ -137,6 +137,7 @@ public: int tile_index; device_ptr buffer; + int device_size; RenderBuffers *buffers; diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp index bb636dd962a..3106ee53832 100644 --- a/intern/cycles/render/session.cpp +++ b/intern/cycles/render/session.cpp @@ -502,6 +502,9 @@ void Session::map_neighbor_tiles(RenderTile *tiles, Device *tile_device) assert(tiles[4].buffers); device->map_neighbor_tiles(tile_device, tiles); + + /* The denoised result is written back to the original tile. */ + tiles[9] = tiles[4]; } void Session::unmap_neighbor_tiles(RenderTile *tiles, Device *tile_device) |