From 2f6db0e227d8835bc4b2ec5d0e181c5cf29da7dc Mon Sep 17 00:00:00 2001 From: Lukas Stockner Date: Wed, 8 Feb 2017 16:59:38 +0100 Subject: Cycles Denoising: Use device-independent denoising in the CPUDevice --- intern/cycles/device/device_cpu.cpp | 592 ++++++++------------- intern/cycles/device/device_cuda.cpp | 9 +- intern/cycles/filter/filter_features.h | 8 +- intern/cycles/filter/filter_nlm_cpu.h | 12 +- intern/cycles/filter/filter_nlm_gpu.h | 9 +- intern/cycles/filter/filter_prefilter.h | 22 +- intern/cycles/filter/filter_reconstruction.h | 28 +- intern/cycles/filter/kernels/cpu/filter_cpu.h | 8 +- intern/cycles/filter/kernels/cpu/filter_cpu_impl.h | 23 +- intern/cycles/filter/kernels/cuda/filter.cu | 33 +- 10 files changed, 312 insertions(+), 432 deletions(-) diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index ebd4acb1e59..bd5630ae958 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -26,6 +26,7 @@ #include "device.h" #include "device_intern.h" +#include "device_denoising.h" #include "kernel.h" #include "kernel_compat_cpu.h" @@ -136,10 +137,10 @@ public: KernelFunctions convert_to_byte_kernel; KernelFunctions shader_kernel; - KernelFunctions filter_divide_shadow_kernel; - KernelFunctions filter_get_feature_kernel; - KernelFunctions filter_combine_halves_kernel; - KernelFunctions filter_divide_combined_kernel; + KernelFunctions filter_divide_shadow_kernel; + KernelFunctions filter_get_feature_kernel; + KernelFunctions filter_combine_halves_kernel; + KernelFunctions filter_divide_combined_kernel; KernelFunctions filter_nlm_calc_difference_kernel; KernelFunctions filter_nlm_blur_kernel; @@ -147,9 +148,9 @@ public: KernelFunctions filter_nlm_update_output_kernel; KernelFunctions filter_nlm_normalize_kernel; - KernelFunctions filter_construct_transform_kernel; - KernelFunctions filter_nlm_construct_gramian_kernel; - KernelFunctions filter_finalize_kernel; + KernelFunctions filter_construct_transform_kernel; + KernelFunctions filter_nlm_construct_gramian_kernel; + KernelFunctions filter_finalize_kernel; #define KERNEL_FUNCTIONS(name) \ KERNEL_NAME_EVAL(cpu, name), \ @@ -221,12 +222,20 @@ public: void mem_free(device_memory& mem) { if(mem.device_pointer) { + if(!mem.data_pointer) { + delete[] (char*) mem.device_pointer; + } mem.device_pointer = 0; stats.mem_free(mem.device_size); mem.device_size = 0; } } + virtual device_ptr mem_get_offset_ptr(device_memory& mem, int offset) + { + return (device_ptr) (((char*) mem.device_pointer) + mem.memory_offset(offset)); + } + void const_copy_to(const char *name, void *host, size_t size) { kernel_const_copy(&kernel_globals, name, host, size); @@ -290,368 +299,216 @@ public: } }; - void non_local_means(int4 rect, float *image, float *weight, float *out, float *variance, float *difference, float *blurDifference, float *weightAccum, int r, int f, float a, float k_2, int channel_ofs_in = 0, int channel_ofs_out = 0) + bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr, + DenoisingTask *task) { + int4 rect = task->rect; + int r = task->nlm_state.r; + int f = task->nlm_state.f; + float a = task->nlm_state.a; + float k_2 = task->nlm_state.k_2; + int w = align_up(rect.z-rect.x, 4); int h = rect.w-rect.y; - int channels = channel_ofs_in? 3: 1; - memset(weightAccum, 0, sizeof(float)*w*h*channels); - memset(out, 0, sizeof(float)*w*h*channels); + float *blurDifference = (float*) task->nlm_state.temporary_1_ptr; + float *difference = (float*) task->nlm_state.temporary_2_ptr; + float *weightAccum = (float*) task->nlm_state.temporary_3_ptr; + + memset(weightAccum, 0, sizeof(float)*w*h); + memset((float*) out_ptr, 0, sizeof(float)*w*h); for(int i = 0; i < (2*r+1)*(2*r+1); i++) { int dy = i / (2*r+1) - r; int dx = i % (2*r+1) - r; int local_rect[4] = {max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)}; - filter_nlm_calc_difference_kernel()(dx, dy, weight, variance, difference, local_rect, w, channel_ofs_in, a, k_2); - filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f); + filter_nlm_calc_difference_kernel()(dx, dy, + (float*) guide_ptr, + (float*) variance_ptr, + difference, + local_rect, + 0, + w, a, k_2); + + filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f); filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f); - filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f); - for(int c = 0; c < channels; c++) { - filter_nlm_update_output_kernel()(dx, dy, blurDifference, image + channel_ofs_in*c, out + channel_ofs_out*c, weightAccum + w*h*c, local_rect, w, f); - } + filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f); + + filter_nlm_update_output_kernel()(dx, dy, + blurDifference, + (float*) image_ptr, + (float*) out_ptr, + weightAccum, + local_rect, + w, f); } int local_rect[4] = {0, 0, rect.z-rect.x, rect.w-rect.y}; - for(int c = 0; c < channels; c++) { - filter_nlm_normalize_kernel()(out + channel_ofs_out*c, weightAccum + w*h*c, local_rect, w); - } + filter_nlm_normalize_kernel()((float*) out_ptr, weightAccum, local_rect, w); + + return true; } - float* denoise_fill_buffer(KernelGlobals *kg, int sample, int4 rect, float** buffers, int* tile_x, int* tile_y, int *offsets, int *strides, int frames, int *frame_strides) + bool denoising_construct_transform(DenoisingTask *task) { - bool use_cross_denoising = kg->__data.film.denoise_cross; - bool use_gradients = kg->__data.integrator.use_gradients; - int buffer_pass_stride = kg->__data.film.pass_stride; - int buffer_denoising_offset = kg->__data.film.pass_denoising; - int num_frames = 1; - - int w = align_up(rect.z - rect.x, 4), h = (rect.w - rect.y); - int pass_stride = w*h*frames; - int passes = use_cross_denoising? 20 : 14; - float *filter_buffers = new float[passes*pass_stride]; - memset(filter_buffers, 0, sizeof(float)*passes*pass_stride); - - /* Denoising Buffer Pass allocation: - * 0: Normal X - * 1: Normal Y - * 2: Normal Z - * 3: Depth - * 4: Shadowing - * 5: Albedo R - * 6: Albedo G - * 7: Albedo B - * 8: Color R - * 9: Color G - * 10: Color B - * 11: Color Variance R - * 12: Color Variance G - * 13: Color Variance B - * With Cross-denoising passes, this list is essentially repeated two times. */ - - for(int frame = 0; frame < frames; frame++) { - float *filter_buffer = filter_buffers + w*h*frame; - float *buffer[9]; - for(int i = 0; i < 9; i++) { - buffer[i] = buffers[i] + frame_strides[i]*frame; - } - DebugPasses debug((rect.z - rect.x), h, 42, 1, w); - -#define PASSPTR(i) (filter_buffer + (i)*pass_stride) - - /* ==== Step 1: Prefilter shadow feature. ==== */ - { - /* Reuse some passes of the filter_buffer for temporary storage. */ - float *sampleV = PASSPTR(0), *sampleVV = PASSPTR(1), *bufferV = PASSPTR(2), *cleanV = PASSPTR(3); - float *unfilteredA = PASSPTR(5), *unfilteredB = PASSPTR(6); - float *nlm_temp1 = PASSPTR(7), *nlm_temp2 = PASSPTR(8), *nlm_temp3 = PASSPTR(9); - - /* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */ - for(int y = rect.y; y < rect.w; y++) { - for(int x = rect.x; x < rect.z; x++) { - filter_divide_shadow_kernel()(sample, buffer, x, y, tile_x, tile_y, offsets, strides, unfilteredA, sampleV, sampleVV, bufferV, &rect.x, buffer_pass_stride, buffer_denoising_offset, num_frames, use_gradients); - } - } - debug.add_pass("shadowUnfilteredA", unfilteredA); - debug.add_pass("shadowUnfilteredB", unfilteredB); - debug.add_pass("shadowBufferV", bufferV); - debug.add_pass("shadowSampleV", sampleV); - debug.add_pass("shadowSampleVV", sampleVV); - - /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */ - non_local_means(rect, bufferV, sampleV, cleanV, sampleVV, nlm_temp1, nlm_temp2, nlm_temp3, 6, 3, 4.0f, 1.0f); - debug.add_pass("shadowCleanV", cleanV); - - /* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */ - non_local_means(rect, unfilteredA, unfilteredB, sampleV, cleanV, nlm_temp1, nlm_temp2, nlm_temp3, 5, 3, 1.0f, 0.25f); - non_local_means(rect, unfilteredB, unfilteredA, bufferV, cleanV, nlm_temp1, nlm_temp2, nlm_temp3, 5, 3, 1.0f, 0.25f); - debug.add_pass("shadowFilteredA", sampleV); - debug.add_pass("shadowFilteredB", bufferV); - - /* Estimate the residual variance between the two filtered halves. */ - for(int y = rect.y; y < rect.w; y++) { - for(int x = rect.x; x < rect.z; x++) { - filter_combine_halves_kernel()(x, y, NULL, sampleVV, sampleV, bufferV, &rect.x, 2); - } - } - debug.add_pass("shadowResidualV", sampleVV); - - /* Use the residual variance for a second filter pass. */ - non_local_means(rect, sampleV, bufferV, unfilteredA, sampleVV, nlm_temp1, nlm_temp2, nlm_temp3, 4, 2, 1.0f, 0.5f); - non_local_means(rect, bufferV, sampleV, unfilteredB, sampleVV, nlm_temp1, nlm_temp2, nlm_temp3, 4, 2, 1.0f, 0.5f); - debug.add_pass("shadowFinalA", unfilteredA); - debug.add_pass("shadowFinalB", unfilteredB); - - /* Combine the two double-filtered halves to a final shadow feature image and associated variance. */ - for(int y = rect.y; y < rect.w; y++) { - for(int x = rect.x; x < rect.z; x++) { - filter_combine_halves_kernel()(x, y, PASSPTR(4), NULL, unfilteredA, unfilteredB, &rect.x, 0); - } - } - debug.add_pass("shadowFinal", PASSPTR(4)); - } - - /* ==== Step 2: Prefilter general features. ==== */ - { - - float *unfiltered = PASSPTR(8), *variance = PASSPTR(9); - float *nlm_temp1 = PASSPTR(10), *nlm_temp2 = PASSPTR(11), *nlm_temp3 = PASSPTR(12); - /* Order in render buffers: - * Normal[X, Y, Z] NormalVar[X, Y, Z] Albedo[R, G, B] AlbedoVar[R, G, B ] Depth DepthVar - * 0 1 2 3 4 5 6 7 8 9 10 11 12 13 - * - * Order of processing: |NormalXYZ|Depth|AlbedoXYZ | - * | | | | */ - int mean_from[] = { 0, 1, 2, 6, 7, 8, 12 }; - int variance_from[] = { 3, 4, 5, 9, 10, 11, 13 }; - int mean_to[] = { 1, 2, 3, 0, 5, 6, 7 }; - for(int i = 0; i < 7; i++) { - for(int y = rect.y; y < rect.w; y++) { - for(int x = rect.x; x < rect.z; x++) { - filter_get_feature_kernel()(sample, buffer, mean_from[i], variance_from[i], x, y, tile_x, tile_y, offsets, strides, unfiltered, variance, &rect.x, buffer_pass_stride, buffer_denoising_offset, use_cross_denoising); - } - } - non_local_means(rect, unfiltered, unfiltered, PASSPTR(mean_to[i]), variance, nlm_temp1, nlm_temp2, nlm_temp3, 2, 2, 1, 0.25f); - debug.add_pass(string_printf("feature%dUnfiltered", i), unfiltered); - debug.add_pass(string_printf("feature%dFiltered", i), PASSPTR(mean_to[i])); - debug.add_pass(string_printf("feature%dVariance", i), variance); - } + for(int y = 0; y < task->filter_area.w; y++) { + for(int x = 0; x < task->filter_area.z; x++) { + filter_construct_transform_kernel()(task->render_buffer.samples, + (float*) task->buffer.mem.device_pointer, + x + task->filter_area.x, + y + task->filter_area.y, + y*task->filter_area.z + x, + (float*) task->storage.transform.device_pointer, + (int*) task->storage.rank.device_pointer, + &task->rect.x, + task->half_window, + task->pca_threshold, + 1, + 0); } + } + return true; + } + bool denoising_reconstruct(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr guide_ptr, + device_ptr guide_variance_ptr, + device_ptr output_ptr, + DenoisingTask *task) + { + mem_zero(task->storage.XtWX); + mem_zero(task->storage.XtWY); + float *difference = (float*) task->reconstruction_state.temporary_1_ptr; + float *blurDifference = (float*) task->reconstruction_state.temporary_2_ptr; - /* ==== Step 3: Copy combined color pass. ==== */ - { - int mean_from[] = {20, 21, 22, 26, 27, 28}; - int variance_from[] = {23, 24, 25, 29, 30, 31}; - int mean_to[] = { 8, 9, 10, 14, 15, 16}; - int variance_to[] = {11, 12, 13, 17, 18, 19}; - for(int i = 0; i < (use_cross_denoising? 6 : 3); i++) { - for(int y = rect.y; y < rect.w; y++) { - for(int x = rect.x; x < rect.z; x++) { - filter_get_feature_kernel()(sample, buffer, mean_from[i], variance_from[i], x, y, tile_x, tile_y, offsets, strides, PASSPTR(mean_to[i]), PASSPTR(variance_to[i]), &rect.x, buffer_pass_stride, buffer_denoising_offset, use_cross_denoising); - } - } - } - } + int r = task->half_window; + for(int i = 0; i < (2*r+1)*(2*r+1); i++) { + int dy = i / (2*r+1) - r; + int dx = i % (2*r+1) - r; -#ifdef WITH_CYCLES_DEBUG_FILTER - { - float *temp1 = new float[pass_stride], *temp2 = new float[pass_stride], *temp3 = new float[3*pass_stride], *out = new float[3*pass_stride]; - non_local_means(rect, PASSPTR(8), PASSPTR(8), out, PASSPTR(11), temp1, temp2, temp3, 8, 4, 1, 0.5f, pass_stride, pass_stride); - debug.add_pass("input0Filtered", out); - debug.add_pass("input1Filtered", out+pass_stride); - debug.add_pass("input2Filtered", out+2*pass_stride); - debug.add_pass("input0Unfiltered", PASSPTR(8)); - debug.add_pass("input1Unfiltered", PASSPTR(9)); - debug.add_pass("input2Unfiltered", PASSPTR(10)); - debug.add_pass("input0Variance", PASSPTR(11)); - debug.add_pass("input1Variance", PASSPTR(12)); - debug.add_pass("input2Variance", PASSPTR(13)); - delete[] temp1; - delete[] temp2; - delete[] temp3; - delete[] out; + int local_rect[4] = {max(0, -dx), max(0, -dy), + task->reconstruction_state.source_w - max(0, dx), + task->reconstruction_state.source_h - max(0, dy)}; + filter_nlm_calc_difference_kernel()(dx, dy, + (float*) guide_ptr, + (float*) guide_variance_ptr, + difference, + local_rect, + task->buffer.w, + task->buffer.pass_stride, + 1.0f, + task->nlm_k_2); + filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4); + filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.w, 4); + filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4); + filter_nlm_construct_gramian_kernel()(dx, dy, + blurDifference, + (float*) task->buffer.mem.device_pointer, + (float*) color_ptr, + (float*) color_variance_ptr, + (float*) task->storage.transform.device_pointer, + (int*) task->storage.rank.device_pointer, + (float*) task->storage.XtWX.device_pointer, + (float3*) task->storage.XtWY.device_pointer, + local_rect, + &task->reconstruction_state.filter_rect.x, + task->buffer.w, + task->buffer.h, + 4); + } + for(int y = 0; y < task->filter_area.w; y++) { + for(int x = 0; x < task->filter_area.z; x++) { + filter_finalize_kernel()(x, + y, + y*task->filter_area.z + x, + task->buffer.w, + task->buffer.h, + (float*) output_ptr, + (int*) task->storage.rank.device_pointer, + (float*) task->storage.XtWX.device_pointer, + (float3*) task->storage.XtWY.device_pointer, + &task->reconstruction_state.buffer_params.x, + task->render_buffer.samples); } -#endif - - debug.write(string_printf("debug_tile_%d_%d.exr", rect.x, rect.y)); } - - return filter_buffers; + return true; } - void denoise_run(KernelGlobals *kg, int sample, float *filter_buffer, int4 filter_area, int4 rect, int offset, int stride, float *buffers) + bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr, + device_ptr mean_ptr, device_ptr variance_ptr, + int r, int4 rect, DenoisingTask *task) { -#ifdef WITH_CYCLES_DEBUG_FPE - scoped_fpe fpe(FPE_ENABLED); -#endif - - bool use_cross_denoising = kg->__data.film.denoise_cross; - int half_window = kg->__data.integrator.half_window; - float pca_threshold = kg->__data.integrator.filter_strength; - int num_frames = 1; /* TODO(lukas) */ - int prev_frames = 0; - - int w = align_up(rect.z - rect.x, 4), h = (rect.w - rect.y); - int pass_stride = w*h; - - int storage_num = filter_area.z*filter_area.w; - float *XtWX = new float[XTWX_SIZE*storage_num]; - float3 *XtWY = new float3[XTWY_SIZE*storage_num]; - float *transform = new float[TRANSFORM_SIZE*storage_num]; - int *rank = new int[storage_num]; - - for(int y = 0; y < filter_area.w; y++) { - for(int x = 0; x < filter_area.z; x++) { - filter_construct_transform_kernel()(sample, filter_buffer, x + filter_area.x, y + filter_area.y, y*filter_area.z + x, transform, rank, &rect.x, half_window, pca_threshold, num_frames, prev_frames); + (void) task; + for(int y = rect.y; y < rect.w; y++) { + for(int x = rect.x; x < rect.z; x++) { + filter_combine_halves_kernel()(x, y, + (float*) mean_ptr, + (float*) variance_ptr, + (float*) a_ptr, + (float*) b_ptr, + &rect.x, + r); } } + return true; + } - if(use_cross_denoising) - { - int f = 4; - float a = 1.0f; - float k_2 = kg->__data.integrator.weighting_adjust; - float *weight = filter_buffer + 8*pass_stride; - float *variance = filter_buffer + 11*pass_stride; - float *difference = new float[pass_stride]; - float *blurDifference = new float[pass_stride]; - float *outA = new float[3*pass_stride]; - float *outB = new float[3*pass_stride]; - int first_filter_rect[4] = {filter_area.x-rect.x, filter_area.y-rect.y, filter_area.z, filter_area.w}; - int first_buffer_params[4] = {-rect.x, -rect.y, 0, pass_stride}; - int simple_rect[4] = {0, 0, w, h}; - DebugPasses debug(filter_area.z, filter_area.w, 34, 1, w); - - memset(XtWX, 0, sizeof(float)*XTWX_SIZE*storage_num); - memset(XtWY, 0, sizeof(float3)*XTWY_SIZE*storage_num); - for(int i = 0; i < (2*half_window+1)*(2*half_window+1); i++) { - int dy = i / (2*half_window+1) - half_window; - int dx = i % (2*half_window+1) - half_window; - - int local_rect[4] = {max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)}; - filter_nlm_calc_difference_kernel()(dx, dy, weight, variance, difference, local_rect, w, pass_stride, a, k_2); - filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f); - filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f); - filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f); - filter_nlm_construct_gramian_kernel()(dx, dy, blurDifference, filter_buffer, 14, 17, transform, rank, XtWX, XtWY, local_rect, first_filter_rect, w, h, 4); - } - for(int y = 0; y < filter_area.w; y++) { - for(int x = 0; x < filter_area.z; x++) { - filter_finalize_kernel()(x, y, y*filter_area.z + x, w, h, outA, rank, XtWX, XtWY, first_buffer_params, sample); - } - } - debug.add_pass("passAColor0", outA+0*pass_stride); - debug.add_pass("passAColor1", outA+1*pass_stride); - debug.add_pass("passAColor2", outA+2*pass_stride); - - memset(XtWX, 0, sizeof(float)*XTWX_SIZE*storage_num); - memset(XtWY, 0, sizeof(float3)*XTWY_SIZE*storage_num); - weight = filter_buffer + 14*pass_stride; - variance = filter_buffer + 17*pass_stride; - for(int i = 0; i < (2*half_window+1)*(2*half_window+1); i++) { - int dy = i / (2*half_window+1) - half_window; - int dx = i % (2*half_window+1) - half_window; - - int local_rect[4] = {max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)}; - filter_nlm_calc_difference_kernel()(dx, dy, weight, variance, difference, local_rect, w, pass_stride, a, k_2); - filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f); - filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f); - filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f); - filter_nlm_construct_gramian_kernel()(dx, dy, blurDifference, filter_buffer, 8, 11, transform, rank, XtWX, XtWY, local_rect, first_filter_rect, w, h, 4); - } - for(int y = 0; y < filter_area.w; y++) { - for(int x = 0; x < filter_area.z; x++) { - filter_finalize_kernel()(x, y, y*filter_area.z + x, w, h, outB, rank, XtWX, XtWY, first_buffer_params, sample); - } - } - debug.add_pass("passBColor0", outB+0*pass_stride); - debug.add_pass("passBColor1", outB+1*pass_stride); - debug.add_pass("passBColor2", outB+2*pass_stride); - - weight = filter_buffer + 8*pass_stride; - variance = filter_buffer + 11*pass_stride; - for(int c = 0; c < 3; c++) { - for(int y = 0; y < filter_area.w; y++) { - for(int x = 0; x < filter_area.z; x++) { - filter_combine_halves_kernel()(x, y, weight + c*pass_stride, variance + c*pass_stride, outA + c*pass_stride, outB + c*pass_stride, simple_rect, 0); - } - } - } - delete[] outA; - delete[] outB; - debug.add_pass("combinedColor0", weight+0*pass_stride); - debug.add_pass("combinedColor1", weight+1*pass_stride); - debug.add_pass("combinedColor2", weight+2*pass_stride); - debug.add_pass("combinedVariance0", variance+0*pass_stride); - debug.add_pass("combinedVariance1", variance+1*pass_stride); - debug.add_pass("combinedVariance2", variance+2*pass_stride); - - memset(XtWX, 0, sizeof(float)*XTWX_SIZE*storage_num); - memset(XtWY, 0, sizeof(float3)*XTWY_SIZE*storage_num); - int second_filter_rect[4] = {0, 0, filter_area.z, filter_area.w}; - int second_buffer_params[4] = {offset, stride, kg->__data.film.pass_stride, kg->__data.film.pass_no_denoising}; - for(int i = 0; i < (2*half_window+1)*(2*half_window+1); i++) { - int dy = i / (2*half_window+1) - half_window; - int dx = i % (2*half_window+1) - half_window; - - int local_rect[4] = {max(0, -dx), max(0, -dy), filter_area.z - max(0, dx), filter_area.w - max(0, dy)}; - filter_nlm_calc_difference_kernel()(dx, dy, weight, variance, difference, local_rect, w, pass_stride, a, k_2); - filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f); - filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f); - filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f); - filter_nlm_construct_gramian_kernel()(dx, dy, blurDifference, filter_buffer, 8, 11, transform, rank, XtWX, XtWY, local_rect, second_filter_rect, w, h, 4); - } - delete[] difference; - delete[] blurDifference; - for(int y = 0; y < filter_area.w; y++) { - for(int x = 0; x < filter_area.z; x++) { - filter_finalize_kernel()(x + filter_area.x, y + filter_area.y, y*filter_area.z + x, w, h, buffers, rank, XtWX, XtWY, second_buffer_params, sample); - } + bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr, + device_ptr sample_variance_ptr, device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr, DenoisingTask *task) + { + for(int y = task->rect.y; y < task->rect.w; y++) { + for(int x = task->rect.x; x < task->rect.z; x++) { + filter_divide_shadow_kernel()(task->render_buffer.samples, + (float**) task->neighbors.buffers, + x, y, + task->neighbors.tile_x, + task->neighbors.tile_y, + task->neighbors.offsets, + task->neighbors.strides, + (float*) a_ptr, + (float*) b_ptr, + (float*) sample_variance_ptr, + (float*) sv_variance_ptr, + (float*) buffer_variance_ptr, + &task->rect.x, + task->render_buffer.pass_stride, + task->render_buffer.denoising_offset, + task->use_gradients); } - - debug.write(string_printf("filter_%d_%d.exr", filter_area.x, filter_area.y)); } - else { - int f = 4; - float a = 1.0f; - float k_2 = kg->__data.integrator.weighting_adjust; - float *weight = filter_buffer + 8*pass_stride; - float *variance = filter_buffer + 11*pass_stride; - float *difference = new float[pass_stride]; - float *blurDifference = new float[pass_stride]; - int filter_rect[4] = {filter_area.x-rect.x, filter_area.y-rect.y, filter_area.z, filter_area.w}; - int buffer_params[4] = {offset, stride, kg->__data.film.pass_stride, kg->__data.film.pass_no_denoising}; - - memset(XtWX, 0, sizeof(float)*XTWX_SIZE*storage_num); - memset(XtWY, 0, sizeof(float3)*XTWY_SIZE*storage_num); - for(int i = 0; i < (2*half_window+1)*(2*half_window+1); i++) { - int dy = i / (2*half_window+1) - half_window; - int dx = i % (2*half_window+1) - half_window; - - int local_rect[4] = {max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)}; - filter_nlm_calc_difference_kernel()(dx, dy, weight, variance, difference, local_rect, w, pass_stride, a, k_2); - filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f); - filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f); - filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f); - filter_nlm_construct_gramian_kernel()(dx, dy, blurDifference, filter_buffer, 8, 11, transform, rank, XtWX, XtWY, local_rect, filter_rect, w, h, 4); - } - delete[] difference; - delete[] blurDifference; - for(int y = 0; y < filter_area.w; y++) { - for(int x = 0; x < filter_area.z; x++) { - filter_finalize_kernel()(x + filter_area.x, y + filter_area.y, y*filter_area.z + x, w, h, buffers, rank, XtWX, XtWY, buffer_params, sample); - } - } + return true; + } + bool denoising_get_feature(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr, + DenoisingTask *task) + { + for(int y = task->rect.y; y < task->rect.w; y++) { + for(int x = task->rect.x; x < task->rect.z; x++) { + filter_get_feature_kernel()(task->render_buffer.samples, + (float**) task->neighbors.buffers, + mean_offset, + variance_offset, + x, y, + task->neighbors.tile_x, + task->neighbors.tile_y, + task->neighbors.offsets, + task->neighbors.strides, + (float*) mean_ptr, + (float*) variance_ptr, + &task->rect.x, + task->render_buffer.pass_stride, + task->render_buffer.denoising_offset, + task->use_cross_denoising); + } } - - delete[] transform; - delete[] rank; - delete[] XtWX; - delete[] XtWY; + return true; } void thread_render(DeviceTask& task) @@ -697,56 +554,47 @@ public: } if(tile.buffers->params.overscan && !task.get_cancel()) { - int tile_x[4] = {tile.x, tile.x, tile.x+tile.w, tile.x+tile.w}; - int tile_y[4] = {tile.y, tile.y, tile.y+tile.h, tile.y+tile.h}; - int offsets[9] = {0, 0, 0, 0, tile.offset, 0, 0, 0, 0}; - int strides[9] = {0, 0, 0, 0, tile.stride, 0, 0, 0, 0}; - float *buffers[9] = {NULL, NULL, NULL, NULL, (float*) tile.buffer, NULL, NULL, NULL, NULL}; - BufferParams ¶ms = tile.buffers->params; - int frame_stride[9] = {0, 0, 0, 0, params.width * params.height * params.get_passes_size(), 0, 0, 0, 0}; + DenoisingTask denoising(this); int overscan = tile.buffers->params.overscan; - int4 filter_area = make_int4(tile.x + overscan, tile.y + overscan, tile.w - 2*overscan, tile.h - 2*overscan); - int4 rect = make_int4(tile.x, tile.y, tile.x + tile.w, tile.y + tile.h); + denoising.filter_area = make_int4(tile.x + overscan, tile.y + overscan, tile.w - 2*overscan, tile.h - 2*overscan); + denoising.render_buffer.samples = end_sample; + + denoising.neighbors.init_from_single_tile(tile); + denoising.init_from_kerneldata(&kg.__data); + + denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising); + denoising.functions.reconstruct = function_bind(&CPUDevice::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); + denoising.functions.combine_halves = function_bind(&CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); + denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising); - float* filter_buffer = denoise_fill_buffer(&kg, end_sample, rect, buffers, tile_x, tile_y, offsets, strides, tile.buffers->params.frames, frame_stride); - denoise_run(&kg, end_sample, filter_buffer, filter_area, rect, tile.offset, tile.stride, (float*) tile.buffer); - delete[] filter_buffer; + denoising.run_denoising(); } } else if(tile.task == RenderTile::DENOISE) { - int sample = tile.start_sample + tile.num_samples; + tile.sample = tile.start_sample + tile.num_samples; + + DenoisingTask denoising(this); + denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h); + denoising.render_buffer.samples = tile.sample; RenderTile rtiles[9]; rtiles[4] = tile; task.get_neighbor_tiles(rtiles); - float *buffers[9]; - int offsets[9], strides[9]; - int frame_stride[9]; - for(int i = 0; i < 9; i++) { - buffers[i] = (float*) rtiles[i].buffer; - offsets[i] = rtiles[i].offset; - strides[i] = rtiles[i].stride; - if(rtiles[i].buffers) { - BufferParams ¶ms = rtiles[i].buffers->params; - frame_stride[i] = params.width * params.height * params.get_passes_size(); - } - else { - frame_stride[i] = 0; - } - } - int tile_x[4] = {rtiles[3].x, rtiles[4].x, rtiles[5].x, rtiles[5].x+rtiles[5].w}; - int tile_y[4] = {rtiles[1].y, rtiles[4].y, rtiles[7].y, rtiles[7].y+rtiles[7].h}; + denoising.neighbors.init_from_rendertiles(rtiles); - int half_window = kg.__data.integrator.half_window; - int4 filter_area = make_int4(tile.x, tile.y, tile.w, tile.h); - int4 rect = make_int4(max(tile.x - half_window, tile_x[0]), max(tile.y - half_window, tile_y[0]), min(tile.x + tile.w + half_window+1, tile_x[3]), min(tile.y + tile.h + half_window+1, tile_y[3])); + denoising.init_from_kerneldata(&kg.__data); - float* filter_buffer = denoise_fill_buffer(&kg, sample, rect, buffers, tile_x, tile_y, offsets, strides, tile.buffers->params.frames, frame_stride); - denoise_run(&kg, sample, filter_buffer, filter_area, rect, tile.offset, tile.stride, (float*) tile.buffer); - delete[] filter_buffer; + denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising); + denoising.functions.reconstruct = function_bind(&CPUDevice::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); + denoising.functions.combine_halves = function_bind(&CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); + denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising); - tile.sample = sample; + denoising.run_denoising(); task.update_progress(&tile, tile.w*tile.h); } diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index b63f7b5f84a..ba4424f844b 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -1003,9 +1003,10 @@ public: void *divide_args[] = {&sample, &d_buffer, &buffer_area, &rtile.offset, &rtile.stride, - &d_unfilteredA, &d_sampleV, &d_sampleVV, &d_bufferV, + &d_unfilteredA, &d_unfilteredB, + &d_sampleV, &d_sampleVV, &d_bufferV, &rect, &buffer_pass_stride, &buffer_denoising_offset, - &num_frames, &use_gradients}; + &use_gradients}; cuda_assert(cuLaunchKernel(cuFilterDivideShadow, xblocks , yblocks, 1, /* blocks */ xthreads, ythreads, 1, /* threads */ @@ -1147,8 +1148,6 @@ public: int f = 4; float a = 1.0f; float k_2 = kernel_globals.integrator.weighting_adjust; - int color_pass = 8; - int variance_pass = 11; CUdeviceptr color_buffer = CUDA_PTR_ADD(d_denoise_buffers, 8*pass_stride); CUdeviceptr variance_buffer = CUDA_PTR_ADD(d_denoise_buffers, 11*pass_stride); @@ -1164,7 +1163,7 @@ public: void *calc_difference_args[] = {&dx, &dy, &color_buffer, &variance_buffer, &d_difference, &local_rect, &w, &a, &k_2}; void *blur_args[] = {&d_difference, &d_blurDifference, &local_rect, &w, &f}; void *calc_weight_args[] = {&d_blurDifference, &d_difference, &local_rect, &w, &f}; - void *construct_gramian_args[] = {&dx, &dy, &d_blurDifference, &d_denoise_buffers, &color_pass, &variance_pass, &d_transform, &d_rank, &d_XtWX, &d_XtWY, &local_rect, &local_filter_rect, &w, &h, &f}; + void *construct_gramian_args[] = {&dx, &dy, &d_blurDifference, &d_denoise_buffers, &color_buffer, &variance_buffer, &d_transform, &d_rank, &d_XtWX, &d_XtWY, &local_rect, &local_filter_rect, &w, &h, &f}; for(int i = 0; i < (2*half_window+1)*(2*half_window+1); i++) { dy = i / (2*half_window+1) - half_window; diff --git a/intern/cycles/filter/filter_features.h b/intern/cycles/filter/filter_features.h index 07c69ed7081..96b2db960d1 100644 --- a/intern/cycles/filter/filter_features.h +++ b/intern/cycles/filter/filter_features.h @@ -117,14 +117,14 @@ ccl_device_inline void filter_calculate_scale(float *scale) scale[7] = 1.0f/max(sqrtf(scale[7]), 0.01f); //AlbedoB } -ccl_device_inline float3 filter_get_pixel_color(float ccl_readonly_ptr buffer, int channel, int pass_stride) +ccl_device_inline float3 filter_get_pixel_color(float ccl_readonly_ptr buffer, int pass_stride) { - return make_float3(ccl_get_feature(channel), ccl_get_feature(channel+1), ccl_get_feature(channel+2)); + return make_float3(ccl_get_feature(0), ccl_get_feature(1), ccl_get_feature(2)); } -ccl_device_inline float filter_get_pixel_variance(float ccl_readonly_ptr buffer, int channel, int pass_stride) +ccl_device_inline float filter_get_pixel_variance(float ccl_readonly_ptr buffer, int pass_stride) { - return average(make_float3(ccl_get_feature(channel), ccl_get_feature(channel+1), ccl_get_feature(channel+2))); + return average(make_float3(ccl_get_feature(0), ccl_get_feature(1), ccl_get_feature(2))); } ccl_device_inline bool filter_firefly_rejection(float3 pixel_color, float pixel_variance, float3 center_color, float sqrt_center_variance) diff --git a/intern/cycles/filter/filter_nlm_cpu.h b/intern/cycles/filter/filter_nlm_cpu.h index d66a743a437..3b03865a7f5 100644 --- a/intern/cycles/filter/filter_nlm_cpu.h +++ b/intern/cycles/filter/filter_nlm_cpu.h @@ -112,10 +112,14 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, float ccl ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, float ccl_readonly_ptr differenceImage, float ccl_readonly_ptr buffer, - int color_pass, int variance_pass, - float *transform, int *rank, - float *XtWX, float3 *XtWY, - int4 rect, int4 filter_rect, + float *color_pass, + float *variance_pass, + float *transform, + int *rank, + float *XtWX, + float3 *XtWY, + int4 rect, + int4 filter_rect, int w, int h, int f) { /* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */ diff --git a/intern/cycles/filter/filter_nlm_gpu.h b/intern/cycles/filter/filter_nlm_gpu.h index c904032a7cc..195fa15ed9c 100644 --- a/intern/cycles/filter/filter_nlm_gpu.h +++ b/intern/cycles/filter/filter_nlm_gpu.h @@ -84,11 +84,14 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy, int dx, int dy, float ccl_readonly_ptr differenceImage, float ccl_readonly_ptr buffer, - int color_pass, int variance_pass, + float *color_pass, + float *variance_pass, float ccl_readonly_ptr transform, int *rank, - float *XtWX, float3 *XtWY, - int4 rect, int4 filter_rect, + float *XtWX, + float3 *XtWY, + int4 rect, + int4 filter_rect, int w, int h, int f) { int y = fy + filter_rect.y; diff --git a/intern/cycles/filter/filter_prefilter.h b/intern/cycles/filter/filter_prefilter.h index 58f58a86e7b..b2eeea28fd8 100644 --- a/intern/cycles/filter/filter_prefilter.h +++ b/intern/cycles/filter/filter_prefilter.h @@ -25,14 +25,19 @@ CCL_NAMESPACE_BEGIN * sampleVarianceV: Variance of the sample variance estimation, quite noisy (since it's essentially the buffer variance of the two variance halves) * bufferVariance: The buffer-based variance of the shadow feature. Unbiased, but quite noisy. */ -ccl_device void kernel_filter_divide_shadow(int sample, float **buffers, +ccl_device void kernel_filter_divide_shadow(int sample, + float **buffers, int x, int y, int *tile_x, int *tile_y, int *offset, int *stride, - float *unfiltered, float *sampleVariance, - float *sampleVarianceV, float *bufferVariance, - int4 rect, int buffer_pass_stride, - int buffer_denoising_offset, int num_frames, + float *unfilteredA, + float *unfilteredB, + float *sampleVariance, + float *sampleVarianceV, + float *bufferVariance, + int4 rect, + int buffer_pass_stride, + int buffer_denoising_offset, bool use_gradients) { int xtile = (x < tile_x[1])? 0: ((x < tile_x[2])? 1: 2); @@ -47,13 +52,12 @@ ccl_device void kernel_filter_divide_shadow(int sample, float **buffers, int buffer_w = align_up(rect.z - rect.x, 4); int idx = (y-rect.y)*buffer_w + (x - rect.x); - int Bofs = (rect.w - rect.y)*buffer_w*num_frames; - unfiltered[idx] = center_buffer[15] / max(center_buffer[14], 1e-7f); - unfiltered[idx+Bofs] = center_buffer[18] / max(center_buffer[17], 1e-7f); + unfilteredA[idx] = center_buffer[15] / max(center_buffer[14], 1e-7f); + unfilteredB[idx] = center_buffer[18] / max(center_buffer[17], 1e-7f); float varFac = 1.0f / (sample * (sample-1)); sampleVariance[idx] = (center_buffer[16] + center_buffer[19]) * varFac; sampleVarianceV[idx] = 0.5f * (center_buffer[16] - center_buffer[19]) * (center_buffer[16] - center_buffer[19]) * varFac * varFac; - bufferVariance[idx] = 0.5f * (unfiltered[idx] - unfiltered[idx+Bofs]) * (unfiltered[idx] - unfiltered[idx+Bofs]); + bufferVariance[idx] = 0.5f * (unfilteredA[idx] - unfilteredB[idx]) * (unfilteredA[idx] - unfilteredB[idx]); } /* Load a regular feature from the render buffers into the denoise buffer. diff --git a/intern/cycles/filter/filter_reconstruction.h b/intern/cycles/filter/filter_reconstruction.h index 15e84b5d054..a308964fb4f 100644 --- a/intern/cycles/filter/filter_reconstruction.h +++ b/intern/cycles/filter/filter_reconstruction.h @@ -18,17 +18,21 @@ CCL_NAMESPACE_BEGIN ccl_device_inline void kernel_filter_construct_gramian(int x, int y, int storage_stride, - int dx, int dy, int w, int h, + int dx, int dy, + int w, int h, float ccl_readonly_ptr buffer, - int color_pass, int variance_pass, + float *color_pass, + float *variance_pass, float ccl_readonly_ptr transform, - int *rank, float weight, - float *XtWX, float3 *XtWY) + int *rank, + float weight, + float *XtWX, + float3 *XtWY) { const int pass_stride = w*h; - float ccl_readonly_ptr p_buffer = buffer + y*w + x; - float ccl_readonly_ptr q_buffer = buffer + (y+dy)*w + (x+dx); + int p_offset = y *w + x; + int q_offset = (y+dy)*w + (x+dx); #ifdef __KERNEL_CPU__ const int stride = 1; @@ -37,21 +41,21 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y, const int stride = storage_stride; #endif - float3 p_color = filter_get_pixel_color(p_buffer, color_pass, pass_stride); - float3 q_color = filter_get_pixel_color(q_buffer, color_pass, pass_stride); + float3 p_color = filter_get_pixel_color(color_pass + p_offset, pass_stride); + float3 q_color = filter_get_pixel_color(color_pass + q_offset, pass_stride); - float p_std_dev = sqrtf(filter_get_pixel_variance(p_buffer, variance_pass, pass_stride)); - float q_std_dev = sqrtf(filter_get_pixel_variance(q_buffer, variance_pass, pass_stride)); + float p_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + p_offset, pass_stride)); + float q_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + q_offset, pass_stride)); if(average(fabs(p_color - q_color)) > 3.0f*(p_std_dev + q_std_dev + 1e-3f)) { return; } float feature_means[DENOISE_FEATURES], features[DENOISE_FEATURES]; - filter_get_features(make_int3(x, y, 0), p_buffer, feature_means, NULL, pass_stride); + filter_get_features(make_int3(x, y, 0), buffer + p_offset, feature_means, NULL, pass_stride); float design_row[DENOISE_FEATURES+1]; - filter_get_design_row_transform(make_int3(x+dx, y+dy, 0), q_buffer, feature_means, pass_stride, features, *rank, design_row, transform, stride); + filter_get_design_row_transform(make_int3(x+dx, y+dy, 0), buffer + q_offset, feature_means, pass_stride, features, *rank, design_row, transform, stride); math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride); math_vec3_add_strided(XtWY, (*rank)+1, design_row, weight * q_color, stride); diff --git a/intern/cycles/filter/kernels/cpu/filter_cpu.h b/intern/cycles/filter/kernels/cpu/filter_cpu.h index b26e12c1245..6a0b58b214c 100644 --- a/intern/cycles/filter/kernels/cpu/filter_cpu.h +++ b/intern/cycles/filter/kernels/cpu/filter_cpu.h @@ -24,14 +24,14 @@ void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample, int *tile_y, int *offset, int *stride, - float *unfiltered, + float *unfilteredA, + float *unfilteredB, float *sampleV, float *sampleVV, float *bufferV, int* prefilter_rect, int buffer_pass_stride, int buffer_denoising_offset, - int num_frames, bool use_gradients); void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, @@ -119,8 +119,8 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, int dy, float *differenceImage, float *buffer, - int color_pass, - int variance_pass, + float *color_pass, + float *variance_pass, float *transform, int *rank, float *XtWX, diff --git a/intern/cycles/filter/kernels/cpu/filter_cpu_impl.h b/intern/cycles/filter/kernels/cpu/filter_cpu_impl.h index 79697e4d48f..586c30cfa69 100644 --- a/intern/cycles/filter/kernels/cpu/filter_cpu_impl.h +++ b/intern/cycles/filter/kernels/cpu/filter_cpu_impl.h @@ -42,11 +42,14 @@ void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample, int *tile_y, int *offset, int *stride, - float *unfiltered, float *sampleVariance, float *sampleVarianceV, float *bufferVariance, + float *unfilteredA, + float *unfilteredB, + float *sampleVariance, + float *sampleVarianceV, + float *bufferVariance, int* prefilter_rect, int buffer_pass_stride, int buffer_denoising_offset, - int num_frames, bool use_gradients) { #ifdef KERNEL_STUB @@ -55,10 +58,14 @@ void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample, kernel_filter_divide_shadow(sample, buffers, x, y, tile_x, tile_y, offset, stride, - unfiltered, sampleVariance, - sampleVarianceV, bufferVariance, - load_int4(prefilter_rect), buffer_pass_stride, - buffer_denoising_offset, num_frames, + unfilteredA, + unfilteredB, + sampleVariance, + sampleVarianceV, + bufferVariance, + load_int4(prefilter_rect), + buffer_pass_stride, + buffer_denoising_offset, use_gradients); #endif } @@ -213,8 +220,8 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, int dy, float *differenceImage, float *buffer, - int color_pass, - int variance_pass, + float *color_pass, + float *variance_pass, float *transform, int *rank, float *XtWX, diff --git a/intern/cycles/filter/kernels/cuda/filter.cu b/intern/cycles/filter/kernels/cuda/filter.cu index 3dabafddee8..c62953c1fcb 100644 --- a/intern/cycles/filter/kernels/cuda/filter.cu +++ b/intern/cycles/filter/kernels/cuda/filter.cu @@ -31,10 +31,14 @@ CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_divide_shadow(int sample, float* buffers, int4 buffer_rect, int offset, int stride, - float *unfiltered, float *sampleVariance, - float *sampleVarianceV, float *bufferVariance, - int4 prefilter_rect, int buffer_pass_stride, - int buffer_denoising_offset, int num_frames, + float *unfilteredA, + float *unfilteredB, + float *sampleVariance, + float *sampleVarianceV, + float *bufferVariance, + int4 prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset, bool use_gradients) { int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; @@ -48,10 +52,14 @@ kernel_cuda_filter_divide_shadow(int sample, float* buffers, kernel_filter_divide_shadow(sample, tile_buffers, x, y, tile_x, tile_y, tile_offset, tile_stride, - unfiltered, sampleVariance, - sampleVarianceV, bufferVariance, - prefilter_rect, buffer_pass_stride, - buffer_denoising_offset, num_frames, + unfilteredA, + unfilteredB, + sampleVariance, + sampleVarianceV, + bufferVariance, + prefilter_rect, + buffer_pass_stride, + buffer_denoising_offset, use_gradients); } } @@ -198,11 +206,14 @@ CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_nlm_construct_gramian(int dx, int dy, float ccl_readonly_ptr differenceImage, float ccl_readonly_ptr buffer, - int color_pass, int variance_pass, + float *color_pass, + float *variance_pass, float const* __restrict__ transform, int *rank, - float *XtWX, float3 *XtWY, - int4 rect, int4 filter_rect, + float *XtWX, + float3 *XtWY, + int4 rect, + int4 filter_rect, int w, int h, int f) { int x = blockDim.x*blockIdx.x + threadIdx.x + max(0, rect.x-filter_rect.x); int y = blockDim.y*blockIdx.y + threadIdx.y + max(0, rect.y-filter_rect.y); -- cgit v1.2.3