From 2f6db0e227d8835bc4b2ec5d0e181c5cf29da7dc Mon Sep 17 00:00:00 2001 From: Lukas Stockner Date: Wed, 8 Feb 2017 16:59:38 +0100 Subject: Cycles Denoising: Use device-independent denoising in the CPUDevice --- intern/cycles/filter/filter_features.h | 8 +++--- intern/cycles/filter/filter_nlm_cpu.h | 12 +++++--- intern/cycles/filter/filter_nlm_gpu.h | 9 ++++-- intern/cycles/filter/filter_prefilter.h | 22 +++++++++------ intern/cycles/filter/filter_reconstruction.h | 28 ++++++++++-------- intern/cycles/filter/kernels/cpu/filter_cpu.h | 8 +++--- intern/cycles/filter/kernels/cpu/filter_cpu_impl.h | 23 +++++++++------ intern/cycles/filter/kernels/cuda/filter.cu | 33 ++++++++++++++-------- 8 files changed, 88 insertions(+), 55 deletions(-) (limited to 'intern/cycles/filter') diff --git a/intern/cycles/filter/filter_features.h b/intern/cycles/filter/filter_features.h index 07c69ed7081..96b2db960d1 100644 --- a/intern/cycles/filter/filter_features.h +++ b/intern/cycles/filter/filter_features.h @@ -117,14 +117,14 @@ ccl_device_inline void filter_calculate_scale(float *scale) scale[7] = 1.0f/max(sqrtf(scale[7]), 0.01f); //AlbedoB } -ccl_device_inline float3 filter_get_pixel_color(float ccl_readonly_ptr buffer, int channel, int pass_stride) +ccl_device_inline float3 filter_get_pixel_color(float ccl_readonly_ptr buffer, int pass_stride) { - return make_float3(ccl_get_feature(channel), ccl_get_feature(channel+1), ccl_get_feature(channel+2)); + return make_float3(ccl_get_feature(0), ccl_get_feature(1), ccl_get_feature(2)); } -ccl_device_inline float filter_get_pixel_variance(float ccl_readonly_ptr buffer, int channel, int pass_stride) +ccl_device_inline float filter_get_pixel_variance(float ccl_readonly_ptr buffer, int pass_stride) { - return average(make_float3(ccl_get_feature(channel), ccl_get_feature(channel+1), ccl_get_feature(channel+2))); + return average(make_float3(ccl_get_feature(0), ccl_get_feature(1), ccl_get_feature(2))); } ccl_device_inline bool filter_firefly_rejection(float3 pixel_color, float pixel_variance, float3 center_color, float sqrt_center_variance) diff --git a/intern/cycles/filter/filter_nlm_cpu.h b/intern/cycles/filter/filter_nlm_cpu.h index d66a743a437..3b03865a7f5 100644 --- a/intern/cycles/filter/filter_nlm_cpu.h +++ b/intern/cycles/filter/filter_nlm_cpu.h @@ -112,10 +112,14 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, float ccl ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, float ccl_readonly_ptr differenceImage, float ccl_readonly_ptr buffer, - int color_pass, int variance_pass, - float *transform, int *rank, - float *XtWX, float3 *XtWY, - int4 rect, int4 filter_rect, + float *color_pass, + float *variance_pass, + float *transform, + int *rank, + float *XtWX, + float3 *XtWY, + int4 rect, + int4 filter_rect, int w, int h, int f) { /* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */ diff --git a/intern/cycles/filter/filter_nlm_gpu.h b/intern/cycles/filter/filter_nlm_gpu.h index c904032a7cc..195fa15ed9c 100644 --- a/intern/cycles/filter/filter_nlm_gpu.h +++ b/intern/cycles/filter/filter_nlm_gpu.h @@ -84,11 +84,14 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy, int dx, int dy, float ccl_readonly_ptr differenceImage, float ccl_readonly_ptr buffer, - int color_pass, int variance_pass, + float *color_pass, + float *variance_pass, float ccl_readonly_ptr transform, int *rank, - float *XtWX, float3 *XtWY, - int4 rect, int4 filter_rect, + float *XtWX, + float3 *XtWY, + int4 rect, + int4 filter_rect, int w, int h, int f) { int y = fy + filter_rect.y; diff --git a/intern/cycles/filter/filter_prefilter.h b/intern/cycles/filter/filter_prefilter.h index 58f58a86e7b..b2eeea28fd8 100644 --- a/intern/cycles/filter/filter_prefilter.h +++ b/intern/cycles/filter/filter_prefilter.h @@ -25,14 +25,19 @@ CCL_NAMESPACE_BEGIN * sampleVarianceV: Variance of the sample variance estimation, quite noisy (since it's essentially the buffer variance of the two variance halves) * bufferVariance: The buffer-based variance of the shadow feature. Unbiased, but quite noisy. */ -ccl_device void kernel_filter_divide_shadow(int sample, float **buffers, +ccl_device void kernel_filter_divide_shadow(int sample, + float **buffers, int x, int y, int *tile_x, int *tile_y, int *offset, int *stride, - float *unfiltered, float *sampleVariance, - float *sampleVarianceV, float *bufferVariance, - int4 rect, int buffer_pass_stride, - int buffer_denoising_offset, int num_frames, + float *unfilteredA, + float *unfilteredB, + float *sampleVariance, + float *sampleVarianceV, + float *bufferVariance, + int4 rect, + int buffer_pass_stride, + int buffer_denoising_offset, bool use_gradients) { int xtile = (x < tile_x[1])? 0: ((x < tile_x[2])? 1: 2); @@ -47,13 +52,12 @@ ccl_device void kernel_filter_divide_shadow(int sample, float **buffers, int buffer_w = align_up(rect.z - rect.x, 4); int idx = (y-rect.y)*buffer_w + (x - rect.x); - int Bofs = (rect.w - rect.y)*buffer_w*num_frames; - unfiltered[idx] = center_buffer[15] / max(center_buffer[14], 1e-7f); - unfiltered[idx+Bofs] = center_buffer[18] / max(center_buffer[17], 1e-7f); + unfilteredA[idx] = center_buffer[15] / max(center_buffer[14], 1e-7f); + unfilteredB[idx] = center_buffer[18] / max(center_buffer[17], 1e-7f); float varFac = 1.0f / (sample * (sample-1)); sampleVariance[idx] = (center_buffer[16] + center_buffer[19]) * varFac; sampleVarianceV[idx] = 0.5f * (center_buffer[16] - center_buffer[19]) * (center_buffer[16] - center_buffer[19]) * varFac * varFac; - bufferVariance[idx] = 0.5f * (unfiltered[idx] - unfiltered[idx+Bofs]) * (unfiltered[idx] - unfiltered[idx+Bofs]); + bufferVariance[idx] = 0.5f * (unfilteredA[idx] - unfilteredB[idx]) * (unfilteredA[idx] - unfilteredB[idx]); } /* Load a regular feature from the render buffers into the denoise buffer. diff --git a/intern/cycles/filter/filter_reconstruction.h b/intern/cycles/filter/filter_reconstruction.h index 15e84b5d054..a308964fb4f 100644 --- a/intern/cycles/filter/filter_reconstruction.h +++ b/intern/cycles/filter/filter_reconstruction.h @@ -18,17 +18,21 @@ CCL_NAMESPACE_BEGIN ccl_device_inline void kernel_filter_construct_gramian(int x, int y, int storage_stride, - int dx, int dy, int w, int h, + int dx, int dy, + int w, int h, float ccl_readonly_ptr buffer, - int color_pass, int variance_pass, + float *color_pass, + float *variance_pass, float ccl_readonly_ptr transform, - int *rank, float weight, - float *XtWX, float3 *XtWY) + int *rank, + float weight, + float *XtWX, + float3 *XtWY) { const int pass_stride = w*h; - float ccl_readonly_ptr p_buffer = buffer + y*w + x; - float ccl_readonly_ptr q_buffer = buffer + (y+dy)*w + (x+dx); + int p_offset = y *w + x; + int q_offset = (y+dy)*w + (x+dx); #ifdef __KERNEL_CPU__ const int stride = 1; @@ -37,21 +41,21 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y, const int stride = storage_stride; #endif - float3 p_color = filter_get_pixel_color(p_buffer, color_pass, pass_stride); - float3 q_color = filter_get_pixel_color(q_buffer, color_pass, pass_stride); + float3 p_color = filter_get_pixel_color(color_pass + p_offset, pass_stride); + float3 q_color = filter_get_pixel_color(color_pass + q_offset, pass_stride); - float p_std_dev = sqrtf(filter_get_pixel_variance(p_buffer, variance_pass, pass_stride)); - float q_std_dev = sqrtf(filter_get_pixel_variance(q_buffer, variance_pass, pass_stride)); + float p_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + p_offset, pass_stride)); + float q_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + q_offset, pass_stride)); if(average(fabs(p_color - q_color)) > 3.0f*(p_std_dev + q_std_dev + 1e-3f)) { return; } float feature_means[DENOISE_FEATURES], features[DENOISE_FEATURES]; - filter_get_features(make_int3(x, y, 0), p_buffer, feature_means, NULL, pass_stride); + filter_get_features(make_int3(x, y, 0), buffer + p_offset, feature_means, NULL, pass_stride); float design_row[DENOISE_FEATURES+1]; - filter_get_design_row_transform(make_int3(x+dx, y+dy, 0), q_buffer, feature_means, pass_stride, features, *rank, design_row, transform, stride); + filter_get_design_row_transform(make_int3(x+dx, y+dy, 0), buffer + q_offset, feature_means, pass_stride, features, *rank, design_row, transform, stride); math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride); math_vec3_add_strided(XtWY, (*rank)+1, design_row, weight * q_color, stride); diff --git a/intern/cycles/filter/kernels/cpu/filter_cpu.h b/intern/cycles/filter/kernels/cpu/filter_cpu.h index b26e12c1245..6a0b58b214c 100644 --- a/intern/cycles/filter/kernels/cpu/filter_cpu.h +++ b/intern/cycles/filter/kernels/cpu/filter_cpu.h @@ -24,14 +24,14 @@ void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample, int *tile_y, int *offset, int *stride, - float *unfiltered, + float *unfilteredA, + float *unfilteredB, float *sampleV, float *sampleVV, float *bufferV, int* prefilter_rect, int buffer_pass_stride, int buffer_denoising_offset, - int num_frames, bool use_gradients); void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, @@ -119,8 +119,8 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, int dy, float *differenceImage, float *buffer, - int color_pass, - int variance_pass, + float *color_pass, + float *variance_pass, float *transform, int *rank, float *XtWX, diff --git a/intern/cycles/filter/kernels/cpu/filter_cpu_impl.h b/intern/cycles/filter/kernels/cpu/filter_cpu_impl.h index 79697e4d48f..586c30cfa69 100644 --- a/intern/cycles/filter/kernels/cpu/filter_cpu_impl.h +++ b/intern/cycles/filter/kernels/cpu/filter_cpu_impl.h @@ -42,11 +42,14 @@ void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample, int *tile_y, int *offset, int *stride, - float *unfiltered, float *sampleVariance, float *sampleVarianceV, float *bufferVariance, + float *unfilteredA, + float *unfilteredB, + float *sampleVariance, + float *sampleVarianceV, + float *bufferVariance, int* prefilter_rect, int buffer_pass_stride, int buffer_denoising_offset, - int num_frames, bool use_gradients) { #ifdef KERNEL_STUB @@ -55,10 +58,14 @@ void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample, kernel_filter_divide_shadow(sample, buffers, x, y, tile_x, tile_y, offset, stride, - unfiltered, sampleVariance, - sampleVarianceV, bufferVariance, - load_int4(prefilter_rect), buffer_pass_stride, - buffer_denoising_offset, num_frames, + unfilteredA, + unfilteredB, + sampleVariance, + sampleVarianceV, + bufferVariance, + load_int4(prefilter_rect), + buffer_pass_stride, + buffer_denoising_offset, use_gradients); #endif } @@ -213,8 +220,8 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, int dy, float *differenceImage, float *buffer, - int color_pass, - int variance_pass, + float *color_pass, + float *variance_pass, float *transform, int *rank, float *XtWX, diff --git a/intern/cycles/filter/kernels/cuda/filter.cu b/intern/cycles/filter/kernels/cuda/filter.cu index 3dabafddee8..c62953c1fcb 100644 --- a/intern/cycles/filter/kernels/cuda/filter.cu +++ b/intern/cycles/filter/kernels/cuda/filter.cu @@ -31,10 +31,14 @@ CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_divide_shadow(int sample, float* buffers, int4 buffer_rect, int offset, int stride, - float *unfiltered, float *sampleVariance, - float *sampleVarianceV, float *bufferVariance, - int4 prefilter_rect, int buffer_pass_stride, - int buffer_denoising_offset, int num_frames, + float *unfilteredA, + float *unfilteredB, + float *sampleVariance, + float *sampleVarianceV, + float *bufferVariance, + int4 prefilter_rect, + int buffer_pass_stride, + int buffer_denoising_offset, bool use_gradients) { int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; @@ -48,10 +52,14 @@ kernel_cuda_filter_divide_shadow(int sample, float* buffers, kernel_filter_divide_shadow(sample, tile_buffers, x, y, tile_x, tile_y, tile_offset, tile_stride, - unfiltered, sampleVariance, - sampleVarianceV, bufferVariance, - prefilter_rect, buffer_pass_stride, - buffer_denoising_offset, num_frames, + unfilteredA, + unfilteredB, + sampleVariance, + sampleVarianceV, + bufferVariance, + prefilter_rect, + buffer_pass_stride, + buffer_denoising_offset, use_gradients); } } @@ -198,11 +206,14 @@ CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_nlm_construct_gramian(int dx, int dy, float ccl_readonly_ptr differenceImage, float ccl_readonly_ptr buffer, - int color_pass, int variance_pass, + float *color_pass, + float *variance_pass, float const* __restrict__ transform, int *rank, - float *XtWX, float3 *XtWY, - int4 rect, int4 filter_rect, + float *XtWX, + float3 *XtWY, + int4 rect, + int4 filter_rect, int w, int h, int f) { int x = blockDim.x*blockIdx.x + threadIdx.x + max(0, rect.x-filter_rect.x); int y = blockDim.y*blockIdx.y + threadIdx.y + max(0, rect.y-filter_rect.y); -- cgit v1.2.3