diff options
Diffstat (limited to 'intern/cycles/kernel')
-rw-r--r-- | intern/cycles/kernel/filter/filter_defines.h | 1 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_nlm_cpu.h | 19 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_nlm_gpu.h | 36 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_prefilter.h | 39 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_reconstruction.h | 12 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_types.h | 9 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/cpu/filter_cpu.h | 14 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h | 56 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/cuda/filter.cu | 36 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/opencl/filter.cl | 34 |
10 files changed, 220 insertions, 36 deletions
diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h index 67f4e62ac0f..9ac7c3db23d 100644 --- a/intern/cycles/kernel/filter/filter_defines.h +++ b/intern/cycles/kernel/filter/filter_defines.h @@ -27,6 +27,7 @@ typedef struct TileInfo { int strides[9]; int x[4]; int y[4]; + int from_render; /* TODO(lukas): CUDA doesn't have uint64_t... */ #ifdef __KERNEL_OPENCL__ ccl_global float *buffers[9]; diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h index af73c0dadf2..0c4387af540 100644 --- a/intern/cycles/kernel/filter/filter_nlm_cpu.h +++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h @@ -22,6 +22,7 @@ CCL_NAMESPACE_BEGIN ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, const float *ccl_restrict weight_image, const float *ccl_restrict variance_image, + const float *ccl_restrict scale_image, float *difference_image, int4 rect, int stride, @@ -41,13 +42,21 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, int idx_q = (y+dy)*stride + aligned_lowx + dx; for(int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) { float4 diff = make_float4(0.0f); + float4 scale_fac; + if(scale_image) { + scale_fac = clamp(load4_a(scale_image, idx_p) / load4_u(scale_image, idx_q), + make_float4(0.25f), make_float4(4.0f)); + } + else { + scale_fac = make_float4(1.0f); + } for(int c = 0, chan_ofs = 0; c < numChannels; c++, chan_ofs += channel_offset) { /* idx_p is guaranteed to be aligned, but idx_q isn't. */ float4 color_p = load4_a(weight_image, idx_p + chan_ofs); - float4 color_q = load4_u(weight_image, idx_q + chan_ofs); + float4 color_q = scale_fac*load4_u(weight_image, idx_q + chan_ofs); float4 cdiff = color_p - color_q; float4 var_p = load4_a(variance_image, idx_p + chan_ofs); - float4 var_q = load4_u(variance_image, idx_q + chan_ofs); + float4 var_q = sqr(scale_fac)*load4_u(variance_image, idx_q + chan_ofs); diff += (cdiff*cdiff - a*(var_p + min(var_p, var_q))) / (make_float4(1e-8f) + k_2*(var_p+var_q)); } load4_a(difference_image, idx_p) = diff*channel_fac; @@ -143,6 +152,7 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, float *out_image, float *accum_image, int4 rect, + int channel_offset, int stride, int f) { @@ -160,6 +170,11 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, load4_a(accum_image, idx_p) += mask(active, weight); float4 val = load4_u(image, idx_q); + if(channel_offset) { + val += load4_u(image, idx_q + channel_offset); + val += load4_u(image, idx_q + 2*channel_offset); + val *= 1.0f/3.0f; + } load4_a(out_image, idx_p) += mask(active, weight*val); } diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h index 058afb34a92..d8e2e4d08aa 100644 --- a/intern/cycles/kernel/filter/filter_nlm_gpu.h +++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h @@ -78,17 +78,25 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y, int dx, int dy, const ccl_global float *ccl_restrict weight_image, const ccl_global float *ccl_restrict variance_image, + const ccl_global float *ccl_restrict scale_image, ccl_global float *difference_image, int4 rect, int stride, int channel_offset, float a, float k_2) { - float diff = 0.0f; + int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx); int numChannels = channel_offset? 3 : 1; - for(int c = 0; c < numChannels; c++) { - float cdiff = weight_image[c*channel_offset + y*stride + x] - weight_image[c*channel_offset + (y+dy)*stride + (x+dx)]; - float pvar = variance_image[c*channel_offset + y*stride + x]; - float qvar = variance_image[c*channel_offset + (y+dy)*stride + (x+dx)]; + + float diff = 0.0f; + float scale_fac = 1.0f; + if(scale_image) { + scale_fac = clamp(scale_image[idx_p] / scale_image[idx_q], 0.25f, 4.0f); + } + + for(int c = 0; c < numChannels; c++, idx_p += channel_offset, idx_q += channel_offset) { + float cdiff = weight_image[idx_p] - scale_fac*weight_image[idx_q]; + float pvar = variance_image[idx_p]; + float qvar = sqr(scale_fac)*variance_image[idx_q]; diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar)); } if(numChannels > 1) { @@ -133,7 +141,8 @@ ccl_device_inline void kernel_filter_nlm_update_output(int x, int y, const ccl_global float *ccl_restrict image, ccl_global float *out_image, ccl_global float *accum_image, - int4 rect, int stride, int f) + int4 rect, int channel_offset, + int stride, int f) { float sum = 0.0f; const int low = max(rect.x, x-f); @@ -142,12 +151,21 @@ ccl_device_inline void kernel_filter_nlm_update_output(int x, int y, sum += difference_image[y*stride + x1]; } sum *= 1.0f/(high-low); + + int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx); if(out_image) { - atomic_add_and_fetch_float(accum_image + y*stride + x, sum); - atomic_add_and_fetch_float(out_image + y*stride + x, sum*image[(y+dy)*stride + (x+dx)]); + atomic_add_and_fetch_float(accum_image + idx_p, sum); + + float val = image[idx_q]; + if(channel_offset) { + val += image[idx_q + channel_offset]; + val += image[idx_q + 2*channel_offset]; + val *= 1.0f/3.0f; + } + atomic_add_and_fetch_float(out_image + idx_p, sum*val); } else { - accum_image[y*stride + x] = sum; + accum_image[idx_p] = sum; } } diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h index 3507f80df46..41be4dbea49 100644 --- a/intern/cycles/kernel/filter/filter_prefilter.h +++ b/intern/cycles/kernel/filter/filter_prefilter.h @@ -84,6 +84,7 @@ ccl_device void kernel_filter_get_feature(int sample, int x, int y, ccl_global float *mean, ccl_global float *variance, + float scale, int4 rect, int buffer_pass_stride, int buffer_denoising_offset) { @@ -95,18 +96,38 @@ ccl_device void kernel_filter_get_feature(int sample, int buffer_w = align_up(rect.z - rect.x, 4); int idx = (y-rect.y)*buffer_w + (x - rect.x); - mean[idx] = center_buffer[m_offset] / sample; - if(sample > 1) { - /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance - * update does not work efficiently with atomics in the kernel. */ - variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1))); - } - else { - /* Can't compute variance with single sample, just set it very high. */ - variance[idx] = 1e10f; + float val = scale * center_buffer[m_offset]; + mean[idx] = val; + + if(v_offset >= 0) { + if(sample > 1) { + /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance + * update does not work efficiently with atomics in the kernel. */ + variance[idx] = max(0.0f, (center_buffer[v_offset] - val*val*sample) / (sample * (sample-1))); + } + else { + /* Can't compute variance with single sample, just set it very high. */ + variance[idx] = 1e10f; + } } } +ccl_device void kernel_filter_write_feature(int sample, + int x, int y, + int4 buffer_params, + ccl_global float *from, + ccl_global float *buffer, + int out_offset, + int4 rect) +{ + ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z; + + int buffer_w = align_up(rect.z - rect.x, 4); + int idx = (y-rect.y)*buffer_w + (x - rect.x); + + combined_buffer[out_offset] = from[idx]; +} + ccl_device void kernel_filter_detect_outliers(int x, int y, ccl_global float *image, ccl_global float *variance, diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h index 58740d5b06a..e5d3b0da835 100644 --- a/intern/cycles/kernel/filter/filter_reconstruction.h +++ b/intern/cycles/kernel/filter/filter_reconstruction.h @@ -108,11 +108,13 @@ ccl_device_inline void kernel_filter_finalize(int x, int y, final_color = max(final_color, make_float3(0.0f, 0.0f, 0.0f)); ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z; - final_color *= sample; - if(buffer_params.w) { - final_color.x += combined_buffer[buffer_params.w+0]; - final_color.y += combined_buffer[buffer_params.w+1]; - final_color.z += combined_buffer[buffer_params.w+2]; + if(buffer_params.w >= 0) { + final_color *= sample; + if(buffer_params.w > 0) { + final_color.x += combined_buffer[buffer_params.w+0]; + final_color.y += combined_buffer[buffer_params.w+1]; + final_color.z += combined_buffer[buffer_params.w+2]; + } } combined_buffer[0] = final_color.x; combined_buffer[1] = final_color.y; diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index 864aa7c470a..caa0057d997 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -472,8 +472,17 @@ typedef enum DenoisingPassOffsets { DENOISING_PASS_COLOR_VAR = 23, DENOISING_PASS_CLEAN = 26, + DENOISING_PASS_PREFILTERED_DEPTH = 0, + DENOISING_PASS_PREFILTERED_NORMAL = 1, + DENOISING_PASS_PREFILTERED_SHADOWING = 4, + DENOISING_PASS_PREFILTERED_ALBEDO = 5, + DENOISING_PASS_PREFILTERED_COLOR = 8, + DENOISING_PASS_PREFILTERED_VARIANCE = 11, + DENOISING_PASS_PREFILTERED_INTENSITY = 14, + DENOISING_PASS_SIZE_BASE = 26, DENOISING_PASS_SIZE_CLEAN = 3, + DENOISING_PASS_SIZE_PREFILTERED = 15, } DenoisingPassOffsets; typedef enum eBakePassFilter { diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h index e036b53b810..08333c7a455 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_cpu.h +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h @@ -37,10 +37,20 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, int y, float *mean, float *variance, + float scale, int* prefilter_rect, int buffer_pass_stride, int buffer_denoising_offset); +void KERNEL_FUNCTION_FULL_NAME(filter_write_feature)(int sample, + int x, + int y, + int *buffer_params, + float *from, + float *buffer, + int out_offset, + int* prefilter_rect); + void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y, ccl_global float *image, ccl_global float *variance, @@ -71,7 +81,8 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, int dy, float *weight_image, - float *variance, + float *variance_image, + float *scale_image, float *difference_image, int* rect, int stride, @@ -99,6 +110,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, float *out_image, float *accum_image, int* rect, + int channel_offset, int stride, int f); diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h index 4c758711481..b792367e3ab 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h @@ -69,6 +69,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, int x, int y, float *mean, float *variance, + float scale, int* prefilter_rect, int buffer_pass_stride, int buffer_denoising_offset) @@ -80,12 +81,29 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample, m_offset, v_offset, x, y, mean, variance, + scale, load_int4(prefilter_rect), buffer_pass_stride, buffer_denoising_offset); #endif } +void KERNEL_FUNCTION_FULL_NAME(filter_write_feature)(int sample, + int x, + int y, + int *buffer_params, + float *from, + float *buffer, + int out_offset, + int* prefilter_rect) +{ +#ifdef KERNEL_STUB + STUB_ASSERT(KERNEL_ARCH, filter_write_feature); +#else + kernel_filter_write_feature(sample, x, y, load_int4(buffer_params), from, buffer, out_offset, load_int4(prefilter_rect)); +#endif +} + void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y, ccl_global float *image, ccl_global float *variance, @@ -130,8 +148,8 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_construct_transform); #else - rank += storage_ofs; - transform += storage_ofs*TRANSFORM_SIZE; + rank += storage_ofs; + transform += storage_ofs*TRANSFORM_SIZE; kernel_filter_construct_transform(buffer, x, y, load_int4(prefilter_rect), @@ -146,7 +164,8 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer, void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, int dy, float *weight_image, - float *variance, + float *variance_image, + float *scale_image, float *difference_image, int *rect, int stride, @@ -157,7 +176,15 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference); #else - kernel_filter_nlm_calc_difference(dx, dy, weight_image, variance, difference_image, load_int4(rect), stride, channel_offset, a, k_2); + kernel_filter_nlm_calc_difference(dx, dy, + weight_image, + variance_image, + scale_image, + difference_image, + load_int4(rect), + stride, + channel_offset, + a, k_2); #endif } @@ -195,13 +222,22 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, float *out_image, float *accum_image, int *rect, + int channel_offset, int stride, int f) { #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output); #else - kernel_filter_nlm_update_output(dx, dy, difference_image, image, temp_image, out_image, accum_image, load_int4(rect), stride, f); + kernel_filter_nlm_update_output(dx, dy, + difference_image, + image, + temp_image, + out_image, + accum_image, + load_int4(rect), + channel_offset, + stride, f); #endif } @@ -222,7 +258,15 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian); #else - kernel_filter_nlm_construct_gramian(dx, dy, difference_image, buffer, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_window), stride, f, pass_stride); + kernel_filter_nlm_construct_gramian(dx, dy, + difference_image, + buffer, + transform, rank, + XtWX, XtWY, + load_int4(rect), + load_int4(filter_window), + stride, f, + pass_stride); #endif } diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu index b856cbde45c..3b51bb41aed 100644 --- a/intern/cycles/kernel/kernels/cuda/filter.cu +++ b/intern/cycles/kernel/kernels/cuda/filter.cu @@ -64,6 +64,7 @@ kernel_cuda_filter_get_feature(int sample, int v_offset, float *mean, float *variance, + float scale, int4 prefilter_rect, int buffer_pass_stride, int buffer_denoising_offset) @@ -76,6 +77,7 @@ kernel_cuda_filter_get_feature(int sample, m_offset, v_offset, x, y, mean, variance, + scale, prefilter_rect, buffer_pass_stride, buffer_denoising_offset); @@ -84,6 +86,30 @@ kernel_cuda_filter_get_feature(int sample, extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_filter_write_feature(int sample, + int4 buffer_params, + int4 filter_area, + float *from, + float *buffer, + int out_offset, + int4 prefilter_rect) +{ + int x = blockDim.x*blockIdx.x + threadIdx.x; + int y = blockDim.y*blockIdx.y + threadIdx.y; + if(x < filter_area.z && y < filter_area.w) { + kernel_filter_write_feature(sample, + x + filter_area.x, + y + filter_area.y, + buffer_params, + from, + buffer, + out_offset, + prefilter_rect); + } +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_detect_outliers(float *image, float *variance, float *depth, @@ -136,6 +162,7 @@ extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image, const float *ccl_restrict variance_image, + const float *ccl_restrict scale_image, float *difference_image, int w, int h, @@ -152,9 +179,11 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image, kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w, weight_image, variance_image, + scale_image, difference_image + ofs, rect, stride, - channel_offset, a, k_2); + channel_offset, + a, k_2); } } @@ -210,6 +239,7 @@ kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image, int h, int stride, int pass_stride, + int channel_offset, int r, int f) { @@ -221,7 +251,9 @@ kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image, image, out_image, accum_image, - rect, stride, f); + rect, + channel_offset, + stride, f); } } diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl index a550f97f4eb..8a821ee281d 100644 --- a/intern/cycles/kernel/kernels/opencl/filter.cl +++ b/intern/cycles/kernel/kernels/opencl/filter.cl @@ -56,6 +56,7 @@ __kernel void kernel_ocl_filter_get_feature(int sample, int v_offset, ccl_global float *mean, ccl_global float *variance, + float scale, int4 prefilter_rect, int buffer_pass_stride, int buffer_denoising_offset) @@ -68,12 +69,35 @@ __kernel void kernel_ocl_filter_get_feature(int sample, m_offset, v_offset, x, y, mean, variance, + scale, prefilter_rect, buffer_pass_stride, buffer_denoising_offset); } } +__kernel void kernel_ocl_filter_write_feature(int sample, + int4 buffer_params, + int4 filter_area, + ccl_global float *from, + ccl_global float *buffer, + int out_offset, + int4 prefilter_rect) +{ + int x = get_global_id(0); + int y = get_global_id(1); + if(x < filter_area.z && y < filter_area.w) { + kernel_filter_write_feature(sample, + x + filter_area.x, + y + filter_area.y, + buffer_params, + from, + buffer, + out_offset, + prefilter_rect); + } +} + __kernel void kernel_ocl_filter_detect_outliers(ccl_global float *image, ccl_global float *variance, ccl_global float *depth, @@ -128,6 +152,7 @@ __kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_restrict weight_image, const ccl_global float *ccl_restrict variance_image, + const ccl_global float *ccl_restrict scale_image, ccl_global float *difference_image, int w, int h, @@ -144,9 +169,11 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_ kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w, weight_image, variance_image, + scale_image, difference_image + ofs, rect, stride, - channel_offset, a, k_2); + channel_offset, + a, k_2); } } @@ -196,6 +223,7 @@ __kernel void kernel_ocl_filter_nlm_update_output(const ccl_global float *ccl_re int h, int stride, int pass_stride, + int channel_offset, int r, int f) { @@ -207,7 +235,9 @@ __kernel void kernel_ocl_filter_nlm_update_output(const ccl_global float *ccl_re image, out_image, accum_image, - rect, stride, f); + rect, + channel_offset, + stride, f); } } |