diff options
author | Lukas Stockner <lukas.stockner@freenet.de> | 2018-10-06 21:39:01 +0300 |
---|---|---|
committer | Lukas Stockner <lukas.stockner@freenet.de> | 2018-10-06 22:49:54 +0300 |
commit | a0cc7bd961ef2cd501468dce08998992a88b3bed (patch) | |
tree | 90a26871e42e7aafc7d17c0172b4513a501896f8 /intern/cycles/kernel | |
parent | 6e08b01bd9ab87f63d8176bf2ec0b4f073a92ee3 (diff) |
Cycles: Implement vectorized NLM kernels for faster CPU denoising
Diffstat (limited to 'intern/cycles/kernel')
-rw-r--r-- | intern/cycles/kernel/filter/filter_nlm_cpu.h | 139 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/cpu/filter_cpu.h | 1 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h | 3 |
3 files changed, 95 insertions, 48 deletions
diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h index e2da0fd872b..af73c0dadf2 100644 --- a/intern/cycles/kernel/filter/filter_nlm_cpu.h +++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h @@ -16,6 +16,9 @@ CCL_NAMESPACE_BEGIN +#define load4_a(buf, ofs) (*((float4*) ((buf) + (ofs)))) +#define load4_u(buf, ofs) load_float4((buf)+(ofs)) + ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, const float *ccl_restrict weight_image, const float *ccl_restrict variance_image, @@ -26,20 +29,28 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, float a, float k_2) { + /* Strides need to be aligned to 16 bytes. */ + kernel_assert((stride % 4) == 0 && (channel_offset % 4) == 0); + + int aligned_lowx = rect.x & (~3); + const int numChannels = (channel_offset > 0)? 3 : 1; + const float4 channel_fac = make_float4(1.0f / numChannels); + for(int y = rect.y; y < rect.w; y++) { - for(int x = rect.x; x < rect.z; x++) { - float diff = 0.0f; - int numChannels = channel_offset? 3 : 1; - for(int c = 0; c < numChannels; c++) { - float cdiff = weight_image[c*channel_offset + y*stride + x] - weight_image[c*channel_offset + (y+dy)*stride + (x+dx)]; - float pvar = variance_image[c*channel_offset + y*stride + x]; - float qvar = variance_image[c*channel_offset + (y+dy)*stride + (x+dx)]; - diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar)); - } - if(numChannels > 1) { - diff *= 1.0f/numChannels; + int idx_p = y*stride + aligned_lowx; + int idx_q = (y+dy)*stride + aligned_lowx + dx; + for(int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) { + float4 diff = make_float4(0.0f); + for(int c = 0, chan_ofs = 0; c < numChannels; c++, chan_ofs += channel_offset) { + /* idx_p is guaranteed to be aligned, but idx_q isn't. */ + float4 color_p = load4_a(weight_image, idx_p + chan_ofs); + float4 color_q = load4_u(weight_image, idx_q + chan_ofs); + float4 cdiff = color_p - color_q; + float4 var_p = load4_a(variance_image, idx_p + chan_ofs); + float4 var_q = load4_u(variance_image, idx_q + chan_ofs); + diff += (cdiff*cdiff - a*(var_p + min(var_p, var_q))) / (make_float4(1e-8f) + k_2*(var_p+var_q)); } - difference_image[y*stride + x] = diff; + load4_a(difference_image, idx_p) = diff*channel_fac; } } } @@ -50,52 +61,77 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen int stride, int f) { - int aligned_lowx = rect.x / 4; - int aligned_highx = (rect.z + 3) / 4; + int aligned_lowx = round_down(rect.x, 4); for(int y = rect.y; y < rect.w; y++) { const int low = max(rect.y, y-f); const int high = min(rect.w, y+f+1); - for(int x = rect.x; x < rect.z; x++) { - out_image[y*stride + x] = 0.0f; + for(int x = aligned_lowx; x < rect.z; x += 4) { + load4_a(out_image, y*stride + x) = make_float4(0.0f); } for(int y1 = low; y1 < high; y1++) { - float4* out_image4 = (float4*)(out_image + y*stride); - float4* difference_image4 = (float4*)(difference_image + y1*stride); - for(int x = aligned_lowx; x < aligned_highx; x++) { - out_image4[x] += difference_image4[x]; + for(int x = aligned_lowx; x < rect.z; x += 4) { + load4_a(out_image, y*stride + x) += load4_a(difference_image, y1*stride + x); } } - for(int x = rect.x; x < rect.z; x++) { - out_image[y*stride + x] *= 1.0f/(high - low); + float fac = 1.0f/(high - low); + for(int x = aligned_lowx; x < rect.z; x += 4) { + load4_a(out_image, y*stride + x) *= fac; } } } -ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict difference_image, - float *out_image, - int4 rect, - int stride, - int f) +ccl_device_inline void nlm_blur_horizontal(const float *ccl_restrict difference_image, + float *out_image, + int4 rect, + int stride, + int f) { + int aligned_lowx = round_down(rect.x, 4); for(int y = rect.y; y < rect.w; y++) { - for(int x = rect.x; x < rect.z; x++) { - out_image[y*stride + x] = 0.0f; + for(int x = aligned_lowx; x < rect.z; x += 4) { + load4_a(out_image, y*stride + x) = make_float4(0.0f); } } + for(int dx = -f; dx <= f; dx++) { - int pos_dx = max(0, dx); - int neg_dx = min(0, dx); + aligned_lowx = round_down(rect.x - min(0, dx), 4); + int highx = rect.z - max(0, dx); + int4 lowx4 = make_int4(rect.x - min(0, dx)); + int4 highx4 = make_int4(rect.z - max(0, dx)); for(int y = rect.y; y < rect.w; y++) { - for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) { - out_image[y*stride + x] += difference_image[y*stride + x+dx]; + for(int x = aligned_lowx; x < highx; x += 4) { + int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3); + int4 active = (x4 >= lowx4) & (x4 < highx4); + + float4 diff = load4_u(difference_image, y*stride + x + dx); + load4_a(out_image, y*stride + x) += mask(active, diff); } } } + + aligned_lowx = round_down(rect.x, 4); for(int y = rect.y; y < rect.w; y++) { - for(int x = rect.x; x < rect.z; x++) { - const int low = max(rect.x, x-f); - const int high = min(rect.z, x+f+1); - out_image[y*stride + x] = fast_expf(-max(out_image[y*stride + x] * (1.0f/(high - low)), 0.0f)); + for(int x = aligned_lowx; x < rect.z; x += 4) { + float4 x4 = make_float4(x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); + float4 low = max(make_float4(rect.x), x4 - make_float4(f)); + float4 high = min(make_float4(rect.z), x4 + make_float4(f+1)); + load4_a(out_image, y*stride + x) *= rcp(high - low); + } + } +} + +ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict difference_image, + float *out_image, + int4 rect, + int stride, + int f) +{ + nlm_blur_horizontal(difference_image, out_image, rect, stride, f); + + int aligned_lowx = round_down(rect.x, 4); + for(int y = rect.y; y < rect.w; y++) { + for(int x = aligned_lowx; x < rect.z; x += 4) { + load4_a(out_image, y*stride + x) = fast_expf4(-max(load4_a(out_image, y*stride + x), make_float4(0.0f))); } } } @@ -103,23 +139,29 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict d ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, const float *ccl_restrict difference_image, const float *ccl_restrict image, + float *temp_image, float *out_image, float *accum_image, int4 rect, int stride, int f) { + nlm_blur_horizontal(difference_image, temp_image, rect, stride, f); + + int aligned_lowx = round_down(rect.x, 4); for(int y = rect.y; y < rect.w; y++) { - for(int x = rect.x; x < rect.z; x++) { - const int low = max(rect.x, x-f); - const int high = min(rect.z, x+f+1); - float sum = 0.0f; - for(int x1 = low; x1 < high; x1++) { - sum += difference_image[y*stride + x1]; - } - float weight = sum * (1.0f/(high - low)); - accum_image[y*stride + x] += weight; - out_image[y*stride + x] += weight*image[(y+dy)*stride + (x+dx)]; + for(int x = aligned_lowx; x < rect.z; x += 4) { + int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3); + int4 active = (x4 >= make_int4(rect.x)) & (x4 < make_int4(rect.z)); + + int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx); + + float4 weight = load4_a(temp_image, idx_p); + load4_a(accum_image, idx_p) += mask(active, weight); + + float4 val = load4_u(image, idx_q); + + load4_a(out_image, idx_p) += mask(active, weight*val); } } } @@ -177,4 +219,7 @@ ccl_device_inline void kernel_filter_nlm_normalize(float *out_image, } } +#undef load4_a +#undef load4_u + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h index b62aa9663ec..e036b53b810 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_cpu.h +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h @@ -95,6 +95,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, int dy, float *difference_image, float *image, + float *temp_image, float *out_image, float *accum_image, int* rect, diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h index 26777fdabb2..4c758711481 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h @@ -191,6 +191,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, int dy, float *difference_image, float *image, + float *temp_image, float *out_image, float *accum_image, int *rect, @@ -200,7 +201,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output); #else - kernel_filter_nlm_update_output(dx, dy, difference_image, image, out_image, accum_image, load_int4(rect), stride, f); + kernel_filter_nlm_update_output(dx, dy, difference_image, image, temp_image, out_image, accum_image, load_int4(rect), stride, f); #endif } |