From e12c08e8d170b7ca40f204a5b0423c23a9fbc2c1 Mon Sep 17 00:00:00 2001 From: Campbell Barton Date: Wed, 17 Apr 2019 06:17:24 +0200 Subject: ClangFormat: apply to source, most of intern Apply clang format as proposed in T53211. For details on usage and instructions for migrating branches without conflicts, see: https://wiki.blender.org/wiki/Tools/ClangFormat --- intern/cycles/kernel/filter/filter_nlm_cpu.h | 334 ++++++++++++++------------- 1 file changed, 171 insertions(+), 163 deletions(-) (limited to 'intern/cycles/kernel/filter/filter_nlm_cpu.h') diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h index 9eb3c603a4a..a94266a8786 100644 --- a/intern/cycles/kernel/filter/filter_nlm_cpu.h +++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h @@ -16,10 +16,11 @@ CCL_NAMESPACE_BEGIN -#define load4_a(buf, ofs) (*((float4*) ((buf) + (ofs)))) -#define load4_u(buf, ofs) load_float4((buf)+(ofs)) +#define load4_a(buf, ofs) (*((float4 *)((buf) + (ofs)))) +#define load4_u(buf, ofs) load_float4((buf) + (ofs)) -ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, +ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, + int dy, const float *ccl_restrict weight_image, const float *ccl_restrict variance_image, const float *ccl_restrict scale_image, @@ -31,122 +32,117 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, float a, float k_2) { - /* Strides need to be aligned to 16 bytes. */ - kernel_assert((stride % 4) == 0 && (channel_offset % 4) == 0); - - int aligned_lowx = rect.x & (~3); - const int numChannels = (channel_offset > 0)? 3 : 1; - const float4 channel_fac = make_float4(1.0f / numChannels); - - for(int y = rect.y; y < rect.w; y++) { - int idx_p = y*stride + aligned_lowx; - int idx_q = (y+dy)*stride + aligned_lowx + dx + frame_offset; - for(int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) { - float4 diff = make_float4(0.0f); - float4 scale_fac; - if(scale_image) { - scale_fac = clamp(load4_a(scale_image, idx_p) / load4_u(scale_image, idx_q), - make_float4(0.25f), make_float4(4.0f)); - } - else { - scale_fac = make_float4(1.0f); - } - for(int c = 0, chan_ofs = 0; c < numChannels; c++, chan_ofs += channel_offset) { - /* idx_p is guaranteed to be aligned, but idx_q isn't. */ - float4 color_p = load4_a(weight_image, idx_p + chan_ofs); - float4 color_q = scale_fac*load4_u(weight_image, idx_q + chan_ofs); - float4 cdiff = color_p - color_q; - float4 var_p = load4_a(variance_image, idx_p + chan_ofs); - float4 var_q = sqr(scale_fac)*load4_u(variance_image, idx_q + chan_ofs); - diff += (cdiff*cdiff - a*(var_p + min(var_p, var_q))) / (make_float4(1e-8f) + k_2*(var_p+var_q)); - } - load4_a(difference_image, idx_p) = diff*channel_fac; - } - } + /* Strides need to be aligned to 16 bytes. */ + kernel_assert((stride % 4) == 0 && (channel_offset % 4) == 0); + + int aligned_lowx = rect.x & (~3); + const int numChannels = (channel_offset > 0) ? 3 : 1; + const float4 channel_fac = make_float4(1.0f / numChannels); + + for (int y = rect.y; y < rect.w; y++) { + int idx_p = y * stride + aligned_lowx; + int idx_q = (y + dy) * stride + aligned_lowx + dx + frame_offset; + for (int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) { + float4 diff = make_float4(0.0f); + float4 scale_fac; + if (scale_image) { + scale_fac = clamp(load4_a(scale_image, idx_p) / load4_u(scale_image, idx_q), + make_float4(0.25f), + make_float4(4.0f)); + } + else { + scale_fac = make_float4(1.0f); + } + for (int c = 0, chan_ofs = 0; c < numChannels; c++, chan_ofs += channel_offset) { + /* idx_p is guaranteed to be aligned, but idx_q isn't. */ + float4 color_p = load4_a(weight_image, idx_p + chan_ofs); + float4 color_q = scale_fac * load4_u(weight_image, idx_q + chan_ofs); + float4 cdiff = color_p - color_q; + float4 var_p = load4_a(variance_image, idx_p + chan_ofs); + float4 var_q = sqr(scale_fac) * load4_u(variance_image, idx_q + chan_ofs); + diff += (cdiff * cdiff - a * (var_p + min(var_p, var_q))) / + (make_float4(1e-8f) + k_2 * (var_p + var_q)); + } + load4_a(difference_image, idx_p) = diff * channel_fac; + } + } } -ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict difference_image, - float *out_image, - int4 rect, - int stride, - int f) +ccl_device_inline void kernel_filter_nlm_blur( + const float *ccl_restrict difference_image, float *out_image, int4 rect, int stride, int f) { - int aligned_lowx = round_down(rect.x, 4); - for(int y = rect.y; y < rect.w; y++) { - const int low = max(rect.y, y-f); - const int high = min(rect.w, y+f+1); - for(int x = aligned_lowx; x < rect.z; x += 4) { - load4_a(out_image, y*stride + x) = make_float4(0.0f); - } - for(int y1 = low; y1 < high; y1++) { - for(int x = aligned_lowx; x < rect.z; x += 4) { - load4_a(out_image, y*stride + x) += load4_a(difference_image, y1*stride + x); - } - } - float fac = 1.0f/(high - low); - for(int x = aligned_lowx; x < rect.z; x += 4) { - load4_a(out_image, y*stride + x) *= fac; - } - } + int aligned_lowx = round_down(rect.x, 4); + for (int y = rect.y; y < rect.w; y++) { + const int low = max(rect.y, y - f); + const int high = min(rect.w, y + f + 1); + for (int x = aligned_lowx; x < rect.z; x += 4) { + load4_a(out_image, y * stride + x) = make_float4(0.0f); + } + for (int y1 = low; y1 < high; y1++) { + for (int x = aligned_lowx; x < rect.z; x += 4) { + load4_a(out_image, y * stride + x) += load4_a(difference_image, y1 * stride + x); + } + } + float fac = 1.0f / (high - low); + for (int x = aligned_lowx; x < rect.z; x += 4) { + load4_a(out_image, y * stride + x) *= fac; + } + } } -ccl_device_inline void nlm_blur_horizontal(const float *ccl_restrict difference_image, - float *out_image, - int4 rect, - int stride, - int f) +ccl_device_inline void nlm_blur_horizontal( + const float *ccl_restrict difference_image, float *out_image, int4 rect, int stride, int f) { - int aligned_lowx = round_down(rect.x, 4); - for(int y = rect.y; y < rect.w; y++) { - for(int x = aligned_lowx; x < rect.z; x += 4) { - load4_a(out_image, y*stride + x) = make_float4(0.0f); - } - } - - for(int dx = -f; dx <= f; dx++) { - aligned_lowx = round_down(rect.x - min(0, dx), 4); - int highx = rect.z - max(0, dx); - int4 lowx4 = make_int4(rect.x - min(0, dx)); - int4 highx4 = make_int4(rect.z - max(0, dx)); - for(int y = rect.y; y < rect.w; y++) { - for(int x = aligned_lowx; x < highx; x += 4) { - int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3); - int4 active = (x4 >= lowx4) & (x4 < highx4); - - float4 diff = load4_u(difference_image, y*stride + x + dx); - load4_a(out_image, y*stride + x) += mask(active, diff); - } - } - } - - aligned_lowx = round_down(rect.x, 4); - for(int y = rect.y; y < rect.w; y++) { - for(int x = aligned_lowx; x < rect.z; x += 4) { - float4 x4 = make_float4(x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); - float4 low = max(make_float4(rect.x), x4 - make_float4(f)); - float4 high = min(make_float4(rect.z), x4 + make_float4(f+1)); - load4_a(out_image, y*stride + x) *= rcp(high - low); - } - } + int aligned_lowx = round_down(rect.x, 4); + for (int y = rect.y; y < rect.w; y++) { + for (int x = aligned_lowx; x < rect.z; x += 4) { + load4_a(out_image, y * stride + x) = make_float4(0.0f); + } + } + + for (int dx = -f; dx <= f; dx++) { + aligned_lowx = round_down(rect.x - min(0, dx), 4); + int highx = rect.z - max(0, dx); + int4 lowx4 = make_int4(rect.x - min(0, dx)); + int4 highx4 = make_int4(rect.z - max(0, dx)); + for (int y = rect.y; y < rect.w; y++) { + for (int x = aligned_lowx; x < highx; x += 4) { + int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3); + int4 active = (x4 >= lowx4) & (x4 < highx4); + + float4 diff = load4_u(difference_image, y * stride + x + dx); + load4_a(out_image, y * stride + x) += mask(active, diff); + } + } + } + + aligned_lowx = round_down(rect.x, 4); + for (int y = rect.y; y < rect.w; y++) { + for (int x = aligned_lowx; x < rect.z; x += 4) { + float4 x4 = make_float4(x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); + float4 low = max(make_float4(rect.x), x4 - make_float4(f)); + float4 high = min(make_float4(rect.z), x4 + make_float4(f + 1)); + load4_a(out_image, y * stride + x) *= rcp(high - low); + } + } } -ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict difference_image, - float *out_image, - int4 rect, - int stride, - int f) +ccl_device_inline void kernel_filter_nlm_calc_weight( + const float *ccl_restrict difference_image, float *out_image, int4 rect, int stride, int f) { - nlm_blur_horizontal(difference_image, out_image, rect, stride, f); - - int aligned_lowx = round_down(rect.x, 4); - for(int y = rect.y; y < rect.w; y++) { - for(int x = aligned_lowx; x < rect.z; x += 4) { - load4_a(out_image, y*stride + x) = fast_expf4(-max(load4_a(out_image, y*stride + x), make_float4(0.0f))); - } - } + nlm_blur_horizontal(difference_image, out_image, rect, stride, f); + + int aligned_lowx = round_down(rect.x, 4); + for (int y = rect.y; y < rect.w; y++) { + for (int x = aligned_lowx; x < rect.z; x += 4) { + load4_a(out_image, y * stride + x) = fast_expf4( + -max(load4_a(out_image, y * stride + x), make_float4(0.0f))); + } + } } -ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, +ccl_device_inline void kernel_filter_nlm_update_output(int dx, + int dy, const float *ccl_restrict difference_image, const float *ccl_restrict image, float *temp_image, @@ -157,33 +153,36 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, int stride, int f) { - nlm_blur_horizontal(difference_image, temp_image, rect, stride, f); + nlm_blur_horizontal(difference_image, temp_image, rect, stride, f); - int aligned_lowx = round_down(rect.x, 4); - for(int y = rect.y; y < rect.w; y++) { - for(int x = aligned_lowx; x < rect.z; x += 4) { - int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3); - int4 active = (x4 >= make_int4(rect.x)) & (x4 < make_int4(rect.z)); + int aligned_lowx = round_down(rect.x, 4); + for (int y = rect.y; y < rect.w; y++) { + for (int x = aligned_lowx; x < rect.z; x += 4) { + int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3); + int4 active = (x4 >= make_int4(rect.x)) & (x4 < make_int4(rect.z)); - int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx); + int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx); - float4 weight = load4_a(temp_image, idx_p); - load4_a(accum_image, idx_p) += mask(active, weight); + float4 weight = load4_a(temp_image, idx_p); + load4_a(accum_image, idx_p) += mask(active, weight); - float4 val = load4_u(image, idx_q); - if(channel_offset) { - val += load4_u(image, idx_q + channel_offset); - val += load4_u(image, idx_q + 2*channel_offset); - val *= 1.0f/3.0f; - } + float4 val = load4_u(image, idx_q); + if (channel_offset) { + val += load4_u(image, idx_q + channel_offset); + val += load4_u(image, idx_q + 2 * channel_offset); + val *= 1.0f / 3.0f; + } - load4_a(out_image, idx_p) += mask(active, weight*val); - } - } + load4_a(out_image, idx_p) += mask(active, weight * val); + } + } } -ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, int t, - const float *ccl_restrict difference_image, +ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, + int dy, + int t, + const float *ccl_restrict + difference_image, const float *ccl_restrict buffer, float *transform, int *rank, @@ -191,40 +190,49 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, int t float3 *XtWY, int4 rect, int4 filter_window, - int stride, int f, + int stride, + int f, int pass_stride, int frame_offset, bool use_time) { - int4 clip_area = rect_clip(rect, filter_window); - /* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */ - for(int y = clip_area.y; y < clip_area.w; y++) { - for(int x = clip_area.x; x < clip_area.z; x++) { - const int low = max(rect.x, x-f); - const int high = min(rect.z, x+f+1); - float sum = 0.0f; - for(int x1 = low; x1 < high; x1++) { - sum += difference_image[y*stride + x1]; - } - float weight = sum * (1.0f/(high - low)); - - int storage_ofs = coord_to_local_index(filter_window, x, y); - float *l_transform = transform + storage_ofs*TRANSFORM_SIZE; - float *l_XtWX = XtWX + storage_ofs*XTWX_SIZE; - float3 *l_XtWY = XtWY + storage_ofs*XTWY_SIZE; - int *l_rank = rank + storage_ofs; - - kernel_filter_construct_gramian(x, y, 1, - dx, dy, t, - stride, - pass_stride, - frame_offset, - use_time, - buffer, - l_transform, l_rank, - weight, l_XtWX, l_XtWY, 0); - } - } + int4 clip_area = rect_clip(rect, filter_window); + /* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */ + for (int y = clip_area.y; y < clip_area.w; y++) { + for (int x = clip_area.x; x < clip_area.z; x++) { + const int low = max(rect.x, x - f); + const int high = min(rect.z, x + f + 1); + float sum = 0.0f; + for (int x1 = low; x1 < high; x1++) { + sum += difference_image[y * stride + x1]; + } + float weight = sum * (1.0f / (high - low)); + + int storage_ofs = coord_to_local_index(filter_window, x, y); + float *l_transform = transform + storage_ofs * TRANSFORM_SIZE; + float *l_XtWX = XtWX + storage_ofs * XTWX_SIZE; + float3 *l_XtWY = XtWY + storage_ofs * XTWY_SIZE; + int *l_rank = rank + storage_ofs; + + kernel_filter_construct_gramian(x, + y, + 1, + dx, + dy, + t, + stride, + pass_stride, + frame_offset, + use_time, + buffer, + l_transform, + l_rank, + weight, + l_XtWX, + l_XtWY, + 0); + } + } } ccl_device_inline void kernel_filter_nlm_normalize(float *out_image, @@ -232,11 +240,11 @@ ccl_device_inline void kernel_filter_nlm_normalize(float *out_image, int4 rect, int w) { - for(int y = rect.y; y < rect.w; y++) { - for(int x = rect.x; x < rect.z; x++) { - out_image[y*w+x] /= accum_image[y*w+x]; - } - } + for (int y = rect.y; y < rect.w; y++) { + for (int x = rect.x; x < rect.z; x++) { + out_image[y * w + x] /= accum_image[y * w + x]; + } + } } #undef load4_a -- cgit v1.2.3