diff options
author | Campbell Barton <ideasman42@gmail.com> | 2019-04-17 07:17:24 +0300 |
---|---|---|
committer | Campbell Barton <ideasman42@gmail.com> | 2019-04-17 07:21:24 +0300 |
commit | e12c08e8d170b7ca40f204a5b0423c23a9fbc2c1 (patch) | |
tree | 8cf3453d12edb177a218ef8009357518ec6cab6a /intern/cycles/kernel/filter | |
parent | b3dabc200a4b0399ec6b81f2ff2730d07b44fcaa (diff) |
ClangFormat: apply to source, most of intern
Apply clang format as proposed in T53211.
For details on usage and instructions for migrating branches
without conflicts, see:
https://wiki.blender.org/wiki/Tools/ClangFormat
Diffstat (limited to 'intern/cycles/kernel/filter')
-rw-r--r-- | intern/cycles/kernel/filter/filter.h | 6 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_defines.h | 75 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_features.h | 168 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_features_sse.h | 129 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_nlm_cpu.h | 334 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_nlm_gpu.h | 365 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_prefilter.h | 325 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_reconstruction.h | 142 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_transform.h | 177 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_transform_gpu.h | 186 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_transform_sse.h | 192 |
11 files changed, 1101 insertions, 998 deletions
diff --git a/intern/cycles/kernel/filter/filter.h b/intern/cycles/kernel/filter/filter.h index 4209d69ee73..b067e53a8bf 100644 --- a/intern/cycles/kernel/filter/filter.h +++ b/intern/cycles/kernel/filter/filter.h @@ -25,8 +25,8 @@ CCL_NAMESPACE_BEGIN -#define KERNEL_NAME_JOIN(x, y, z) x ## _ ## y ## _ ## z -#define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name) +#define KERNEL_NAME_JOIN(x, y, z) x##_##y##_##z +#define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name) #define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name) #define KERNEL_ARCH cpu @@ -49,4 +49,4 @@ CCL_NAMESPACE_BEGIN CCL_NAMESPACE_END -#endif /* __FILTER_H__ */ +#endif /* __FILTER_H__ */ diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h index cb04aac35f4..0e51eeef92f 100644 --- a/intern/cycles/kernel/filter/filter_defines.h +++ b/intern/cycles/kernel/filter/filter_defines.h @@ -18,59 +18,56 @@ #define __FILTER_DEFINES_H__ #define DENOISE_FEATURES 11 -#define TRANSFORM_SIZE (DENOISE_FEATURES*DENOISE_FEATURES) -#define XTWX_SIZE (((DENOISE_FEATURES+1)*(DENOISE_FEATURES+2))/2) -#define XTWY_SIZE (DENOISE_FEATURES+1) +#define TRANSFORM_SIZE (DENOISE_FEATURES * DENOISE_FEATURES) +#define XTWX_SIZE (((DENOISE_FEATURES + 1) * (DENOISE_FEATURES + 2)) / 2) +#define XTWY_SIZE (DENOISE_FEATURES + 1) #define DENOISE_MAX_FRAMES 16 typedef struct TileInfo { - int offsets[9]; - int strides[9]; - int x[4]; - int y[4]; - int from_render; - int frames[DENOISE_MAX_FRAMES]; - int num_frames; - /* TODO(lukas): CUDA doesn't have uint64_t... */ + int offsets[9]; + int strides[9]; + int x[4]; + int y[4]; + int from_render; + int frames[DENOISE_MAX_FRAMES]; + int num_frames; + /* TODO(lukas): CUDA doesn't have uint64_t... */ #ifdef __KERNEL_OPENCL__ - ccl_global float *buffers[9]; + ccl_global float *buffers[9]; #else - long long int buffers[9]; + long long int buffers[9]; #endif } TileInfo; #ifdef __KERNEL_OPENCL__ -# define CCL_FILTER_TILE_INFO ccl_global TileInfo* tile_info, \ - ccl_global float *tile_buffer_1, \ - ccl_global float *tile_buffer_2, \ - ccl_global float *tile_buffer_3, \ - ccl_global float *tile_buffer_4, \ - ccl_global float *tile_buffer_5, \ - ccl_global float *tile_buffer_6, \ - ccl_global float *tile_buffer_7, \ - ccl_global float *tile_buffer_8, \ - ccl_global float *tile_buffer_9 -# define CCL_FILTER_TILE_INFO_ARG tile_info, \ - tile_buffer_1, tile_buffer_2, tile_buffer_3, \ - tile_buffer_4, tile_buffer_5, tile_buffer_6, \ - tile_buffer_7, tile_buffer_8, tile_buffer_9 -# define ccl_get_tile_buffer(id) (id == 0 ? tile_buffer_1 \ - : id == 1 ? tile_buffer_2 \ - : id == 2 ? tile_buffer_3 \ - : id == 3 ? tile_buffer_4 \ - : id == 4 ? tile_buffer_5 \ - : id == 5 ? tile_buffer_6 \ - : id == 6 ? tile_buffer_7 \ - : id == 7 ? tile_buffer_8 \ - : tile_buffer_9) +# define CCL_FILTER_TILE_INFO \ + ccl_global TileInfo *tile_info, ccl_global float *tile_buffer_1, \ + ccl_global float *tile_buffer_2, ccl_global float *tile_buffer_3, \ + ccl_global float *tile_buffer_4, ccl_global float *tile_buffer_5, \ + ccl_global float *tile_buffer_6, ccl_global float *tile_buffer_7, \ + ccl_global float *tile_buffer_8, ccl_global float *tile_buffer_9 +# define CCL_FILTER_TILE_INFO_ARG \ + tile_info, tile_buffer_1, tile_buffer_2, tile_buffer_3, tile_buffer_4, tile_buffer_5, \ + tile_buffer_6, tile_buffer_7, tile_buffer_8, tile_buffer_9 +# define ccl_get_tile_buffer(id) \ + (id == 0 ? tile_buffer_1 : \ + id == 1 ? \ + tile_buffer_2 : \ + id == 2 ? \ + tile_buffer_3 : \ + id == 3 ? tile_buffer_4 : \ + id == 4 ? tile_buffer_5 : \ + id == 5 ? tile_buffer_6 : \ + id == 6 ? tile_buffer_7 : \ + id == 7 ? tile_buffer_8 : tile_buffer_9) #else # ifdef __KERNEL_CUDA__ -# define CCL_FILTER_TILE_INFO ccl_global TileInfo* tile_info +# define CCL_FILTER_TILE_INFO ccl_global TileInfo *tile_info # else -# define CCL_FILTER_TILE_INFO TileInfo* tile_info +# define CCL_FILTER_TILE_INFO TileInfo *tile_info # endif # define ccl_get_tile_buffer(id) (tile_info->buffers[id]) #endif -#endif /* __FILTER_DEFINES_H__*/ +#endif /* __FILTER_DEFINES_H__*/ diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h index e1ea6487aa9..809ccfe8be6 100644 --- a/intern/cycles/kernel/filter/filter_features.h +++ b/intern/cycles/kernel/filter/filter_features.h @@ -14,22 +14,25 @@ * limitations under the License. */ - CCL_NAMESPACE_BEGIN +CCL_NAMESPACE_BEGIN #define ccl_get_feature(buffer, pass) (buffer)[(pass)*pass_stride] /* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).+ * pixel_buffer always points to the current pixel in the first pass. * Repeat the loop for every secondary frame if there are any. */ -#define FOR_PIXEL_WINDOW for(int frame = 0; frame < tile_info->num_frames; frame++) { \ - pixel.z = tile_info->frames[frame]; \ - pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x) + frame*frame_stride; \ - for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ - for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) { +#define FOR_PIXEL_WINDOW \ + for (int frame = 0; frame < tile_info->num_frames; frame++) { \ + pixel.z = tile_info->frames[frame]; \ + pixel_buffer = buffer + (low.y - rect.y) * buffer_w + (low.x - rect.x) + \ + frame * frame_stride; \ + for (pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ + for (pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) { -#define END_FOR_PIXEL_WINDOW } \ - pixel_buffer += buffer_w - (high.x - low.x); \ - } \ - } +#define END_FOR_PIXEL_WINDOW \ + } \ + pixel_buffer += buffer_w - (high.x - low.x); \ + } \ + } ccl_device_inline void filter_get_features(int3 pixel, const ccl_global float *ccl_restrict buffer, @@ -38,24 +41,24 @@ ccl_device_inline void filter_get_features(int3 pixel, const float *ccl_restrict mean, int pass_stride) { - features[0] = pixel.x; - features[1] = pixel.y; - features[2] = fabsf(ccl_get_feature(buffer, 0)); - features[3] = ccl_get_feature(buffer, 1); - features[4] = ccl_get_feature(buffer, 2); - features[5] = ccl_get_feature(buffer, 3); - features[6] = ccl_get_feature(buffer, 4); - features[7] = ccl_get_feature(buffer, 5); - features[8] = ccl_get_feature(buffer, 6); - features[9] = ccl_get_feature(buffer, 7); - if(use_time) { - features[10] = pixel.z; - } - if(mean) { - for(int i = 0; i < (use_time? 11 : 10); i++) { - features[i] -= mean[i]; - } - } + features[0] = pixel.x; + features[1] = pixel.y; + features[2] = fabsf(ccl_get_feature(buffer, 0)); + features[3] = ccl_get_feature(buffer, 1); + features[4] = ccl_get_feature(buffer, 2); + features[5] = ccl_get_feature(buffer, 3); + features[6] = ccl_get_feature(buffer, 4); + features[7] = ccl_get_feature(buffer, 5); + features[8] = ccl_get_feature(buffer, 6); + features[9] = ccl_get_feature(buffer, 7); + if (use_time) { + features[10] = pixel.z; + } + if (mean) { + for (int i = 0; i < (use_time ? 11 : 10); i++) { + features[i] -= mean[i]; + } + } } ccl_device_inline void filter_get_feature_scales(int3 pixel, @@ -65,38 +68,39 @@ ccl_device_inline void filter_get_feature_scales(int3 pixel, const float *ccl_restrict mean, int pass_stride) { - scales[0] = fabsf(pixel.x - mean[0]); - scales[1] = fabsf(pixel.y - mean[1]); - scales[2] = fabsf(fabsf(ccl_get_feature(buffer, 0)) - mean[2]); - scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3], - ccl_get_feature(buffer, 2) - mean[4], - ccl_get_feature(buffer, 3) - mean[5])); - scales[4] = fabsf(ccl_get_feature(buffer, 4) - mean[6]); - scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7], - ccl_get_feature(buffer, 6) - mean[8], - ccl_get_feature(buffer, 7) - mean[9])); - if(use_time) { - scales[6] = fabsf(pixel.z - mean[10]); - } + scales[0] = fabsf(pixel.x - mean[0]); + scales[1] = fabsf(pixel.y - mean[1]); + scales[2] = fabsf(fabsf(ccl_get_feature(buffer, 0)) - mean[2]); + scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3], + ccl_get_feature(buffer, 2) - mean[4], + ccl_get_feature(buffer, 3) - mean[5])); + scales[4] = fabsf(ccl_get_feature(buffer, 4) - mean[6]); + scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7], + ccl_get_feature(buffer, 6) - mean[8], + ccl_get_feature(buffer, 7) - mean[9])); + if (use_time) { + scales[6] = fabsf(pixel.z - mean[10]); + } } ccl_device_inline void filter_calculate_scale(float *scale, bool use_time) { - scale[0] = 1.0f/max(scale[0], 0.01f); - scale[1] = 1.0f/max(scale[1], 0.01f); - scale[2] = 1.0f/max(scale[2], 0.01f); - if(use_time) { - scale[10] = 1.0f/max(scale[6], 0.01f); - } - scale[6] = 1.0f/max(scale[4], 0.01f); - scale[7] = scale[8] = scale[9] = 1.0f/max(sqrtf(scale[5]), 0.01f); - scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f); + scale[0] = 1.0f / max(scale[0], 0.01f); + scale[1] = 1.0f / max(scale[1], 0.01f); + scale[2] = 1.0f / max(scale[2], 0.01f); + if (use_time) { + scale[10] = 1.0f / max(scale[6], 0.01f); + } + scale[6] = 1.0f / max(scale[4], 0.01f); + scale[7] = scale[8] = scale[9] = 1.0f / max(sqrtf(scale[5]), 0.01f); + scale[3] = scale[4] = scale[5] = 1.0f / max(sqrtf(scale[3]), 0.01f); } ccl_device_inline float3 filter_get_color(const ccl_global float *ccl_restrict buffer, int pass_stride) { - return make_float3(ccl_get_feature(buffer, 8), ccl_get_feature(buffer, 9), ccl_get_feature(buffer, 10)); + return make_float3( + ccl_get_feature(buffer, 8), ccl_get_feature(buffer, 9), ccl_get_feature(buffer, 10)); } ccl_device_inline void design_row_add(float *design_row, @@ -107,42 +111,44 @@ ccl_device_inline void design_row_add(float *design_row, float feature, int transform_row_stride) { - for(int i = 0; i < rank; i++) { - design_row[1+i] += transform[(row*transform_row_stride + i)*stride]*feature; - } + for (int i = 0; i < rank; i++) { + design_row[1 + i] += transform[(row * transform_row_stride + i) * stride] * feature; + } } /* Fill the design row. */ -ccl_device_inline void filter_get_design_row_transform(int3 p_pixel, - const ccl_global float *ccl_restrict p_buffer, - int3 q_pixel, - const ccl_global float *ccl_restrict q_buffer, - int pass_stride, - int rank, - float *design_row, - const ccl_global float *ccl_restrict transform, - int stride, - bool use_time) +ccl_device_inline void filter_get_design_row_transform( + int3 p_pixel, + const ccl_global float *ccl_restrict p_buffer, + int3 q_pixel, + const ccl_global float *ccl_restrict q_buffer, + int pass_stride, + int rank, + float *design_row, + const ccl_global float *ccl_restrict transform, + int stride, + bool use_time) { - int num_features = use_time? 11 : 10; + int num_features = use_time ? 11 : 10; - design_row[0] = 1.0f; - math_vector_zero(design_row+1, rank); + design_row[0] = 1.0f; + math_vector_zero(design_row + 1, rank); -#define DESIGN_ROW_ADD(I, F) design_row_add(design_row, rank, transform, stride, I, F, num_features); - DESIGN_ROW_ADD(0, q_pixel.x - p_pixel.x); - DESIGN_ROW_ADD(1, q_pixel.y - p_pixel.y); - DESIGN_ROW_ADD(2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0))); - DESIGN_ROW_ADD(3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1)); - DESIGN_ROW_ADD(4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2)); - DESIGN_ROW_ADD(5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3)); - DESIGN_ROW_ADD(6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4)); - DESIGN_ROW_ADD(7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5)); - DESIGN_ROW_ADD(8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6)); - DESIGN_ROW_ADD(9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7)); - if(use_time) { - DESIGN_ROW_ADD(10, q_pixel.z - p_pixel.z) - } +#define DESIGN_ROW_ADD(I, F) \ + design_row_add(design_row, rank, transform, stride, I, F, num_features); + DESIGN_ROW_ADD(0, q_pixel.x - p_pixel.x); + DESIGN_ROW_ADD(1, q_pixel.y - p_pixel.y); + DESIGN_ROW_ADD(2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0))); + DESIGN_ROW_ADD(3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1)); + DESIGN_ROW_ADD(4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2)); + DESIGN_ROW_ADD(5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3)); + DESIGN_ROW_ADD(6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4)); + DESIGN_ROW_ADD(7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5)); + DESIGN_ROW_ADD(8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6)); + DESIGN_ROW_ADD(9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7)); + if (use_time) { + DESIGN_ROW_ADD(10, q_pixel.z - p_pixel.z) + } #undef DESIGN_ROW_ADD } diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h index 5dd001ffb93..1e0d6e93453 100644 --- a/intern/cycles/kernel/filter/filter_features_sse.h +++ b/intern/cycles/kernel/filter/filter_features_sse.h @@ -22,22 +22,27 @@ CCL_NAMESPACE_BEGIN * pixel_buffer always points to the first of the 4 current pixel in the first pass. * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. * Repeat the loop for every secondary frame if there are any. */ -#define FOR_PIXEL_WINDOW_SSE for(int frame = 0; frame < tile_info->num_frames; frame++) { \ - pixel.z = tile_info->frames[frame]; \ - pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x) + frame*frame_stride; \ - float4 t4 = make_float4(pixel.z); \ - for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ - float4 y4 = make_float4(pixel.y); \ - for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \ - float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \ - int4 active_pixels = x4 < make_float4(high.x); +#define FOR_PIXEL_WINDOW_SSE \ + for (int frame = 0; frame < tile_info->num_frames; frame++) { \ + pixel.z = tile_info->frames[frame]; \ + pixel_buffer = buffer + (low.y - rect.y) * buffer_w + (low.x - rect.x) + \ + frame * frame_stride; \ + float4 t4 = make_float4(pixel.z); \ + for (pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ + float4 y4 = make_float4(pixel.y); \ + for (pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \ + float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \ + int4 active_pixels = x4 < make_float4(high.x); -#define END_FOR_PIXEL_WINDOW_SSE } \ - pixel_buffer += buffer_w - (high.x - low.x); \ - } \ - } +#define END_FOR_PIXEL_WINDOW_SSE \ + } \ + pixel_buffer += buffer_w - (high.x - low.x); \ + } \ + } -ccl_device_inline void filter_get_features_sse(float4 x, float4 y, float4 t, +ccl_device_inline void filter_get_features_sse(float4 x, + float4 y, + float4 t, int4 active_pixels, const float *ccl_restrict buffer, float4 *features, @@ -45,33 +50,35 @@ ccl_device_inline void filter_get_features_sse(float4 x, float4 y, float4 t, const float4 *ccl_restrict mean, int pass_stride) { - int num_features = use_time? 11 : 10; + int num_features = use_time ? 11 : 10; - features[0] = x; - features[1] = y; - features[2] = fabs(ccl_get_feature_sse(0)); - features[3] = ccl_get_feature_sse(1); - features[4] = ccl_get_feature_sse(2); - features[5] = ccl_get_feature_sse(3); - features[6] = ccl_get_feature_sse(4); - features[7] = ccl_get_feature_sse(5); - features[8] = ccl_get_feature_sse(6); - features[9] = ccl_get_feature_sse(7); - if(use_time) { - features[10] = t; - } + features[0] = x; + features[1] = y; + features[2] = fabs(ccl_get_feature_sse(0)); + features[3] = ccl_get_feature_sse(1); + features[4] = ccl_get_feature_sse(2); + features[5] = ccl_get_feature_sse(3); + features[6] = ccl_get_feature_sse(4); + features[7] = ccl_get_feature_sse(5); + features[8] = ccl_get_feature_sse(6); + features[9] = ccl_get_feature_sse(7); + if (use_time) { + features[10] = t; + } - if(mean) { - for(int i = 0; i < num_features; i++) { - features[i] = features[i] - mean[i]; - } - } - for(int i = 0; i < num_features; i++) { - features[i] = mask(active_pixels, features[i]); - } + if (mean) { + for (int i = 0; i < num_features; i++) { + features[i] = features[i] - mean[i]; + } + } + for (int i = 0; i < num_features; i++) { + features[i] = mask(active_pixels, features[i]); + } } -ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y, float4 t, +ccl_device_inline void filter_get_feature_scales_sse(float4 x, + float4 y, + float4 t, int4 active_pixels, const float *ccl_restrict buffer, float4 *scales, @@ -79,36 +86,34 @@ ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y, float4 const float4 *ccl_restrict mean, int pass_stride) { - scales[0] = fabs(x - mean[0]); - scales[1] = fabs(y - mean[1]); - scales[2] = fabs(fabs(ccl_get_feature_sse(0)) - mean[2]); - scales[3] = sqr(ccl_get_feature_sse(1) - mean[3]) + - sqr(ccl_get_feature_sse(2) - mean[4]) + - sqr(ccl_get_feature_sse(3) - mean[5]); - scales[4] = fabs(ccl_get_feature_sse(4) - mean[6]); - scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) + - sqr(ccl_get_feature_sse(6) - mean[8]) + - sqr(ccl_get_feature_sse(7) - mean[9]); - if(use_time) { - scales[6] = fabs(t - mean[10]); - } + scales[0] = fabs(x - mean[0]); + scales[1] = fabs(y - mean[1]); + scales[2] = fabs(fabs(ccl_get_feature_sse(0)) - mean[2]); + scales[3] = sqr(ccl_get_feature_sse(1) - mean[3]) + sqr(ccl_get_feature_sse(2) - mean[4]) + + sqr(ccl_get_feature_sse(3) - mean[5]); + scales[4] = fabs(ccl_get_feature_sse(4) - mean[6]); + scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) + sqr(ccl_get_feature_sse(6) - mean[8]) + + sqr(ccl_get_feature_sse(7) - mean[9]); + if (use_time) { + scales[6] = fabs(t - mean[10]); + } - for(int i = 0; i < (use_time? 7 : 6); i++) - scales[i] = mask(active_pixels, scales[i]); + for (int i = 0; i < (use_time ? 7 : 6); i++) + scales[i] = mask(active_pixels, scales[i]); } ccl_device_inline void filter_calculate_scale_sse(float4 *scale, bool use_time) { - scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f))); - scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f))); - scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f))); - if(use_time) { - scale[10] = rcp(max(reduce_max(scale[6]), make_float4(0.01f)));; - } - scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f))); - scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f))); - scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f))); + scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f))); + scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f))); + scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f))); + if (use_time) { + scale[10] = rcp(max(reduce_max(scale[6]), make_float4(0.01f))); + ; + } + scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f))); + scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f))); + scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f))); } - CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h index 9eb3c603a4a..a94266a8786 100644 --- a/intern/cycles/kernel/filter/filter_nlm_cpu.h +++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h @@ -16,10 +16,11 @@ CCL_NAMESPACE_BEGIN -#define load4_a(buf, ofs) (*((float4*) ((buf) + (ofs)))) -#define load4_u(buf, ofs) load_float4((buf)+(ofs)) +#define load4_a(buf, ofs) (*((float4 *)((buf) + (ofs)))) +#define load4_u(buf, ofs) load_float4((buf) + (ofs)) -ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, +ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, + int dy, const float *ccl_restrict weight_image, const float *ccl_restrict variance_image, const float *ccl_restrict scale_image, @@ -31,122 +32,117 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, float a, float k_2) { - /* Strides need to be aligned to 16 bytes. */ - kernel_assert((stride % 4) == 0 && (channel_offset % 4) == 0); - - int aligned_lowx = rect.x & (~3); - const int numChannels = (channel_offset > 0)? 3 : 1; - const float4 channel_fac = make_float4(1.0f / numChannels); - - for(int y = rect.y; y < rect.w; y++) { - int idx_p = y*stride + aligned_lowx; - int idx_q = (y+dy)*stride + aligned_lowx + dx + frame_offset; - for(int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) { - float4 diff = make_float4(0.0f); - float4 scale_fac; - if(scale_image) { - scale_fac = clamp(load4_a(scale_image, idx_p) / load4_u(scale_image, idx_q), - make_float4(0.25f), make_float4(4.0f)); - } - else { - scale_fac = make_float4(1.0f); - } - for(int c = 0, chan_ofs = 0; c < numChannels; c++, chan_ofs += channel_offset) { - /* idx_p is guaranteed to be aligned, but idx_q isn't. */ - float4 color_p = load4_a(weight_image, idx_p + chan_ofs); - float4 color_q = scale_fac*load4_u(weight_image, idx_q + chan_ofs); - float4 cdiff = color_p - color_q; - float4 var_p = load4_a(variance_image, idx_p + chan_ofs); - float4 var_q = sqr(scale_fac)*load4_u(variance_image, idx_q + chan_ofs); - diff += (cdiff*cdiff - a*(var_p + min(var_p, var_q))) / (make_float4(1e-8f) + k_2*(var_p+var_q)); - } - load4_a(difference_image, idx_p) = diff*channel_fac; - } - } + /* Strides need to be aligned to 16 bytes. */ + kernel_assert((stride % 4) == 0 && (channel_offset % 4) == 0); + + int aligned_lowx = rect.x & (~3); + const int numChannels = (channel_offset > 0) ? 3 : 1; + const float4 channel_fac = make_float4(1.0f / numChannels); + + for (int y = rect.y; y < rect.w; y++) { + int idx_p = y * stride + aligned_lowx; + int idx_q = (y + dy) * stride + aligned_lowx + dx + frame_offset; + for (int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) { + float4 diff = make_float4(0.0f); + float4 scale_fac; + if (scale_image) { + scale_fac = clamp(load4_a(scale_image, idx_p) / load4_u(scale_image, idx_q), + make_float4(0.25f), + make_float4(4.0f)); + } + else { + scale_fac = make_float4(1.0f); + } + for (int c = 0, chan_ofs = 0; c < numChannels; c++, chan_ofs += channel_offset) { + /* idx_p is guaranteed to be aligned, but idx_q isn't. */ + float4 color_p = load4_a(weight_image, idx_p + chan_ofs); + float4 color_q = scale_fac * load4_u(weight_image, idx_q + chan_ofs); + float4 cdiff = color_p - color_q; + float4 var_p = load4_a(variance_image, idx_p + chan_ofs); + float4 var_q = sqr(scale_fac) * load4_u(variance_image, idx_q + chan_ofs); + diff += (cdiff * cdiff - a * (var_p + min(var_p, var_q))) / + (make_float4(1e-8f) + k_2 * (var_p + var_q)); + } + load4_a(difference_image, idx_p) = diff * channel_fac; + } + } } -ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict difference_image, - float *out_image, - int4 rect, - int stride, - int f) +ccl_device_inline void kernel_filter_nlm_blur( + const float *ccl_restrict difference_image, float *out_image, int4 rect, int stride, int f) { - int aligned_lowx = round_down(rect.x, 4); - for(int y = rect.y; y < rect.w; y++) { - const int low = max(rect.y, y-f); - const int high = min(rect.w, y+f+1); - for(int x = aligned_lowx; x < rect.z; x += 4) { - load4_a(out_image, y*stride + x) = make_float4(0.0f); - } - for(int y1 = low; y1 < high; y1++) { - for(int x = aligned_lowx; x < rect.z; x += 4) { - load4_a(out_image, y*stride + x) += load4_a(difference_image, y1*stride + x); - } - } - float fac = 1.0f/(high - low); - for(int x = aligned_lowx; x < rect.z; x += 4) { - load4_a(out_image, y*stride + x) *= fac; - } - } + int aligned_lowx = round_down(rect.x, 4); + for (int y = rect.y; y < rect.w; y++) { + const int low = max(rect.y, y - f); + const int high = min(rect.w, y + f + 1); + for (int x = aligned_lowx; x < rect.z; x += 4) { + load4_a(out_image, y * stride + x) = make_float4(0.0f); + } + for (int y1 = low; y1 < high; y1++) { + for (int x = aligned_lowx; x < rect.z; x += 4) { + load4_a(out_image, y * stride + x) += load4_a(difference_image, y1 * stride + x); + } + } + float fac = 1.0f / (high - low); + for (int x = aligned_lowx; x < rect.z; x += 4) { + load4_a(out_image, y * stride + x) *= fac; + } + } } -ccl_device_inline void nlm_blur_horizontal(const float *ccl_restrict difference_image, - float *out_image, - int4 rect, - int stride, - int f) +ccl_device_inline void nlm_blur_horizontal( + const float *ccl_restrict difference_image, float *out_image, int4 rect, int stride, int f) { - int aligned_lowx = round_down(rect.x, 4); - for(int y = rect.y; y < rect.w; y++) { - for(int x = aligned_lowx; x < rect.z; x += 4) { - load4_a(out_image, y*stride + x) = make_float4(0.0f); - } - } - - for(int dx = -f; dx <= f; dx++) { - aligned_lowx = round_down(rect.x - min(0, dx), 4); - int highx = rect.z - max(0, dx); - int4 lowx4 = make_int4(rect.x - min(0, dx)); - int4 highx4 = make_int4(rect.z - max(0, dx)); - for(int y = rect.y; y < rect.w; y++) { - for(int x = aligned_lowx; x < highx; x += 4) { - int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3); - int4 active = (x4 >= lowx4) & (x4 < highx4); - - float4 diff = load4_u(difference_image, y*stride + x + dx); - load4_a(out_image, y*stride + x) += mask(active, diff); - } - } - } - - aligned_lowx = round_down(rect.x, 4); - for(int y = rect.y; y < rect.w; y++) { - for(int x = aligned_lowx; x < rect.z; x += 4) { - float4 x4 = make_float4(x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); - float4 low = max(make_float4(rect.x), x4 - make_float4(f)); - float4 high = min(make_float4(rect.z), x4 + make_float4(f+1)); - load4_a(out_image, y*stride + x) *= rcp(high - low); - } - } + int aligned_lowx = round_down(rect.x, 4); + for (int y = rect.y; y < rect.w; y++) { + for (int x = aligned_lowx; x < rect.z; x += 4) { + load4_a(out_image, y * stride + x) = make_float4(0.0f); + } + } + + for (int dx = -f; dx <= f; dx++) { + aligned_lowx = round_down(rect.x - min(0, dx), 4); + int highx = rect.z - max(0, dx); + int4 lowx4 = make_int4(rect.x - min(0, dx)); + int4 highx4 = make_int4(rect.z - max(0, dx)); + for (int y = rect.y; y < rect.w; y++) { + for (int x = aligned_lowx; x < highx; x += 4) { + int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3); + int4 active = (x4 >= lowx4) & (x4 < highx4); + + float4 diff = load4_u(difference_image, y * stride + x + dx); + load4_a(out_image, y * stride + x) += mask(active, diff); + } + } + } + + aligned_lowx = round_down(rect.x, 4); + for (int y = rect.y; y < rect.w; y++) { + for (int x = aligned_lowx; x < rect.z; x += 4) { + float4 x4 = make_float4(x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); + float4 low = max(make_float4(rect.x), x4 - make_float4(f)); + float4 high = min(make_float4(rect.z), x4 + make_float4(f + 1)); + load4_a(out_image, y * stride + x) *= rcp(high - low); + } + } } -ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict difference_image, - float *out_image, - int4 rect, - int stride, - int f) +ccl_device_inline void kernel_filter_nlm_calc_weight( + const float *ccl_restrict difference_image, float *out_image, int4 rect, int stride, int f) { - nlm_blur_horizontal(difference_image, out_image, rect, stride, f); - - int aligned_lowx = round_down(rect.x, 4); - for(int y = rect.y; y < rect.w; y++) { - for(int x = aligned_lowx; x < rect.z; x += 4) { - load4_a(out_image, y*stride + x) = fast_expf4(-max(load4_a(out_image, y*stride + x), make_float4(0.0f))); - } - } + nlm_blur_horizontal(difference_image, out_image, rect, stride, f); + + int aligned_lowx = round_down(rect.x, 4); + for (int y = rect.y; y < rect.w; y++) { + for (int x = aligned_lowx; x < rect.z; x += 4) { + load4_a(out_image, y * stride + x) = fast_expf4( + -max(load4_a(out_image, y * stride + x), make_float4(0.0f))); + } + } } -ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, +ccl_device_inline void kernel_filter_nlm_update_output(int dx, + int dy, const float *ccl_restrict difference_image, const float *ccl_restrict image, float *temp_image, @@ -157,33 +153,36 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, int stride, int f) { - nlm_blur_horizontal(difference_image, temp_image, rect, stride, f); + nlm_blur_horizontal(difference_image, temp_image, rect, stride, f); - int aligned_lowx = round_down(rect.x, 4); - for(int y = rect.y; y < rect.w; y++) { - for(int x = aligned_lowx; x < rect.z; x += 4) { - int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3); - int4 active = (x4 >= make_int4(rect.x)) & (x4 < make_int4(rect.z)); + int aligned_lowx = round_down(rect.x, 4); + for (int y = rect.y; y < rect.w; y++) { + for (int x = aligned_lowx; x < rect.z; x += 4) { + int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3); + int4 active = (x4 >= make_int4(rect.x)) & (x4 < make_int4(rect.z)); - int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx); + int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx); - float4 weight = load4_a(temp_image, idx_p); - load4_a(accum_image, idx_p) += mask(active, weight); + float4 weight = load4_a(temp_image, idx_p); + load4_a(accum_image, idx_p) += mask(active, weight); - float4 val = load4_u(image, idx_q); - if(channel_offset) { - val += load4_u(image, idx_q + channel_offset); - val += load4_u(image, idx_q + 2*channel_offset); - val *= 1.0f/3.0f; - } + float4 val = load4_u(image, idx_q); + if (channel_offset) { + val += load4_u(image, idx_q + channel_offset); + val += load4_u(image, idx_q + 2 * channel_offset); + val *= 1.0f / 3.0f; + } - load4_a(out_image, idx_p) += mask(active, weight*val); - } - } + load4_a(out_image, idx_p) += mask(active, weight * val); + } + } } -ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, int t, - const float *ccl_restrict difference_image, +ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, + int dy, + int t, + const float *ccl_restrict + difference_image, const float *ccl_restrict buffer, float *transform, int *rank, @@ -191,40 +190,49 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, int t float3 *XtWY, int4 rect, int4 filter_window, - int stride, int f, + int stride, + int f, int pass_stride, int frame_offset, bool use_time) { - int4 clip_area = rect_clip(rect, filter_window); - /* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */ - for(int y = clip_area.y; y < clip_area.w; y++) { - for(int x = clip_area.x; x < clip_area.z; x++) { - const int low = max(rect.x, x-f); - const int high = min(rect.z, x+f+1); - float sum = 0.0f; - for(int x1 = low; x1 < high; x1++) { - sum += difference_image[y*stride + x1]; - } - float weight = sum * (1.0f/(high - low)); - - int storage_ofs = coord_to_local_index(filter_window, x, y); - float *l_transform = transform + storage_ofs*TRANSFORM_SIZE; - float *l_XtWX = XtWX + storage_ofs*XTWX_SIZE; - float3 *l_XtWY = XtWY + storage_ofs*XTWY_SIZE; - int *l_rank = rank + storage_ofs; - - kernel_filter_construct_gramian(x, y, 1, - dx, dy, t, - stride, - pass_stride, - frame_offset, - use_time, - buffer, - l_transform, l_rank, - weight, l_XtWX, l_XtWY, 0); - } - } + int4 clip_area = rect_clip(rect, filter_window); + /* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */ + for (int y = clip_area.y; y < clip_area.w; y++) { + for (int x = clip_area.x; x < clip_area.z; x++) { + const int low = max(rect.x, x - f); + const int high = min(rect.z, x + f + 1); + float sum = 0.0f; + for (int x1 = low; x1 < high; x1++) { + sum += difference_image[y * stride + x1]; + } + float weight = sum * (1.0f / (high - low)); + + int storage_ofs = coord_to_local_index(filter_window, x, y); + float *l_transform = transform + storage_ofs * TRANSFORM_SIZE; + float *l_XtWX = XtWX + storage_ofs * XTWX_SIZE; + float3 *l_XtWY = XtWY + storage_ofs * XTWY_SIZE; + int *l_rank = rank + storage_ofs; + + kernel_filter_construct_gramian(x, + y, + 1, + dx, + dy, + t, + stride, + pass_stride, + frame_offset, + use_time, + buffer, + l_transform, + l_rank, + weight, + l_XtWX, + l_XtWY, + 0); + } + } } ccl_device_inline void kernel_filter_nlm_normalize(float *out_image, @@ -232,11 +240,11 @@ ccl_device_inline void kernel_filter_nlm_normalize(float *out_image, int4 rect, int w) { - for(int y = rect.y; y < rect.w; y++) { - for(int x = rect.x; x < rect.z; x++) { - out_image[y*w+x] /= accum_image[y*w+x]; - } - } + for (int y = rect.y; y < rect.w; y++) { + for (int x = rect.x; x < rect.z; x++) { + out_image[y * w + x] /= accum_image[y * w + x]; + } + } } #undef load4_a diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h index 12636393243..650c743f34f 100644 --- a/intern/cycles/kernel/filter/filter_nlm_gpu.h +++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h @@ -24,203 +24,232 @@ CCL_NAMESPACE_BEGIN * Window is the rect that should be processed. * co is filled with (x, y, dx, dy). */ -ccl_device_inline bool get_nlm_coords_window(int w, int h, int r, int stride, - int4 *rect, int4 *co, int *ofs, - int4 window) +ccl_device_inline bool get_nlm_coords_window( + int w, int h, int r, int stride, int4 *rect, int4 *co, int *ofs, int4 window) { - /* Determine the pixel offset that this thread should apply. */ - int s = 2*r+1; - int si = ccl_global_id(1); - int sx = si % s; - int sy = si / s; - if(sy >= s) { - return false; - } - - /* Pixels still need to lie inside the denoising buffer after applying the offset, - * so determine the area for which this is the case. */ - int dx = sx - r; - int dy = sy - r; - - *rect = make_int4(max(0, -dx), max(0, -dy), - w - max(0, dx), h - max(0, dy)); - - /* Find the intersection of the area that we want to process (window) and the area - * that can be processed (rect) to get the final area for this offset. */ - int4 clip_area = rect_clip(window, *rect); - - /* If the radius is larger than one of the sides of the window, - * there will be shifts for which there is no usable pixel at all. */ - if(!rect_is_valid(clip_area)) { - return false; - } - - /* Map the linear thread index to pixels inside the clip area. */ - int x, y; - if(!local_index_to_coord(clip_area, ccl_global_id(0), &x, &y)) { - return false; - } - - *co = make_int4(x, y, dx, dy); - - *ofs = (sy*s + sx) * stride; - - return true; + /* Determine the pixel offset that this thread should apply. */ + int s = 2 * r + 1; + int si = ccl_global_id(1); + int sx = si % s; + int sy = si / s; + if (sy >= s) { + return false; + } + + /* Pixels still need to lie inside the denoising buffer after applying the offset, + * so determine the area for which this is the case. */ + int dx = sx - r; + int dy = sy - r; + + *rect = make_int4(max(0, -dx), max(0, -dy), w - max(0, dx), h - max(0, dy)); + + /* Find the intersection of the area that we want to process (window) and the area + * that can be processed (rect) to get the final area for this offset. */ + int4 clip_area = rect_clip(window, *rect); + + /* If the radius is larger than one of the sides of the window, + * there will be shifts for which there is no usable pixel at all. */ + if (!rect_is_valid(clip_area)) { + return false; + } + + /* Map the linear thread index to pixels inside the clip area. */ + int x, y; + if (!local_index_to_coord(clip_area, ccl_global_id(0), &x, &y)) { + return false; + } + + *co = make_int4(x, y, dx, dy); + + *ofs = (sy * s + sx) * stride; + + return true; } -ccl_device_inline bool get_nlm_coords(int w, int h, int r, int stride, - int4 *rect, int4 *co, int *ofs) +ccl_device_inline bool get_nlm_coords( + int w, int h, int r, int stride, int4 *rect, int4 *co, int *ofs) { - return get_nlm_coords_window(w, h, r, stride, rect, co, ofs, make_int4(0, 0, w, h)); + return get_nlm_coords_window(w, h, r, stride, rect, co, ofs, make_int4(0, 0, w, h)); } -ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y, - int dx, int dy, - const ccl_global float *ccl_restrict weight_image, - const ccl_global float *ccl_restrict variance_image, - const ccl_global float *ccl_restrict scale_image, - ccl_global float *difference_image, - int4 rect, int stride, - int channel_offset, - int frame_offset, - float a, float k_2) +ccl_device_inline void kernel_filter_nlm_calc_difference( + int x, + int y, + int dx, + int dy, + const ccl_global float *ccl_restrict weight_image, + const ccl_global float *ccl_restrict variance_image, + const ccl_global float *ccl_restrict scale_image, + ccl_global float *difference_image, + int4 rect, + int stride, + int channel_offset, + int frame_offset, + float a, + float k_2) { - int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx) + frame_offset; - int numChannels = channel_offset? 3 : 1; - - float diff = 0.0f; - float scale_fac = 1.0f; - if(scale_image) { - scale_fac = clamp(scale_image[idx_p] / scale_image[idx_q], 0.25f, 4.0f); - } - - for(int c = 0; c < numChannels; c++, idx_p += channel_offset, idx_q += channel_offset) { - float cdiff = weight_image[idx_p] - scale_fac*weight_image[idx_q]; - float pvar = variance_image[idx_p]; - float qvar = sqr(scale_fac)*variance_image[idx_q]; - diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar)); - } - if(numChannels > 1) { - diff *= 1.0f/numChannels; - } - difference_image[y*stride + x] = diff; + int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx) + frame_offset; + int numChannels = channel_offset ? 3 : 1; + + float diff = 0.0f; + float scale_fac = 1.0f; + if (scale_image) { + scale_fac = clamp(scale_image[idx_p] / scale_image[idx_q], 0.25f, 4.0f); + } + + for (int c = 0; c < numChannels; c++, idx_p += channel_offset, idx_q += channel_offset) { + float cdiff = weight_image[idx_p] - scale_fac * weight_image[idx_q]; + float pvar = variance_image[idx_p]; + float qvar = sqr(scale_fac) * variance_image[idx_q]; + diff += (cdiff * cdiff - a * (pvar + min(pvar, qvar))) / (1e-8f + k_2 * (pvar + qvar)); + } + if (numChannels > 1) { + diff *= 1.0f / numChannels; + } + difference_image[y * stride + x] = diff; } -ccl_device_inline void kernel_filter_nlm_blur(int x, int y, - const ccl_global float *ccl_restrict difference_image, +ccl_device_inline void kernel_filter_nlm_blur(int x, + int y, + const ccl_global float *ccl_restrict + difference_image, ccl_global float *out_image, - int4 rect, int stride, int f) + int4 rect, + int stride, + int f) { - float sum = 0.0f; - const int low = max(rect.y, y-f); - const int high = min(rect.w, y+f+1); - for(int y1 = low; y1 < high; y1++) { - sum += difference_image[y1*stride + x]; - } - sum *= 1.0f/(high-low); - out_image[y*stride + x] = sum; + float sum = 0.0f; + const int low = max(rect.y, y - f); + const int high = min(rect.w, y + f + 1); + for (int y1 = low; y1 < high; y1++) { + sum += difference_image[y1 * stride + x]; + } + sum *= 1.0f / (high - low); + out_image[y * stride + x] = sum; } -ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y, - const ccl_global float *ccl_restrict difference_image, +ccl_device_inline void kernel_filter_nlm_calc_weight(int x, + int y, + const ccl_global float *ccl_restrict + difference_image, ccl_global float *out_image, - int4 rect, int stride, int f) + int4 rect, + int stride, + int f) { - float sum = 0.0f; - const int low = max(rect.x, x-f); - const int high = min(rect.z, x+f+1); - for(int x1 = low; x1 < high; x1++) { - sum += difference_image[y*stride + x1]; - } - sum *= 1.0f/(high-low); - out_image[y*stride + x] = fast_expf(-max(sum, 0.0f)); + float sum = 0.0f; + const int low = max(rect.x, x - f); + const int high = min(rect.z, x + f + 1); + for (int x1 = low; x1 < high; x1++) { + sum += difference_image[y * stride + x1]; + } + sum *= 1.0f / (high - low); + out_image[y * stride + x] = fast_expf(-max(sum, 0.0f)); } -ccl_device_inline void kernel_filter_nlm_update_output(int x, int y, - int dx, int dy, - const ccl_global float *ccl_restrict difference_image, +ccl_device_inline void kernel_filter_nlm_update_output(int x, + int y, + int dx, + int dy, + const ccl_global float *ccl_restrict + difference_image, const ccl_global float *ccl_restrict image, ccl_global float *out_image, ccl_global float *accum_image, - int4 rect, int channel_offset, - int stride, int f) + int4 rect, + int channel_offset, + int stride, + int f) { - float sum = 0.0f; - const int low = max(rect.x, x-f); - const int high = min(rect.z, x+f+1); - for(int x1 = low; x1 < high; x1++) { - sum += difference_image[y*stride + x1]; - } - sum *= 1.0f/(high-low); - - int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx); - if(out_image) { - atomic_add_and_fetch_float(accum_image + idx_p, sum); - - float val = image[idx_q]; - if(channel_offset) { - val += image[idx_q + channel_offset]; - val += image[idx_q + 2*channel_offset]; - val *= 1.0f/3.0f; - } - atomic_add_and_fetch_float(out_image + idx_p, sum*val); - } - else { - accum_image[idx_p] = sum; - } + float sum = 0.0f; + const int low = max(rect.x, x - f); + const int high = min(rect.z, x + f + 1); + for (int x1 = low; x1 < high; x1++) { + sum += difference_image[y * stride + x1]; + } + sum *= 1.0f / (high - low); + + int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx); + if (out_image) { + atomic_add_and_fetch_float(accum_image + idx_p, sum); + + float val = image[idx_q]; + if (channel_offset) { + val += image[idx_q + channel_offset]; + val += image[idx_q + 2 * channel_offset]; + val *= 1.0f / 3.0f; + } + atomic_add_and_fetch_float(out_image + idx_p, sum * val); + } + else { + accum_image[idx_p] = sum; + } } -ccl_device_inline void kernel_filter_nlm_construct_gramian(int x, int y, - int dx, int dy, int t, - const ccl_global float *ccl_restrict difference_image, - const ccl_global float *ccl_restrict buffer, - const ccl_global float *ccl_restrict transform, - ccl_global int *rank, - ccl_global float *XtWX, - ccl_global float3 *XtWY, - int4 rect, - int4 filter_window, - int stride, int f, - int pass_stride, - int frame_offset, - bool use_time, - int localIdx) +ccl_device_inline void kernel_filter_nlm_construct_gramian( + int x, + int y, + int dx, + int dy, + int t, + const ccl_global float *ccl_restrict difference_image, + const ccl_global float *ccl_restrict buffer, + const ccl_global float *ccl_restrict transform, + ccl_global int *rank, + ccl_global float *XtWX, + ccl_global float3 *XtWY, + int4 rect, + int4 filter_window, + int stride, + int f, + int pass_stride, + int frame_offset, + bool use_time, + int localIdx) { - const int low = max(rect.x, x-f); - const int high = min(rect.z, x+f+1); - float sum = 0.0f; - for(int x1 = low; x1 < high; x1++) { - sum += difference_image[y*stride + x1]; - } - float weight = sum * (1.0f/(high - low)); - - /* Reconstruction data is only stored for pixels inside the filter window, - * so compute the pixels's index in there. */ - int storage_ofs = coord_to_local_index(filter_window, x, y); - transform += storage_ofs; - rank += storage_ofs; - XtWX += storage_ofs; - XtWY += storage_ofs; - - kernel_filter_construct_gramian(x, y, - rect_size(filter_window), - dx, dy, t, - stride, - pass_stride, - frame_offset, - use_time, - buffer, - transform, rank, - weight, XtWX, XtWY, - localIdx); + const int low = max(rect.x, x - f); + const int high = min(rect.z, x + f + 1); + float sum = 0.0f; + for (int x1 = low; x1 < high; x1++) { + sum += difference_image[y * stride + x1]; + } + float weight = sum * (1.0f / (high - low)); + + /* Reconstruction data is only stored for pixels inside the filter window, + * so compute the pixels's index in there. */ + int storage_ofs = coord_to_local_index(filter_window, x, y); + transform += storage_ofs; + rank += storage_ofs; + XtWX += storage_ofs; + XtWY += storage_ofs; + + kernel_filter_construct_gramian(x, + y, + rect_size(filter_window), + dx, + dy, + t, + stride, + pass_stride, + frame_offset, + use_time, + buffer, + transform, + rank, + weight, + XtWX, + XtWY, + localIdx); } -ccl_device_inline void kernel_filter_nlm_normalize(int x, int y, +ccl_device_inline void kernel_filter_nlm_normalize(int x, + int y, ccl_global float *out_image, - const ccl_global float *ccl_restrict accum_image, + const ccl_global float *ccl_restrict + accum_image, int stride) { - out_image[y*stride + x] /= accum_image[y*stride + x]; + out_image[y * stride + x] /= accum_image[y * stride + x]; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h index e24f4feb28d..8211311313d 100644 --- a/intern/cycles/kernel/filter/filter_prefilter.h +++ b/intern/cycles/kernel/filter/filter_prefilter.h @@ -27,7 +27,8 @@ CCL_NAMESPACE_BEGIN */ ccl_device void kernel_filter_divide_shadow(int sample, CCL_FILTER_TILE_INFO, - int x, int y, + int x, + int y, ccl_global float *unfilteredA, ccl_global float *unfilteredB, ccl_global float *sampleVariance, @@ -37,37 +38,39 @@ ccl_device void kernel_filter_divide_shadow(int sample, int buffer_pass_stride, int buffer_denoising_offset) { - int xtile = (x < tile_info->x[1])? 0: ((x < tile_info->x[2])? 1: 2); - int ytile = (y < tile_info->y[1])? 0: ((y < tile_info->y[2])? 1: 2); - int tile = ytile*3+xtile; + int xtile = (x < tile_info->x[1]) ? 0 : ((x < tile_info->x[2]) ? 1 : 2); + int ytile = (y < tile_info->y[1]) ? 0 : ((y < tile_info->y[2]) ? 1 : 2); + int tile = ytile * 3 + xtile; - int offset = tile_info->offsets[tile]; - int stride = tile_info->strides[tile]; - const ccl_global float *ccl_restrict center_buffer = (ccl_global float*) ccl_get_tile_buffer(tile); - center_buffer += (y*stride + x + offset)*buffer_pass_stride; - center_buffer += buffer_denoising_offset + 14; + int offset = tile_info->offsets[tile]; + int stride = tile_info->strides[tile]; + const ccl_global float *ccl_restrict center_buffer = (ccl_global float *)ccl_get_tile_buffer( + tile); + center_buffer += (y * stride + x + offset) * buffer_pass_stride; + center_buffer += buffer_denoising_offset + 14; - int buffer_w = align_up(rect.z - rect.x, 4); - int idx = (y-rect.y)*buffer_w + (x - rect.x); - unfilteredA[idx] = center_buffer[1] / max(center_buffer[0], 1e-7f); - unfilteredB[idx] = center_buffer[4] / max(center_buffer[3], 1e-7f); + int buffer_w = align_up(rect.z - rect.x, 4); + int idx = (y - rect.y) * buffer_w + (x - rect.x); + unfilteredA[idx] = center_buffer[1] / max(center_buffer[0], 1e-7f); + unfilteredB[idx] = center_buffer[4] / max(center_buffer[3], 1e-7f); - float varA = center_buffer[2]; - float varB = center_buffer[5]; - int odd_sample = (sample+1)/2; - int even_sample = sample/2; + float varA = center_buffer[2]; + float varB = center_buffer[5]; + int odd_sample = (sample + 1) / 2; + int even_sample = sample / 2; - /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance - * update does not work efficiently with atomics in the kernel. */ - varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample); - varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample); + /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance + * update does not work efficiently with atomics in the kernel. */ + varA = max(0.0f, varA - unfilteredA[idx] * unfilteredA[idx] * odd_sample); + varB = max(0.0f, varB - unfilteredB[idx] * unfilteredB[idx] * even_sample); - varA /= max(odd_sample - 1, 1); - varB /= max(even_sample - 1, 1); + varA /= max(odd_sample - 1, 1); + varB /= max(even_sample - 1, 1); - sampleVariance[idx] = 0.5f*(varA + varB) / sample; - sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample*sample); - bufferVariance[idx] = 0.5f * (unfilteredA[idx] - unfilteredB[idx]) * (unfilteredA[idx] - unfilteredB[idx]); + sampleVariance[idx] = 0.5f * (varA + varB) / sample; + sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample * sample); + bufferVariance[idx] = 0.5f * (unfilteredA[idx] - unfilteredB[idx]) * + (unfilteredA[idx] - unfilteredB[idx]); } /* Load a regular feature from the render buffers into the denoise buffer. @@ -80,55 +83,65 @@ ccl_device void kernel_filter_divide_shadow(int sample, */ ccl_device void kernel_filter_get_feature(int sample, CCL_FILTER_TILE_INFO, - int m_offset, int v_offset, - int x, int y, + int m_offset, + int v_offset, + int x, + int y, ccl_global float *mean, ccl_global float *variance, float scale, - int4 rect, int buffer_pass_stride, + int4 rect, + int buffer_pass_stride, int buffer_denoising_offset) { - int xtile = (x < tile_info->x[1])? 0: ((x < tile_info->x[2])? 1: 2); - int ytile = (y < tile_info->y[1])? 0: ((y < tile_info->y[2])? 1: 2); - int tile = ytile*3+xtile; - ccl_global float *center_buffer = ((ccl_global float*) ccl_get_tile_buffer(tile)) + (tile_info->offsets[tile] + y*tile_info->strides[tile] + x)*buffer_pass_stride + buffer_denoising_offset; + int xtile = (x < tile_info->x[1]) ? 0 : ((x < tile_info->x[2]) ? 1 : 2); + int ytile = (y < tile_info->y[1]) ? 0 : ((y < tile_info->y[2]) ? 1 : 2); + int tile = ytile * 3 + xtile; + ccl_global float *center_buffer = ((ccl_global float *)ccl_get_tile_buffer(tile)) + + (tile_info->offsets[tile] + y * tile_info->strides[tile] + x) * + buffer_pass_stride + + buffer_denoising_offset; - int buffer_w = align_up(rect.z - rect.x, 4); - int idx = (y-rect.y)*buffer_w + (x - rect.x); + int buffer_w = align_up(rect.z - rect.x, 4); + int idx = (y - rect.y) * buffer_w + (x - rect.x); - float val = scale * center_buffer[m_offset]; - mean[idx] = val; + float val = scale * center_buffer[m_offset]; + mean[idx] = val; - if(v_offset >= 0) { - if(sample > 1) { - /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance - * update does not work efficiently with atomics in the kernel. */ - variance[idx] = max(0.0f, (center_buffer[v_offset] - val*val*sample) / (sample * (sample-1))); - } - else { - /* Can't compute variance with single sample, just set it very high. */ - variance[idx] = 1e10f; - } - } + if (v_offset >= 0) { + if (sample > 1) { + /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance + * update does not work efficiently with atomics in the kernel. */ + variance[idx] = max( + 0.0f, (center_buffer[v_offset] - val * val * sample) / (sample * (sample - 1))); + } + else { + /* Can't compute variance with single sample, just set it very high. */ + variance[idx] = 1e10f; + } + } } ccl_device void kernel_filter_write_feature(int sample, - int x, int y, + int x, + int y, int4 buffer_params, ccl_global float *from, ccl_global float *buffer, int out_offset, int4 rect) { - ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z; + ccl_global float *combined_buffer = buffer + (y * buffer_params.y + x + buffer_params.x) * + buffer_params.z; - int buffer_w = align_up(rect.z - rect.x, 4); - int idx = (y-rect.y)*buffer_w + (x - rect.x); + int buffer_w = align_up(rect.z - rect.x, 4); + int idx = (y - rect.y) * buffer_w + (x - rect.x); - combined_buffer[out_offset] = from[idx]; + combined_buffer[out_offset] = from[idx]; } -ccl_device void kernel_filter_detect_outliers(int x, int y, +ccl_device void kernel_filter_detect_outliers(int x, + int y, ccl_global float *image, ccl_global float *variance, ccl_global float *depth, @@ -136,123 +149,131 @@ ccl_device void kernel_filter_detect_outliers(int x, int y, int4 rect, int pass_stride) { - int buffer_w = align_up(rect.z - rect.x, 4); + int buffer_w = align_up(rect.z - rect.x, 4); - int n = 0; - float values[25]; - float pixel_variance, max_variance = 0.0f; - for(int y1 = max(y-2, rect.y); y1 < min(y+3, rect.w); y1++) { - for(int x1 = max(x-2, rect.x); x1 < min(x+3, rect.z); x1++) { - int idx = (y1-rect.y)*buffer_w + (x1-rect.x); - float3 color = make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]); - color = max(color, make_float3(0.0f, 0.0f, 0.0f)); - float L = average(color); + int n = 0; + float values[25]; + float pixel_variance, max_variance = 0.0f; + for (int y1 = max(y - 2, rect.y); y1 < min(y + 3, rect.w); y1++) { + for (int x1 = max(x - 2, rect.x); x1 < min(x + 3, rect.z); x1++) { + int idx = (y1 - rect.y) * buffer_w + (x1 - rect.x); + float3 color = make_float3( + image[idx], image[idx + pass_stride], image[idx + 2 * pass_stride]); + color = max(color, make_float3(0.0f, 0.0f, 0.0f)); + float L = average(color); - /* Find the position of L. */ - int i; - for(i = 0; i < n; i++) { - if(values[i] > L) break; - } - /* Make space for L by shifting all following values to the right. */ - for(int j = n; j > i; j--) { - values[j] = values[j-1]; - } - /* Insert L. */ - values[i] = L; - n++; + /* Find the position of L. */ + int i; + for (i = 0; i < n; i++) { + if (values[i] > L) + break; + } + /* Make space for L by shifting all following values to the right. */ + for (int j = n; j > i; j--) { + values[j] = values[j - 1]; + } + /* Insert L. */ + values[i] = L; + n++; - float3 pixel_var = make_float3(variance[idx], variance[idx+pass_stride], variance[idx+2*pass_stride]); - float var = average(pixel_var); - if((x1 == x) && (y1 == y)) { - pixel_variance = (pixel_var.x < 0.0f || pixel_var.y < 0.0f || pixel_var.z < 0.0f)? -1.0f : var; - } - else { - max_variance = max(max_variance, var); - } - } - } + float3 pixel_var = make_float3( + variance[idx], variance[idx + pass_stride], variance[idx + 2 * pass_stride]); + float var = average(pixel_var); + if ((x1 == x) && (y1 == y)) { + pixel_variance = (pixel_var.x < 0.0f || pixel_var.y < 0.0f || pixel_var.z < 0.0f) ? -1.0f : + var; + } + else { + max_variance = max(max_variance, var); + } + } + } - max_variance += 1e-4f; + max_variance += 1e-4f; - int idx = (y-rect.y)*buffer_w + (x-rect.x); - float3 color = make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]); - color = max(color, make_float3(0.0f, 0.0f, 0.0f)); - float L = average(color); + int idx = (y - rect.y) * buffer_w + (x - rect.x); + float3 color = make_float3(image[idx], image[idx + pass_stride], image[idx + 2 * pass_stride]); + color = max(color, make_float3(0.0f, 0.0f, 0.0f)); + float L = average(color); - float ref = 2.0f*values[(int)(n*0.75f)]; + float ref = 2.0f * values[(int)(n * 0.75f)]; - /* Slightly offset values to avoid false positives in (almost) black areas. */ - max_variance += 1e-5f; - ref -= 1e-5f; + /* Slightly offset values to avoid false positives in (almost) black areas. */ + max_variance += 1e-5f; + ref -= 1e-5f; - if(L > ref) { - /* The pixel appears to be an outlier. - * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is that the pixel - * should actually be at the reference value: - * If the reference is within the 3-sigma interval, the pixel is assumed to be a statistical outlier. - * Otherwise, it is very unlikely that the pixel should be darker, which indicates a legitimate highlight. - */ + if (L > ref) { + /* The pixel appears to be an outlier. + * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is that the pixel + * should actually be at the reference value: + * If the reference is within the 3-sigma interval, the pixel is assumed to be a statistical outlier. + * Otherwise, it is very unlikely that the pixel should be darker, which indicates a legitimate highlight. + */ - if(pixel_variance < 0.0f || pixel_variance > 9.0f * max_variance) { - depth[idx] = -depth[idx]; - color *= ref/L; - variance[idx] = variance[idx + pass_stride] = variance[idx + 2*pass_stride] = max_variance; - } - else { - float stddev = sqrtf(pixel_variance); - if(L - 3*stddev < ref) { - /* The pixel is an outlier, so negate the depth value to mark it as one. - * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */ - depth[idx] = -depth[idx]; - float fac = ref/L; - color *= fac; - variance[idx ] *= fac*fac; - variance[idx + pass_stride] *= fac*fac; - variance[idx+2*pass_stride] *= fac*fac; - } - } - } - out[idx ] = color.x; - out[idx + pass_stride] = color.y; - out[idx+2*pass_stride] = color.z; + if (pixel_variance < 0.0f || pixel_variance > 9.0f * max_variance) { + depth[idx] = -depth[idx]; + color *= ref / L; + variance[idx] = variance[idx + pass_stride] = variance[idx + 2 * pass_stride] = max_variance; + } + else { + float stddev = sqrtf(pixel_variance); + if (L - 3 * stddev < ref) { + /* The pixel is an outlier, so negate the depth value to mark it as one. + * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */ + depth[idx] = -depth[idx]; + float fac = ref / L; + color *= fac; + variance[idx] *= fac * fac; + variance[idx + pass_stride] *= fac * fac; + variance[idx + 2 * pass_stride] *= fac * fac; + } + } + } + out[idx] = color.x; + out[idx + pass_stride] = color.y; + out[idx + 2 * pass_stride] = color.z; } /* Combine A/B buffers. * Calculates the combined mean and the buffer variance. */ -ccl_device void kernel_filter_combine_halves(int x, int y, +ccl_device void kernel_filter_combine_halves(int x, + int y, ccl_global float *mean, ccl_global float *variance, ccl_global float *a, ccl_global float *b, - int4 rect, int r) + int4 rect, + int r) { - int buffer_w = align_up(rect.z - rect.x, 4); - int idx = (y-rect.y)*buffer_w + (x - rect.x); + int buffer_w = align_up(rect.z - rect.x, 4); + int idx = (y - rect.y) * buffer_w + (x - rect.x); - if(mean) mean[idx] = 0.5f * (a[idx]+b[idx]); - if(variance) { - if(r == 0) variance[idx] = 0.25f * (a[idx]-b[idx])*(a[idx]-b[idx]); - else { - variance[idx] = 0.0f; - float values[25]; - int numValues = 0; - for(int py = max(y-r, rect.y); py < min(y+r+1, rect.w); py++) { - for(int px = max(x-r, rect.x); px < min(x+r+1, rect.z); px++) { - int pidx = (py-rect.y)*buffer_w + (px-rect.x); - values[numValues++] = 0.25f * (a[pidx]-b[pidx])*(a[pidx]-b[pidx]); - } - } - /* Insertion-sort the variances (fast enough for 25 elements). */ - for(int i = 1; i < numValues; i++) { - float v = values[i]; - int j; - for(j = i-1; j >= 0 && values[j] > v; j--) - values[j+1] = values[j]; - values[j+1] = v; - } - variance[idx] = values[(7*numValues)/8]; - } - } + if (mean) + mean[idx] = 0.5f * (a[idx] + b[idx]); + if (variance) { + if (r == 0) + variance[idx] = 0.25f * (a[idx] - b[idx]) * (a[idx] - b[idx]); + else { + variance[idx] = 0.0f; + float values[25]; + int numValues = 0; + for (int py = max(y - r, rect.y); py < min(y + r + 1, rect.w); py++) { + for (int px = max(x - r, rect.x); px < min(x + r + 1, rect.z); px++) { + int pidx = (py - rect.y) * buffer_w + (px - rect.x); + values[numValues++] = 0.25f * (a[pidx] - b[pidx]) * (a[pidx] - b[pidx]); + } + } + /* Insertion-sort the variances (fast enough for 25 elements). */ + for (int i = 1; i < numValues; i++) { + float v = values[i]; + int j; + for (j = i - 1; j >= 0 && values[j] > v; j--) + values[j + 1] = values[j]; + values[j + 1] = v; + } + variance[idx] = values[(7 * numValues) / 8]; + } + } } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h index ceda8f71f98..850f20584da 100644 --- a/intern/cycles/kernel/filter/filter_reconstruction.h +++ b/intern/cycles/kernel/filter/filter_reconstruction.h @@ -16,63 +16,75 @@ CCL_NAMESPACE_BEGIN -ccl_device_inline void kernel_filter_construct_gramian(int x, int y, +ccl_device_inline void kernel_filter_construct_gramian(int x, + int y, int storage_stride, - int dx, int dy, int t, + int dx, + int dy, + int t, int buffer_stride, int pass_stride, int frame_offset, bool use_time, const ccl_global float *ccl_restrict buffer, - const ccl_global float *ccl_restrict transform, + const ccl_global float *ccl_restrict + transform, ccl_global int *rank, float weight, ccl_global float *XtWX, ccl_global float3 *XtWY, int localIdx) { - if(weight < 1e-3f) { - return; - } + if (weight < 1e-3f) { + return; + } - int p_offset = y * buffer_stride + x; - int q_offset = (y+dy) * buffer_stride + (x+dx) + frame_offset; + int p_offset = y * buffer_stride + x; + int q_offset = (y + dy) * buffer_stride + (x + dx) + frame_offset; #ifdef __KERNEL_GPU__ - const int stride = storage_stride; + const int stride = storage_stride; #else - const int stride = 1; - (void) storage_stride; + const int stride = 1; + (void)storage_stride; #endif #ifdef __KERNEL_CUDA__ - ccl_local float shared_design_row[(DENOISE_FEATURES+1)*CCL_MAX_LOCAL_SIZE]; - ccl_local_param float *design_row = shared_design_row + localIdx*(DENOISE_FEATURES+1); + ccl_local float shared_design_row[(DENOISE_FEATURES + 1) * CCL_MAX_LOCAL_SIZE]; + ccl_local_param float *design_row = shared_design_row + localIdx * (DENOISE_FEATURES + 1); #else - float design_row[DENOISE_FEATURES+1]; + float design_row[DENOISE_FEATURES + 1]; #endif - float3 q_color = filter_get_color(buffer + q_offset, pass_stride); + float3 q_color = filter_get_color(buffer + q_offset, pass_stride); - /* If the pixel was flagged as an outlier during prefiltering, skip it. */ - if(ccl_get_feature(buffer + q_offset, 0) < 0.0f) { - return; - } + /* If the pixel was flagged as an outlier during prefiltering, skip it. */ + if (ccl_get_feature(buffer + q_offset, 0) < 0.0f) { + return; + } - filter_get_design_row_transform(make_int3(x, y, t), buffer + p_offset, - make_int3(x+dx, y+dy, t), buffer + q_offset, - pass_stride, *rank, design_row, transform, stride, use_time); + filter_get_design_row_transform(make_int3(x, y, t), + buffer + p_offset, + make_int3(x + dx, y + dy, t), + buffer + q_offset, + pass_stride, + *rank, + design_row, + transform, + stride, + use_time); #ifdef __KERNEL_GPU__ - math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride); - math_vec3_add_strided(XtWY, (*rank)+1, design_row, weight * q_color, stride); + math_trimatrix_add_gramian_strided(XtWX, (*rank) + 1, design_row, weight, stride); + math_vec3_add_strided(XtWY, (*rank) + 1, design_row, weight * q_color, stride); #else - math_trimatrix_add_gramian(XtWX, (*rank)+1, design_row, weight); - math_vec3_add(XtWY, (*rank)+1, design_row, weight * q_color); + math_trimatrix_add_gramian(XtWX, (*rank) + 1, design_row, weight); + math_vec3_add(XtWY, (*rank) + 1, design_row, weight * q_color); #endif } -ccl_device_inline void kernel_filter_finalize(int x, int y, +ccl_device_inline void kernel_filter_finalize(int x, + int y, ccl_global float *buffer, ccl_global int *rank, int storage_stride, @@ -82,47 +94,47 @@ ccl_device_inline void kernel_filter_finalize(int x, int y, int sample) { #ifdef __KERNEL_GPU__ - const int stride = storage_stride; + const int stride = storage_stride; #else - const int stride = 1; - (void) storage_stride; + const int stride = 1; + (void)storage_stride; #endif - if(XtWX[0] < 1e-3f) { - /* There is not enough information to determine a denoised result. - * As a fallback, keep the original value of the pixel. */ - return; - } - - /* The weighted average of pixel colors (essentially, the NLM-filtered image). - * In case the solution of the linear model fails due to numerical issues or - * returns non-sensical negative values, fall back to this value. */ - float3 mean_color = XtWY[0]/XtWX[0]; - - math_trimatrix_vec3_solve(XtWX, XtWY, (*rank)+1, stride); - - float3 final_color = XtWY[0]; - if(!isfinite3_safe(final_color) || - (final_color.x < -0.01f || final_color.y < -0.01f || final_color.z < -0.01f)) - { - final_color = mean_color; - } - - /* Clamp pixel value to positive values. */ - final_color = max(final_color, make_float3(0.0f, 0.0f, 0.0f)); - - ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z; - if(buffer_params.w >= 0) { - final_color *= sample; - if(buffer_params.w > 0) { - final_color.x += combined_buffer[buffer_params.w+0]; - final_color.y += combined_buffer[buffer_params.w+1]; - final_color.z += combined_buffer[buffer_params.w+2]; - } - } - combined_buffer[0] = final_color.x; - combined_buffer[1] = final_color.y; - combined_buffer[2] = final_color.z; + if (XtWX[0] < 1e-3f) { + /* There is not enough information to determine a denoised result. + * As a fallback, keep the original value of the pixel. */ + return; + } + + /* The weighted average of pixel colors (essentially, the NLM-filtered image). + * In case the solution of the linear model fails due to numerical issues or + * returns non-sensical negative values, fall back to this value. */ + float3 mean_color = XtWY[0] / XtWX[0]; + + math_trimatrix_vec3_solve(XtWX, XtWY, (*rank) + 1, stride); + + float3 final_color = XtWY[0]; + if (!isfinite3_safe(final_color) || + (final_color.x < -0.01f || final_color.y < -0.01f || final_color.z < -0.01f)) { + final_color = mean_color; + } + + /* Clamp pixel value to positive values. */ + final_color = max(final_color, make_float3(0.0f, 0.0f, 0.0f)); + + ccl_global float *combined_buffer = buffer + (y * buffer_params.y + x + buffer_params.x) * + buffer_params.z; + if (buffer_params.w >= 0) { + final_color *= sample; + if (buffer_params.w > 0) { + final_color.x += combined_buffer[buffer_params.w + 0]; + final_color.y += combined_buffer[buffer_params.w + 1]; + final_color.z += combined_buffer[buffer_params.w + 2]; + } + } + combined_buffer[0] = final_color.x; + combined_buffer[1] = final_color.y; + combined_buffer[2] = final_color.z; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_transform.h b/intern/cycles/kernel/filter/filter_transform.h index 94e27bb02fd..69e3c7c458d 100644 --- a/intern/cycles/kernel/filter/filter_transform.h +++ b/intern/cycles/kernel/filter/filter_transform.h @@ -18,92 +18,101 @@ CCL_NAMESPACE_BEGIN ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer, CCL_FILTER_TILE_INFO, - int x, int y, int4 rect, - int pass_stride, int frame_stride, + int x, + int y, + int4 rect, + int pass_stride, + int frame_stride, bool use_time, - float *transform, int *rank, - int radius, float pca_threshold) + float *transform, + int *rank, + int radius, + float pca_threshold) { - int buffer_w = align_up(rect.z - rect.x, 4); - - float features[DENOISE_FEATURES]; - - const float *ccl_restrict pixel_buffer; - int3 pixel; - - int num_features = use_time? 11 : 10; - - /* === Calculate denoising window. === */ - int2 low = make_int2(max(rect.x, x - radius), - max(rect.y, y - radius)); - int2 high = make_int2(min(rect.z, x + radius + 1), - min(rect.w, y + radius + 1)); - int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames; - - /* === Shift feature passes to have mean 0. === */ - float feature_means[DENOISE_FEATURES]; - math_vector_zero(feature_means, num_features); - FOR_PIXEL_WINDOW { - filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride); - math_vector_add(feature_means, features, num_features); - } END_FOR_PIXEL_WINDOW - - math_vector_scale(feature_means, 1.0f / num_pixels, num_features); - - /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */ - float feature_scale[DENOISE_FEATURES]; - math_vector_zero(feature_scale, num_features); - - FOR_PIXEL_WINDOW { - filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride); - math_vector_max(feature_scale, features, num_features); - } END_FOR_PIXEL_WINDOW - - filter_calculate_scale(feature_scale, use_time); - - /* === Generate the feature transformation. === - * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space - * which generally has fewer dimensions. This mainly helps to prevent overfitting. */ - float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES]; - math_matrix_zero(feature_matrix, num_features); - FOR_PIXEL_WINDOW { - filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride); - math_vector_mul(features, feature_scale, num_features); - math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f); - } END_FOR_PIXEL_WINDOW - - math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1); - *rank = 0; - /* Prevent overfitting when a small window is used. */ - int max_rank = min(num_features, num_pixels/3); - if(pca_threshold < 0.0f) { - float threshold_energy = 0.0f; - for(int i = 0; i < num_features; i++) { - threshold_energy += feature_matrix[i*num_features+i]; - } - threshold_energy *= 1.0f - (-pca_threshold); - - float reduced_energy = 0.0f; - for(int i = 0; i < max_rank; i++, (*rank)++) { - if(i >= 2 && reduced_energy >= threshold_energy) - break; - float s = feature_matrix[i*num_features+i]; - reduced_energy += s; - } - } - else { - for(int i = 0; i < max_rank; i++, (*rank)++) { - float s = feature_matrix[i*num_features+i]; - if(i >= 2 && sqrtf(s) < pca_threshold) - break; - } - } - - /* Bake the feature scaling into the transformation matrix. */ - for(int i = 0; i < (*rank); i++) { - math_vector_mul(transform + i*num_features, feature_scale, num_features); - } - math_matrix_transpose(transform, num_features, 1); + int buffer_w = align_up(rect.z - rect.x, 4); + + float features[DENOISE_FEATURES]; + + const float *ccl_restrict pixel_buffer; + int3 pixel; + + int num_features = use_time ? 11 : 10; + + /* === Calculate denoising window. === */ + int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius)); + int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1)); + int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames; + + /* === Shift feature passes to have mean 0. === */ + float feature_means[DENOISE_FEATURES]; + math_vector_zero(feature_means, num_features); + FOR_PIXEL_WINDOW + { + filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride); + math_vector_add(feature_means, features, num_features); + } + END_FOR_PIXEL_WINDOW + + math_vector_scale(feature_means, 1.0f / num_pixels, num_features); + + /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */ + float feature_scale[DENOISE_FEATURES]; + math_vector_zero(feature_scale, num_features); + + FOR_PIXEL_WINDOW + { + filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride); + math_vector_max(feature_scale, features, num_features); + } + END_FOR_PIXEL_WINDOW + + filter_calculate_scale(feature_scale, use_time); + + /* === Generate the feature transformation. === + * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space + * which generally has fewer dimensions. This mainly helps to prevent overfitting. */ + float feature_matrix[DENOISE_FEATURES * DENOISE_FEATURES]; + math_matrix_zero(feature_matrix, num_features); + FOR_PIXEL_WINDOW + { + filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride); + math_vector_mul(features, feature_scale, num_features); + math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f); + } + END_FOR_PIXEL_WINDOW + + math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1); + *rank = 0; + /* Prevent overfitting when a small window is used. */ + int max_rank = min(num_features, num_pixels / 3); + if (pca_threshold < 0.0f) { + float threshold_energy = 0.0f; + for (int i = 0; i < num_features; i++) { + threshold_energy += feature_matrix[i * num_features + i]; + } + threshold_energy *= 1.0f - (-pca_threshold); + + float reduced_energy = 0.0f; + for (int i = 0; i < max_rank; i++, (*rank)++) { + if (i >= 2 && reduced_energy >= threshold_energy) + break; + float s = feature_matrix[i * num_features + i]; + reduced_energy += s; + } + } + else { + for (int i = 0; i < max_rank; i++, (*rank)++) { + float s = feature_matrix[i * num_features + i]; + if (i >= 2 && sqrtf(s) < pca_threshold) + break; + } + } + + /* Bake the feature scaling into the transformation matrix. */ + for (int i = 0; i < (*rank); i++) { + math_vector_mul(transform + i * num_features, feature_scale, num_features); + } + math_matrix_transpose(transform, num_features, 1); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h index ed8ddcb49b1..89cddfd927f 100644 --- a/intern/cycles/kernel/filter/filter_transform_gpu.h +++ b/intern/cycles/kernel/filter/filter_transform_gpu.h @@ -18,106 +18,110 @@ CCL_NAMESPACE_BEGIN ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_restrict buffer, CCL_FILTER_TILE_INFO, - int x, int y, int4 rect, - int pass_stride, int frame_stride, + int x, + int y, + int4 rect, + int pass_stride, + int frame_stride, bool use_time, ccl_global float *transform, ccl_global int *rank, - int radius, float pca_threshold, - int transform_stride, int localIdx) + int radius, + float pca_threshold, + int transform_stride, + int localIdx) { - int buffer_w = align_up(rect.z - rect.x, 4); + int buffer_w = align_up(rect.z - rect.x, 4); #ifdef __KERNEL_CUDA__ - ccl_local float shared_features[DENOISE_FEATURES*CCL_MAX_LOCAL_SIZE]; - ccl_local_param float *features = shared_features + localIdx*DENOISE_FEATURES; + ccl_local float shared_features[DENOISE_FEATURES * CCL_MAX_LOCAL_SIZE]; + ccl_local_param float *features = shared_features + localIdx * DENOISE_FEATURES; #else - float features[DENOISE_FEATURES]; + float features[DENOISE_FEATURES]; #endif - int num_features = use_time? 11 : 10; - - /* === Calculate denoising window. === */ - int2 low = make_int2(max(rect.x, x - radius), - max(rect.y, y - radius)); - int2 high = make_int2(min(rect.z, x + radius + 1), - min(rect.w, y + radius + 1)); - int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames; - const ccl_global float *ccl_restrict pixel_buffer; - int3 pixel; - - - - - /* === Shift feature passes to have mean 0. === */ - float feature_means[DENOISE_FEATURES]; - math_vector_zero(feature_means, num_features); - FOR_PIXEL_WINDOW { - filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride); - math_vector_add(feature_means, features, num_features); - } END_FOR_PIXEL_WINDOW - - math_vector_scale(feature_means, 1.0f / num_pixels, num_features); - - /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */ - float feature_scale[DENOISE_FEATURES]; - math_vector_zero(feature_scale, num_features); - - FOR_PIXEL_WINDOW { - filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride); - math_vector_max(feature_scale, features, num_features); - } END_FOR_PIXEL_WINDOW - - filter_calculate_scale(feature_scale, use_time); - - - - /* === Generate the feature transformation. === - * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space - * which generally has fewer dimensions. This mainly helps to prevent overfitting. */ - float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES]; - math_matrix_zero(feature_matrix, num_features); - FOR_PIXEL_WINDOW { - filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride); - math_vector_mul(features, feature_scale, num_features); - math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f); - } END_FOR_PIXEL_WINDOW - - math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, transform_stride); - *rank = 0; - /* Prevent overfitting when a small window is used. */ - int max_rank = min(num_features, num_pixels/3); - if(pca_threshold < 0.0f) { - float threshold_energy = 0.0f; - for(int i = 0; i < num_features; i++) { - threshold_energy += feature_matrix[i*num_features+i]; - } - threshold_energy *= 1.0f - (-pca_threshold); - - float reduced_energy = 0.0f; - for(int i = 0; i < max_rank; i++, (*rank)++) { - if(i >= 2 && reduced_energy >= threshold_energy) - break; - float s = feature_matrix[i*num_features+i]; - reduced_energy += s; - } - } - else { - for(int i = 0; i < max_rank; i++, (*rank)++) { - float s = feature_matrix[i*num_features+i]; - if(i >= 2 && sqrtf(s) < pca_threshold) - break; - } - } - - math_matrix_transpose(transform, num_features, transform_stride); - - /* Bake the feature scaling into the transformation matrix. */ - for(int i = 0; i < num_features; i++) { - for(int j = 0; j < (*rank); j++) { - transform[(i*num_features + j)*transform_stride] *= feature_scale[i]; - } - } + int num_features = use_time ? 11 : 10; + + /* === Calculate denoising window. === */ + int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius)); + int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1)); + int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames; + const ccl_global float *ccl_restrict pixel_buffer; + int3 pixel; + + /* === Shift feature passes to have mean 0. === */ + float feature_means[DENOISE_FEATURES]; + math_vector_zero(feature_means, num_features); + FOR_PIXEL_WINDOW + { + filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride); + math_vector_add(feature_means, features, num_features); + } + END_FOR_PIXEL_WINDOW + + math_vector_scale(feature_means, 1.0f / num_pixels, num_features); + + /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */ + float feature_scale[DENOISE_FEATURES]; + math_vector_zero(feature_scale, num_features); + + FOR_PIXEL_WINDOW + { + filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride); + math_vector_max(feature_scale, features, num_features); + } + END_FOR_PIXEL_WINDOW + + filter_calculate_scale(feature_scale, use_time); + + /* === Generate the feature transformation. === + * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space + * which generally has fewer dimensions. This mainly helps to prevent overfitting. */ + float feature_matrix[DENOISE_FEATURES * DENOISE_FEATURES]; + math_matrix_zero(feature_matrix, num_features); + FOR_PIXEL_WINDOW + { + filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride); + math_vector_mul(features, feature_scale, num_features); + math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f); + } + END_FOR_PIXEL_WINDOW + + math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, transform_stride); + *rank = 0; + /* Prevent overfitting when a small window is used. */ + int max_rank = min(num_features, num_pixels / 3); + if (pca_threshold < 0.0f) { + float threshold_energy = 0.0f; + for (int i = 0; i < num_features; i++) { + threshold_energy += feature_matrix[i * num_features + i]; + } + threshold_energy *= 1.0f - (-pca_threshold); + + float reduced_energy = 0.0f; + for (int i = 0; i < max_rank; i++, (*rank)++) { + if (i >= 2 && reduced_energy >= threshold_energy) + break; + float s = feature_matrix[i * num_features + i]; + reduced_energy += s; + } + } + else { + for (int i = 0; i < max_rank; i++, (*rank)++) { + float s = feature_matrix[i * num_features + i]; + if (i >= 2 && sqrtf(s) < pca_threshold) + break; + } + } + + math_matrix_transpose(transform, num_features, transform_stride); + + /* Bake the feature scaling into the transformation matrix. */ + for (int i = 0; i < num_features; i++) { + for (int j = 0; j < (*rank); j++) { + transform[(i * num_features + j) * transform_stride] *= feature_scale[i]; + } + } } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h index 10bd3e477e9..22397b292db 100644 --- a/intern/cycles/kernel/filter/filter_transform_sse.h +++ b/intern/cycles/kernel/filter/filter_transform_sse.h @@ -18,98 +18,110 @@ CCL_NAMESPACE_BEGIN ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer, CCL_FILTER_TILE_INFO, - int x, int y, int4 rect, - int pass_stride, int frame_stride, + int x, + int y, + int4 rect, + int pass_stride, + int frame_stride, bool use_time, - float *transform, int *rank, - int radius, float pca_threshold) + float *transform, + int *rank, + int radius, + float pca_threshold) { - int buffer_w = align_up(rect.z - rect.x, 4); - - float4 features[DENOISE_FEATURES]; - const float *ccl_restrict pixel_buffer; - int3 pixel; - - int num_features = use_time? 11 : 10; - - /* === Calculate denoising window. === */ - int2 low = make_int2(max(rect.x, x - radius), - max(rect.y, y - radius)); - int2 high = make_int2(min(rect.z, x + radius + 1), - min(rect.w, y + radius + 1)); - int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames; - - /* === Shift feature passes to have mean 0. === */ - float4 feature_means[DENOISE_FEATURES]; - math_vector_zero_sse(feature_means, num_features); - FOR_PIXEL_WINDOW_SSE { - filter_get_features_sse(x4, y4, t4, active_pixels, pixel_buffer, features, use_time, NULL, pass_stride); - math_vector_add_sse(feature_means, num_features, features); - } END_FOR_PIXEL_WINDOW_SSE - - float4 pixel_scale = make_float4(1.0f / num_pixels); - for(int i = 0; i < num_features; i++) { - feature_means[i] = reduce_add(feature_means[i]) * pixel_scale; - } - - /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */ - float4 feature_scale[DENOISE_FEATURES]; - math_vector_zero_sse(feature_scale, num_features); - FOR_PIXEL_WINDOW_SSE { - filter_get_feature_scales_sse(x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride); - math_vector_max_sse(feature_scale, features, num_features); - } END_FOR_PIXEL_WINDOW_SSE - - filter_calculate_scale_sse(feature_scale, use_time); - - /* === Generate the feature transformation. === - * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space - * which generally has fewer dimensions. This mainly helps to prevent overfitting. */ - float4 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES]; - math_matrix_zero_sse(feature_matrix_sse, num_features); - FOR_PIXEL_WINDOW_SSE { - filter_get_features_sse(x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride); - math_vector_mul_sse(features, num_features, feature_scale); - math_matrix_add_gramian_sse(feature_matrix_sse, num_features, features, make_float4(1.0f)); - } END_FOR_PIXEL_WINDOW_SSE - - float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES]; - math_matrix_hsum(feature_matrix, num_features, feature_matrix_sse); - - math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1); - - *rank = 0; - /* Prevent overfitting when a small window is used. */ - int max_rank = min(num_features, num_pixels/3); - if(pca_threshold < 0.0f) { - float threshold_energy = 0.0f; - for(int i = 0; i < num_features; i++) { - threshold_energy += feature_matrix[i*num_features+i]; - } - threshold_energy *= 1.0f - (-pca_threshold); - - float reduced_energy = 0.0f; - for(int i = 0; i < max_rank; i++, (*rank)++) { - if(i >= 2 && reduced_energy >= threshold_energy) - break; - float s = feature_matrix[i*num_features+i]; - reduced_energy += s; - } - } - else { - for(int i = 0; i < max_rank; i++, (*rank)++) { - float s = feature_matrix[i*num_features+i]; - if(i >= 2 && sqrtf(s) < pca_threshold) - break; - } - } - - math_matrix_transpose(transform, num_features, 1); - - /* Bake the feature scaling into the transformation matrix. */ - for(int i = 0; i < num_features; i++) { - math_vector_scale(transform + i*num_features, feature_scale[i][0], *rank); - } + int buffer_w = align_up(rect.z - rect.x, 4); + + float4 features[DENOISE_FEATURES]; + const float *ccl_restrict pixel_buffer; + int3 pixel; + + int num_features = use_time ? 11 : 10; + + /* === Calculate denoising window. === */ + int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius)); + int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1)); + int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames; + + /* === Shift feature passes to have mean 0. === */ + float4 feature_means[DENOISE_FEATURES]; + math_vector_zero_sse(feature_means, num_features); + FOR_PIXEL_WINDOW_SSE + { + filter_get_features_sse( + x4, y4, t4, active_pixels, pixel_buffer, features, use_time, NULL, pass_stride); + math_vector_add_sse(feature_means, num_features, features); + } + END_FOR_PIXEL_WINDOW_SSE + + float4 pixel_scale = make_float4(1.0f / num_pixels); + for (int i = 0; i < num_features; i++) { + feature_means[i] = reduce_add(feature_means[i]) * pixel_scale; + } + + /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */ + float4 feature_scale[DENOISE_FEATURES]; + math_vector_zero_sse(feature_scale, num_features); + FOR_PIXEL_WINDOW_SSE + { + filter_get_feature_scales_sse( + x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride); + math_vector_max_sse(feature_scale, features, num_features); + } + END_FOR_PIXEL_WINDOW_SSE + + filter_calculate_scale_sse(feature_scale, use_time); + + /* === Generate the feature transformation. === + * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space + * which generally has fewer dimensions. This mainly helps to prevent overfitting. */ + float4 feature_matrix_sse[DENOISE_FEATURES * DENOISE_FEATURES]; + math_matrix_zero_sse(feature_matrix_sse, num_features); + FOR_PIXEL_WINDOW_SSE + { + filter_get_features_sse( + x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride); + math_vector_mul_sse(features, num_features, feature_scale); + math_matrix_add_gramian_sse(feature_matrix_sse, num_features, features, make_float4(1.0f)); + } + END_FOR_PIXEL_WINDOW_SSE + + float feature_matrix[DENOISE_FEATURES * DENOISE_FEATURES]; + math_matrix_hsum(feature_matrix, num_features, feature_matrix_sse); + + math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1); + + *rank = 0; + /* Prevent overfitting when a small window is used. */ + int max_rank = min(num_features, num_pixels / 3); + if (pca_threshold < 0.0f) { + float threshold_energy = 0.0f; + for (int i = 0; i < num_features; i++) { + threshold_energy += feature_matrix[i * num_features + i]; + } + threshold_energy *= 1.0f - (-pca_threshold); + + float reduced_energy = 0.0f; + for (int i = 0; i < max_rank; i++, (*rank)++) { + if (i >= 2 && reduced_energy >= threshold_energy) + break; + float s = feature_matrix[i * num_features + i]; + reduced_energy += s; + } + } + else { + for (int i = 0; i < max_rank; i++, (*rank)++) { + float s = feature_matrix[i * num_features + i]; + if (i >= 2 && sqrtf(s) < pca_threshold) + break; + } + } + + math_matrix_transpose(transform, num_features, 1); + + /* Bake the feature scaling into the transformation matrix. */ + for (int i = 0; i < num_features; i++) { + math_vector_scale(transform + i * num_features, feature_scale[i][0], *rank); + } } CCL_NAMESPACE_END |