Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCampbell Barton <ideasman42@gmail.com>2019-04-17 07:17:24 +0300
committerCampbell Barton <ideasman42@gmail.com>2019-04-17 07:21:24 +0300
commite12c08e8d170b7ca40f204a5b0423c23a9fbc2c1 (patch)
tree8cf3453d12edb177a218ef8009357518ec6cab6a /intern/cycles/kernel/filter
parentb3dabc200a4b0399ec6b81f2ff2730d07b44fcaa (diff)
ClangFormat: apply to source, most of intern
Apply clang format as proposed in T53211. For details on usage and instructions for migrating branches without conflicts, see: https://wiki.blender.org/wiki/Tools/ClangFormat
Diffstat (limited to 'intern/cycles/kernel/filter')
-rw-r--r--intern/cycles/kernel/filter/filter.h6
-rw-r--r--intern/cycles/kernel/filter/filter_defines.h75
-rw-r--r--intern/cycles/kernel/filter/filter_features.h168
-rw-r--r--intern/cycles/kernel/filter/filter_features_sse.h129
-rw-r--r--intern/cycles/kernel/filter/filter_nlm_cpu.h334
-rw-r--r--intern/cycles/kernel/filter/filter_nlm_gpu.h365
-rw-r--r--intern/cycles/kernel/filter/filter_prefilter.h325
-rw-r--r--intern/cycles/kernel/filter/filter_reconstruction.h142
-rw-r--r--intern/cycles/kernel/filter/filter_transform.h177
-rw-r--r--intern/cycles/kernel/filter/filter_transform_gpu.h186
-rw-r--r--intern/cycles/kernel/filter/filter_transform_sse.h192
11 files changed, 1101 insertions, 998 deletions
diff --git a/intern/cycles/kernel/filter/filter.h b/intern/cycles/kernel/filter/filter.h
index 4209d69ee73..b067e53a8bf 100644
--- a/intern/cycles/kernel/filter/filter.h
+++ b/intern/cycles/kernel/filter/filter.h
@@ -25,8 +25,8 @@
CCL_NAMESPACE_BEGIN
-#define KERNEL_NAME_JOIN(x, y, z) x ## _ ## y ## _ ## z
-#define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name)
+#define KERNEL_NAME_JOIN(x, y, z) x##_##y##_##z
+#define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name)
#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
#define KERNEL_ARCH cpu
@@ -49,4 +49,4 @@ CCL_NAMESPACE_BEGIN
CCL_NAMESPACE_END
-#endif /* __FILTER_H__ */
+#endif /* __FILTER_H__ */
diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h
index cb04aac35f4..0e51eeef92f 100644
--- a/intern/cycles/kernel/filter/filter_defines.h
+++ b/intern/cycles/kernel/filter/filter_defines.h
@@ -18,59 +18,56 @@
#define __FILTER_DEFINES_H__
#define DENOISE_FEATURES 11
-#define TRANSFORM_SIZE (DENOISE_FEATURES*DENOISE_FEATURES)
-#define XTWX_SIZE (((DENOISE_FEATURES+1)*(DENOISE_FEATURES+2))/2)
-#define XTWY_SIZE (DENOISE_FEATURES+1)
+#define TRANSFORM_SIZE (DENOISE_FEATURES * DENOISE_FEATURES)
+#define XTWX_SIZE (((DENOISE_FEATURES + 1) * (DENOISE_FEATURES + 2)) / 2)
+#define XTWY_SIZE (DENOISE_FEATURES + 1)
#define DENOISE_MAX_FRAMES 16
typedef struct TileInfo {
- int offsets[9];
- int strides[9];
- int x[4];
- int y[4];
- int from_render;
- int frames[DENOISE_MAX_FRAMES];
- int num_frames;
- /* TODO(lukas): CUDA doesn't have uint64_t... */
+ int offsets[9];
+ int strides[9];
+ int x[4];
+ int y[4];
+ int from_render;
+ int frames[DENOISE_MAX_FRAMES];
+ int num_frames;
+ /* TODO(lukas): CUDA doesn't have uint64_t... */
#ifdef __KERNEL_OPENCL__
- ccl_global float *buffers[9];
+ ccl_global float *buffers[9];
#else
- long long int buffers[9];
+ long long int buffers[9];
#endif
} TileInfo;
#ifdef __KERNEL_OPENCL__
-# define CCL_FILTER_TILE_INFO ccl_global TileInfo* tile_info, \
- ccl_global float *tile_buffer_1, \
- ccl_global float *tile_buffer_2, \
- ccl_global float *tile_buffer_3, \
- ccl_global float *tile_buffer_4, \
- ccl_global float *tile_buffer_5, \
- ccl_global float *tile_buffer_6, \
- ccl_global float *tile_buffer_7, \
- ccl_global float *tile_buffer_8, \
- ccl_global float *tile_buffer_9
-# define CCL_FILTER_TILE_INFO_ARG tile_info, \
- tile_buffer_1, tile_buffer_2, tile_buffer_3, \
- tile_buffer_4, tile_buffer_5, tile_buffer_6, \
- tile_buffer_7, tile_buffer_8, tile_buffer_9
-# define ccl_get_tile_buffer(id) (id == 0 ? tile_buffer_1 \
- : id == 1 ? tile_buffer_2 \
- : id == 2 ? tile_buffer_3 \
- : id == 3 ? tile_buffer_4 \
- : id == 4 ? tile_buffer_5 \
- : id == 5 ? tile_buffer_6 \
- : id == 6 ? tile_buffer_7 \
- : id == 7 ? tile_buffer_8 \
- : tile_buffer_9)
+# define CCL_FILTER_TILE_INFO \
+ ccl_global TileInfo *tile_info, ccl_global float *tile_buffer_1, \
+ ccl_global float *tile_buffer_2, ccl_global float *tile_buffer_3, \
+ ccl_global float *tile_buffer_4, ccl_global float *tile_buffer_5, \
+ ccl_global float *tile_buffer_6, ccl_global float *tile_buffer_7, \
+ ccl_global float *tile_buffer_8, ccl_global float *tile_buffer_9
+# define CCL_FILTER_TILE_INFO_ARG \
+ tile_info, tile_buffer_1, tile_buffer_2, tile_buffer_3, tile_buffer_4, tile_buffer_5, \
+ tile_buffer_6, tile_buffer_7, tile_buffer_8, tile_buffer_9
+# define ccl_get_tile_buffer(id) \
+ (id == 0 ? tile_buffer_1 : \
+ id == 1 ? \
+ tile_buffer_2 : \
+ id == 2 ? \
+ tile_buffer_3 : \
+ id == 3 ? tile_buffer_4 : \
+ id == 4 ? tile_buffer_5 : \
+ id == 5 ? tile_buffer_6 : \
+ id == 6 ? tile_buffer_7 : \
+ id == 7 ? tile_buffer_8 : tile_buffer_9)
#else
# ifdef __KERNEL_CUDA__
-# define CCL_FILTER_TILE_INFO ccl_global TileInfo* tile_info
+# define CCL_FILTER_TILE_INFO ccl_global TileInfo *tile_info
# else
-# define CCL_FILTER_TILE_INFO TileInfo* tile_info
+# define CCL_FILTER_TILE_INFO TileInfo *tile_info
# endif
# define ccl_get_tile_buffer(id) (tile_info->buffers[id])
#endif
-#endif /* __FILTER_DEFINES_H__*/
+#endif /* __FILTER_DEFINES_H__*/
diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h
index e1ea6487aa9..809ccfe8be6 100644
--- a/intern/cycles/kernel/filter/filter_features.h
+++ b/intern/cycles/kernel/filter/filter_features.h
@@ -14,22 +14,25 @@
* limitations under the License.
*/
- CCL_NAMESPACE_BEGIN
+CCL_NAMESPACE_BEGIN
#define ccl_get_feature(buffer, pass) (buffer)[(pass)*pass_stride]
/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).+ * pixel_buffer always points to the current pixel in the first pass.
* Repeat the loop for every secondary frame if there are any. */
-#define FOR_PIXEL_WINDOW for(int frame = 0; frame < tile_info->num_frames; frame++) { \
- pixel.z = tile_info->frames[frame]; \
- pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x) + frame*frame_stride; \
- for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
- for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) {
+#define FOR_PIXEL_WINDOW \
+ for (int frame = 0; frame < tile_info->num_frames; frame++) { \
+ pixel.z = tile_info->frames[frame]; \
+ pixel_buffer = buffer + (low.y - rect.y) * buffer_w + (low.x - rect.x) + \
+ frame * frame_stride; \
+ for (pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
+ for (pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) {
-#define END_FOR_PIXEL_WINDOW } \
- pixel_buffer += buffer_w - (high.x - low.x); \
- } \
- }
+#define END_FOR_PIXEL_WINDOW \
+ } \
+ pixel_buffer += buffer_w - (high.x - low.x); \
+ } \
+ }
ccl_device_inline void filter_get_features(int3 pixel,
const ccl_global float *ccl_restrict buffer,
@@ -38,24 +41,24 @@ ccl_device_inline void filter_get_features(int3 pixel,
const float *ccl_restrict mean,
int pass_stride)
{
- features[0] = pixel.x;
- features[1] = pixel.y;
- features[2] = fabsf(ccl_get_feature(buffer, 0));
- features[3] = ccl_get_feature(buffer, 1);
- features[4] = ccl_get_feature(buffer, 2);
- features[5] = ccl_get_feature(buffer, 3);
- features[6] = ccl_get_feature(buffer, 4);
- features[7] = ccl_get_feature(buffer, 5);
- features[8] = ccl_get_feature(buffer, 6);
- features[9] = ccl_get_feature(buffer, 7);
- if(use_time) {
- features[10] = pixel.z;
- }
- if(mean) {
- for(int i = 0; i < (use_time? 11 : 10); i++) {
- features[i] -= mean[i];
- }
- }
+ features[0] = pixel.x;
+ features[1] = pixel.y;
+ features[2] = fabsf(ccl_get_feature(buffer, 0));
+ features[3] = ccl_get_feature(buffer, 1);
+ features[4] = ccl_get_feature(buffer, 2);
+ features[5] = ccl_get_feature(buffer, 3);
+ features[6] = ccl_get_feature(buffer, 4);
+ features[7] = ccl_get_feature(buffer, 5);
+ features[8] = ccl_get_feature(buffer, 6);
+ features[9] = ccl_get_feature(buffer, 7);
+ if (use_time) {
+ features[10] = pixel.z;
+ }
+ if (mean) {
+ for (int i = 0; i < (use_time ? 11 : 10); i++) {
+ features[i] -= mean[i];
+ }
+ }
}
ccl_device_inline void filter_get_feature_scales(int3 pixel,
@@ -65,38 +68,39 @@ ccl_device_inline void filter_get_feature_scales(int3 pixel,
const float *ccl_restrict mean,
int pass_stride)
{
- scales[0] = fabsf(pixel.x - mean[0]);
- scales[1] = fabsf(pixel.y - mean[1]);
- scales[2] = fabsf(fabsf(ccl_get_feature(buffer, 0)) - mean[2]);
- scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3],
- ccl_get_feature(buffer, 2) - mean[4],
- ccl_get_feature(buffer, 3) - mean[5]));
- scales[4] = fabsf(ccl_get_feature(buffer, 4) - mean[6]);
- scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7],
- ccl_get_feature(buffer, 6) - mean[8],
- ccl_get_feature(buffer, 7) - mean[9]));
- if(use_time) {
- scales[6] = fabsf(pixel.z - mean[10]);
- }
+ scales[0] = fabsf(pixel.x - mean[0]);
+ scales[1] = fabsf(pixel.y - mean[1]);
+ scales[2] = fabsf(fabsf(ccl_get_feature(buffer, 0)) - mean[2]);
+ scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3],
+ ccl_get_feature(buffer, 2) - mean[4],
+ ccl_get_feature(buffer, 3) - mean[5]));
+ scales[4] = fabsf(ccl_get_feature(buffer, 4) - mean[6]);
+ scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7],
+ ccl_get_feature(buffer, 6) - mean[8],
+ ccl_get_feature(buffer, 7) - mean[9]));
+ if (use_time) {
+ scales[6] = fabsf(pixel.z - mean[10]);
+ }
}
ccl_device_inline void filter_calculate_scale(float *scale, bool use_time)
{
- scale[0] = 1.0f/max(scale[0], 0.01f);
- scale[1] = 1.0f/max(scale[1], 0.01f);
- scale[2] = 1.0f/max(scale[2], 0.01f);
- if(use_time) {
- scale[10] = 1.0f/max(scale[6], 0.01f);
- }
- scale[6] = 1.0f/max(scale[4], 0.01f);
- scale[7] = scale[8] = scale[9] = 1.0f/max(sqrtf(scale[5]), 0.01f);
- scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f);
+ scale[0] = 1.0f / max(scale[0], 0.01f);
+ scale[1] = 1.0f / max(scale[1], 0.01f);
+ scale[2] = 1.0f / max(scale[2], 0.01f);
+ if (use_time) {
+ scale[10] = 1.0f / max(scale[6], 0.01f);
+ }
+ scale[6] = 1.0f / max(scale[4], 0.01f);
+ scale[7] = scale[8] = scale[9] = 1.0f / max(sqrtf(scale[5]), 0.01f);
+ scale[3] = scale[4] = scale[5] = 1.0f / max(sqrtf(scale[3]), 0.01f);
}
ccl_device_inline float3 filter_get_color(const ccl_global float *ccl_restrict buffer,
int pass_stride)
{
- return make_float3(ccl_get_feature(buffer, 8), ccl_get_feature(buffer, 9), ccl_get_feature(buffer, 10));
+ return make_float3(
+ ccl_get_feature(buffer, 8), ccl_get_feature(buffer, 9), ccl_get_feature(buffer, 10));
}
ccl_device_inline void design_row_add(float *design_row,
@@ -107,42 +111,44 @@ ccl_device_inline void design_row_add(float *design_row,
float feature,
int transform_row_stride)
{
- for(int i = 0; i < rank; i++) {
- design_row[1+i] += transform[(row*transform_row_stride + i)*stride]*feature;
- }
+ for (int i = 0; i < rank; i++) {
+ design_row[1 + i] += transform[(row * transform_row_stride + i) * stride] * feature;
+ }
}
/* Fill the design row. */
-ccl_device_inline void filter_get_design_row_transform(int3 p_pixel,
- const ccl_global float *ccl_restrict p_buffer,
- int3 q_pixel,
- const ccl_global float *ccl_restrict q_buffer,
- int pass_stride,
- int rank,
- float *design_row,
- const ccl_global float *ccl_restrict transform,
- int stride,
- bool use_time)
+ccl_device_inline void filter_get_design_row_transform(
+ int3 p_pixel,
+ const ccl_global float *ccl_restrict p_buffer,
+ int3 q_pixel,
+ const ccl_global float *ccl_restrict q_buffer,
+ int pass_stride,
+ int rank,
+ float *design_row,
+ const ccl_global float *ccl_restrict transform,
+ int stride,
+ bool use_time)
{
- int num_features = use_time? 11 : 10;
+ int num_features = use_time ? 11 : 10;
- design_row[0] = 1.0f;
- math_vector_zero(design_row+1, rank);
+ design_row[0] = 1.0f;
+ math_vector_zero(design_row + 1, rank);
-#define DESIGN_ROW_ADD(I, F) design_row_add(design_row, rank, transform, stride, I, F, num_features);
- DESIGN_ROW_ADD(0, q_pixel.x - p_pixel.x);
- DESIGN_ROW_ADD(1, q_pixel.y - p_pixel.y);
- DESIGN_ROW_ADD(2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0)));
- DESIGN_ROW_ADD(3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1));
- DESIGN_ROW_ADD(4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2));
- DESIGN_ROW_ADD(5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3));
- DESIGN_ROW_ADD(6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4));
- DESIGN_ROW_ADD(7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5));
- DESIGN_ROW_ADD(8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6));
- DESIGN_ROW_ADD(9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7));
- if(use_time) {
- DESIGN_ROW_ADD(10, q_pixel.z - p_pixel.z)
- }
+#define DESIGN_ROW_ADD(I, F) \
+ design_row_add(design_row, rank, transform, stride, I, F, num_features);
+ DESIGN_ROW_ADD(0, q_pixel.x - p_pixel.x);
+ DESIGN_ROW_ADD(1, q_pixel.y - p_pixel.y);
+ DESIGN_ROW_ADD(2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0)));
+ DESIGN_ROW_ADD(3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1));
+ DESIGN_ROW_ADD(4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2));
+ DESIGN_ROW_ADD(5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3));
+ DESIGN_ROW_ADD(6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4));
+ DESIGN_ROW_ADD(7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5));
+ DESIGN_ROW_ADD(8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6));
+ DESIGN_ROW_ADD(9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7));
+ if (use_time) {
+ DESIGN_ROW_ADD(10, q_pixel.z - p_pixel.z)
+ }
#undef DESIGN_ROW_ADD
}
diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h
index 5dd001ffb93..1e0d6e93453 100644
--- a/intern/cycles/kernel/filter/filter_features_sse.h
+++ b/intern/cycles/kernel/filter/filter_features_sse.h
@@ -22,22 +22,27 @@ CCL_NAMESPACE_BEGIN
* pixel_buffer always points to the first of the 4 current pixel in the first pass.
* x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window.
* Repeat the loop for every secondary frame if there are any. */
-#define FOR_PIXEL_WINDOW_SSE for(int frame = 0; frame < tile_info->num_frames; frame++) { \
- pixel.z = tile_info->frames[frame]; \
- pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x) + frame*frame_stride; \
- float4 t4 = make_float4(pixel.z); \
- for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
- float4 y4 = make_float4(pixel.y); \
- for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
- float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \
- int4 active_pixels = x4 < make_float4(high.x);
+#define FOR_PIXEL_WINDOW_SSE \
+ for (int frame = 0; frame < tile_info->num_frames; frame++) { \
+ pixel.z = tile_info->frames[frame]; \
+ pixel_buffer = buffer + (low.y - rect.y) * buffer_w + (low.x - rect.x) + \
+ frame * frame_stride; \
+ float4 t4 = make_float4(pixel.z); \
+ for (pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
+ float4 y4 = make_float4(pixel.y); \
+ for (pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
+ float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \
+ int4 active_pixels = x4 < make_float4(high.x);
-#define END_FOR_PIXEL_WINDOW_SSE } \
- pixel_buffer += buffer_w - (high.x - low.x); \
- } \
- }
+#define END_FOR_PIXEL_WINDOW_SSE \
+ } \
+ pixel_buffer += buffer_w - (high.x - low.x); \
+ } \
+ }
-ccl_device_inline void filter_get_features_sse(float4 x, float4 y, float4 t,
+ccl_device_inline void filter_get_features_sse(float4 x,
+ float4 y,
+ float4 t,
int4 active_pixels,
const float *ccl_restrict buffer,
float4 *features,
@@ -45,33 +50,35 @@ ccl_device_inline void filter_get_features_sse(float4 x, float4 y, float4 t,
const float4 *ccl_restrict mean,
int pass_stride)
{
- int num_features = use_time? 11 : 10;
+ int num_features = use_time ? 11 : 10;
- features[0] = x;
- features[1] = y;
- features[2] = fabs(ccl_get_feature_sse(0));
- features[3] = ccl_get_feature_sse(1);
- features[4] = ccl_get_feature_sse(2);
- features[5] = ccl_get_feature_sse(3);
- features[6] = ccl_get_feature_sse(4);
- features[7] = ccl_get_feature_sse(5);
- features[8] = ccl_get_feature_sse(6);
- features[9] = ccl_get_feature_sse(7);
- if(use_time) {
- features[10] = t;
- }
+ features[0] = x;
+ features[1] = y;
+ features[2] = fabs(ccl_get_feature_sse(0));
+ features[3] = ccl_get_feature_sse(1);
+ features[4] = ccl_get_feature_sse(2);
+ features[5] = ccl_get_feature_sse(3);
+ features[6] = ccl_get_feature_sse(4);
+ features[7] = ccl_get_feature_sse(5);
+ features[8] = ccl_get_feature_sse(6);
+ features[9] = ccl_get_feature_sse(7);
+ if (use_time) {
+ features[10] = t;
+ }
- if(mean) {
- for(int i = 0; i < num_features; i++) {
- features[i] = features[i] - mean[i];
- }
- }
- for(int i = 0; i < num_features; i++) {
- features[i] = mask(active_pixels, features[i]);
- }
+ if (mean) {
+ for (int i = 0; i < num_features; i++) {
+ features[i] = features[i] - mean[i];
+ }
+ }
+ for (int i = 0; i < num_features; i++) {
+ features[i] = mask(active_pixels, features[i]);
+ }
}
-ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y, float4 t,
+ccl_device_inline void filter_get_feature_scales_sse(float4 x,
+ float4 y,
+ float4 t,
int4 active_pixels,
const float *ccl_restrict buffer,
float4 *scales,
@@ -79,36 +86,34 @@ ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y, float4
const float4 *ccl_restrict mean,
int pass_stride)
{
- scales[0] = fabs(x - mean[0]);
- scales[1] = fabs(y - mean[1]);
- scales[2] = fabs(fabs(ccl_get_feature_sse(0)) - mean[2]);
- scales[3] = sqr(ccl_get_feature_sse(1) - mean[3]) +
- sqr(ccl_get_feature_sse(2) - mean[4]) +
- sqr(ccl_get_feature_sse(3) - mean[5]);
- scales[4] = fabs(ccl_get_feature_sse(4) - mean[6]);
- scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) +
- sqr(ccl_get_feature_sse(6) - mean[8]) +
- sqr(ccl_get_feature_sse(7) - mean[9]);
- if(use_time) {
- scales[6] = fabs(t - mean[10]);
- }
+ scales[0] = fabs(x - mean[0]);
+ scales[1] = fabs(y - mean[1]);
+ scales[2] = fabs(fabs(ccl_get_feature_sse(0)) - mean[2]);
+ scales[3] = sqr(ccl_get_feature_sse(1) - mean[3]) + sqr(ccl_get_feature_sse(2) - mean[4]) +
+ sqr(ccl_get_feature_sse(3) - mean[5]);
+ scales[4] = fabs(ccl_get_feature_sse(4) - mean[6]);
+ scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) + sqr(ccl_get_feature_sse(6) - mean[8]) +
+ sqr(ccl_get_feature_sse(7) - mean[9]);
+ if (use_time) {
+ scales[6] = fabs(t - mean[10]);
+ }
- for(int i = 0; i < (use_time? 7 : 6); i++)
- scales[i] = mask(active_pixels, scales[i]);
+ for (int i = 0; i < (use_time ? 7 : 6); i++)
+ scales[i] = mask(active_pixels, scales[i]);
}
ccl_device_inline void filter_calculate_scale_sse(float4 *scale, bool use_time)
{
- scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f)));
- scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f)));
- scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f)));
- if(use_time) {
- scale[10] = rcp(max(reduce_max(scale[6]), make_float4(0.01f)));;
- }
- scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f)));
- scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f)));
- scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f)));
+ scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f)));
+ scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f)));
+ scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f)));
+ if (use_time) {
+ scale[10] = rcp(max(reduce_max(scale[6]), make_float4(0.01f)));
+ ;
+ }
+ scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f)));
+ scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f)));
+ scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f)));
}
-
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h
index 9eb3c603a4a..a94266a8786 100644
--- a/intern/cycles/kernel/filter/filter_nlm_cpu.h
+++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h
@@ -16,10 +16,11 @@
CCL_NAMESPACE_BEGIN
-#define load4_a(buf, ofs) (*((float4*) ((buf) + (ofs))))
-#define load4_u(buf, ofs) load_float4((buf)+(ofs))
+#define load4_a(buf, ofs) (*((float4 *)((buf) + (ofs))))
+#define load4_u(buf, ofs) load_float4((buf) + (ofs))
-ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
+ccl_device_inline void kernel_filter_nlm_calc_difference(int dx,
+ int dy,
const float *ccl_restrict weight_image,
const float *ccl_restrict variance_image,
const float *ccl_restrict scale_image,
@@ -31,122 +32,117 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
float a,
float k_2)
{
- /* Strides need to be aligned to 16 bytes. */
- kernel_assert((stride % 4) == 0 && (channel_offset % 4) == 0);
-
- int aligned_lowx = rect.x & (~3);
- const int numChannels = (channel_offset > 0)? 3 : 1;
- const float4 channel_fac = make_float4(1.0f / numChannels);
-
- for(int y = rect.y; y < rect.w; y++) {
- int idx_p = y*stride + aligned_lowx;
- int idx_q = (y+dy)*stride + aligned_lowx + dx + frame_offset;
- for(int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) {
- float4 diff = make_float4(0.0f);
- float4 scale_fac;
- if(scale_image) {
- scale_fac = clamp(load4_a(scale_image, idx_p) / load4_u(scale_image, idx_q),
- make_float4(0.25f), make_float4(4.0f));
- }
- else {
- scale_fac = make_float4(1.0f);
- }
- for(int c = 0, chan_ofs = 0; c < numChannels; c++, chan_ofs += channel_offset) {
- /* idx_p is guaranteed to be aligned, but idx_q isn't. */
- float4 color_p = load4_a(weight_image, idx_p + chan_ofs);
- float4 color_q = scale_fac*load4_u(weight_image, idx_q + chan_ofs);
- float4 cdiff = color_p - color_q;
- float4 var_p = load4_a(variance_image, idx_p + chan_ofs);
- float4 var_q = sqr(scale_fac)*load4_u(variance_image, idx_q + chan_ofs);
- diff += (cdiff*cdiff - a*(var_p + min(var_p, var_q))) / (make_float4(1e-8f) + k_2*(var_p+var_q));
- }
- load4_a(difference_image, idx_p) = diff*channel_fac;
- }
- }
+ /* Strides need to be aligned to 16 bytes. */
+ kernel_assert((stride % 4) == 0 && (channel_offset % 4) == 0);
+
+ int aligned_lowx = rect.x & (~3);
+ const int numChannels = (channel_offset > 0) ? 3 : 1;
+ const float4 channel_fac = make_float4(1.0f / numChannels);
+
+ for (int y = rect.y; y < rect.w; y++) {
+ int idx_p = y * stride + aligned_lowx;
+ int idx_q = (y + dy) * stride + aligned_lowx + dx + frame_offset;
+ for (int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) {
+ float4 diff = make_float4(0.0f);
+ float4 scale_fac;
+ if (scale_image) {
+ scale_fac = clamp(load4_a(scale_image, idx_p) / load4_u(scale_image, idx_q),
+ make_float4(0.25f),
+ make_float4(4.0f));
+ }
+ else {
+ scale_fac = make_float4(1.0f);
+ }
+ for (int c = 0, chan_ofs = 0; c < numChannels; c++, chan_ofs += channel_offset) {
+ /* idx_p is guaranteed to be aligned, but idx_q isn't. */
+ float4 color_p = load4_a(weight_image, idx_p + chan_ofs);
+ float4 color_q = scale_fac * load4_u(weight_image, idx_q + chan_ofs);
+ float4 cdiff = color_p - color_q;
+ float4 var_p = load4_a(variance_image, idx_p + chan_ofs);
+ float4 var_q = sqr(scale_fac) * load4_u(variance_image, idx_q + chan_ofs);
+ diff += (cdiff * cdiff - a * (var_p + min(var_p, var_q))) /
+ (make_float4(1e-8f) + k_2 * (var_p + var_q));
+ }
+ load4_a(difference_image, idx_p) = diff * channel_fac;
+ }
+ }
}
-ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict difference_image,
- float *out_image,
- int4 rect,
- int stride,
- int f)
+ccl_device_inline void kernel_filter_nlm_blur(
+ const float *ccl_restrict difference_image, float *out_image, int4 rect, int stride, int f)
{
- int aligned_lowx = round_down(rect.x, 4);
- for(int y = rect.y; y < rect.w; y++) {
- const int low = max(rect.y, y-f);
- const int high = min(rect.w, y+f+1);
- for(int x = aligned_lowx; x < rect.z; x += 4) {
- load4_a(out_image, y*stride + x) = make_float4(0.0f);
- }
- for(int y1 = low; y1 < high; y1++) {
- for(int x = aligned_lowx; x < rect.z; x += 4) {
- load4_a(out_image, y*stride + x) += load4_a(difference_image, y1*stride + x);
- }
- }
- float fac = 1.0f/(high - low);
- for(int x = aligned_lowx; x < rect.z; x += 4) {
- load4_a(out_image, y*stride + x) *= fac;
- }
- }
+ int aligned_lowx = round_down(rect.x, 4);
+ for (int y = rect.y; y < rect.w; y++) {
+ const int low = max(rect.y, y - f);
+ const int high = min(rect.w, y + f + 1);
+ for (int x = aligned_lowx; x < rect.z; x += 4) {
+ load4_a(out_image, y * stride + x) = make_float4(0.0f);
+ }
+ for (int y1 = low; y1 < high; y1++) {
+ for (int x = aligned_lowx; x < rect.z; x += 4) {
+ load4_a(out_image, y * stride + x) += load4_a(difference_image, y1 * stride + x);
+ }
+ }
+ float fac = 1.0f / (high - low);
+ for (int x = aligned_lowx; x < rect.z; x += 4) {
+ load4_a(out_image, y * stride + x) *= fac;
+ }
+ }
}
-ccl_device_inline void nlm_blur_horizontal(const float *ccl_restrict difference_image,
- float *out_image,
- int4 rect,
- int stride,
- int f)
+ccl_device_inline void nlm_blur_horizontal(
+ const float *ccl_restrict difference_image, float *out_image, int4 rect, int stride, int f)
{
- int aligned_lowx = round_down(rect.x, 4);
- for(int y = rect.y; y < rect.w; y++) {
- for(int x = aligned_lowx; x < rect.z; x += 4) {
- load4_a(out_image, y*stride + x) = make_float4(0.0f);
- }
- }
-
- for(int dx = -f; dx <= f; dx++) {
- aligned_lowx = round_down(rect.x - min(0, dx), 4);
- int highx = rect.z - max(0, dx);
- int4 lowx4 = make_int4(rect.x - min(0, dx));
- int4 highx4 = make_int4(rect.z - max(0, dx));
- for(int y = rect.y; y < rect.w; y++) {
- for(int x = aligned_lowx; x < highx; x += 4) {
- int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3);
- int4 active = (x4 >= lowx4) & (x4 < highx4);
-
- float4 diff = load4_u(difference_image, y*stride + x + dx);
- load4_a(out_image, y*stride + x) += mask(active, diff);
- }
- }
- }
-
- aligned_lowx = round_down(rect.x, 4);
- for(int y = rect.y; y < rect.w; y++) {
- for(int x = aligned_lowx; x < rect.z; x += 4) {
- float4 x4 = make_float4(x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f);
- float4 low = max(make_float4(rect.x), x4 - make_float4(f));
- float4 high = min(make_float4(rect.z), x4 + make_float4(f+1));
- load4_a(out_image, y*stride + x) *= rcp(high - low);
- }
- }
+ int aligned_lowx = round_down(rect.x, 4);
+ for (int y = rect.y; y < rect.w; y++) {
+ for (int x = aligned_lowx; x < rect.z; x += 4) {
+ load4_a(out_image, y * stride + x) = make_float4(0.0f);
+ }
+ }
+
+ for (int dx = -f; dx <= f; dx++) {
+ aligned_lowx = round_down(rect.x - min(0, dx), 4);
+ int highx = rect.z - max(0, dx);
+ int4 lowx4 = make_int4(rect.x - min(0, dx));
+ int4 highx4 = make_int4(rect.z - max(0, dx));
+ for (int y = rect.y; y < rect.w; y++) {
+ for (int x = aligned_lowx; x < highx; x += 4) {
+ int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3);
+ int4 active = (x4 >= lowx4) & (x4 < highx4);
+
+ float4 diff = load4_u(difference_image, y * stride + x + dx);
+ load4_a(out_image, y * stride + x) += mask(active, diff);
+ }
+ }
+ }
+
+ aligned_lowx = round_down(rect.x, 4);
+ for (int y = rect.y; y < rect.w; y++) {
+ for (int x = aligned_lowx; x < rect.z; x += 4) {
+ float4 x4 = make_float4(x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f);
+ float4 low = max(make_float4(rect.x), x4 - make_float4(f));
+ float4 high = min(make_float4(rect.z), x4 + make_float4(f + 1));
+ load4_a(out_image, y * stride + x) *= rcp(high - low);
+ }
+ }
}
-ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
- float *out_image,
- int4 rect,
- int stride,
- int f)
+ccl_device_inline void kernel_filter_nlm_calc_weight(
+ const float *ccl_restrict difference_image, float *out_image, int4 rect, int stride, int f)
{
- nlm_blur_horizontal(difference_image, out_image, rect, stride, f);
-
- int aligned_lowx = round_down(rect.x, 4);
- for(int y = rect.y; y < rect.w; y++) {
- for(int x = aligned_lowx; x < rect.z; x += 4) {
- load4_a(out_image, y*stride + x) = fast_expf4(-max(load4_a(out_image, y*stride + x), make_float4(0.0f)));
- }
- }
+ nlm_blur_horizontal(difference_image, out_image, rect, stride, f);
+
+ int aligned_lowx = round_down(rect.x, 4);
+ for (int y = rect.y; y < rect.w; y++) {
+ for (int x = aligned_lowx; x < rect.z; x += 4) {
+ load4_a(out_image, y * stride + x) = fast_expf4(
+ -max(load4_a(out_image, y * stride + x), make_float4(0.0f)));
+ }
+ }
}
-ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy,
+ccl_device_inline void kernel_filter_nlm_update_output(int dx,
+ int dy,
const float *ccl_restrict difference_image,
const float *ccl_restrict image,
float *temp_image,
@@ -157,33 +153,36 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy,
int stride,
int f)
{
- nlm_blur_horizontal(difference_image, temp_image, rect, stride, f);
+ nlm_blur_horizontal(difference_image, temp_image, rect, stride, f);
- int aligned_lowx = round_down(rect.x, 4);
- for(int y = rect.y; y < rect.w; y++) {
- for(int x = aligned_lowx; x < rect.z; x += 4) {
- int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3);
- int4 active = (x4 >= make_int4(rect.x)) & (x4 < make_int4(rect.z));
+ int aligned_lowx = round_down(rect.x, 4);
+ for (int y = rect.y; y < rect.w; y++) {
+ for (int x = aligned_lowx; x < rect.z; x += 4) {
+ int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3);
+ int4 active = (x4 >= make_int4(rect.x)) & (x4 < make_int4(rect.z));
- int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx);
+ int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx);
- float4 weight = load4_a(temp_image, idx_p);
- load4_a(accum_image, idx_p) += mask(active, weight);
+ float4 weight = load4_a(temp_image, idx_p);
+ load4_a(accum_image, idx_p) += mask(active, weight);
- float4 val = load4_u(image, idx_q);
- if(channel_offset) {
- val += load4_u(image, idx_q + channel_offset);
- val += load4_u(image, idx_q + 2*channel_offset);
- val *= 1.0f/3.0f;
- }
+ float4 val = load4_u(image, idx_q);
+ if (channel_offset) {
+ val += load4_u(image, idx_q + channel_offset);
+ val += load4_u(image, idx_q + 2 * channel_offset);
+ val *= 1.0f / 3.0f;
+ }
- load4_a(out_image, idx_p) += mask(active, weight*val);
- }
- }
+ load4_a(out_image, idx_p) += mask(active, weight * val);
+ }
+ }
}
-ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, int t,
- const float *ccl_restrict difference_image,
+ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx,
+ int dy,
+ int t,
+ const float *ccl_restrict
+ difference_image,
const float *ccl_restrict buffer,
float *transform,
int *rank,
@@ -191,40 +190,49 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, int t
float3 *XtWY,
int4 rect,
int4 filter_window,
- int stride, int f,
+ int stride,
+ int f,
int pass_stride,
int frame_offset,
bool use_time)
{
- int4 clip_area = rect_clip(rect, filter_window);
- /* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */
- for(int y = clip_area.y; y < clip_area.w; y++) {
- for(int x = clip_area.x; x < clip_area.z; x++) {
- const int low = max(rect.x, x-f);
- const int high = min(rect.z, x+f+1);
- float sum = 0.0f;
- for(int x1 = low; x1 < high; x1++) {
- sum += difference_image[y*stride + x1];
- }
- float weight = sum * (1.0f/(high - low));
-
- int storage_ofs = coord_to_local_index(filter_window, x, y);
- float *l_transform = transform + storage_ofs*TRANSFORM_SIZE;
- float *l_XtWX = XtWX + storage_ofs*XTWX_SIZE;
- float3 *l_XtWY = XtWY + storage_ofs*XTWY_SIZE;
- int *l_rank = rank + storage_ofs;
-
- kernel_filter_construct_gramian(x, y, 1,
- dx, dy, t,
- stride,
- pass_stride,
- frame_offset,
- use_time,
- buffer,
- l_transform, l_rank,
- weight, l_XtWX, l_XtWY, 0);
- }
- }
+ int4 clip_area = rect_clip(rect, filter_window);
+ /* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */
+ for (int y = clip_area.y; y < clip_area.w; y++) {
+ for (int x = clip_area.x; x < clip_area.z; x++) {
+ const int low = max(rect.x, x - f);
+ const int high = min(rect.z, x + f + 1);
+ float sum = 0.0f;
+ for (int x1 = low; x1 < high; x1++) {
+ sum += difference_image[y * stride + x1];
+ }
+ float weight = sum * (1.0f / (high - low));
+
+ int storage_ofs = coord_to_local_index(filter_window, x, y);
+ float *l_transform = transform + storage_ofs * TRANSFORM_SIZE;
+ float *l_XtWX = XtWX + storage_ofs * XTWX_SIZE;
+ float3 *l_XtWY = XtWY + storage_ofs * XTWY_SIZE;
+ int *l_rank = rank + storage_ofs;
+
+ kernel_filter_construct_gramian(x,
+ y,
+ 1,
+ dx,
+ dy,
+ t,
+ stride,
+ pass_stride,
+ frame_offset,
+ use_time,
+ buffer,
+ l_transform,
+ l_rank,
+ weight,
+ l_XtWX,
+ l_XtWY,
+ 0);
+ }
+ }
}
ccl_device_inline void kernel_filter_nlm_normalize(float *out_image,
@@ -232,11 +240,11 @@ ccl_device_inline void kernel_filter_nlm_normalize(float *out_image,
int4 rect,
int w)
{
- for(int y = rect.y; y < rect.w; y++) {
- for(int x = rect.x; x < rect.z; x++) {
- out_image[y*w+x] /= accum_image[y*w+x];
- }
- }
+ for (int y = rect.y; y < rect.w; y++) {
+ for (int x = rect.x; x < rect.z; x++) {
+ out_image[y * w + x] /= accum_image[y * w + x];
+ }
+ }
}
#undef load4_a
diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h
index 12636393243..650c743f34f 100644
--- a/intern/cycles/kernel/filter/filter_nlm_gpu.h
+++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h
@@ -24,203 +24,232 @@ CCL_NAMESPACE_BEGIN
* Window is the rect that should be processed.
* co is filled with (x, y, dx, dy).
*/
-ccl_device_inline bool get_nlm_coords_window(int w, int h, int r, int stride,
- int4 *rect, int4 *co, int *ofs,
- int4 window)
+ccl_device_inline bool get_nlm_coords_window(
+ int w, int h, int r, int stride, int4 *rect, int4 *co, int *ofs, int4 window)
{
- /* Determine the pixel offset that this thread should apply. */
- int s = 2*r+1;
- int si = ccl_global_id(1);
- int sx = si % s;
- int sy = si / s;
- if(sy >= s) {
- return false;
- }
-
- /* Pixels still need to lie inside the denoising buffer after applying the offset,
- * so determine the area for which this is the case. */
- int dx = sx - r;
- int dy = sy - r;
-
- *rect = make_int4(max(0, -dx), max(0, -dy),
- w - max(0, dx), h - max(0, dy));
-
- /* Find the intersection of the area that we want to process (window) and the area
- * that can be processed (rect) to get the final area for this offset. */
- int4 clip_area = rect_clip(window, *rect);
-
- /* If the radius is larger than one of the sides of the window,
- * there will be shifts for which there is no usable pixel at all. */
- if(!rect_is_valid(clip_area)) {
- return false;
- }
-
- /* Map the linear thread index to pixels inside the clip area. */
- int x, y;
- if(!local_index_to_coord(clip_area, ccl_global_id(0), &x, &y)) {
- return false;
- }
-
- *co = make_int4(x, y, dx, dy);
-
- *ofs = (sy*s + sx) * stride;
-
- return true;
+ /* Determine the pixel offset that this thread should apply. */
+ int s = 2 * r + 1;
+ int si = ccl_global_id(1);
+ int sx = si % s;
+ int sy = si / s;
+ if (sy >= s) {
+ return false;
+ }
+
+ /* Pixels still need to lie inside the denoising buffer after applying the offset,
+ * so determine the area for which this is the case. */
+ int dx = sx - r;
+ int dy = sy - r;
+
+ *rect = make_int4(max(0, -dx), max(0, -dy), w - max(0, dx), h - max(0, dy));
+
+ /* Find the intersection of the area that we want to process (window) and the area
+ * that can be processed (rect) to get the final area for this offset. */
+ int4 clip_area = rect_clip(window, *rect);
+
+ /* If the radius is larger than one of the sides of the window,
+ * there will be shifts for which there is no usable pixel at all. */
+ if (!rect_is_valid(clip_area)) {
+ return false;
+ }
+
+ /* Map the linear thread index to pixels inside the clip area. */
+ int x, y;
+ if (!local_index_to_coord(clip_area, ccl_global_id(0), &x, &y)) {
+ return false;
+ }
+
+ *co = make_int4(x, y, dx, dy);
+
+ *ofs = (sy * s + sx) * stride;
+
+ return true;
}
-ccl_device_inline bool get_nlm_coords(int w, int h, int r, int stride,
- int4 *rect, int4 *co, int *ofs)
+ccl_device_inline bool get_nlm_coords(
+ int w, int h, int r, int stride, int4 *rect, int4 *co, int *ofs)
{
- return get_nlm_coords_window(w, h, r, stride, rect, co, ofs, make_int4(0, 0, w, h));
+ return get_nlm_coords_window(w, h, r, stride, rect, co, ofs, make_int4(0, 0, w, h));
}
-ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y,
- int dx, int dy,
- const ccl_global float *ccl_restrict weight_image,
- const ccl_global float *ccl_restrict variance_image,
- const ccl_global float *ccl_restrict scale_image,
- ccl_global float *difference_image,
- int4 rect, int stride,
- int channel_offset,
- int frame_offset,
- float a, float k_2)
+ccl_device_inline void kernel_filter_nlm_calc_difference(
+ int x,
+ int y,
+ int dx,
+ int dy,
+ const ccl_global float *ccl_restrict weight_image,
+ const ccl_global float *ccl_restrict variance_image,
+ const ccl_global float *ccl_restrict scale_image,
+ ccl_global float *difference_image,
+ int4 rect,
+ int stride,
+ int channel_offset,
+ int frame_offset,
+ float a,
+ float k_2)
{
- int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx) + frame_offset;
- int numChannels = channel_offset? 3 : 1;
-
- float diff = 0.0f;
- float scale_fac = 1.0f;
- if(scale_image) {
- scale_fac = clamp(scale_image[idx_p] / scale_image[idx_q], 0.25f, 4.0f);
- }
-
- for(int c = 0; c < numChannels; c++, idx_p += channel_offset, idx_q += channel_offset) {
- float cdiff = weight_image[idx_p] - scale_fac*weight_image[idx_q];
- float pvar = variance_image[idx_p];
- float qvar = sqr(scale_fac)*variance_image[idx_q];
- diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
- }
- if(numChannels > 1) {
- diff *= 1.0f/numChannels;
- }
- difference_image[y*stride + x] = diff;
+ int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx) + frame_offset;
+ int numChannels = channel_offset ? 3 : 1;
+
+ float diff = 0.0f;
+ float scale_fac = 1.0f;
+ if (scale_image) {
+ scale_fac = clamp(scale_image[idx_p] / scale_image[idx_q], 0.25f, 4.0f);
+ }
+
+ for (int c = 0; c < numChannels; c++, idx_p += channel_offset, idx_q += channel_offset) {
+ float cdiff = weight_image[idx_p] - scale_fac * weight_image[idx_q];
+ float pvar = variance_image[idx_p];
+ float qvar = sqr(scale_fac) * variance_image[idx_q];
+ diff += (cdiff * cdiff - a * (pvar + min(pvar, qvar))) / (1e-8f + k_2 * (pvar + qvar));
+ }
+ if (numChannels > 1) {
+ diff *= 1.0f / numChannels;
+ }
+ difference_image[y * stride + x] = diff;
}
-ccl_device_inline void kernel_filter_nlm_blur(int x, int y,
- const ccl_global float *ccl_restrict difference_image,
+ccl_device_inline void kernel_filter_nlm_blur(int x,
+ int y,
+ const ccl_global float *ccl_restrict
+ difference_image,
ccl_global float *out_image,
- int4 rect, int stride, int f)
+ int4 rect,
+ int stride,
+ int f)
{
- float sum = 0.0f;
- const int low = max(rect.y, y-f);
- const int high = min(rect.w, y+f+1);
- for(int y1 = low; y1 < high; y1++) {
- sum += difference_image[y1*stride + x];
- }
- sum *= 1.0f/(high-low);
- out_image[y*stride + x] = sum;
+ float sum = 0.0f;
+ const int low = max(rect.y, y - f);
+ const int high = min(rect.w, y + f + 1);
+ for (int y1 = low; y1 < high; y1++) {
+ sum += difference_image[y1 * stride + x];
+ }
+ sum *= 1.0f / (high - low);
+ out_image[y * stride + x] = sum;
}
-ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y,
- const ccl_global float *ccl_restrict difference_image,
+ccl_device_inline void kernel_filter_nlm_calc_weight(int x,
+ int y,
+ const ccl_global float *ccl_restrict
+ difference_image,
ccl_global float *out_image,
- int4 rect, int stride, int f)
+ int4 rect,
+ int stride,
+ int f)
{
- float sum = 0.0f;
- const int low = max(rect.x, x-f);
- const int high = min(rect.z, x+f+1);
- for(int x1 = low; x1 < high; x1++) {
- sum += difference_image[y*stride + x1];
- }
- sum *= 1.0f/(high-low);
- out_image[y*stride + x] = fast_expf(-max(sum, 0.0f));
+ float sum = 0.0f;
+ const int low = max(rect.x, x - f);
+ const int high = min(rect.z, x + f + 1);
+ for (int x1 = low; x1 < high; x1++) {
+ sum += difference_image[y * stride + x1];
+ }
+ sum *= 1.0f / (high - low);
+ out_image[y * stride + x] = fast_expf(-max(sum, 0.0f));
}
-ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
- int dx, int dy,
- const ccl_global float *ccl_restrict difference_image,
+ccl_device_inline void kernel_filter_nlm_update_output(int x,
+ int y,
+ int dx,
+ int dy,
+ const ccl_global float *ccl_restrict
+ difference_image,
const ccl_global float *ccl_restrict image,
ccl_global float *out_image,
ccl_global float *accum_image,
- int4 rect, int channel_offset,
- int stride, int f)
+ int4 rect,
+ int channel_offset,
+ int stride,
+ int f)
{
- float sum = 0.0f;
- const int low = max(rect.x, x-f);
- const int high = min(rect.z, x+f+1);
- for(int x1 = low; x1 < high; x1++) {
- sum += difference_image[y*stride + x1];
- }
- sum *= 1.0f/(high-low);
-
- int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx);
- if(out_image) {
- atomic_add_and_fetch_float(accum_image + idx_p, sum);
-
- float val = image[idx_q];
- if(channel_offset) {
- val += image[idx_q + channel_offset];
- val += image[idx_q + 2*channel_offset];
- val *= 1.0f/3.0f;
- }
- atomic_add_and_fetch_float(out_image + idx_p, sum*val);
- }
- else {
- accum_image[idx_p] = sum;
- }
+ float sum = 0.0f;
+ const int low = max(rect.x, x - f);
+ const int high = min(rect.z, x + f + 1);
+ for (int x1 = low; x1 < high; x1++) {
+ sum += difference_image[y * stride + x1];
+ }
+ sum *= 1.0f / (high - low);
+
+ int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx);
+ if (out_image) {
+ atomic_add_and_fetch_float(accum_image + idx_p, sum);
+
+ float val = image[idx_q];
+ if (channel_offset) {
+ val += image[idx_q + channel_offset];
+ val += image[idx_q + 2 * channel_offset];
+ val *= 1.0f / 3.0f;
+ }
+ atomic_add_and_fetch_float(out_image + idx_p, sum * val);
+ }
+ else {
+ accum_image[idx_p] = sum;
+ }
}
-ccl_device_inline void kernel_filter_nlm_construct_gramian(int x, int y,
- int dx, int dy, int t,
- const ccl_global float *ccl_restrict difference_image,
- const ccl_global float *ccl_restrict buffer,
- const ccl_global float *ccl_restrict transform,
- ccl_global int *rank,
- ccl_global float *XtWX,
- ccl_global float3 *XtWY,
- int4 rect,
- int4 filter_window,
- int stride, int f,
- int pass_stride,
- int frame_offset,
- bool use_time,
- int localIdx)
+ccl_device_inline void kernel_filter_nlm_construct_gramian(
+ int x,
+ int y,
+ int dx,
+ int dy,
+ int t,
+ const ccl_global float *ccl_restrict difference_image,
+ const ccl_global float *ccl_restrict buffer,
+ const ccl_global float *ccl_restrict transform,
+ ccl_global int *rank,
+ ccl_global float *XtWX,
+ ccl_global float3 *XtWY,
+ int4 rect,
+ int4 filter_window,
+ int stride,
+ int f,
+ int pass_stride,
+ int frame_offset,
+ bool use_time,
+ int localIdx)
{
- const int low = max(rect.x, x-f);
- const int high = min(rect.z, x+f+1);
- float sum = 0.0f;
- for(int x1 = low; x1 < high; x1++) {
- sum += difference_image[y*stride + x1];
- }
- float weight = sum * (1.0f/(high - low));
-
- /* Reconstruction data is only stored for pixels inside the filter window,
- * so compute the pixels's index in there. */
- int storage_ofs = coord_to_local_index(filter_window, x, y);
- transform += storage_ofs;
- rank += storage_ofs;
- XtWX += storage_ofs;
- XtWY += storage_ofs;
-
- kernel_filter_construct_gramian(x, y,
- rect_size(filter_window),
- dx, dy, t,
- stride,
- pass_stride,
- frame_offset,
- use_time,
- buffer,
- transform, rank,
- weight, XtWX, XtWY,
- localIdx);
+ const int low = max(rect.x, x - f);
+ const int high = min(rect.z, x + f + 1);
+ float sum = 0.0f;
+ for (int x1 = low; x1 < high; x1++) {
+ sum += difference_image[y * stride + x1];
+ }
+ float weight = sum * (1.0f / (high - low));
+
+ /* Reconstruction data is only stored for pixels inside the filter window,
+ * so compute the pixels's index in there. */
+ int storage_ofs = coord_to_local_index(filter_window, x, y);
+ transform += storage_ofs;
+ rank += storage_ofs;
+ XtWX += storage_ofs;
+ XtWY += storage_ofs;
+
+ kernel_filter_construct_gramian(x,
+ y,
+ rect_size(filter_window),
+ dx,
+ dy,
+ t,
+ stride,
+ pass_stride,
+ frame_offset,
+ use_time,
+ buffer,
+ transform,
+ rank,
+ weight,
+ XtWX,
+ XtWY,
+ localIdx);
}
-ccl_device_inline void kernel_filter_nlm_normalize(int x, int y,
+ccl_device_inline void kernel_filter_nlm_normalize(int x,
+ int y,
ccl_global float *out_image,
- const ccl_global float *ccl_restrict accum_image,
+ const ccl_global float *ccl_restrict
+ accum_image,
int stride)
{
- out_image[y*stride + x] /= accum_image[y*stride + x];
+ out_image[y * stride + x] /= accum_image[y * stride + x];
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h
index e24f4feb28d..8211311313d 100644
--- a/intern/cycles/kernel/filter/filter_prefilter.h
+++ b/intern/cycles/kernel/filter/filter_prefilter.h
@@ -27,7 +27,8 @@ CCL_NAMESPACE_BEGIN
*/
ccl_device void kernel_filter_divide_shadow(int sample,
CCL_FILTER_TILE_INFO,
- int x, int y,
+ int x,
+ int y,
ccl_global float *unfilteredA,
ccl_global float *unfilteredB,
ccl_global float *sampleVariance,
@@ -37,37 +38,39 @@ ccl_device void kernel_filter_divide_shadow(int sample,
int buffer_pass_stride,
int buffer_denoising_offset)
{
- int xtile = (x < tile_info->x[1])? 0: ((x < tile_info->x[2])? 1: 2);
- int ytile = (y < tile_info->y[1])? 0: ((y < tile_info->y[2])? 1: 2);
- int tile = ytile*3+xtile;
+ int xtile = (x < tile_info->x[1]) ? 0 : ((x < tile_info->x[2]) ? 1 : 2);
+ int ytile = (y < tile_info->y[1]) ? 0 : ((y < tile_info->y[2]) ? 1 : 2);
+ int tile = ytile * 3 + xtile;
- int offset = tile_info->offsets[tile];
- int stride = tile_info->strides[tile];
- const ccl_global float *ccl_restrict center_buffer = (ccl_global float*) ccl_get_tile_buffer(tile);
- center_buffer += (y*stride + x + offset)*buffer_pass_stride;
- center_buffer += buffer_denoising_offset + 14;
+ int offset = tile_info->offsets[tile];
+ int stride = tile_info->strides[tile];
+ const ccl_global float *ccl_restrict center_buffer = (ccl_global float *)ccl_get_tile_buffer(
+ tile);
+ center_buffer += (y * stride + x + offset) * buffer_pass_stride;
+ center_buffer += buffer_denoising_offset + 14;
- int buffer_w = align_up(rect.z - rect.x, 4);
- int idx = (y-rect.y)*buffer_w + (x - rect.x);
- unfilteredA[idx] = center_buffer[1] / max(center_buffer[0], 1e-7f);
- unfilteredB[idx] = center_buffer[4] / max(center_buffer[3], 1e-7f);
+ int buffer_w = align_up(rect.z - rect.x, 4);
+ int idx = (y - rect.y) * buffer_w + (x - rect.x);
+ unfilteredA[idx] = center_buffer[1] / max(center_buffer[0], 1e-7f);
+ unfilteredB[idx] = center_buffer[4] / max(center_buffer[3], 1e-7f);
- float varA = center_buffer[2];
- float varB = center_buffer[5];
- int odd_sample = (sample+1)/2;
- int even_sample = sample/2;
+ float varA = center_buffer[2];
+ float varB = center_buffer[5];
+ int odd_sample = (sample + 1) / 2;
+ int even_sample = sample / 2;
- /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
- * update does not work efficiently with atomics in the kernel. */
- varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample);
- varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample);
+ /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
+ * update does not work efficiently with atomics in the kernel. */
+ varA = max(0.0f, varA - unfilteredA[idx] * unfilteredA[idx] * odd_sample);
+ varB = max(0.0f, varB - unfilteredB[idx] * unfilteredB[idx] * even_sample);
- varA /= max(odd_sample - 1, 1);
- varB /= max(even_sample - 1, 1);
+ varA /= max(odd_sample - 1, 1);
+ varB /= max(even_sample - 1, 1);
- sampleVariance[idx] = 0.5f*(varA + varB) / sample;
- sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample*sample);
- bufferVariance[idx] = 0.5f * (unfilteredA[idx] - unfilteredB[idx]) * (unfilteredA[idx] - unfilteredB[idx]);
+ sampleVariance[idx] = 0.5f * (varA + varB) / sample;
+ sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample * sample);
+ bufferVariance[idx] = 0.5f * (unfilteredA[idx] - unfilteredB[idx]) *
+ (unfilteredA[idx] - unfilteredB[idx]);
}
/* Load a regular feature from the render buffers into the denoise buffer.
@@ -80,55 +83,65 @@ ccl_device void kernel_filter_divide_shadow(int sample,
*/
ccl_device void kernel_filter_get_feature(int sample,
CCL_FILTER_TILE_INFO,
- int m_offset, int v_offset,
- int x, int y,
+ int m_offset,
+ int v_offset,
+ int x,
+ int y,
ccl_global float *mean,
ccl_global float *variance,
float scale,
- int4 rect, int buffer_pass_stride,
+ int4 rect,
+ int buffer_pass_stride,
int buffer_denoising_offset)
{
- int xtile = (x < tile_info->x[1])? 0: ((x < tile_info->x[2])? 1: 2);
- int ytile = (y < tile_info->y[1])? 0: ((y < tile_info->y[2])? 1: 2);
- int tile = ytile*3+xtile;
- ccl_global float *center_buffer = ((ccl_global float*) ccl_get_tile_buffer(tile)) + (tile_info->offsets[tile] + y*tile_info->strides[tile] + x)*buffer_pass_stride + buffer_denoising_offset;
+ int xtile = (x < tile_info->x[1]) ? 0 : ((x < tile_info->x[2]) ? 1 : 2);
+ int ytile = (y < tile_info->y[1]) ? 0 : ((y < tile_info->y[2]) ? 1 : 2);
+ int tile = ytile * 3 + xtile;
+ ccl_global float *center_buffer = ((ccl_global float *)ccl_get_tile_buffer(tile)) +
+ (tile_info->offsets[tile] + y * tile_info->strides[tile] + x) *
+ buffer_pass_stride +
+ buffer_denoising_offset;
- int buffer_w = align_up(rect.z - rect.x, 4);
- int idx = (y-rect.y)*buffer_w + (x - rect.x);
+ int buffer_w = align_up(rect.z - rect.x, 4);
+ int idx = (y - rect.y) * buffer_w + (x - rect.x);
- float val = scale * center_buffer[m_offset];
- mean[idx] = val;
+ float val = scale * center_buffer[m_offset];
+ mean[idx] = val;
- if(v_offset >= 0) {
- if(sample > 1) {
- /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
- * update does not work efficiently with atomics in the kernel. */
- variance[idx] = max(0.0f, (center_buffer[v_offset] - val*val*sample) / (sample * (sample-1)));
- }
- else {
- /* Can't compute variance with single sample, just set it very high. */
- variance[idx] = 1e10f;
- }
- }
+ if (v_offset >= 0) {
+ if (sample > 1) {
+ /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
+ * update does not work efficiently with atomics in the kernel. */
+ variance[idx] = max(
+ 0.0f, (center_buffer[v_offset] - val * val * sample) / (sample * (sample - 1)));
+ }
+ else {
+ /* Can't compute variance with single sample, just set it very high. */
+ variance[idx] = 1e10f;
+ }
+ }
}
ccl_device void kernel_filter_write_feature(int sample,
- int x, int y,
+ int x,
+ int y,
int4 buffer_params,
ccl_global float *from,
ccl_global float *buffer,
int out_offset,
int4 rect)
{
- ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z;
+ ccl_global float *combined_buffer = buffer + (y * buffer_params.y + x + buffer_params.x) *
+ buffer_params.z;
- int buffer_w = align_up(rect.z - rect.x, 4);
- int idx = (y-rect.y)*buffer_w + (x - rect.x);
+ int buffer_w = align_up(rect.z - rect.x, 4);
+ int idx = (y - rect.y) * buffer_w + (x - rect.x);
- combined_buffer[out_offset] = from[idx];
+ combined_buffer[out_offset] = from[idx];
}
-ccl_device void kernel_filter_detect_outliers(int x, int y,
+ccl_device void kernel_filter_detect_outliers(int x,
+ int y,
ccl_global float *image,
ccl_global float *variance,
ccl_global float *depth,
@@ -136,123 +149,131 @@ ccl_device void kernel_filter_detect_outliers(int x, int y,
int4 rect,
int pass_stride)
{
- int buffer_w = align_up(rect.z - rect.x, 4);
+ int buffer_w = align_up(rect.z - rect.x, 4);
- int n = 0;
- float values[25];
- float pixel_variance, max_variance = 0.0f;
- for(int y1 = max(y-2, rect.y); y1 < min(y+3, rect.w); y1++) {
- for(int x1 = max(x-2, rect.x); x1 < min(x+3, rect.z); x1++) {
- int idx = (y1-rect.y)*buffer_w + (x1-rect.x);
- float3 color = make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]);
- color = max(color, make_float3(0.0f, 0.0f, 0.0f));
- float L = average(color);
+ int n = 0;
+ float values[25];
+ float pixel_variance, max_variance = 0.0f;
+ for (int y1 = max(y - 2, rect.y); y1 < min(y + 3, rect.w); y1++) {
+ for (int x1 = max(x - 2, rect.x); x1 < min(x + 3, rect.z); x1++) {
+ int idx = (y1 - rect.y) * buffer_w + (x1 - rect.x);
+ float3 color = make_float3(
+ image[idx], image[idx + pass_stride], image[idx + 2 * pass_stride]);
+ color = max(color, make_float3(0.0f, 0.0f, 0.0f));
+ float L = average(color);
- /* Find the position of L. */
- int i;
- for(i = 0; i < n; i++) {
- if(values[i] > L) break;
- }
- /* Make space for L by shifting all following values to the right. */
- for(int j = n; j > i; j--) {
- values[j] = values[j-1];
- }
- /* Insert L. */
- values[i] = L;
- n++;
+ /* Find the position of L. */
+ int i;
+ for (i = 0; i < n; i++) {
+ if (values[i] > L)
+ break;
+ }
+ /* Make space for L by shifting all following values to the right. */
+ for (int j = n; j > i; j--) {
+ values[j] = values[j - 1];
+ }
+ /* Insert L. */
+ values[i] = L;
+ n++;
- float3 pixel_var = make_float3(variance[idx], variance[idx+pass_stride], variance[idx+2*pass_stride]);
- float var = average(pixel_var);
- if((x1 == x) && (y1 == y)) {
- pixel_variance = (pixel_var.x < 0.0f || pixel_var.y < 0.0f || pixel_var.z < 0.0f)? -1.0f : var;
- }
- else {
- max_variance = max(max_variance, var);
- }
- }
- }
+ float3 pixel_var = make_float3(
+ variance[idx], variance[idx + pass_stride], variance[idx + 2 * pass_stride]);
+ float var = average(pixel_var);
+ if ((x1 == x) && (y1 == y)) {
+ pixel_variance = (pixel_var.x < 0.0f || pixel_var.y < 0.0f || pixel_var.z < 0.0f) ? -1.0f :
+ var;
+ }
+ else {
+ max_variance = max(max_variance, var);
+ }
+ }
+ }
- max_variance += 1e-4f;
+ max_variance += 1e-4f;
- int idx = (y-rect.y)*buffer_w + (x-rect.x);
- float3 color = make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]);
- color = max(color, make_float3(0.0f, 0.0f, 0.0f));
- float L = average(color);
+ int idx = (y - rect.y) * buffer_w + (x - rect.x);
+ float3 color = make_float3(image[idx], image[idx + pass_stride], image[idx + 2 * pass_stride]);
+ color = max(color, make_float3(0.0f, 0.0f, 0.0f));
+ float L = average(color);
- float ref = 2.0f*values[(int)(n*0.75f)];
+ float ref = 2.0f * values[(int)(n * 0.75f)];
- /* Slightly offset values to avoid false positives in (almost) black areas. */
- max_variance += 1e-5f;
- ref -= 1e-5f;
+ /* Slightly offset values to avoid false positives in (almost) black areas. */
+ max_variance += 1e-5f;
+ ref -= 1e-5f;
- if(L > ref) {
- /* The pixel appears to be an outlier.
- * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is that the pixel
- * should actually be at the reference value:
- * If the reference is within the 3-sigma interval, the pixel is assumed to be a statistical outlier.
- * Otherwise, it is very unlikely that the pixel should be darker, which indicates a legitimate highlight.
- */
+ if (L > ref) {
+ /* The pixel appears to be an outlier.
+ * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is that the pixel
+ * should actually be at the reference value:
+ * If the reference is within the 3-sigma interval, the pixel is assumed to be a statistical outlier.
+ * Otherwise, it is very unlikely that the pixel should be darker, which indicates a legitimate highlight.
+ */
- if(pixel_variance < 0.0f || pixel_variance > 9.0f * max_variance) {
- depth[idx] = -depth[idx];
- color *= ref/L;
- variance[idx] = variance[idx + pass_stride] = variance[idx + 2*pass_stride] = max_variance;
- }
- else {
- float stddev = sqrtf(pixel_variance);
- if(L - 3*stddev < ref) {
- /* The pixel is an outlier, so negate the depth value to mark it as one.
- * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */
- depth[idx] = -depth[idx];
- float fac = ref/L;
- color *= fac;
- variance[idx ] *= fac*fac;
- variance[idx + pass_stride] *= fac*fac;
- variance[idx+2*pass_stride] *= fac*fac;
- }
- }
- }
- out[idx ] = color.x;
- out[idx + pass_stride] = color.y;
- out[idx+2*pass_stride] = color.z;
+ if (pixel_variance < 0.0f || pixel_variance > 9.0f * max_variance) {
+ depth[idx] = -depth[idx];
+ color *= ref / L;
+ variance[idx] = variance[idx + pass_stride] = variance[idx + 2 * pass_stride] = max_variance;
+ }
+ else {
+ float stddev = sqrtf(pixel_variance);
+ if (L - 3 * stddev < ref) {
+ /* The pixel is an outlier, so negate the depth value to mark it as one.
+ * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */
+ depth[idx] = -depth[idx];
+ float fac = ref / L;
+ color *= fac;
+ variance[idx] *= fac * fac;
+ variance[idx + pass_stride] *= fac * fac;
+ variance[idx + 2 * pass_stride] *= fac * fac;
+ }
+ }
+ }
+ out[idx] = color.x;
+ out[idx + pass_stride] = color.y;
+ out[idx + 2 * pass_stride] = color.z;
}
/* Combine A/B buffers.
* Calculates the combined mean and the buffer variance. */
-ccl_device void kernel_filter_combine_halves(int x, int y,
+ccl_device void kernel_filter_combine_halves(int x,
+ int y,
ccl_global float *mean,
ccl_global float *variance,
ccl_global float *a,
ccl_global float *b,
- int4 rect, int r)
+ int4 rect,
+ int r)
{
- int buffer_w = align_up(rect.z - rect.x, 4);
- int idx = (y-rect.y)*buffer_w + (x - rect.x);
+ int buffer_w = align_up(rect.z - rect.x, 4);
+ int idx = (y - rect.y) * buffer_w + (x - rect.x);
- if(mean) mean[idx] = 0.5f * (a[idx]+b[idx]);
- if(variance) {
- if(r == 0) variance[idx] = 0.25f * (a[idx]-b[idx])*(a[idx]-b[idx]);
- else {
- variance[idx] = 0.0f;
- float values[25];
- int numValues = 0;
- for(int py = max(y-r, rect.y); py < min(y+r+1, rect.w); py++) {
- for(int px = max(x-r, rect.x); px < min(x+r+1, rect.z); px++) {
- int pidx = (py-rect.y)*buffer_w + (px-rect.x);
- values[numValues++] = 0.25f * (a[pidx]-b[pidx])*(a[pidx]-b[pidx]);
- }
- }
- /* Insertion-sort the variances (fast enough for 25 elements). */
- for(int i = 1; i < numValues; i++) {
- float v = values[i];
- int j;
- for(j = i-1; j >= 0 && values[j] > v; j--)
- values[j+1] = values[j];
- values[j+1] = v;
- }
- variance[idx] = values[(7*numValues)/8];
- }
- }
+ if (mean)
+ mean[idx] = 0.5f * (a[idx] + b[idx]);
+ if (variance) {
+ if (r == 0)
+ variance[idx] = 0.25f * (a[idx] - b[idx]) * (a[idx] - b[idx]);
+ else {
+ variance[idx] = 0.0f;
+ float values[25];
+ int numValues = 0;
+ for (int py = max(y - r, rect.y); py < min(y + r + 1, rect.w); py++) {
+ for (int px = max(x - r, rect.x); px < min(x + r + 1, rect.z); px++) {
+ int pidx = (py - rect.y) * buffer_w + (px - rect.x);
+ values[numValues++] = 0.25f * (a[pidx] - b[pidx]) * (a[pidx] - b[pidx]);
+ }
+ }
+ /* Insertion-sort the variances (fast enough for 25 elements). */
+ for (int i = 1; i < numValues; i++) {
+ float v = values[i];
+ int j;
+ for (j = i - 1; j >= 0 && values[j] > v; j--)
+ values[j + 1] = values[j];
+ values[j + 1] = v;
+ }
+ variance[idx] = values[(7 * numValues) / 8];
+ }
+ }
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h
index ceda8f71f98..850f20584da 100644
--- a/intern/cycles/kernel/filter/filter_reconstruction.h
+++ b/intern/cycles/kernel/filter/filter_reconstruction.h
@@ -16,63 +16,75 @@
CCL_NAMESPACE_BEGIN
-ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
+ccl_device_inline void kernel_filter_construct_gramian(int x,
+ int y,
int storage_stride,
- int dx, int dy, int t,
+ int dx,
+ int dy,
+ int t,
int buffer_stride,
int pass_stride,
int frame_offset,
bool use_time,
const ccl_global float *ccl_restrict buffer,
- const ccl_global float *ccl_restrict transform,
+ const ccl_global float *ccl_restrict
+ transform,
ccl_global int *rank,
float weight,
ccl_global float *XtWX,
ccl_global float3 *XtWY,
int localIdx)
{
- if(weight < 1e-3f) {
- return;
- }
+ if (weight < 1e-3f) {
+ return;
+ }
- int p_offset = y * buffer_stride + x;
- int q_offset = (y+dy) * buffer_stride + (x+dx) + frame_offset;
+ int p_offset = y * buffer_stride + x;
+ int q_offset = (y + dy) * buffer_stride + (x + dx) + frame_offset;
#ifdef __KERNEL_GPU__
- const int stride = storage_stride;
+ const int stride = storage_stride;
#else
- const int stride = 1;
- (void) storage_stride;
+ const int stride = 1;
+ (void)storage_stride;
#endif
#ifdef __KERNEL_CUDA__
- ccl_local float shared_design_row[(DENOISE_FEATURES+1)*CCL_MAX_LOCAL_SIZE];
- ccl_local_param float *design_row = shared_design_row + localIdx*(DENOISE_FEATURES+1);
+ ccl_local float shared_design_row[(DENOISE_FEATURES + 1) * CCL_MAX_LOCAL_SIZE];
+ ccl_local_param float *design_row = shared_design_row + localIdx * (DENOISE_FEATURES + 1);
#else
- float design_row[DENOISE_FEATURES+1];
+ float design_row[DENOISE_FEATURES + 1];
#endif
- float3 q_color = filter_get_color(buffer + q_offset, pass_stride);
+ float3 q_color = filter_get_color(buffer + q_offset, pass_stride);
- /* If the pixel was flagged as an outlier during prefiltering, skip it. */
- if(ccl_get_feature(buffer + q_offset, 0) < 0.0f) {
- return;
- }
+ /* If the pixel was flagged as an outlier during prefiltering, skip it. */
+ if (ccl_get_feature(buffer + q_offset, 0) < 0.0f) {
+ return;
+ }
- filter_get_design_row_transform(make_int3(x, y, t), buffer + p_offset,
- make_int3(x+dx, y+dy, t), buffer + q_offset,
- pass_stride, *rank, design_row, transform, stride, use_time);
+ filter_get_design_row_transform(make_int3(x, y, t),
+ buffer + p_offset,
+ make_int3(x + dx, y + dy, t),
+ buffer + q_offset,
+ pass_stride,
+ *rank,
+ design_row,
+ transform,
+ stride,
+ use_time);
#ifdef __KERNEL_GPU__
- math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride);
- math_vec3_add_strided(XtWY, (*rank)+1, design_row, weight * q_color, stride);
+ math_trimatrix_add_gramian_strided(XtWX, (*rank) + 1, design_row, weight, stride);
+ math_vec3_add_strided(XtWY, (*rank) + 1, design_row, weight * q_color, stride);
#else
- math_trimatrix_add_gramian(XtWX, (*rank)+1, design_row, weight);
- math_vec3_add(XtWY, (*rank)+1, design_row, weight * q_color);
+ math_trimatrix_add_gramian(XtWX, (*rank) + 1, design_row, weight);
+ math_vec3_add(XtWY, (*rank) + 1, design_row, weight * q_color);
#endif
}
-ccl_device_inline void kernel_filter_finalize(int x, int y,
+ccl_device_inline void kernel_filter_finalize(int x,
+ int y,
ccl_global float *buffer,
ccl_global int *rank,
int storage_stride,
@@ -82,47 +94,47 @@ ccl_device_inline void kernel_filter_finalize(int x, int y,
int sample)
{
#ifdef __KERNEL_GPU__
- const int stride = storage_stride;
+ const int stride = storage_stride;
#else
- const int stride = 1;
- (void) storage_stride;
+ const int stride = 1;
+ (void)storage_stride;
#endif
- if(XtWX[0] < 1e-3f) {
- /* There is not enough information to determine a denoised result.
- * As a fallback, keep the original value of the pixel. */
- return;
- }
-
- /* The weighted average of pixel colors (essentially, the NLM-filtered image).
- * In case the solution of the linear model fails due to numerical issues or
- * returns non-sensical negative values, fall back to this value. */
- float3 mean_color = XtWY[0]/XtWX[0];
-
- math_trimatrix_vec3_solve(XtWX, XtWY, (*rank)+1, stride);
-
- float3 final_color = XtWY[0];
- if(!isfinite3_safe(final_color) ||
- (final_color.x < -0.01f || final_color.y < -0.01f || final_color.z < -0.01f))
- {
- final_color = mean_color;
- }
-
- /* Clamp pixel value to positive values. */
- final_color = max(final_color, make_float3(0.0f, 0.0f, 0.0f));
-
- ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z;
- if(buffer_params.w >= 0) {
- final_color *= sample;
- if(buffer_params.w > 0) {
- final_color.x += combined_buffer[buffer_params.w+0];
- final_color.y += combined_buffer[buffer_params.w+1];
- final_color.z += combined_buffer[buffer_params.w+2];
- }
- }
- combined_buffer[0] = final_color.x;
- combined_buffer[1] = final_color.y;
- combined_buffer[2] = final_color.z;
+ if (XtWX[0] < 1e-3f) {
+ /* There is not enough information to determine a denoised result.
+ * As a fallback, keep the original value of the pixel. */
+ return;
+ }
+
+ /* The weighted average of pixel colors (essentially, the NLM-filtered image).
+ * In case the solution of the linear model fails due to numerical issues or
+ * returns non-sensical negative values, fall back to this value. */
+ float3 mean_color = XtWY[0] / XtWX[0];
+
+ math_trimatrix_vec3_solve(XtWX, XtWY, (*rank) + 1, stride);
+
+ float3 final_color = XtWY[0];
+ if (!isfinite3_safe(final_color) ||
+ (final_color.x < -0.01f || final_color.y < -0.01f || final_color.z < -0.01f)) {
+ final_color = mean_color;
+ }
+
+ /* Clamp pixel value to positive values. */
+ final_color = max(final_color, make_float3(0.0f, 0.0f, 0.0f));
+
+ ccl_global float *combined_buffer = buffer + (y * buffer_params.y + x + buffer_params.x) *
+ buffer_params.z;
+ if (buffer_params.w >= 0) {
+ final_color *= sample;
+ if (buffer_params.w > 0) {
+ final_color.x += combined_buffer[buffer_params.w + 0];
+ final_color.y += combined_buffer[buffer_params.w + 1];
+ final_color.z += combined_buffer[buffer_params.w + 2];
+ }
+ }
+ combined_buffer[0] = final_color.x;
+ combined_buffer[1] = final_color.y;
+ combined_buffer[2] = final_color.z;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform.h b/intern/cycles/kernel/filter/filter_transform.h
index 94e27bb02fd..69e3c7c458d 100644
--- a/intern/cycles/kernel/filter/filter_transform.h
+++ b/intern/cycles/kernel/filter/filter_transform.h
@@ -18,92 +18,101 @@ CCL_NAMESPACE_BEGIN
ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
CCL_FILTER_TILE_INFO,
- int x, int y, int4 rect,
- int pass_stride, int frame_stride,
+ int x,
+ int y,
+ int4 rect,
+ int pass_stride,
+ int frame_stride,
bool use_time,
- float *transform, int *rank,
- int radius, float pca_threshold)
+ float *transform,
+ int *rank,
+ int radius,
+ float pca_threshold)
{
- int buffer_w = align_up(rect.z - rect.x, 4);
-
- float features[DENOISE_FEATURES];
-
- const float *ccl_restrict pixel_buffer;
- int3 pixel;
-
- int num_features = use_time? 11 : 10;
-
- /* === Calculate denoising window. === */
- int2 low = make_int2(max(rect.x, x - radius),
- max(rect.y, y - radius));
- int2 high = make_int2(min(rect.z, x + radius + 1),
- min(rect.w, y + radius + 1));
- int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
-
- /* === Shift feature passes to have mean 0. === */
- float feature_means[DENOISE_FEATURES];
- math_vector_zero(feature_means, num_features);
- FOR_PIXEL_WINDOW {
- filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride);
- math_vector_add(feature_means, features, num_features);
- } END_FOR_PIXEL_WINDOW
-
- math_vector_scale(feature_means, 1.0f / num_pixels, num_features);
-
- /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
- float feature_scale[DENOISE_FEATURES];
- math_vector_zero(feature_scale, num_features);
-
- FOR_PIXEL_WINDOW {
- filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
- math_vector_max(feature_scale, features, num_features);
- } END_FOR_PIXEL_WINDOW
-
- filter_calculate_scale(feature_scale, use_time);
-
- /* === Generate the feature transformation. ===
- * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space
- * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
- float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
- math_matrix_zero(feature_matrix, num_features);
- FOR_PIXEL_WINDOW {
- filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
- math_vector_mul(features, feature_scale, num_features);
- math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f);
- } END_FOR_PIXEL_WINDOW
-
- math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1);
- *rank = 0;
- /* Prevent overfitting when a small window is used. */
- int max_rank = min(num_features, num_pixels/3);
- if(pca_threshold < 0.0f) {
- float threshold_energy = 0.0f;
- for(int i = 0; i < num_features; i++) {
- threshold_energy += feature_matrix[i*num_features+i];
- }
- threshold_energy *= 1.0f - (-pca_threshold);
-
- float reduced_energy = 0.0f;
- for(int i = 0; i < max_rank; i++, (*rank)++) {
- if(i >= 2 && reduced_energy >= threshold_energy)
- break;
- float s = feature_matrix[i*num_features+i];
- reduced_energy += s;
- }
- }
- else {
- for(int i = 0; i < max_rank; i++, (*rank)++) {
- float s = feature_matrix[i*num_features+i];
- if(i >= 2 && sqrtf(s) < pca_threshold)
- break;
- }
- }
-
- /* Bake the feature scaling into the transformation matrix. */
- for(int i = 0; i < (*rank); i++) {
- math_vector_mul(transform + i*num_features, feature_scale, num_features);
- }
- math_matrix_transpose(transform, num_features, 1);
+ int buffer_w = align_up(rect.z - rect.x, 4);
+
+ float features[DENOISE_FEATURES];
+
+ const float *ccl_restrict pixel_buffer;
+ int3 pixel;
+
+ int num_features = use_time ? 11 : 10;
+
+ /* === Calculate denoising window. === */
+ int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius));
+ int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1));
+ int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
+
+ /* === Shift feature passes to have mean 0. === */
+ float feature_means[DENOISE_FEATURES];
+ math_vector_zero(feature_means, num_features);
+ FOR_PIXEL_WINDOW
+ {
+ filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride);
+ math_vector_add(feature_means, features, num_features);
+ }
+ END_FOR_PIXEL_WINDOW
+
+ math_vector_scale(feature_means, 1.0f / num_pixels, num_features);
+
+ /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
+ float feature_scale[DENOISE_FEATURES];
+ math_vector_zero(feature_scale, num_features);
+
+ FOR_PIXEL_WINDOW
+ {
+ filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
+ math_vector_max(feature_scale, features, num_features);
+ }
+ END_FOR_PIXEL_WINDOW
+
+ filter_calculate_scale(feature_scale, use_time);
+
+ /* === Generate the feature transformation. ===
+ * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space
+ * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
+ float feature_matrix[DENOISE_FEATURES * DENOISE_FEATURES];
+ math_matrix_zero(feature_matrix, num_features);
+ FOR_PIXEL_WINDOW
+ {
+ filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
+ math_vector_mul(features, feature_scale, num_features);
+ math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f);
+ }
+ END_FOR_PIXEL_WINDOW
+
+ math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1);
+ *rank = 0;
+ /* Prevent overfitting when a small window is used. */
+ int max_rank = min(num_features, num_pixels / 3);
+ if (pca_threshold < 0.0f) {
+ float threshold_energy = 0.0f;
+ for (int i = 0; i < num_features; i++) {
+ threshold_energy += feature_matrix[i * num_features + i];
+ }
+ threshold_energy *= 1.0f - (-pca_threshold);
+
+ float reduced_energy = 0.0f;
+ for (int i = 0; i < max_rank; i++, (*rank)++) {
+ if (i >= 2 && reduced_energy >= threshold_energy)
+ break;
+ float s = feature_matrix[i * num_features + i];
+ reduced_energy += s;
+ }
+ }
+ else {
+ for (int i = 0; i < max_rank; i++, (*rank)++) {
+ float s = feature_matrix[i * num_features + i];
+ if (i >= 2 && sqrtf(s) < pca_threshold)
+ break;
+ }
+ }
+
+ /* Bake the feature scaling into the transformation matrix. */
+ for (int i = 0; i < (*rank); i++) {
+ math_vector_mul(transform + i * num_features, feature_scale, num_features);
+ }
+ math_matrix_transpose(transform, num_features, 1);
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h
index ed8ddcb49b1..89cddfd927f 100644
--- a/intern/cycles/kernel/filter/filter_transform_gpu.h
+++ b/intern/cycles/kernel/filter/filter_transform_gpu.h
@@ -18,106 +18,110 @@ CCL_NAMESPACE_BEGIN
ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_restrict buffer,
CCL_FILTER_TILE_INFO,
- int x, int y, int4 rect,
- int pass_stride, int frame_stride,
+ int x,
+ int y,
+ int4 rect,
+ int pass_stride,
+ int frame_stride,
bool use_time,
ccl_global float *transform,
ccl_global int *rank,
- int radius, float pca_threshold,
- int transform_stride, int localIdx)
+ int radius,
+ float pca_threshold,
+ int transform_stride,
+ int localIdx)
{
- int buffer_w = align_up(rect.z - rect.x, 4);
+ int buffer_w = align_up(rect.z - rect.x, 4);
#ifdef __KERNEL_CUDA__
- ccl_local float shared_features[DENOISE_FEATURES*CCL_MAX_LOCAL_SIZE];
- ccl_local_param float *features = shared_features + localIdx*DENOISE_FEATURES;
+ ccl_local float shared_features[DENOISE_FEATURES * CCL_MAX_LOCAL_SIZE];
+ ccl_local_param float *features = shared_features + localIdx * DENOISE_FEATURES;
#else
- float features[DENOISE_FEATURES];
+ float features[DENOISE_FEATURES];
#endif
- int num_features = use_time? 11 : 10;
-
- /* === Calculate denoising window. === */
- int2 low = make_int2(max(rect.x, x - radius),
- max(rect.y, y - radius));
- int2 high = make_int2(min(rect.z, x + radius + 1),
- min(rect.w, y + radius + 1));
- int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
- const ccl_global float *ccl_restrict pixel_buffer;
- int3 pixel;
-
-
-
-
- /* === Shift feature passes to have mean 0. === */
- float feature_means[DENOISE_FEATURES];
- math_vector_zero(feature_means, num_features);
- FOR_PIXEL_WINDOW {
- filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride);
- math_vector_add(feature_means, features, num_features);
- } END_FOR_PIXEL_WINDOW
-
- math_vector_scale(feature_means, 1.0f / num_pixels, num_features);
-
- /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
- float feature_scale[DENOISE_FEATURES];
- math_vector_zero(feature_scale, num_features);
-
- FOR_PIXEL_WINDOW {
- filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
- math_vector_max(feature_scale, features, num_features);
- } END_FOR_PIXEL_WINDOW
-
- filter_calculate_scale(feature_scale, use_time);
-
-
-
- /* === Generate the feature transformation. ===
- * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space
- * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
- float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
- math_matrix_zero(feature_matrix, num_features);
- FOR_PIXEL_WINDOW {
- filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
- math_vector_mul(features, feature_scale, num_features);
- math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f);
- } END_FOR_PIXEL_WINDOW
-
- math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, transform_stride);
- *rank = 0;
- /* Prevent overfitting when a small window is used. */
- int max_rank = min(num_features, num_pixels/3);
- if(pca_threshold < 0.0f) {
- float threshold_energy = 0.0f;
- for(int i = 0; i < num_features; i++) {
- threshold_energy += feature_matrix[i*num_features+i];
- }
- threshold_energy *= 1.0f - (-pca_threshold);
-
- float reduced_energy = 0.0f;
- for(int i = 0; i < max_rank; i++, (*rank)++) {
- if(i >= 2 && reduced_energy >= threshold_energy)
- break;
- float s = feature_matrix[i*num_features+i];
- reduced_energy += s;
- }
- }
- else {
- for(int i = 0; i < max_rank; i++, (*rank)++) {
- float s = feature_matrix[i*num_features+i];
- if(i >= 2 && sqrtf(s) < pca_threshold)
- break;
- }
- }
-
- math_matrix_transpose(transform, num_features, transform_stride);
-
- /* Bake the feature scaling into the transformation matrix. */
- for(int i = 0; i < num_features; i++) {
- for(int j = 0; j < (*rank); j++) {
- transform[(i*num_features + j)*transform_stride] *= feature_scale[i];
- }
- }
+ int num_features = use_time ? 11 : 10;
+
+ /* === Calculate denoising window. === */
+ int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius));
+ int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1));
+ int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
+ const ccl_global float *ccl_restrict pixel_buffer;
+ int3 pixel;
+
+ /* === Shift feature passes to have mean 0. === */
+ float feature_means[DENOISE_FEATURES];
+ math_vector_zero(feature_means, num_features);
+ FOR_PIXEL_WINDOW
+ {
+ filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride);
+ math_vector_add(feature_means, features, num_features);
+ }
+ END_FOR_PIXEL_WINDOW
+
+ math_vector_scale(feature_means, 1.0f / num_pixels, num_features);
+
+ /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
+ float feature_scale[DENOISE_FEATURES];
+ math_vector_zero(feature_scale, num_features);
+
+ FOR_PIXEL_WINDOW
+ {
+ filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
+ math_vector_max(feature_scale, features, num_features);
+ }
+ END_FOR_PIXEL_WINDOW
+
+ filter_calculate_scale(feature_scale, use_time);
+
+ /* === Generate the feature transformation. ===
+ * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space
+ * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
+ float feature_matrix[DENOISE_FEATURES * DENOISE_FEATURES];
+ math_matrix_zero(feature_matrix, num_features);
+ FOR_PIXEL_WINDOW
+ {
+ filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
+ math_vector_mul(features, feature_scale, num_features);
+ math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f);
+ }
+ END_FOR_PIXEL_WINDOW
+
+ math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, transform_stride);
+ *rank = 0;
+ /* Prevent overfitting when a small window is used. */
+ int max_rank = min(num_features, num_pixels / 3);
+ if (pca_threshold < 0.0f) {
+ float threshold_energy = 0.0f;
+ for (int i = 0; i < num_features; i++) {
+ threshold_energy += feature_matrix[i * num_features + i];
+ }
+ threshold_energy *= 1.0f - (-pca_threshold);
+
+ float reduced_energy = 0.0f;
+ for (int i = 0; i < max_rank; i++, (*rank)++) {
+ if (i >= 2 && reduced_energy >= threshold_energy)
+ break;
+ float s = feature_matrix[i * num_features + i];
+ reduced_energy += s;
+ }
+ }
+ else {
+ for (int i = 0; i < max_rank; i++, (*rank)++) {
+ float s = feature_matrix[i * num_features + i];
+ if (i >= 2 && sqrtf(s) < pca_threshold)
+ break;
+ }
+ }
+
+ math_matrix_transpose(transform, num_features, transform_stride);
+
+ /* Bake the feature scaling into the transformation matrix. */
+ for (int i = 0; i < num_features; i++) {
+ for (int j = 0; j < (*rank); j++) {
+ transform[(i * num_features + j) * transform_stride] *= feature_scale[i];
+ }
+ }
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h
index 10bd3e477e9..22397b292db 100644
--- a/intern/cycles/kernel/filter/filter_transform_sse.h
+++ b/intern/cycles/kernel/filter/filter_transform_sse.h
@@ -18,98 +18,110 @@ CCL_NAMESPACE_BEGIN
ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
CCL_FILTER_TILE_INFO,
- int x, int y, int4 rect,
- int pass_stride, int frame_stride,
+ int x,
+ int y,
+ int4 rect,
+ int pass_stride,
+ int frame_stride,
bool use_time,
- float *transform, int *rank,
- int radius, float pca_threshold)
+ float *transform,
+ int *rank,
+ int radius,
+ float pca_threshold)
{
- int buffer_w = align_up(rect.z - rect.x, 4);
-
- float4 features[DENOISE_FEATURES];
- const float *ccl_restrict pixel_buffer;
- int3 pixel;
-
- int num_features = use_time? 11 : 10;
-
- /* === Calculate denoising window. === */
- int2 low = make_int2(max(rect.x, x - radius),
- max(rect.y, y - radius));
- int2 high = make_int2(min(rect.z, x + radius + 1),
- min(rect.w, y + radius + 1));
- int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
-
- /* === Shift feature passes to have mean 0. === */
- float4 feature_means[DENOISE_FEATURES];
- math_vector_zero_sse(feature_means, num_features);
- FOR_PIXEL_WINDOW_SSE {
- filter_get_features_sse(x4, y4, t4, active_pixels, pixel_buffer, features, use_time, NULL, pass_stride);
- math_vector_add_sse(feature_means, num_features, features);
- } END_FOR_PIXEL_WINDOW_SSE
-
- float4 pixel_scale = make_float4(1.0f / num_pixels);
- for(int i = 0; i < num_features; i++) {
- feature_means[i] = reduce_add(feature_means[i]) * pixel_scale;
- }
-
- /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
- float4 feature_scale[DENOISE_FEATURES];
- math_vector_zero_sse(feature_scale, num_features);
- FOR_PIXEL_WINDOW_SSE {
- filter_get_feature_scales_sse(x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride);
- math_vector_max_sse(feature_scale, features, num_features);
- } END_FOR_PIXEL_WINDOW_SSE
-
- filter_calculate_scale_sse(feature_scale, use_time);
-
- /* === Generate the feature transformation. ===
- * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space
- * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
- float4 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
- math_matrix_zero_sse(feature_matrix_sse, num_features);
- FOR_PIXEL_WINDOW_SSE {
- filter_get_features_sse(x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride);
- math_vector_mul_sse(features, num_features, feature_scale);
- math_matrix_add_gramian_sse(feature_matrix_sse, num_features, features, make_float4(1.0f));
- } END_FOR_PIXEL_WINDOW_SSE
-
- float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
- math_matrix_hsum(feature_matrix, num_features, feature_matrix_sse);
-
- math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1);
-
- *rank = 0;
- /* Prevent overfitting when a small window is used. */
- int max_rank = min(num_features, num_pixels/3);
- if(pca_threshold < 0.0f) {
- float threshold_energy = 0.0f;
- for(int i = 0; i < num_features; i++) {
- threshold_energy += feature_matrix[i*num_features+i];
- }
- threshold_energy *= 1.0f - (-pca_threshold);
-
- float reduced_energy = 0.0f;
- for(int i = 0; i < max_rank; i++, (*rank)++) {
- if(i >= 2 && reduced_energy >= threshold_energy)
- break;
- float s = feature_matrix[i*num_features+i];
- reduced_energy += s;
- }
- }
- else {
- for(int i = 0; i < max_rank; i++, (*rank)++) {
- float s = feature_matrix[i*num_features+i];
- if(i >= 2 && sqrtf(s) < pca_threshold)
- break;
- }
- }
-
- math_matrix_transpose(transform, num_features, 1);
-
- /* Bake the feature scaling into the transformation matrix. */
- for(int i = 0; i < num_features; i++) {
- math_vector_scale(transform + i*num_features, feature_scale[i][0], *rank);
- }
+ int buffer_w = align_up(rect.z - rect.x, 4);
+
+ float4 features[DENOISE_FEATURES];
+ const float *ccl_restrict pixel_buffer;
+ int3 pixel;
+
+ int num_features = use_time ? 11 : 10;
+
+ /* === Calculate denoising window. === */
+ int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius));
+ int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1));
+ int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
+
+ /* === Shift feature passes to have mean 0. === */
+ float4 feature_means[DENOISE_FEATURES];
+ math_vector_zero_sse(feature_means, num_features);
+ FOR_PIXEL_WINDOW_SSE
+ {
+ filter_get_features_sse(
+ x4, y4, t4, active_pixels, pixel_buffer, features, use_time, NULL, pass_stride);
+ math_vector_add_sse(feature_means, num_features, features);
+ }
+ END_FOR_PIXEL_WINDOW_SSE
+
+ float4 pixel_scale = make_float4(1.0f / num_pixels);
+ for (int i = 0; i < num_features; i++) {
+ feature_means[i] = reduce_add(feature_means[i]) * pixel_scale;
+ }
+
+ /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
+ float4 feature_scale[DENOISE_FEATURES];
+ math_vector_zero_sse(feature_scale, num_features);
+ FOR_PIXEL_WINDOW_SSE
+ {
+ filter_get_feature_scales_sse(
+ x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride);
+ math_vector_max_sse(feature_scale, features, num_features);
+ }
+ END_FOR_PIXEL_WINDOW_SSE
+
+ filter_calculate_scale_sse(feature_scale, use_time);
+
+ /* === Generate the feature transformation. ===
+ * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space
+ * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
+ float4 feature_matrix_sse[DENOISE_FEATURES * DENOISE_FEATURES];
+ math_matrix_zero_sse(feature_matrix_sse, num_features);
+ FOR_PIXEL_WINDOW_SSE
+ {
+ filter_get_features_sse(
+ x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride);
+ math_vector_mul_sse(features, num_features, feature_scale);
+ math_matrix_add_gramian_sse(feature_matrix_sse, num_features, features, make_float4(1.0f));
+ }
+ END_FOR_PIXEL_WINDOW_SSE
+
+ float feature_matrix[DENOISE_FEATURES * DENOISE_FEATURES];
+ math_matrix_hsum(feature_matrix, num_features, feature_matrix_sse);
+
+ math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1);
+
+ *rank = 0;
+ /* Prevent overfitting when a small window is used. */
+ int max_rank = min(num_features, num_pixels / 3);
+ if (pca_threshold < 0.0f) {
+ float threshold_energy = 0.0f;
+ for (int i = 0; i < num_features; i++) {
+ threshold_energy += feature_matrix[i * num_features + i];
+ }
+ threshold_energy *= 1.0f - (-pca_threshold);
+
+ float reduced_energy = 0.0f;
+ for (int i = 0; i < max_rank; i++, (*rank)++) {
+ if (i >= 2 && reduced_energy >= threshold_energy)
+ break;
+ float s = feature_matrix[i * num_features + i];
+ reduced_energy += s;
+ }
+ }
+ else {
+ for (int i = 0; i < max_rank; i++, (*rank)++) {
+ float s = feature_matrix[i * num_features + i];
+ if (i >= 2 && sqrtf(s) < pca_threshold)
+ break;
+ }
+ }
+
+ math_matrix_transpose(transform, num_features, 1);
+
+ /* Bake the feature scaling into the transformation matrix. */
+ for (int i = 0; i < num_features; i++) {
+ math_vector_scale(transform + i * num_features, feature_scale[i][0], *rank);
+ }
}
CCL_NAMESPACE_END