Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLukas Stockner <lukas.stockner@freenet.de>2019-02-06 16:19:20 +0300
committerBrecht Van Lommel <brechtvanlommel@gmail.com>2019-02-06 17:18:42 +0300
commitfccf506ed7fd96f8a8f5edda7b99f564a386321a (patch)
tree80a4d10012b13e1601011e5cf6d4771d0e382775 /intern/cycles/kernel
parentc183ac73dcfd20d0acf5ca07a2b062deadc4d73a (diff)
Cycles: animation denoising support in the kernel.
This is the internal implementation, not available from the API or interface yet. The algorithm takes into account past and future frames, both to get more coherent animation and reduce noise. Ref D3889.
Diffstat (limited to 'intern/cycles/kernel')
-rw-r--r--intern/cycles/kernel/filter/filter_defines.h6
-rw-r--r--intern/cycles/kernel/filter/filter_features.h77
-rw-r--r--intern/cycles/kernel/filter/filter_features_sse.h53
-rw-r--r--intern/cycles/kernel/filter/filter_nlm_cpu.h13
-rw-r--r--intern/cycles/kernel/filter/filter_nlm_gpu.h11
-rw-r--r--intern/cycles/kernel/filter/filter_reconstruction.h12
-rw-r--r--intern/cycles/kernel/filter/filter_transform.h59
-rw-r--r--intern/cycles/kernel/filter/filter_transform_gpu.h54
-rw-r--r--intern/cycles/kernel/filter/filter_transform_sse.h60
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_cpu.h9
-rw-r--r--intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h19
-rw-r--r--intern/cycles/kernel/kernels/cuda/filter.cu25
-rw-r--r--intern/cycles/kernel/kernels/opencl/filter.cl20
13 files changed, 271 insertions, 147 deletions
diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h
index 9ac7c3db23d..cb04aac35f4 100644
--- a/intern/cycles/kernel/filter/filter_defines.h
+++ b/intern/cycles/kernel/filter/filter_defines.h
@@ -17,17 +17,21 @@
#ifndef __FILTER_DEFINES_H__
#define __FILTER_DEFINES_H__
-#define DENOISE_FEATURES 10
+#define DENOISE_FEATURES 11
#define TRANSFORM_SIZE (DENOISE_FEATURES*DENOISE_FEATURES)
#define XTWX_SIZE (((DENOISE_FEATURES+1)*(DENOISE_FEATURES+2))/2)
#define XTWY_SIZE (DENOISE_FEATURES+1)
+#define DENOISE_MAX_FRAMES 16
+
typedef struct TileInfo {
int offsets[9];
int strides[9];
int x[4];
int y[4];
int from_render;
+ int frames[DENOISE_MAX_FRAMES];
+ int num_frames;
/* TODO(lukas): CUDA doesn't have uint64_t... */
#ifdef __KERNEL_OPENCL__
ccl_global float *buffers[9];
diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h
index 6226ed2c2ef..e1ea6487aa9 100644
--- a/intern/cycles/kernel/filter/filter_features.h
+++ b/intern/cycles/kernel/filter/filter_features.h
@@ -18,19 +18,23 @@
#define ccl_get_feature(buffer, pass) (buffer)[(pass)*pass_stride]
-/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).
- * pixel_buffer always points to the current pixel in the first pass. */
-#define FOR_PIXEL_WINDOW pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
- for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
- for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) {
+/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).+ * pixel_buffer always points to the current pixel in the first pass.
+ * Repeat the loop for every secondary frame if there are any. */
+#define FOR_PIXEL_WINDOW for(int frame = 0; frame < tile_info->num_frames; frame++) { \
+ pixel.z = tile_info->frames[frame]; \
+ pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x) + frame*frame_stride; \
+ for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
+ for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) {
-#define END_FOR_PIXEL_WINDOW } \
- pixel_buffer += buffer_w - (high.x - low.x); \
+#define END_FOR_PIXEL_WINDOW } \
+ pixel_buffer += buffer_w - (high.x - low.x); \
+ } \
}
-ccl_device_inline void filter_get_features(int2 pixel,
+ccl_device_inline void filter_get_features(int3 pixel,
const ccl_global float *ccl_restrict buffer,
float *features,
+ bool use_time,
const float *ccl_restrict mean,
int pass_stride)
{
@@ -44,15 +48,20 @@ ccl_device_inline void filter_get_features(int2 pixel,
features[7] = ccl_get_feature(buffer, 5);
features[8] = ccl_get_feature(buffer, 6);
features[9] = ccl_get_feature(buffer, 7);
+ if(use_time) {
+ features[10] = pixel.z;
+ }
if(mean) {
- for(int i = 0; i < DENOISE_FEATURES; i++)
+ for(int i = 0; i < (use_time? 11 : 10); i++) {
features[i] -= mean[i];
+ }
}
}
-ccl_device_inline void filter_get_feature_scales(int2 pixel,
+ccl_device_inline void filter_get_feature_scales(int3 pixel,
const ccl_global float *ccl_restrict buffer,
float *scales,
+ bool use_time,
const float *ccl_restrict mean,
int pass_stride)
{
@@ -66,13 +75,19 @@ ccl_device_inline void filter_get_feature_scales(int2 pixel,
scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7],
ccl_get_feature(buffer, 6) - mean[8],
ccl_get_feature(buffer, 7) - mean[9]));
+ if(use_time) {
+ scales[6] = fabsf(pixel.z - mean[10]);
+ }
}
-ccl_device_inline void filter_calculate_scale(float *scale)
+ccl_device_inline void filter_calculate_scale(float *scale, bool use_time)
{
scale[0] = 1.0f/max(scale[0], 0.01f);
scale[1] = 1.0f/max(scale[1], 0.01f);
scale[2] = 1.0f/max(scale[2], 0.01f);
+ if(use_time) {
+ scale[10] = 1.0f/max(scale[6], 0.01f);
+ }
scale[6] = 1.0f/max(scale[4], 0.01f);
scale[7] = scale[8] = scale[9] = 1.0f/max(sqrtf(scale[5]), 0.01f);
scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f);
@@ -89,36 +104,46 @@ ccl_device_inline void design_row_add(float *design_row,
const ccl_global float *ccl_restrict transform,
int stride,
int row,
- float feature)
+ float feature,
+ int transform_row_stride)
{
for(int i = 0; i < rank; i++) {
- design_row[1+i] += transform[(row*DENOISE_FEATURES + i)*stride]*feature;
+ design_row[1+i] += transform[(row*transform_row_stride + i)*stride]*feature;
}
}
/* Fill the design row. */
-ccl_device_inline void filter_get_design_row_transform(int2 p_pixel,
+ccl_device_inline void filter_get_design_row_transform(int3 p_pixel,
const ccl_global float *ccl_restrict p_buffer,
- int2 q_pixel,
+ int3 q_pixel,
const ccl_global float *ccl_restrict q_buffer,
int pass_stride,
int rank,
float *design_row,
const ccl_global float *ccl_restrict transform,
- int stride)
+ int stride,
+ bool use_time)
{
+ int num_features = use_time? 11 : 10;
+
design_row[0] = 1.0f;
math_vector_zero(design_row+1, rank);
- design_row_add(design_row, rank, transform, stride, 0, q_pixel.x - p_pixel.x);
- design_row_add(design_row, rank, transform, stride, 1, q_pixel.y - p_pixel.y);
- design_row_add(design_row, rank, transform, stride, 2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0)));
- design_row_add(design_row, rank, transform, stride, 3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1));
- design_row_add(design_row, rank, transform, stride, 4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2));
- design_row_add(design_row, rank, transform, stride, 5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3));
- design_row_add(design_row, rank, transform, stride, 6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4));
- design_row_add(design_row, rank, transform, stride, 7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5));
- design_row_add(design_row, rank, transform, stride, 8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6));
- design_row_add(design_row, rank, transform, stride, 9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7));
+
+#define DESIGN_ROW_ADD(I, F) design_row_add(design_row, rank, transform, stride, I, F, num_features);
+ DESIGN_ROW_ADD(0, q_pixel.x - p_pixel.x);
+ DESIGN_ROW_ADD(1, q_pixel.y - p_pixel.y);
+ DESIGN_ROW_ADD(2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0)));
+ DESIGN_ROW_ADD(3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1));
+ DESIGN_ROW_ADD(4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2));
+ DESIGN_ROW_ADD(5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3));
+ DESIGN_ROW_ADD(6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4));
+ DESIGN_ROW_ADD(7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5));
+ DESIGN_ROW_ADD(8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6));
+ DESIGN_ROW_ADD(9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7));
+ if(use_time) {
+ DESIGN_ROW_ADD(10, q_pixel.z - p_pixel.z)
+ }
+#undef DESIGN_ROW_ADD
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h
index 3ddd8712266..5dd001ffb93 100644
--- a/intern/cycles/kernel/filter/filter_features_sse.h
+++ b/intern/cycles/kernel/filter/filter_features_sse.h
@@ -20,26 +20,33 @@ CCL_NAMESPACE_BEGIN
/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time.
* pixel_buffer always points to the first of the 4 current pixel in the first pass.
- * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. */
+ * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window.
+ * Repeat the loop for every secondary frame if there are any. */
+#define FOR_PIXEL_WINDOW_SSE for(int frame = 0; frame < tile_info->num_frames; frame++) { \
+ pixel.z = tile_info->frames[frame]; \
+ pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x) + frame*frame_stride; \
+ float4 t4 = make_float4(pixel.z); \
+ for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
+ float4 y4 = make_float4(pixel.y); \
+ for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
+ float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \
+ int4 active_pixels = x4 < make_float4(high.x);
-#define FOR_PIXEL_WINDOW_SSE pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
- for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
- float4 y4 = make_float4(pixel.y); \
- for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
- float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \
- int4 active_pixels = x4 < make_float4(high.x);
-
-#define END_FOR_PIXEL_WINDOW_SSE } \
- pixel_buffer += buffer_w - (pixel.x - low.x); \
+#define END_FOR_PIXEL_WINDOW_SSE } \
+ pixel_buffer += buffer_w - (high.x - low.x); \
+ } \
}
-ccl_device_inline void filter_get_features_sse(float4 x, float4 y,
+ccl_device_inline void filter_get_features_sse(float4 x, float4 y, float4 t,
int4 active_pixels,
const float *ccl_restrict buffer,
float4 *features,
+ bool use_time,
const float4 *ccl_restrict mean,
int pass_stride)
{
+ int num_features = use_time? 11 : 10;
+
features[0] = x;
features[1] = y;
features[2] = fabs(ccl_get_feature_sse(0));
@@ -50,18 +57,25 @@ ccl_device_inline void filter_get_features_sse(float4 x, float4 y,
features[7] = ccl_get_feature_sse(5);
features[8] = ccl_get_feature_sse(6);
features[9] = ccl_get_feature_sse(7);
+ if(use_time) {
+ features[10] = t;
+ }
+
if(mean) {
- for(int i = 0; i < DENOISE_FEATURES; i++)
+ for(int i = 0; i < num_features; i++) {
features[i] = features[i] - mean[i];
+ }
}
- for(int i = 0; i < DENOISE_FEATURES; i++)
+ for(int i = 0; i < num_features; i++) {
features[i] = mask(active_pixels, features[i]);
+ }
}
-ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y,
+ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y, float4 t,
int4 active_pixels,
const float *ccl_restrict buffer,
float4 *scales,
+ bool use_time,
const float4 *ccl_restrict mean,
int pass_stride)
{
@@ -75,15 +89,22 @@ ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y,
scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) +
sqr(ccl_get_feature_sse(6) - mean[8]) +
sqr(ccl_get_feature_sse(7) - mean[9]);
- for(int i = 0; i < 6; i++)
+ if(use_time) {
+ scales[6] = fabs(t - mean[10]);
+ }
+
+ for(int i = 0; i < (use_time? 7 : 6); i++)
scales[i] = mask(active_pixels, scales[i]);
}
-ccl_device_inline void filter_calculate_scale_sse(float4 *scale)
+ccl_device_inline void filter_calculate_scale_sse(float4 *scale, bool use_time)
{
scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f)));
scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f)));
scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f)));
+ if(use_time) {
+ scale[10] = rcp(max(reduce_max(scale[6]), make_float4(0.01f)));;
+ }
scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f)));
scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f)));
scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f)));
diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h
index 0c4387af540..9eb3c603a4a 100644
--- a/intern/cycles/kernel/filter/filter_nlm_cpu.h
+++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h
@@ -27,6 +27,7 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
int4 rect,
int stride,
int channel_offset,
+ int frame_offset,
float a,
float k_2)
{
@@ -39,7 +40,7 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
for(int y = rect.y; y < rect.w; y++) {
int idx_p = y*stride + aligned_lowx;
- int idx_q = (y+dy)*stride + aligned_lowx + dx;
+ int idx_q = (y+dy)*stride + aligned_lowx + dx + frame_offset;
for(int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) {
float4 diff = make_float4(0.0f);
float4 scale_fac;
@@ -181,7 +182,7 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy,
}
}
-ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
+ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, int t,
const float *ccl_restrict difference_image,
const float *ccl_restrict buffer,
float *transform,
@@ -191,7 +192,9 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
int4 rect,
int4 filter_window,
int stride, int f,
- int pass_stride)
+ int pass_stride,
+ int frame_offset,
+ bool use_time)
{
int4 clip_area = rect_clip(rect, filter_window);
/* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */
@@ -212,9 +215,11 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
int *l_rank = rank + storage_ofs;
kernel_filter_construct_gramian(x, y, 1,
- dx, dy,
+ dx, dy, t,
stride,
pass_stride,
+ frame_offset,
+ use_time,
buffer,
l_transform, l_rank,
weight, l_XtWX, l_XtWY, 0);
diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h
index d8e2e4d08aa..12636393243 100644
--- a/intern/cycles/kernel/filter/filter_nlm_gpu.h
+++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h
@@ -82,9 +82,10 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y,
ccl_global float *difference_image,
int4 rect, int stride,
int channel_offset,
+ int frame_offset,
float a, float k_2)
{
- int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx);
+ int idx_p = y*stride + x, idx_q = (y+dy)*stride + (x+dx) + frame_offset;
int numChannels = channel_offset? 3 : 1;
float diff = 0.0f;
@@ -170,7 +171,7 @@ ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
}
ccl_device_inline void kernel_filter_nlm_construct_gramian(int x, int y,
- int dx, int dy,
+ int dx, int dy, int t,
const ccl_global float *ccl_restrict difference_image,
const ccl_global float *ccl_restrict buffer,
const ccl_global float *ccl_restrict transform,
@@ -181,6 +182,8 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int x, int y,
int4 filter_window,
int stride, int f,
int pass_stride,
+ int frame_offset,
+ bool use_time,
int localIdx)
{
const int low = max(rect.x, x-f);
@@ -201,9 +204,11 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int x, int y,
kernel_filter_construct_gramian(x, y,
rect_size(filter_window),
- dx, dy,
+ dx, dy, t,
stride,
pass_stride,
+ frame_offset,
+ use_time,
buffer,
transform, rank,
weight, XtWX, XtWY,
diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h
index e5d3b0da835..31a7487c77a 100644
--- a/intern/cycles/kernel/filter/filter_reconstruction.h
+++ b/intern/cycles/kernel/filter/filter_reconstruction.h
@@ -18,9 +18,11 @@ CCL_NAMESPACE_BEGIN
ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
int storage_stride,
- int dx, int dy,
+ int dx, int dy, int t,
int buffer_stride,
int pass_stride,
+ int frame_offset,
+ bool use_time,
const ccl_global float *ccl_restrict buffer,
const ccl_global float *ccl_restrict transform,
ccl_global int *rank,
@@ -34,7 +36,7 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
}
int p_offset = y * buffer_stride + x;
- int q_offset = (y+dy) * buffer_stride + (x+dx);
+ int q_offset = (y+dy) * buffer_stride + (x+dx) + frame_offset;
#ifdef __KERNEL_GPU__
const int stride = storage_stride;
@@ -57,9 +59,9 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
return;
}
- filter_get_design_row_transform(make_int2(x, y), buffer + p_offset,
- make_int2(x+dx, y+dy), buffer + q_offset,
- pass_stride, *rank, design_row, transform, stride);
+ filter_get_design_row_transform(make_int3(x, y, t), buffer + p_offset,
+ make_int3(x+dx, y+dy, t), buffer + q_offset,
+ pass_stride, *rank, design_row, transform, stride, use_time);
#ifdef __KERNEL_GPU__
math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride);
diff --git a/intern/cycles/kernel/filter/filter_transform.h b/intern/cycles/kernel/filter/filter_transform.h
index a5f87c05ec0..94e27bb02fd 100644
--- a/intern/cycles/kernel/filter/filter_transform.h
+++ b/intern/cycles/kernel/filter/filter_transform.h
@@ -17,8 +17,10 @@
CCL_NAMESPACE_BEGIN
ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
+ CCL_FILTER_TILE_INFO,
int x, int y, int4 rect,
- int pass_stride,
+ int pass_stride, int frame_stride,
+ bool use_time,
float *transform, int *rank,
int radius, float pca_threshold)
{
@@ -26,59 +28,58 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
float features[DENOISE_FEATURES];
- /* Temporary storage, used in different steps of the algorithm. */
- float tempmatrix[DENOISE_FEATURES*DENOISE_FEATURES];
- float tempvector[2*DENOISE_FEATURES];
const float *ccl_restrict pixel_buffer;
- int2 pixel;
+ int3 pixel;
+
+ int num_features = use_time? 11 : 10;
/* === Calculate denoising window. === */
int2 low = make_int2(max(rect.x, x - radius),
max(rect.y, y - radius));
int2 high = make_int2(min(rect.z, x + radius + 1),
min(rect.w, y + radius + 1));
- int num_pixels = (high.y - low.y) * (high.x - low.x);
+ int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
/* === Shift feature passes to have mean 0. === */
float feature_means[DENOISE_FEATURES];
- math_vector_zero(feature_means, DENOISE_FEATURES);
+ math_vector_zero(feature_means, num_features);
FOR_PIXEL_WINDOW {
- filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride);
- math_vector_add(feature_means, features, DENOISE_FEATURES);
+ filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride);
+ math_vector_add(feature_means, features, num_features);
} END_FOR_PIXEL_WINDOW
- math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES);
+ math_vector_scale(feature_means, 1.0f / num_pixels, num_features);
/* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
- float *feature_scale = tempvector;
- math_vector_zero(feature_scale, DENOISE_FEATURES);
+ float feature_scale[DENOISE_FEATURES];
+ math_vector_zero(feature_scale, num_features);
FOR_PIXEL_WINDOW {
- filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride);
- math_vector_max(feature_scale, features, DENOISE_FEATURES);
+ filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
+ math_vector_max(feature_scale, features, num_features);
} END_FOR_PIXEL_WINDOW
- filter_calculate_scale(feature_scale);
+ filter_calculate_scale(feature_scale, use_time);
/* === Generate the feature transformation. ===
- * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
+ * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space
* which generally has fewer dimensions. This mainly helps to prevent overfitting. */
- float* feature_matrix = tempmatrix;
- math_matrix_zero(feature_matrix, DENOISE_FEATURES);
+ float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
+ math_matrix_zero(feature_matrix, num_features);
FOR_PIXEL_WINDOW {
- filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
- math_vector_mul(features, feature_scale, DENOISE_FEATURES);
- math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
+ filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
+ math_vector_mul(features, feature_scale, num_features);
+ math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f);
} END_FOR_PIXEL_WINDOW
- math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
+ math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1);
*rank = 0;
/* Prevent overfitting when a small window is used. */
- int max_rank = min(DENOISE_FEATURES, num_pixels/3);
+ int max_rank = min(num_features, num_pixels/3);
if(pca_threshold < 0.0f) {
float threshold_energy = 0.0f;
- for(int i = 0; i < DENOISE_FEATURES; i++) {
- threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+ for(int i = 0; i < num_features; i++) {
+ threshold_energy += feature_matrix[i*num_features+i];
}
threshold_energy *= 1.0f - (-pca_threshold);
@@ -86,13 +87,13 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
for(int i = 0; i < max_rank; i++, (*rank)++) {
if(i >= 2 && reduced_energy >= threshold_energy)
break;
- float s = feature_matrix[i*DENOISE_FEATURES+i];
+ float s = feature_matrix[i*num_features+i];
reduced_energy += s;
}
}
else {
for(int i = 0; i < max_rank; i++, (*rank)++) {
- float s = feature_matrix[i*DENOISE_FEATURES+i];
+ float s = feature_matrix[i*num_features+i];
if(i >= 2 && sqrtf(s) < pca_threshold)
break;
}
@@ -100,9 +101,9 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
/* Bake the feature scaling into the transformation matrix. */
for(int i = 0; i < (*rank); i++) {
- math_vector_mul(transform + i*DENOISE_FEATURES, feature_scale, DENOISE_FEATURES);
+ math_vector_mul(transform + i*num_features, feature_scale, num_features);
}
- math_matrix_transpose(transform, DENOISE_FEATURES, 1);
+ math_matrix_transpose(transform, num_features, 1);
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h
index 83a1222bbdb..ed8ddcb49b1 100644
--- a/intern/cycles/kernel/filter/filter_transform_gpu.h
+++ b/intern/cycles/kernel/filter/filter_transform_gpu.h
@@ -17,8 +17,10 @@
CCL_NAMESPACE_BEGIN
ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_restrict buffer,
+ CCL_FILTER_TILE_INFO,
int x, int y, int4 rect,
- int pass_stride,
+ int pass_stride, int frame_stride,
+ bool use_time,
ccl_global float *transform,
ccl_global int *rank,
int radius, float pca_threshold,
@@ -33,60 +35,62 @@ ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_re
float features[DENOISE_FEATURES];
#endif
+ int num_features = use_time? 11 : 10;
+
/* === Calculate denoising window. === */
int2 low = make_int2(max(rect.x, x - radius),
max(rect.y, y - radius));
int2 high = make_int2(min(rect.z, x + radius + 1),
min(rect.w, y + radius + 1));
- int num_pixels = (high.y - low.y) * (high.x - low.x);
+ int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
const ccl_global float *ccl_restrict pixel_buffer;
- int2 pixel;
+ int3 pixel;
/* === Shift feature passes to have mean 0. === */
float feature_means[DENOISE_FEATURES];
- math_vector_zero(feature_means, DENOISE_FEATURES);
+ math_vector_zero(feature_means, num_features);
FOR_PIXEL_WINDOW {
- filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride);
- math_vector_add(feature_means, features, DENOISE_FEATURES);
+ filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride);
+ math_vector_add(feature_means, features, num_features);
} END_FOR_PIXEL_WINDOW
- math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES);
+ math_vector_scale(feature_means, 1.0f / num_pixels, num_features);
/* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
float feature_scale[DENOISE_FEATURES];
- math_vector_zero(feature_scale, DENOISE_FEATURES);
+ math_vector_zero(feature_scale, num_features);
FOR_PIXEL_WINDOW {
- filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride);
- math_vector_max(feature_scale, features, DENOISE_FEATURES);
+ filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
+ math_vector_max(feature_scale, features, num_features);
} END_FOR_PIXEL_WINDOW
- filter_calculate_scale(feature_scale);
+ filter_calculate_scale(feature_scale, use_time);
/* === Generate the feature transformation. ===
- * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
+ * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space
* which generally has fewer dimensions. This mainly helps to prevent overfitting. */
float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
- math_matrix_zero(feature_matrix, DENOISE_FEATURES);
+ math_matrix_zero(feature_matrix, num_features);
FOR_PIXEL_WINDOW {
- filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
- math_vector_mul(features, feature_scale, DENOISE_FEATURES);
- math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
+ filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
+ math_vector_mul(features, feature_scale, num_features);
+ math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f);
} END_FOR_PIXEL_WINDOW
- math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, transform_stride);
+ math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, transform_stride);
*rank = 0;
/* Prevent overfitting when a small window is used. */
- int max_rank = min(DENOISE_FEATURES, num_pixels/3);
+ int max_rank = min(num_features, num_pixels/3);
if(pca_threshold < 0.0f) {
float threshold_energy = 0.0f;
- for(int i = 0; i < DENOISE_FEATURES; i++) {
- threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+ for(int i = 0; i < num_features; i++) {
+ threshold_energy += feature_matrix[i*num_features+i];
}
threshold_energy *= 1.0f - (-pca_threshold);
@@ -94,24 +98,24 @@ ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_re
for(int i = 0; i < max_rank; i++, (*rank)++) {
if(i >= 2 && reduced_energy >= threshold_energy)
break;
- float s = feature_matrix[i*DENOISE_FEATURES+i];
+ float s = feature_matrix[i*num_features+i];
reduced_energy += s;
}
}
else {
for(int i = 0; i < max_rank; i++, (*rank)++) {
- float s = feature_matrix[i*DENOISE_FEATURES+i];
+ float s = feature_matrix[i*num_features+i];
if(i >= 2 && sqrtf(s) < pca_threshold)
break;
}
}
- math_matrix_transpose(transform, DENOISE_FEATURES, transform_stride);
+ math_matrix_transpose(transform, num_features, transform_stride);
/* Bake the feature scaling into the transformation matrix. */
- for(int i = 0; i < DENOISE_FEATURES; i++) {
+ for(int i = 0; i < num_features; i++) {
for(int j = 0; j < (*rank); j++) {
- transform[(i*DENOISE_FEATURES + j)*transform_stride] *= feature_scale[i];
+ transform[(i*num_features + j)*transform_stride] *= feature_scale[i];
}
}
}
diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h
index 9e65f61664b..10bd3e477e9 100644
--- a/intern/cycles/kernel/filter/filter_transform_sse.h
+++ b/intern/cycles/kernel/filter/filter_transform_sse.h
@@ -17,8 +17,10 @@
CCL_NAMESPACE_BEGIN
ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
+ CCL_FILTER_TILE_INFO,
int x, int y, int4 rect,
- int pass_stride,
+ int pass_stride, int frame_stride,
+ bool use_time,
float *transform, int *rank,
int radius, float pca_threshold)
{
@@ -26,55 +28,63 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
float4 features[DENOISE_FEATURES];
const float *ccl_restrict pixel_buffer;
- int2 pixel;
+ int3 pixel;
+ int num_features = use_time? 11 : 10;
+
+ /* === Calculate denoising window. === */
int2 low = make_int2(max(rect.x, x - radius),
max(rect.y, y - radius));
int2 high = make_int2(min(rect.z, x + radius + 1),
min(rect.w, y + radius + 1));
- int num_pixels = (high.y - low.y) * (high.x - low.x);
+ int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
+ /* === Shift feature passes to have mean 0. === */
float4 feature_means[DENOISE_FEATURES];
- math_vector_zero_sse(feature_means, DENOISE_FEATURES);
+ math_vector_zero_sse(feature_means, num_features);
FOR_PIXEL_WINDOW_SSE {
- filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride);
- math_vector_add_sse(feature_means, DENOISE_FEATURES, features);
+ filter_get_features_sse(x4, y4, t4, active_pixels, pixel_buffer, features, use_time, NULL, pass_stride);
+ math_vector_add_sse(feature_means, num_features, features);
} END_FOR_PIXEL_WINDOW_SSE
float4 pixel_scale = make_float4(1.0f / num_pixels);
- for(int i = 0; i < DENOISE_FEATURES; i++) {
+ for(int i = 0; i < num_features; i++) {
feature_means[i] = reduce_add(feature_means[i]) * pixel_scale;
}
+ /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
float4 feature_scale[DENOISE_FEATURES];
- math_vector_zero_sse(feature_scale, DENOISE_FEATURES);
+ math_vector_zero_sse(feature_scale, num_features);
FOR_PIXEL_WINDOW_SSE {
- filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
- math_vector_max_sse(feature_scale, features, DENOISE_FEATURES);
+ filter_get_feature_scales_sse(x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride);
+ math_vector_max_sse(feature_scale, features, num_features);
} END_FOR_PIXEL_WINDOW_SSE
- filter_calculate_scale_sse(feature_scale);
+ filter_calculate_scale_sse(feature_scale, use_time);
+ /* === Generate the feature transformation. ===
+ * This transformation maps the num_features-dimentional feature space to a reduced feature (r-feature) space
+ * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
float4 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
- math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES);
+ math_matrix_zero_sse(feature_matrix_sse, num_features);
FOR_PIXEL_WINDOW_SSE {
- filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
- math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale);
- math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, make_float4(1.0f));
+ filter_get_features_sse(x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride);
+ math_vector_mul_sse(features, num_features, feature_scale);
+ math_matrix_add_gramian_sse(feature_matrix_sse, num_features, features, make_float4(1.0f));
} END_FOR_PIXEL_WINDOW_SSE
float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
- math_matrix_hsum(feature_matrix, DENOISE_FEATURES, feature_matrix_sse);
+ math_matrix_hsum(feature_matrix, num_features, feature_matrix_sse);
- math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
+ math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1);
*rank = 0;
/* Prevent overfitting when a small window is used. */
- int max_rank = min(DENOISE_FEATURES, num_pixels/3);
+ int max_rank = min(num_features, num_pixels/3);
if(pca_threshold < 0.0f) {
float threshold_energy = 0.0f;
- for(int i = 0; i < DENOISE_FEATURES; i++) {
- threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+ for(int i = 0; i < num_features; i++) {
+ threshold_energy += feature_matrix[i*num_features+i];
}
threshold_energy *= 1.0f - (-pca_threshold);
@@ -82,23 +92,23 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
for(int i = 0; i < max_rank; i++, (*rank)++) {
if(i >= 2 && reduced_energy >= threshold_energy)
break;
- float s = feature_matrix[i*DENOISE_FEATURES+i];
+ float s = feature_matrix[i*num_features+i];
reduced_energy += s;
}
}
else {
for(int i = 0; i < max_rank; i++, (*rank)++) {
- float s = feature_matrix[i*DENOISE_FEATURES+i];
+ float s = feature_matrix[i*num_features+i];
if(i >= 2 && sqrtf(s) < pca_threshold)
break;
}
}
- math_matrix_transpose(transform, DENOISE_FEATURES, 1);
+ math_matrix_transpose(transform, num_features, 1);
/* Bake the feature scaling into the transformation matrix. */
- for(int i = 0; i < DENOISE_FEATURES; i++) {
- math_vector_scale(transform + i*DENOISE_FEATURES, feature_scale[i][0], *rank);
+ for(int i = 0; i < num_features; i++) {
+ math_vector_scale(transform + i*num_features, feature_scale[i][0], *rank);
}
}
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
index 08333c7a455..02c85562db8 100644
--- a/intern/cycles/kernel/kernels/cpu/filter_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
@@ -68,6 +68,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
int r);
void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
+ TileInfo *tiles,
int x,
int y,
int storage_ofs,
@@ -75,6 +76,8 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
int *rank,
int* rect,
int pass_stride,
+ int frame_stride,
+ bool use_time,
int radius,
float pca_threshold);
@@ -87,6 +90,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
int* rect,
int stride,
int channel_offset,
+ int frame_offset,
float a,
float k_2);
@@ -116,6 +120,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
int dy,
+ int t,
float *difference_image,
float *buffer,
float *transform,
@@ -126,7 +131,9 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
int *filter_window,
int stride,
int f,
- int pass_stride);
+ int pass_stride,
+ int frame_offset,
+ bool use_time);
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image,
float *accum_image,
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
index b792367e3ab..c29505880cb 100644
--- a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
@@ -135,6 +135,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
}
void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
+ TileInfo *tile_info,
int x,
int y,
int storage_ofs,
@@ -142,6 +143,8 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
int *rank,
int* prefilter_rect,
int pass_stride,
+ int frame_stride,
+ bool use_time,
int radius,
float pca_threshold)
{
@@ -151,9 +154,12 @@ void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
rank += storage_ofs;
transform += storage_ofs*TRANSFORM_SIZE;
kernel_filter_construct_transform(buffer,
+ tile_info,
x, y,
load_int4(prefilter_rect),
pass_stride,
+ frame_stride,
+ use_time,
transform,
rank,
radius,
@@ -170,6 +176,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
int *rect,
int stride,
int channel_offset,
+ int frame_offset,
float a,
float k_2)
{
@@ -184,6 +191,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
load_int4(rect),
stride,
channel_offset,
+ frame_offset,
a, k_2);
#endif
}
@@ -243,6 +251,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
int dy,
+ int t,
float *difference_image,
float *buffer,
float *transform,
@@ -253,12 +262,14 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
int *filter_window,
int stride,
int f,
- int pass_stride)
+ int pass_stride,
+ int frame_offset,
+ bool use_time)
{
#ifdef KERNEL_STUB
STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian);
#else
- kernel_filter_nlm_construct_gramian(dx, dy,
+ kernel_filter_nlm_construct_gramian(dx, dy, t,
difference_image,
buffer,
transform, rank,
@@ -266,7 +277,9 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
load_int4(rect),
load_int4(filter_window),
stride, f,
- pass_stride);
+ pass_stride,
+ frame_offset,
+ use_time);
#endif
}
diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu
index 3b51bb41aed..5b552b01413 100644
--- a/intern/cycles/kernel/kernels/cuda/filter.cu
+++ b/intern/cycles/kernel/kernels/cuda/filter.cu
@@ -29,7 +29,7 @@
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_divide_shadow(int sample,
- TileInfo *tile_info,
+ CCL_FILTER_TILE_INFO,
float *unfilteredA,
float *unfilteredB,
float *sampleVariance,
@@ -59,7 +59,7 @@ kernel_cuda_filter_divide_shadow(int sample,
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_get_feature(int sample,
- TileInfo *tile_info,
+ CCL_FILTER_TILE_INFO,
int m_offset,
int v_offset,
float *mean,
@@ -138,10 +138,12 @@ kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
+ CCL_FILTER_TILE_INFO,
float *transform, int *rank,
int4 filter_area, int4 rect,
int radius, float pca_threshold,
- int pass_stride)
+ int pass_stride, int frame_stride,
+ bool use_time)
{
int x = blockDim.x*blockIdx.x + threadIdx.x;
int y = blockDim.y*blockIdx.y + threadIdx.y;
@@ -149,8 +151,11 @@ kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
int *l_rank = rank + y*filter_area.z + x;
float *l_transform = transform + y*filter_area.z + x;
kernel_filter_construct_transform(buffer,
+ tile_info,
x + filter_area.x, y + filter_area.y,
- rect, pass_stride,
+ rect,
+ pass_stride, frame_stride,
+ use_time,
l_transform, l_rank,
radius, pca_threshold,
filter_area.z*filter_area.w,
@@ -170,6 +175,7 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
int pass_stride,
int r,
int channel_offset,
+ int frame_offset,
float a,
float k_2)
{
@@ -183,6 +189,7 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
difference_image + ofs,
rect, stride,
channel_offset,
+ frame_offset,
a, k_2);
}
}
@@ -274,7 +281,8 @@ kernel_cuda_filter_nlm_normalize(float *out_image,
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_image,
+kernel_cuda_filter_nlm_construct_gramian(int t,
+ const float *ccl_restrict difference_image,
const float *ccl_restrict buffer,
float const* __restrict__ transform,
int *rank,
@@ -286,13 +294,16 @@ kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_im
int stride,
int pass_stride,
int r,
- int f)
+ int f,
+ int frame_offset,
+ bool use_time)
{
int4 co, rect;
int ofs;
if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) {
kernel_filter_nlm_construct_gramian(co.x, co.y,
co.z, co.w,
+ t,
difference_image + ofs,
buffer,
transform, rank,
@@ -300,6 +311,8 @@ kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_im
rect, filter_window,
stride, f,
pass_stride,
+ frame_offset,
+ use_time,
threadIdx.y*blockDim.x + threadIdx.x);
}
}
diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl
index 8a821ee281d..996bc27f71b 100644
--- a/intern/cycles/kernel/kernels/opencl/filter.cl
+++ b/intern/cycles/kernel/kernels/opencl/filter.cl
@@ -127,11 +127,14 @@ __kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean,
}
__kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_restrict buffer,
+ CCL_FILTER_TILE_INFO,
ccl_global float *transform,
ccl_global int *rank,
int4 filter_area,
int4 rect,
int pass_stride,
+ int frame_stride,
+ char use_time,
int radius,
float pca_threshold)
{
@@ -141,8 +144,11 @@ __kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_
ccl_global int *l_rank = rank + y*filter_area.z + x;
ccl_global float *l_transform = transform + y*filter_area.z + x;
kernel_filter_construct_transform(buffer,
+ CCL_FILTER_TILE_INFO_ARG,
x + filter_area.x, y + filter_area.y,
- rect, pass_stride,
+ rect,
+ pass_stride, frame_stride,
+ use_time,
l_transform, l_rank,
radius, pca_threshold,
filter_area.z*filter_area.w,
@@ -160,6 +166,7 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_
int pass_stride,
int r,
int channel_offset,
+ int frame_offset,
float a,
float k_2)
{
@@ -173,6 +180,7 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_
difference_image + ofs,
rect, stride,
channel_offset,
+ frame_offset,
a, k_2);
}
}
@@ -254,7 +262,8 @@ __kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *out_image,
}
}
-__kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *ccl_restrict difference_image,
+__kernel void kernel_ocl_filter_nlm_construct_gramian(int t,
+ const ccl_global float *ccl_restrict difference_image,
const ccl_global float *ccl_restrict buffer,
const ccl_global float *ccl_restrict transform,
ccl_global int *rank,
@@ -266,13 +275,16 @@ __kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *cc
int stride,
int pass_stride,
int r,
- int f)
+ int f,
+ int frame_offset,
+ char use_time)
{
int4 co, rect;
int ofs;
if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) {
kernel_filter_nlm_construct_gramian(co.x, co.y,
co.z, co.w,
+ t,
difference_image + ofs,
buffer,
transform, rank,
@@ -280,6 +292,8 @@ __kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *cc
rect, filter_window,
stride, f,
pass_stride,
+ frame_offset,
+ use_time,
get_local_id(1)*get_local_size(0) + get_local_id(0));
}
}