diff options
Diffstat (limited to 'intern/cycles/kernel/filter')
-rw-r--r-- | intern/cycles/kernel/filter/filter.h | 52 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_defines.h | 72 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_features.h | 156 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_features_sse.h | 118 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_kernel.h | 50 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_nlm_cpu.h | 254 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_nlm_gpu.h | 255 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_prefilter.h | 303 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_reconstruction.h | 140 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_transform.h | 120 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_transform_gpu.h | 129 | ||||
-rw-r--r-- | intern/cycles/kernel/filter/filter_transform_sse.h | 129 |
12 files changed, 0 insertions, 1778 deletions
diff --git a/intern/cycles/kernel/filter/filter.h b/intern/cycles/kernel/filter/filter.h deleted file mode 100644 index b067e53a8bf..00000000000 --- a/intern/cycles/kernel/filter/filter.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __FILTER_H__ -#define __FILTER_H__ - -/* CPU Filter Kernel Interface */ - -#include "util/util_types.h" - -#include "kernel/filter/filter_defines.h" - -CCL_NAMESPACE_BEGIN - -#define KERNEL_NAME_JOIN(x, y, z) x##_##y##_##z -#define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name) -#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name) - -#define KERNEL_ARCH cpu -#include "kernel/kernels/cpu/filter_cpu.h" - -#define KERNEL_ARCH cpu_sse2 -#include "kernel/kernels/cpu/filter_cpu.h" - -#define KERNEL_ARCH cpu_sse3 -#include "kernel/kernels/cpu/filter_cpu.h" - -#define KERNEL_ARCH cpu_sse41 -#include "kernel/kernels/cpu/filter_cpu.h" - -#define KERNEL_ARCH cpu_avx -#include "kernel/kernels/cpu/filter_cpu.h" - -#define KERNEL_ARCH cpu_avx2 -#include "kernel/kernels/cpu/filter_cpu.h" - -CCL_NAMESPACE_END - -#endif /* __FILTER_H__ */ diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h deleted file mode 100644 index 1c0ac5e2cb7..00000000000 --- a/intern/cycles/kernel/filter/filter_defines.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __FILTER_DEFINES_H__ -#define __FILTER_DEFINES_H__ - -#define DENOISE_FEATURES 11 -#define TRANSFORM_SIZE (DENOISE_FEATURES * DENOISE_FEATURES) -#define XTWX_SIZE (((DENOISE_FEATURES + 1) * (DENOISE_FEATURES + 2)) / 2) -#define XTWY_SIZE (DENOISE_FEATURES + 1) - -#define DENOISE_MAX_FRAMES 16 - -typedef struct TileInfo { - int offsets[9]; - int strides[9]; - int x[4]; - int y[4]; - int from_render; - int frames[DENOISE_MAX_FRAMES]; - int num_frames; - /* TODO(lukas): CUDA doesn't have uint64_t... */ -#ifdef __KERNEL_OPENCL__ - ccl_global float *buffers[9]; -#else - long long int buffers[9]; -#endif -} TileInfo; - -#ifdef __KERNEL_OPENCL__ -# define CCL_FILTER_TILE_INFO \ - ccl_global TileInfo *tile_info, ccl_global float *tile_buffer_1, \ - ccl_global float *tile_buffer_2, ccl_global float *tile_buffer_3, \ - ccl_global float *tile_buffer_4, ccl_global float *tile_buffer_5, \ - ccl_global float *tile_buffer_6, ccl_global float *tile_buffer_7, \ - ccl_global float *tile_buffer_8, ccl_global float *tile_buffer_9 -# define CCL_FILTER_TILE_INFO_ARG \ - tile_info, tile_buffer_1, tile_buffer_2, tile_buffer_3, tile_buffer_4, tile_buffer_5, \ - tile_buffer_6, tile_buffer_7, tile_buffer_8, tile_buffer_9 -# define ccl_get_tile_buffer(id) \ - (id == 0 ? tile_buffer_1 : \ - id == 1 ? tile_buffer_2 : \ - id == 2 ? tile_buffer_3 : \ - id == 3 ? tile_buffer_4 : \ - id == 4 ? tile_buffer_5 : \ - id == 5 ? tile_buffer_6 : \ - id == 6 ? tile_buffer_7 : \ - id == 7 ? tile_buffer_8 : \ - tile_buffer_9) -#else -# ifdef __KERNEL_CUDA__ -# define CCL_FILTER_TILE_INFO ccl_global TileInfo *tile_info -# else -# define CCL_FILTER_TILE_INFO TileInfo *tile_info -# endif -# define ccl_get_tile_buffer(id) (tile_info->buffers[id]) -#endif - -#endif /* __FILTER_DEFINES_H__*/ diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h deleted file mode 100644 index 8a2af957146..00000000000 --- a/intern/cycles/kernel/filter/filter_features.h +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -#define ccl_get_feature(buffer, pass) (buffer)[(pass)*pass_stride] - -/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).+ * pixel_buffer always - * points to the current pixel in the first pass. Repeat the loop for every secondary frame if - * there are any. */ -#define FOR_PIXEL_WINDOW \ - for (int frame = 0; frame < tile_info->num_frames; frame++) { \ - pixel.z = tile_info->frames[frame]; \ - pixel_buffer = buffer + (low.y - rect.y) * buffer_w + (low.x - rect.x) + \ - frame * frame_stride; \ - for (pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ - for (pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) { - -#define END_FOR_PIXEL_WINDOW \ - } \ - pixel_buffer += buffer_w - (high.x - low.x); \ - } \ - } - -ccl_device_inline void filter_get_features(int3 pixel, - const ccl_global float *ccl_restrict buffer, - float *features, - bool use_time, - const float *ccl_restrict mean, - int pass_stride) -{ - features[0] = pixel.x; - features[1] = pixel.y; - features[2] = fabsf(ccl_get_feature(buffer, 0)); - features[3] = ccl_get_feature(buffer, 1); - features[4] = ccl_get_feature(buffer, 2); - features[5] = ccl_get_feature(buffer, 3); - features[6] = ccl_get_feature(buffer, 4); - features[7] = ccl_get_feature(buffer, 5); - features[8] = ccl_get_feature(buffer, 6); - features[9] = ccl_get_feature(buffer, 7); - if (use_time) { - features[10] = pixel.z; - } - if (mean) { - for (int i = 0; i < (use_time ? 11 : 10); i++) { - features[i] -= mean[i]; - } - } -} - -ccl_device_inline void filter_get_feature_scales(int3 pixel, - const ccl_global float *ccl_restrict buffer, - float *scales, - bool use_time, - const float *ccl_restrict mean, - int pass_stride) -{ - scales[0] = fabsf(pixel.x - mean[0]); - scales[1] = fabsf(pixel.y - mean[1]); - scales[2] = fabsf(fabsf(ccl_get_feature(buffer, 0)) - mean[2]); - scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3], - ccl_get_feature(buffer, 2) - mean[4], - ccl_get_feature(buffer, 3) - mean[5])); - scales[4] = fabsf(ccl_get_feature(buffer, 4) - mean[6]); - scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7], - ccl_get_feature(buffer, 6) - mean[8], - ccl_get_feature(buffer, 7) - mean[9])); - if (use_time) { - scales[6] = fabsf(pixel.z - mean[10]); - } -} - -ccl_device_inline void filter_calculate_scale(float *scale, bool use_time) -{ - scale[0] = 1.0f / max(scale[0], 0.01f); - scale[1] = 1.0f / max(scale[1], 0.01f); - scale[2] = 1.0f / max(scale[2], 0.01f); - if (use_time) { - scale[10] = 1.0f / max(scale[6], 0.01f); - } - scale[6] = 1.0f / max(scale[4], 0.01f); - scale[7] = scale[8] = scale[9] = 1.0f / max(sqrtf(scale[5]), 0.01f); - scale[3] = scale[4] = scale[5] = 1.0f / max(sqrtf(scale[3]), 0.01f); -} - -ccl_device_inline float3 filter_get_color(const ccl_global float *ccl_restrict buffer, - int pass_stride) -{ - return make_float3( - ccl_get_feature(buffer, 8), ccl_get_feature(buffer, 9), ccl_get_feature(buffer, 10)); -} - -ccl_device_inline void design_row_add(float *design_row, - int rank, - const ccl_global float *ccl_restrict transform, - int stride, - int row, - float feature, - int transform_row_stride) -{ - for (int i = 0; i < rank; i++) { - design_row[1 + i] += transform[(row * transform_row_stride + i) * stride] * feature; - } -} - -/* Fill the design row. */ -ccl_device_inline void filter_get_design_row_transform( - int3 p_pixel, - const ccl_global float *ccl_restrict p_buffer, - int3 q_pixel, - const ccl_global float *ccl_restrict q_buffer, - int pass_stride, - int rank, - float *design_row, - const ccl_global float *ccl_restrict transform, - int stride, - bool use_time) -{ - int num_features = use_time ? 11 : 10; - - design_row[0] = 1.0f; - math_vector_zero(design_row + 1, rank); - -#define DESIGN_ROW_ADD(I, F) \ - design_row_add(design_row, rank, transform, stride, I, F, num_features); - DESIGN_ROW_ADD(0, q_pixel.x - p_pixel.x); - DESIGN_ROW_ADD(1, q_pixel.y - p_pixel.y); - DESIGN_ROW_ADD(2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0))); - DESIGN_ROW_ADD(3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1)); - DESIGN_ROW_ADD(4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2)); - DESIGN_ROW_ADD(5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3)); - DESIGN_ROW_ADD(6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4)); - DESIGN_ROW_ADD(7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5)); - DESIGN_ROW_ADD(8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6)); - DESIGN_ROW_ADD(9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7)); - if (use_time) { - DESIGN_ROW_ADD(10, q_pixel.z - p_pixel.z) - } -#undef DESIGN_ROW_ADD -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h deleted file mode 100644 index 59d4ace2bef..00000000000 --- a/intern/cycles/kernel/filter/filter_features_sse.h +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -#define ccl_get_feature_sse(pass) load_float4(buffer + (pass)*pass_stride) - -/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time. - * pixel_buffer always points to the first of the 4 current pixel in the first pass. - * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set - * for all pixels within the window. Repeat the loop for every secondary frame if there are any. */ -#define FOR_PIXEL_WINDOW_SSE \ - for (int frame = 0; frame < tile_info->num_frames; frame++) { \ - pixel.z = tile_info->frames[frame]; \ - pixel_buffer = buffer + (low.y - rect.y) * buffer_w + (low.x - rect.x) + \ - frame * frame_stride; \ - float4 t4 = make_float4(pixel.z); \ - for (pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ - float4 y4 = make_float4(pixel.y); \ - for (pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \ - float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \ - int4 active_pixels = x4 < make_float4(high.x); - -#define END_FOR_PIXEL_WINDOW_SSE \ - } \ - pixel_buffer += buffer_w - (high.x - low.x); \ - } \ - } - -ccl_device_inline void filter_get_features_sse(float4 x, - float4 y, - float4 t, - int4 active_pixels, - const float *ccl_restrict buffer, - float4 *features, - bool use_time, - const float4 *ccl_restrict mean, - int pass_stride) -{ - int num_features = use_time ? 11 : 10; - - features[0] = x; - features[1] = y; - features[2] = fabs(ccl_get_feature_sse(0)); - features[3] = ccl_get_feature_sse(1); - features[4] = ccl_get_feature_sse(2); - features[5] = ccl_get_feature_sse(3); - features[6] = ccl_get_feature_sse(4); - features[7] = ccl_get_feature_sse(5); - features[8] = ccl_get_feature_sse(6); - features[9] = ccl_get_feature_sse(7); - if (use_time) { - features[10] = t; - } - - if (mean) { - for (int i = 0; i < num_features; i++) { - features[i] = features[i] - mean[i]; - } - } - for (int i = 0; i < num_features; i++) { - features[i] = mask(active_pixels, features[i]); - } -} - -ccl_device_inline void filter_get_feature_scales_sse(float4 x, - float4 y, - float4 t, - int4 active_pixels, - const float *ccl_restrict buffer, - float4 *scales, - bool use_time, - const float4 *ccl_restrict mean, - int pass_stride) -{ - scales[0] = fabs(x - mean[0]); - scales[1] = fabs(y - mean[1]); - scales[2] = fabs(fabs(ccl_get_feature_sse(0)) - mean[2]); - scales[3] = sqr(ccl_get_feature_sse(1) - mean[3]) + sqr(ccl_get_feature_sse(2) - mean[4]) + - sqr(ccl_get_feature_sse(3) - mean[5]); - scales[4] = fabs(ccl_get_feature_sse(4) - mean[6]); - scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) + sqr(ccl_get_feature_sse(6) - mean[8]) + - sqr(ccl_get_feature_sse(7) - mean[9]); - if (use_time) { - scales[6] = fabs(t - mean[10]); - } - - for (int i = 0; i < (use_time ? 7 : 6); i++) - scales[i] = mask(active_pixels, scales[i]); -} - -ccl_device_inline void filter_calculate_scale_sse(float4 *scale, bool use_time) -{ - scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f))); - scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f))); - scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f))); - if (use_time) { - scale[10] = rcp(max(reduce_max(scale[6]), make_float4(0.01f))); - } - scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f))); - scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f))); - scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f))); -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_kernel.h b/intern/cycles/kernel/filter/filter_kernel.h deleted file mode 100644 index 2ef03dc0a02..00000000000 --- a/intern/cycles/kernel/filter/filter_kernel.h +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "util/util_color.h" -#include "util/util_math.h" -#include "util/util_math_fast.h" -#include "util/util_texture.h" - -#include "util/util_atomic.h" -#include "util/util_math_matrix.h" - -#include "kernel/filter/filter_defines.h" - -#include "kernel/filter/filter_features.h" -#ifdef __KERNEL_SSE3__ -# include "kernel/filter/filter_features_sse.h" -#endif - -#include "kernel/filter/filter_prefilter.h" - -#ifdef __KERNEL_GPU__ -# include "kernel/filter/filter_transform_gpu.h" -#else -# ifdef __KERNEL_SSE3__ -# include "kernel/filter/filter_transform_sse.h" -# else -# include "kernel/filter/filter_transform.h" -# endif -#endif - -#include "kernel/filter/filter_reconstruction.h" - -#ifdef __KERNEL_CPU__ -# include "kernel/filter/filter_nlm_cpu.h" -#else -# include "kernel/filter/filter_nlm_gpu.h" -#endif diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h deleted file mode 100644 index 24200c29203..00000000000 --- a/intern/cycles/kernel/filter/filter_nlm_cpu.h +++ /dev/null @@ -1,254 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -#define load4_a(buf, ofs) (*((float4 *)((buf) + (ofs)))) -#define load4_u(buf, ofs) load_float4((buf) + (ofs)) - -ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, - int dy, - const float *ccl_restrict weight_image, - const float *ccl_restrict variance_image, - const float *ccl_restrict scale_image, - float *difference_image, - int4 rect, - int stride, - int channel_offset, - int frame_offset, - float a, - float k_2) -{ - /* Strides need to be aligned to 16 bytes. */ - kernel_assert((stride % 4) == 0 && (channel_offset % 4) == 0); - - int aligned_lowx = rect.x & (~3); - const int numChannels = (channel_offset > 0) ? 3 : 1; - const float4 channel_fac = make_float4(1.0f / numChannels); - - for (int y = rect.y; y < rect.w; y++) { - int idx_p = y * stride + aligned_lowx; - int idx_q = (y + dy) * stride + aligned_lowx + dx + frame_offset; - for (int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) { - float4 diff = make_float4(0.0f); - float4 scale_fac; - if (scale_image) { - scale_fac = clamp(load4_a(scale_image, idx_p) / load4_u(scale_image, idx_q), - make_float4(0.25f), - make_float4(4.0f)); - } - else { - scale_fac = make_float4(1.0f); - } - for (int c = 0, chan_ofs = 0; c < numChannels; c++, chan_ofs += channel_offset) { - /* idx_p is guaranteed to be aligned, but idx_q isn't. */ - float4 color_p = load4_a(weight_image, idx_p + chan_ofs); - float4 color_q = scale_fac * load4_u(weight_image, idx_q + chan_ofs); - float4 cdiff = color_p - color_q; - float4 var_p = load4_a(variance_image, idx_p + chan_ofs); - float4 var_q = sqr(scale_fac) * load4_u(variance_image, idx_q + chan_ofs); - diff += (cdiff * cdiff - a * (var_p + min(var_p, var_q))) / - (make_float4(1e-8f) + k_2 * (var_p + var_q)); - } - load4_a(difference_image, idx_p) = diff * channel_fac; - } - } -} - -ccl_device_inline void kernel_filter_nlm_blur( - const float *ccl_restrict difference_image, float *out_image, int4 rect, int stride, int f) -{ - int aligned_lowx = round_down(rect.x, 4); - for (int y = rect.y; y < rect.w; y++) { - const int low = max(rect.y, y - f); - const int high = min(rect.w, y + f + 1); - for (int x = aligned_lowx; x < rect.z; x += 4) { - load4_a(out_image, y * stride + x) = make_float4(0.0f); - } - for (int y1 = low; y1 < high; y1++) { - for (int x = aligned_lowx; x < rect.z; x += 4) { - load4_a(out_image, y * stride + x) += load4_a(difference_image, y1 * stride + x); - } - } - float fac = 1.0f / (high - low); - for (int x = aligned_lowx; x < rect.z; x += 4) { - load4_a(out_image, y * stride + x) *= fac; - } - } -} - -ccl_device_inline void nlm_blur_horizontal( - const float *ccl_restrict difference_image, float *out_image, int4 rect, int stride, int f) -{ - int aligned_lowx = round_down(rect.x, 4); - for (int y = rect.y; y < rect.w; y++) { - for (int x = aligned_lowx; x < rect.z; x += 4) { - load4_a(out_image, y * stride + x) = make_float4(0.0f); - } - } - - for (int dx = -f; dx <= f; dx++) { - aligned_lowx = round_down(rect.x - min(0, dx), 4); - int highx = rect.z - max(0, dx); - int4 lowx4 = make_int4(rect.x - min(0, dx)); - int4 highx4 = make_int4(rect.z - max(0, dx)); - for (int y = rect.y; y < rect.w; y++) { - for (int x = aligned_lowx; x < highx; x += 4) { - int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3); - int4 active = (x4 >= lowx4) & (x4 < highx4); - - float4 diff = load4_u(difference_image, y * stride + x + dx); - load4_a(out_image, y * stride + x) += mask(active, diff); - } - } - } - - aligned_lowx = round_down(rect.x, 4); - for (int y = rect.y; y < rect.w; y++) { - for (int x = aligned_lowx; x < rect.z; x += 4) { - float4 x4 = make_float4(x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); - float4 low = max(make_float4(rect.x), x4 - make_float4(f)); - float4 high = min(make_float4(rect.z), x4 + make_float4(f + 1)); - load4_a(out_image, y * stride + x) *= rcp(high - low); - } - } -} - -ccl_device_inline void kernel_filter_nlm_calc_weight( - const float *ccl_restrict difference_image, float *out_image, int4 rect, int stride, int f) -{ - nlm_blur_horizontal(difference_image, out_image, rect, stride, f); - - int aligned_lowx = round_down(rect.x, 4); - for (int y = rect.y; y < rect.w; y++) { - for (int x = aligned_lowx; x < rect.z; x += 4) { - load4_a(out_image, y * stride + x) = fast_expf4( - -max(load4_a(out_image, y * stride + x), make_float4(0.0f))); - } - } -} - -ccl_device_inline void kernel_filter_nlm_update_output(int dx, - int dy, - const float *ccl_restrict difference_image, - const float *ccl_restrict image, - float *temp_image, - float *out_image, - float *accum_image, - int4 rect, - int channel_offset, - int stride, - int f) -{ - nlm_blur_horizontal(difference_image, temp_image, rect, stride, f); - - int aligned_lowx = round_down(rect.x, 4); - for (int y = rect.y; y < rect.w; y++) { - for (int x = aligned_lowx; x < rect.z; x += 4) { - int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3); - int4 active = (x4 >= make_int4(rect.x)) & (x4 < make_int4(rect.z)); - - int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx); - - float4 weight = load4_a(temp_image, idx_p); - load4_a(accum_image, idx_p) += mask(active, weight); - - float4 val = load4_u(image, idx_q); - if (channel_offset) { - val += load4_u(image, idx_q + channel_offset); - val += load4_u(image, idx_q + 2 * channel_offset); - val *= 1.0f / 3.0f; - } - - load4_a(out_image, idx_p) += mask(active, weight * val); - } - } -} - -ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, - int dy, - int t, - const float *ccl_restrict - difference_image, - const float *ccl_restrict buffer, - float *transform, - int *rank, - float *XtWX, - float3 *XtWY, - int4 rect, - int4 filter_window, - int stride, - int f, - int pass_stride, - int frame_offset, - bool use_time) -{ - int4 clip_area = rect_clip(rect, filter_window); - /* fy and fy are in filter-window-relative coordinates, - * while x and y are in feature-window-relative coordinates. */ - for (int y = clip_area.y; y < clip_area.w; y++) { - for (int x = clip_area.x; x < clip_area.z; x++) { - const int low = max(rect.x, x - f); - const int high = min(rect.z, x + f + 1); - float sum = 0.0f; - for (int x1 = low; x1 < high; x1++) { - sum += difference_image[y * stride + x1]; - } - float weight = sum * (1.0f / (high - low)); - - int storage_ofs = coord_to_local_index(filter_window, x, y); - float *l_transform = transform + storage_ofs * TRANSFORM_SIZE; - float *l_XtWX = XtWX + storage_ofs * XTWX_SIZE; - float3 *l_XtWY = XtWY + storage_ofs * XTWY_SIZE; - int *l_rank = rank + storage_ofs; - - kernel_filter_construct_gramian(x, - y, - 1, - dx, - dy, - t, - stride, - pass_stride, - frame_offset, - use_time, - buffer, - l_transform, - l_rank, - weight, - l_XtWX, - l_XtWY, - 0); - } - } -} - -ccl_device_inline void kernel_filter_nlm_normalize(float *out_image, - const float *ccl_restrict accum_image, - int4 rect, - int w) -{ - for (int y = rect.y; y < rect.w; y++) { - for (int x = rect.x; x < rect.z; x++) { - out_image[y * w + x] /= accum_image[y * w + x]; - } - } -} - -#undef load4_a -#undef load4_u - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h deleted file mode 100644 index 650c743f34f..00000000000 --- a/intern/cycles/kernel/filter/filter_nlm_gpu.h +++ /dev/null @@ -1,255 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -/* Determines pixel coordinates and offset for the current thread. - * Returns whether the thread should do any work. - * - * All coordinates are relative to the denoising buffer! - * - * Window is the rect that should be processed. - * co is filled with (x, y, dx, dy). - */ -ccl_device_inline bool get_nlm_coords_window( - int w, int h, int r, int stride, int4 *rect, int4 *co, int *ofs, int4 window) -{ - /* Determine the pixel offset that this thread should apply. */ - int s = 2 * r + 1; - int si = ccl_global_id(1); - int sx = si % s; - int sy = si / s; - if (sy >= s) { - return false; - } - - /* Pixels still need to lie inside the denoising buffer after applying the offset, - * so determine the area for which this is the case. */ - int dx = sx - r; - int dy = sy - r; - - *rect = make_int4(max(0, -dx), max(0, -dy), w - max(0, dx), h - max(0, dy)); - - /* Find the intersection of the area that we want to process (window) and the area - * that can be processed (rect) to get the final area for this offset. */ - int4 clip_area = rect_clip(window, *rect); - - /* If the radius is larger than one of the sides of the window, - * there will be shifts for which there is no usable pixel at all. */ - if (!rect_is_valid(clip_area)) { - return false; - } - - /* Map the linear thread index to pixels inside the clip area. */ - int x, y; - if (!local_index_to_coord(clip_area, ccl_global_id(0), &x, &y)) { - return false; - } - - *co = make_int4(x, y, dx, dy); - - *ofs = (sy * s + sx) * stride; - - return true; -} - -ccl_device_inline bool get_nlm_coords( - int w, int h, int r, int stride, int4 *rect, int4 *co, int *ofs) -{ - return get_nlm_coords_window(w, h, r, stride, rect, co, ofs, make_int4(0, 0, w, h)); -} - -ccl_device_inline void kernel_filter_nlm_calc_difference( - int x, - int y, - int dx, - int dy, - const ccl_global float *ccl_restrict weight_image, - const ccl_global float *ccl_restrict variance_image, - const ccl_global float *ccl_restrict scale_image, - ccl_global float *difference_image, - int4 rect, - int stride, - int channel_offset, - int frame_offset, - float a, - float k_2) -{ - int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx) + frame_offset; - int numChannels = channel_offset ? 3 : 1; - - float diff = 0.0f; - float scale_fac = 1.0f; - if (scale_image) { - scale_fac = clamp(scale_image[idx_p] / scale_image[idx_q], 0.25f, 4.0f); - } - - for (int c = 0; c < numChannels; c++, idx_p += channel_offset, idx_q += channel_offset) { - float cdiff = weight_image[idx_p] - scale_fac * weight_image[idx_q]; - float pvar = variance_image[idx_p]; - float qvar = sqr(scale_fac) * variance_image[idx_q]; - diff += (cdiff * cdiff - a * (pvar + min(pvar, qvar))) / (1e-8f + k_2 * (pvar + qvar)); - } - if (numChannels > 1) { - diff *= 1.0f / numChannels; - } - difference_image[y * stride + x] = diff; -} - -ccl_device_inline void kernel_filter_nlm_blur(int x, - int y, - const ccl_global float *ccl_restrict - difference_image, - ccl_global float *out_image, - int4 rect, - int stride, - int f) -{ - float sum = 0.0f; - const int low = max(rect.y, y - f); - const int high = min(rect.w, y + f + 1); - for (int y1 = low; y1 < high; y1++) { - sum += difference_image[y1 * stride + x]; - } - sum *= 1.0f / (high - low); - out_image[y * stride + x] = sum; -} - -ccl_device_inline void kernel_filter_nlm_calc_weight(int x, - int y, - const ccl_global float *ccl_restrict - difference_image, - ccl_global float *out_image, - int4 rect, - int stride, - int f) -{ - float sum = 0.0f; - const int low = max(rect.x, x - f); - const int high = min(rect.z, x + f + 1); - for (int x1 = low; x1 < high; x1++) { - sum += difference_image[y * stride + x1]; - } - sum *= 1.0f / (high - low); - out_image[y * stride + x] = fast_expf(-max(sum, 0.0f)); -} - -ccl_device_inline void kernel_filter_nlm_update_output(int x, - int y, - int dx, - int dy, - const ccl_global float *ccl_restrict - difference_image, - const ccl_global float *ccl_restrict image, - ccl_global float *out_image, - ccl_global float *accum_image, - int4 rect, - int channel_offset, - int stride, - int f) -{ - float sum = 0.0f; - const int low = max(rect.x, x - f); - const int high = min(rect.z, x + f + 1); - for (int x1 = low; x1 < high; x1++) { - sum += difference_image[y * stride + x1]; - } - sum *= 1.0f / (high - low); - - int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx); - if (out_image) { - atomic_add_and_fetch_float(accum_image + idx_p, sum); - - float val = image[idx_q]; - if (channel_offset) { - val += image[idx_q + channel_offset]; - val += image[idx_q + 2 * channel_offset]; - val *= 1.0f / 3.0f; - } - atomic_add_and_fetch_float(out_image + idx_p, sum * val); - } - else { - accum_image[idx_p] = sum; - } -} - -ccl_device_inline void kernel_filter_nlm_construct_gramian( - int x, - int y, - int dx, - int dy, - int t, - const ccl_global float *ccl_restrict difference_image, - const ccl_global float *ccl_restrict buffer, - const ccl_global float *ccl_restrict transform, - ccl_global int *rank, - ccl_global float *XtWX, - ccl_global float3 *XtWY, - int4 rect, - int4 filter_window, - int stride, - int f, - int pass_stride, - int frame_offset, - bool use_time, - int localIdx) -{ - const int low = max(rect.x, x - f); - const int high = min(rect.z, x + f + 1); - float sum = 0.0f; - for (int x1 = low; x1 < high; x1++) { - sum += difference_image[y * stride + x1]; - } - float weight = sum * (1.0f / (high - low)); - - /* Reconstruction data is only stored for pixels inside the filter window, - * so compute the pixels's index in there. */ - int storage_ofs = coord_to_local_index(filter_window, x, y); - transform += storage_ofs; - rank += storage_ofs; - XtWX += storage_ofs; - XtWY += storage_ofs; - - kernel_filter_construct_gramian(x, - y, - rect_size(filter_window), - dx, - dy, - t, - stride, - pass_stride, - frame_offset, - use_time, - buffer, - transform, - rank, - weight, - XtWX, - XtWY, - localIdx); -} - -ccl_device_inline void kernel_filter_nlm_normalize(int x, - int y, - ccl_global float *out_image, - const ccl_global float *ccl_restrict - accum_image, - int stride) -{ - out_image[y * stride + x] /= accum_image[y * stride + x]; -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h deleted file mode 100644 index 97cecba190e..00000000000 --- a/intern/cycles/kernel/filter/filter_prefilter.h +++ /dev/null @@ -1,303 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -/** - * First step of the shadow prefiltering, performs the shadow division and stores all data - * in a nice and easy rectangular array that can be passed to the NLM filter. - * - * Calculates: - * \param unfiltered: Contains the two half images of the shadow feature pass - * \param sampleVariance: The sample-based variance calculated in the kernel. - * Note: This calculation is biased in general, - * and especially here since the variance of the ratio can only be approximated. - * \param sampleVarianceV: Variance of the sample variance estimation, quite noisy - * (since it's essentially the buffer variance of the two variance halves) - * \param bufferVariance: The buffer-based variance of the shadow feature. - * Unbiased, but quite noisy. - */ -ccl_device void kernel_filter_divide_shadow(int sample, - CCL_FILTER_TILE_INFO, - int x, - int y, - ccl_global float *unfilteredA, - ccl_global float *unfilteredB, - ccl_global float *sampleVariance, - ccl_global float *sampleVarianceV, - ccl_global float *bufferVariance, - int4 rect, - int buffer_pass_stride, - int buffer_denoising_offset) -{ - int xtile = (x < tile_info->x[1]) ? 0 : ((x < tile_info->x[2]) ? 1 : 2); - int ytile = (y < tile_info->y[1]) ? 0 : ((y < tile_info->y[2]) ? 1 : 2); - int tile = ytile * 3 + xtile; - - int offset = tile_info->offsets[tile]; - int stride = tile_info->strides[tile]; - const ccl_global float *ccl_restrict center_buffer = (ccl_global float *)ccl_get_tile_buffer( - tile); - center_buffer += (y * stride + x + offset) * buffer_pass_stride; - center_buffer += buffer_denoising_offset + 14; - - int buffer_w = align_up(rect.z - rect.x, 4); - int idx = (y - rect.y) * buffer_w + (x - rect.x); - unfilteredA[idx] = center_buffer[1] / max(center_buffer[0], 1e-7f); - unfilteredB[idx] = center_buffer[4] / max(center_buffer[3], 1e-7f); - - float varA = center_buffer[2]; - float varB = center_buffer[5]; - int odd_sample = (sample + 1) / 2; - int even_sample = sample / 2; - - /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance - * update does not work efficiently with atomics in the kernel. */ - varA = max(0.0f, varA - unfilteredA[idx] * unfilteredA[idx] * odd_sample); - varB = max(0.0f, varB - unfilteredB[idx] * unfilteredB[idx] * even_sample); - - varA /= max(odd_sample - 1, 1); - varB /= max(even_sample - 1, 1); - - sampleVariance[idx] = 0.5f * (varA + varB) / sample; - sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample * sample); - bufferVariance[idx] = 0.5f * (unfilteredA[idx] - unfilteredB[idx]) * - (unfilteredA[idx] - unfilteredB[idx]); -} - -/* Load a regular feature from the render buffers into the denoise buffer. - * Parameters: - * - sample: The sample amount in the buffer, used to normalize the buffer. - * - m_offset, v_offset: Render Buffer Pass offsets of mean and variance of the feature. - * - x, y: Current pixel - * - mean, variance: Target denoise buffers. - * - rect: The prefilter area (lower pixels inclusive, upper pixels exclusive). - */ -ccl_device void kernel_filter_get_feature(int sample, - CCL_FILTER_TILE_INFO, - int m_offset, - int v_offset, - int x, - int y, - ccl_global float *mean, - ccl_global float *variance, - float scale, - int4 rect, - int buffer_pass_stride, - int buffer_denoising_offset) -{ - int xtile = (x < tile_info->x[1]) ? 0 : ((x < tile_info->x[2]) ? 1 : 2); - int ytile = (y < tile_info->y[1]) ? 0 : ((y < tile_info->y[2]) ? 1 : 2); - int tile = ytile * 3 + xtile; - ccl_global float *center_buffer = ((ccl_global float *)ccl_get_tile_buffer(tile)) + - (tile_info->offsets[tile] + y * tile_info->strides[tile] + x) * - buffer_pass_stride + - buffer_denoising_offset; - - int buffer_w = align_up(rect.z - rect.x, 4); - int idx = (y - rect.y) * buffer_w + (x - rect.x); - - float val = scale * center_buffer[m_offset]; - mean[idx] = val; - - if (v_offset >= 0) { - if (sample > 1) { - /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance - * update does not work efficiently with atomics in the kernel. */ - variance[idx] = max( - 0.0f, (center_buffer[v_offset] - val * val * sample) / (sample * (sample - 1))); - } - else { - /* Can't compute variance with single sample, just set it very high. */ - variance[idx] = 1e10f; - } - } -} - -ccl_device void kernel_filter_write_feature(int sample, - int x, - int y, - int4 buffer_params, - ccl_global float *from, - ccl_global float *buffer, - int out_offset, - int4 rect) -{ - ccl_global float *combined_buffer = buffer + (y * buffer_params.y + x + buffer_params.x) * - buffer_params.z; - - int buffer_w = align_up(rect.z - rect.x, 4); - int idx = (y - rect.y) * buffer_w + (x - rect.x); - - combined_buffer[out_offset] = from[idx]; -} - -#define GET_COLOR(image) \ - make_float3(image[idx], image[idx + pass_stride], image[idx + 2 * pass_stride]) -#define SET_COLOR(image, color) \ - image[idx] = color.x; \ - image[idx + pass_stride] = color.y; \ - image[idx + 2 * pass_stride] = color.z - -ccl_device void kernel_filter_detect_outliers(int x, - int y, - ccl_global float *in, - ccl_global float *variance_out, - ccl_global float *depth, - ccl_global float *image_out, - int4 rect, - int pass_stride) -{ - int buffer_w = align_up(rect.z - rect.x, 4); - - ccl_global float *image_in = in; - ccl_global float *variance_in = in + 3 * pass_stride; - - int n = 0; - float values[25]; - float pixel_variance, max_variance = 0.0f; - for (int y1 = max(y - 2, rect.y); y1 < min(y + 3, rect.w); y1++) { - for (int x1 = max(x - 2, rect.x); x1 < min(x + 3, rect.z); x1++) { - int idx = (y1 - rect.y) * buffer_w + (x1 - rect.x); - float3 color = GET_COLOR(image_in); - color = max(color, make_float3(0.0f, 0.0f, 0.0f)); - float L = average(color); - - /* Find the position of L. */ - int i; - for (i = 0; i < n; i++) { - if (values[i] > L) - break; - } - /* Make space for L by shifting all following values to the right. */ - for (int j = n; j > i; j--) { - values[j] = values[j - 1]; - } - /* Insert L. */ - values[i] = L; - n++; - - float3 pixel_var = GET_COLOR(variance_in); - float var = average(pixel_var); - if ((x1 == x) && (y1 == y)) { - pixel_variance = (pixel_var.x < 0.0f || pixel_var.y < 0.0f || pixel_var.z < 0.0f) ? -1.0f : - var; - } - else { - max_variance = max(max_variance, var); - } - } - } - - max_variance += 1e-4f; - - int idx = (y - rect.y) * buffer_w + (x - rect.x); - - float3 color = GET_COLOR(image_in); - float3 variance = GET_COLOR(variance_in); - color = max(color, make_float3(0.0f, 0.0f, 0.0f)); - variance = max(variance, make_float3(0.0f, 0.0f, 0.0f)); - - float L = average(color); - - float ref = 2.0f * values[(int)(n * 0.75f)]; - - /* Slightly offset values to avoid false positives in (almost) black areas. */ - max_variance += 1e-5f; - ref -= 1e-5f; - - if (L > ref) { - /* The pixel appears to be an outlier. - * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is - * that the pixel should actually be at the reference value: If the reference is within the - * 3-sigma interval, the pixel is assumed to be a statistical outlier. Otherwise, it is very - * unlikely that the pixel should be darker, which indicates a legitimate highlight. - */ - - if (pixel_variance < 0.0f || pixel_variance > 9.0f * max_variance) { - depth[idx] = -depth[idx]; - color *= ref / L; - variance = make_float3(max_variance, max_variance, max_variance); - } - else { - float stddev = sqrtf(pixel_variance); - if (L - 3 * stddev < ref) { - /* The pixel is an outlier, so negate the depth value to mark it as one. - * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM - * weights. */ - depth[idx] = -depth[idx]; - float fac = ref / L; - color *= fac; - variance *= sqr(fac); - } - } - } - - /* Apply log(1+x) transform to compress highlights and avoid halos in the denoised results. - * Variance is transformed accordingly - the derivative of the transform is 1/(1+x), so we - * scale by the square of that (since we have variance instead of standard deviation). */ - color = color_highlight_compress(color, &variance); - - SET_COLOR(image_out, color); - SET_COLOR(variance_out, variance); -} - -#undef GET_COLOR -#undef SET_COLOR - -/* Combine A/B buffers. - * Calculates the combined mean and the buffer variance. */ -ccl_device void kernel_filter_combine_halves(int x, - int y, - ccl_global float *mean, - ccl_global float *variance, - ccl_global float *a, - ccl_global float *b, - int4 rect, - int r) -{ - int buffer_w = align_up(rect.z - rect.x, 4); - int idx = (y - rect.y) * buffer_w + (x - rect.x); - - if (mean) - mean[idx] = 0.5f * (a[idx] + b[idx]); - if (variance) { - if (r == 0) - variance[idx] = 0.25f * (a[idx] - b[idx]) * (a[idx] - b[idx]); - else { - variance[idx] = 0.0f; - float values[25]; - int numValues = 0; - for (int py = max(y - r, rect.y); py < min(y + r + 1, rect.w); py++) { - for (int px = max(x - r, rect.x); px < min(x + r + 1, rect.z); px++) { - int pidx = (py - rect.y) * buffer_w + (px - rect.x); - values[numValues++] = 0.25f * (a[pidx] - b[pidx]) * (a[pidx] - b[pidx]); - } - } - /* Insertion-sort the variances (fast enough for 25 elements). */ - for (int i = 1; i < numValues; i++) { - float v = values[i]; - int j; - for (j = i - 1; j >= 0 && values[j] > v; j--) - values[j + 1] = values[j]; - values[j + 1] = v; - } - variance[idx] = values[(7 * numValues) / 8]; - } - } -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h deleted file mode 100644 index 17941689ad5..00000000000 --- a/intern/cycles/kernel/filter/filter_reconstruction.h +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -ccl_device_inline void kernel_filter_construct_gramian(int x, - int y, - int storage_stride, - int dx, - int dy, - int t, - int buffer_stride, - int pass_stride, - int frame_offset, - bool use_time, - const ccl_global float *ccl_restrict buffer, - const ccl_global float *ccl_restrict - transform, - ccl_global int *rank, - float weight, - ccl_global float *XtWX, - ccl_global float3 *XtWY, - int localIdx) -{ - if (weight < 1e-3f) { - return; - } - - int p_offset = y * buffer_stride + x; - int q_offset = (y + dy) * buffer_stride + (x + dx) + frame_offset; - -#ifdef __KERNEL_GPU__ - const int stride = storage_stride; -#else - const int stride = 1; - (void)storage_stride; -#endif - -#ifdef __KERNEL_CUDA__ - ccl_local float shared_design_row[(DENOISE_FEATURES + 1) * CCL_MAX_LOCAL_SIZE]; - ccl_local_param float *design_row = shared_design_row + localIdx * (DENOISE_FEATURES + 1); -#else - float design_row[DENOISE_FEATURES + 1]; -#endif - - float3 q_color = filter_get_color(buffer + q_offset, pass_stride); - - /* If the pixel was flagged as an outlier during prefiltering, skip it. */ - if (ccl_get_feature(buffer + q_offset, 0) < 0.0f) { - return; - } - - filter_get_design_row_transform(make_int3(x, y, t), - buffer + p_offset, - make_int3(x + dx, y + dy, t), - buffer + q_offset, - pass_stride, - *rank, - design_row, - transform, - stride, - use_time); - -#ifdef __KERNEL_GPU__ - math_trimatrix_add_gramian_strided(XtWX, (*rank) + 1, design_row, weight, stride); - math_vec3_add_strided(XtWY, (*rank) + 1, design_row, weight * q_color, stride); -#else - math_trimatrix_add_gramian(XtWX, (*rank) + 1, design_row, weight); - math_vec3_add(XtWY, (*rank) + 1, design_row, weight * q_color); -#endif -} - -ccl_device_inline void kernel_filter_finalize(int x, - int y, - ccl_global float *buffer, - ccl_global int *rank, - int storage_stride, - ccl_global float *XtWX, - ccl_global float3 *XtWY, - int4 buffer_params, - int sample) -{ -#ifdef __KERNEL_GPU__ - const int stride = storage_stride; -#else - const int stride = 1; - (void)storage_stride; -#endif - - if (XtWX[0] < 1e-3f) { - /* There is not enough information to determine a denoised result. - * As a fallback, keep the original value of the pixel. */ - return; - } - - /* The weighted average of pixel colors (essentially, the NLM-filtered image). - * In case the solution of the linear model fails due to numerical issues or - * returns nonsensical negative values, fall back to this value. */ - float3 mean_color = XtWY[0] / XtWX[0]; - - math_trimatrix_vec3_solve(XtWX, XtWY, (*rank) + 1, stride); - - float3 final_color = XtWY[0]; - if (!isfinite3_safe(final_color) || - (final_color.x < -0.01f || final_color.y < -0.01f || final_color.z < -0.01f)) { - final_color = mean_color; - } - - /* Clamp pixel value to positive values and reverse the highlight compression transform. */ - final_color = color_highlight_uncompress(max(final_color, make_float3(0.0f, 0.0f, 0.0f))); - - ccl_global float *combined_buffer = buffer + (y * buffer_params.y + x + buffer_params.x) * - buffer_params.z; - if (buffer_params.w >= 0) { - final_color *= sample; - if (buffer_params.w > 0) { - final_color.x += combined_buffer[buffer_params.w + 0]; - final_color.y += combined_buffer[buffer_params.w + 1]; - final_color.z += combined_buffer[buffer_params.w + 2]; - } - } - combined_buffer[0] = final_color.x; - combined_buffer[1] = final_color.y; - combined_buffer[2] = final_color.z; -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_transform.h b/intern/cycles/kernel/filter/filter_transform.h deleted file mode 100644 index 880a661214e..00000000000 --- a/intern/cycles/kernel/filter/filter_transform.h +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer, - CCL_FILTER_TILE_INFO, - int x, - int y, - int4 rect, - int pass_stride, - int frame_stride, - bool use_time, - float *transform, - int *rank, - int radius, - float pca_threshold) -{ - int buffer_w = align_up(rect.z - rect.x, 4); - - float features[DENOISE_FEATURES]; - - const float *ccl_restrict pixel_buffer; - int3 pixel; - - int num_features = use_time ? 11 : 10; - - /* === Calculate denoising window. === */ - int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius)); - int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1)); - int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames; - - /* === Shift feature passes to have mean 0. === */ - float feature_means[DENOISE_FEATURES]; - math_vector_zero(feature_means, num_features); - FOR_PIXEL_WINDOW - { - filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride); - math_vector_add(feature_means, features, num_features); - } - END_FOR_PIXEL_WINDOW - - math_vector_scale(feature_means, 1.0f / num_pixels, num_features); - - /* === Scale the shifted feature passes to a range of [-1; 1] === - * Will be baked into the transform later. */ - float feature_scale[DENOISE_FEATURES]; - math_vector_zero(feature_scale, num_features); - - FOR_PIXEL_WINDOW - { - filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride); - math_vector_max(feature_scale, features, num_features); - } - END_FOR_PIXEL_WINDOW - - filter_calculate_scale(feature_scale, use_time); - - /* === Generate the feature transformation. === - * This transformation maps the num_features-dimensional feature space to a reduced feature - * (r-feature) space which generally has fewer dimensions. - * This mainly helps to prevent over-fitting. */ - float feature_matrix[DENOISE_FEATURES * DENOISE_FEATURES]; - math_matrix_zero(feature_matrix, num_features); - FOR_PIXEL_WINDOW - { - filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride); - math_vector_mul(features, feature_scale, num_features); - math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f); - } - END_FOR_PIXEL_WINDOW - - math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1); - *rank = 0; - /* Prevent over-fitting when a small window is used. */ - int max_rank = min(num_features, num_pixels / 3); - if (pca_threshold < 0.0f) { - float threshold_energy = 0.0f; - for (int i = 0; i < num_features; i++) { - threshold_energy += feature_matrix[i * num_features + i]; - } - threshold_energy *= 1.0f - (-pca_threshold); - - float reduced_energy = 0.0f; - for (int i = 0; i < max_rank; i++, (*rank)++) { - if (i >= 2 && reduced_energy >= threshold_energy) - break; - float s = feature_matrix[i * num_features + i]; - reduced_energy += s; - } - } - else { - for (int i = 0; i < max_rank; i++, (*rank)++) { - float s = feature_matrix[i * num_features + i]; - if (i >= 2 && sqrtf(s) < pca_threshold) - break; - } - } - - /* Bake the feature scaling into the transformation matrix. */ - for (int i = 0; i < (*rank); i++) { - math_vector_mul(transform + i * num_features, feature_scale, num_features); - } - math_matrix_transpose(transform, num_features, 1); -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h deleted file mode 100644 index ec258a5212a..00000000000 --- a/intern/cycles/kernel/filter/filter_transform_gpu.h +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_restrict buffer, - CCL_FILTER_TILE_INFO, - int x, - int y, - int4 rect, - int pass_stride, - int frame_stride, - bool use_time, - ccl_global float *transform, - ccl_global int *rank, - int radius, - float pca_threshold, - int transform_stride, - int localIdx) -{ - int buffer_w = align_up(rect.z - rect.x, 4); - -#ifdef __KERNEL_CUDA__ - ccl_local float shared_features[DENOISE_FEATURES * CCL_MAX_LOCAL_SIZE]; - ccl_local_param float *features = shared_features + localIdx * DENOISE_FEATURES; -#else - float features[DENOISE_FEATURES]; -#endif - - int num_features = use_time ? 11 : 10; - - /* === Calculate denoising window. === */ - int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius)); - int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1)); - int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames; - const ccl_global float *ccl_restrict pixel_buffer; - int3 pixel; - - /* === Shift feature passes to have mean 0. === */ - float feature_means[DENOISE_FEATURES]; - math_vector_zero(feature_means, num_features); - FOR_PIXEL_WINDOW - { - filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride); - math_vector_add(feature_means, features, num_features); - } - END_FOR_PIXEL_WINDOW - - math_vector_scale(feature_means, 1.0f / num_pixels, num_features); - - /* === Scale the shifted feature passes to a range of [-1; 1] === - * Will be baked into the transform later. */ - float feature_scale[DENOISE_FEATURES]; - math_vector_zero(feature_scale, num_features); - - FOR_PIXEL_WINDOW - { - filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride); - math_vector_max(feature_scale, features, num_features); - } - END_FOR_PIXEL_WINDOW - - filter_calculate_scale(feature_scale, use_time); - - /* === Generate the feature transformation. === - * This transformation maps the num_features-dimensional feature space to a reduced feature - * (r-feature) space which generally has fewer dimensions. - * This mainly helps to prevent over-fitting. */ - float feature_matrix[DENOISE_FEATURES * DENOISE_FEATURES]; - math_matrix_zero(feature_matrix, num_features); - FOR_PIXEL_WINDOW - { - filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride); - math_vector_mul(features, feature_scale, num_features); - math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f); - } - END_FOR_PIXEL_WINDOW - - math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, transform_stride); - *rank = 0; - /* Prevent over-fitting when a small window is used. */ - int max_rank = min(num_features, num_pixels / 3); - if (pca_threshold < 0.0f) { - float threshold_energy = 0.0f; - for (int i = 0; i < num_features; i++) { - threshold_energy += feature_matrix[i * num_features + i]; - } - threshold_energy *= 1.0f - (-pca_threshold); - - float reduced_energy = 0.0f; - for (int i = 0; i < max_rank; i++, (*rank)++) { - if (i >= 2 && reduced_energy >= threshold_energy) - break; - float s = feature_matrix[i * num_features + i]; - reduced_energy += s; - } - } - else { - for (int i = 0; i < max_rank; i++, (*rank)++) { - float s = feature_matrix[i * num_features + i]; - if (i >= 2 && sqrtf(s) < pca_threshold) - break; - } - } - - math_matrix_transpose(transform, num_features, transform_stride); - - /* Bake the feature scaling into the transformation matrix. */ - for (int i = 0; i < num_features; i++) { - for (int j = 0; j < (*rank); j++) { - transform[(i * num_features + j) * transform_stride] *= feature_scale[i]; - } - } -} - -CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h deleted file mode 100644 index 0304d990f9f..00000000000 --- a/intern/cycles/kernel/filter/filter_transform_sse.h +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer, - CCL_FILTER_TILE_INFO, - int x, - int y, - int4 rect, - int pass_stride, - int frame_stride, - bool use_time, - float *transform, - int *rank, - int radius, - float pca_threshold) -{ - int buffer_w = align_up(rect.z - rect.x, 4); - - float4 features[DENOISE_FEATURES]; - const float *ccl_restrict pixel_buffer; - int3 pixel; - - int num_features = use_time ? 11 : 10; - - /* === Calculate denoising window. === */ - int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius)); - int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1)); - int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames; - - /* === Shift feature passes to have mean 0. === */ - float4 feature_means[DENOISE_FEATURES]; - math_vector_zero_sse(feature_means, num_features); - FOR_PIXEL_WINDOW_SSE - { - filter_get_features_sse( - x4, y4, t4, active_pixels, pixel_buffer, features, use_time, NULL, pass_stride); - math_vector_add_sse(feature_means, num_features, features); - } - END_FOR_PIXEL_WINDOW_SSE - - float4 pixel_scale = make_float4(1.0f / num_pixels); - for (int i = 0; i < num_features; i++) { - feature_means[i] = reduce_add(feature_means[i]) * pixel_scale; - } - - /* === Scale the shifted feature passes to a range of [-1; 1] === - * Will be baked into the transform later. */ - float4 feature_scale[DENOISE_FEATURES]; - math_vector_zero_sse(feature_scale, num_features); - FOR_PIXEL_WINDOW_SSE - { - filter_get_feature_scales_sse( - x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride); - math_vector_max_sse(feature_scale, features, num_features); - } - END_FOR_PIXEL_WINDOW_SSE - - filter_calculate_scale_sse(feature_scale, use_time); - - /* === Generate the feature transformation. === - * This transformation maps the num_features-dimensional feature space to a reduced feature - * (r-feature) space which generally has fewer dimensions. - * This mainly helps to prevent over-fitting. */ - float4 feature_matrix_sse[DENOISE_FEATURES * DENOISE_FEATURES]; - math_matrix_zero_sse(feature_matrix_sse, num_features); - FOR_PIXEL_WINDOW_SSE - { - filter_get_features_sse( - x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride); - math_vector_mul_sse(features, num_features, feature_scale); - math_matrix_add_gramian_sse(feature_matrix_sse, num_features, features, make_float4(1.0f)); - } - END_FOR_PIXEL_WINDOW_SSE - - float feature_matrix[DENOISE_FEATURES * DENOISE_FEATURES]; - math_matrix_hsum(feature_matrix, num_features, feature_matrix_sse); - - math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1); - - *rank = 0; - /* Prevent over-fitting when a small window is used. */ - int max_rank = min(num_features, num_pixels / 3); - if (pca_threshold < 0.0f) { - float threshold_energy = 0.0f; - for (int i = 0; i < num_features; i++) { - threshold_energy += feature_matrix[i * num_features + i]; - } - threshold_energy *= 1.0f - (-pca_threshold); - - float reduced_energy = 0.0f; - for (int i = 0; i < max_rank; i++, (*rank)++) { - if (i >= 2 && reduced_energy >= threshold_energy) - break; - float s = feature_matrix[i * num_features + i]; - reduced_energy += s; - } - } - else { - for (int i = 0; i < max_rank; i++, (*rank)++) { - float s = feature_matrix[i * num_features + i]; - if (i >= 2 && sqrtf(s) < pca_threshold) - break; - } - } - - math_matrix_transpose(transform, num_features, 1); - - /* Bake the feature scaling into the transformation matrix. */ - for (int i = 0; i < num_features; i++) { - math_vector_scale(transform + i * num_features, feature_scale[i][0], *rank); - } -} - -CCL_NAMESPACE_END |