Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/kernel/filter')
-rw-r--r--intern/cycles/kernel/filter/filter.h52
-rw-r--r--intern/cycles/kernel/filter/filter_defines.h38
-rw-r--r--intern/cycles/kernel/filter/filter_features.h124
-rw-r--r--intern/cycles/kernel/filter/filter_features_sse.h105
-rw-r--r--intern/cycles/kernel/filter/filter_kernel.h50
-rw-r--r--intern/cycles/kernel/filter/filter_nlm_cpu.h186
-rw-r--r--intern/cycles/kernel/filter/filter_nlm_gpu.h144
-rw-r--r--intern/cycles/kernel/filter/filter_prefilter.h211
-rw-r--r--intern/cycles/kernel/filter/filter_reconstruction.h117
-rw-r--r--intern/cycles/kernel/filter/filter_transform.h108
-rw-r--r--intern/cycles/kernel/filter/filter_transform_gpu.h119
-rw-r--r--intern/cycles/kernel/filter/filter_transform_sse.h105
12 files changed, 1359 insertions, 0 deletions
diff --git a/intern/cycles/kernel/filter/filter.h b/intern/cycles/kernel/filter/filter.h
new file mode 100644
index 00000000000..f6e474d6702
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FILTER_H__
+#define __FILTER_H__
+
+/* CPU Filter Kernel Interface */
+
+#include "util/util_types.h"
+
+#include "kernel/filter/filter_defines.h"
+
+CCL_NAMESPACE_BEGIN
+
+#define KERNEL_NAME_JOIN(x, y, z) x ## _ ## y ## _ ## z
+#define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name)
+#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
+
+#define KERNEL_ARCH cpu
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+CCL_NAMESPACE_END
+
+#endif /* __FILTER_H__ */
diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h
new file mode 100644
index 00000000000..ce96f733aff
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_defines.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FILTER_DEFINES_H__
+#define __FILTER_DEFINES_H__
+
+#define DENOISE_FEATURES 10
+#define TRANSFORM_SIZE (DENOISE_FEATURES*DENOISE_FEATURES)
+#define XTWX_SIZE (((DENOISE_FEATURES+1)*(DENOISE_FEATURES+2))/2)
+#define XTWY_SIZE (DENOISE_FEATURES+1)
+
+typedef struct TilesInfo {
+ int offsets[9];
+ int strides[9];
+ int x[4];
+ int y[4];
+ /* TODO(lukas): CUDA doesn't have uint64_t... */
+#ifdef __KERNEL_OPENCL__
+ ccl_global float *buffers[9];
+#else
+ long long int buffers[9];
+#endif
+} TilesInfo;
+
+#endif /* __FILTER_DEFINES_H__*/
diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h
new file mode 100644
index 00000000000..6226ed2c2ef
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_features.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ CCL_NAMESPACE_BEGIN
+
+#define ccl_get_feature(buffer, pass) (buffer)[(pass)*pass_stride]
+
+/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).
+ * pixel_buffer always points to the current pixel in the first pass. */
+#define FOR_PIXEL_WINDOW pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
+ for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
+ for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) {
+
+#define END_FOR_PIXEL_WINDOW } \
+ pixel_buffer += buffer_w - (high.x - low.x); \
+ }
+
+ccl_device_inline void filter_get_features(int2 pixel,
+ const ccl_global float *ccl_restrict buffer,
+ float *features,
+ const float *ccl_restrict mean,
+ int pass_stride)
+{
+ features[0] = pixel.x;
+ features[1] = pixel.y;
+ features[2] = fabsf(ccl_get_feature(buffer, 0));
+ features[3] = ccl_get_feature(buffer, 1);
+ features[4] = ccl_get_feature(buffer, 2);
+ features[5] = ccl_get_feature(buffer, 3);
+ features[6] = ccl_get_feature(buffer, 4);
+ features[7] = ccl_get_feature(buffer, 5);
+ features[8] = ccl_get_feature(buffer, 6);
+ features[9] = ccl_get_feature(buffer, 7);
+ if(mean) {
+ for(int i = 0; i < DENOISE_FEATURES; i++)
+ features[i] -= mean[i];
+ }
+}
+
+ccl_device_inline void filter_get_feature_scales(int2 pixel,
+ const ccl_global float *ccl_restrict buffer,
+ float *scales,
+ const float *ccl_restrict mean,
+ int pass_stride)
+{
+ scales[0] = fabsf(pixel.x - mean[0]);
+ scales[1] = fabsf(pixel.y - mean[1]);
+ scales[2] = fabsf(fabsf(ccl_get_feature(buffer, 0)) - mean[2]);
+ scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3],
+ ccl_get_feature(buffer, 2) - mean[4],
+ ccl_get_feature(buffer, 3) - mean[5]));
+ scales[4] = fabsf(ccl_get_feature(buffer, 4) - mean[6]);
+ scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7],
+ ccl_get_feature(buffer, 6) - mean[8],
+ ccl_get_feature(buffer, 7) - mean[9]));
+}
+
+ccl_device_inline void filter_calculate_scale(float *scale)
+{
+ scale[0] = 1.0f/max(scale[0], 0.01f);
+ scale[1] = 1.0f/max(scale[1], 0.01f);
+ scale[2] = 1.0f/max(scale[2], 0.01f);
+ scale[6] = 1.0f/max(scale[4], 0.01f);
+ scale[7] = scale[8] = scale[9] = 1.0f/max(sqrtf(scale[5]), 0.01f);
+ scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f);
+}
+
+ccl_device_inline float3 filter_get_color(const ccl_global float *ccl_restrict buffer,
+ int pass_stride)
+{
+ return make_float3(ccl_get_feature(buffer, 8), ccl_get_feature(buffer, 9), ccl_get_feature(buffer, 10));
+}
+
+ccl_device_inline void design_row_add(float *design_row,
+ int rank,
+ const ccl_global float *ccl_restrict transform,
+ int stride,
+ int row,
+ float feature)
+{
+ for(int i = 0; i < rank; i++) {
+ design_row[1+i] += transform[(row*DENOISE_FEATURES + i)*stride]*feature;
+ }
+}
+
+/* Fill the design row. */
+ccl_device_inline void filter_get_design_row_transform(int2 p_pixel,
+ const ccl_global float *ccl_restrict p_buffer,
+ int2 q_pixel,
+ const ccl_global float *ccl_restrict q_buffer,
+ int pass_stride,
+ int rank,
+ float *design_row,
+ const ccl_global float *ccl_restrict transform,
+ int stride)
+{
+ design_row[0] = 1.0f;
+ math_vector_zero(design_row+1, rank);
+ design_row_add(design_row, rank, transform, stride, 0, q_pixel.x - p_pixel.x);
+ design_row_add(design_row, rank, transform, stride, 1, q_pixel.y - p_pixel.y);
+ design_row_add(design_row, rank, transform, stride, 2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0)));
+ design_row_add(design_row, rank, transform, stride, 3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1));
+ design_row_add(design_row, rank, transform, stride, 4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2));
+ design_row_add(design_row, rank, transform, stride, 5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3));
+ design_row_add(design_row, rank, transform, stride, 6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4));
+ design_row_add(design_row, rank, transform, stride, 7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5));
+ design_row_add(design_row, rank, transform, stride, 8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6));
+ design_row_add(design_row, rank, transform, stride, 9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7));
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h
new file mode 100644
index 00000000000..3185330994c
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_features_sse.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#define ccl_get_feature_sse(pass) _mm_loadu_ps(buffer + (pass)*pass_stride)
+
+/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time.
+ * pixel_buffer always points to the first of the 4 current pixel in the first pass.
+ * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. */
+
+#define FOR_PIXEL_WINDOW_SSE pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
+ for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
+ __m128 y4 = _mm_set1_ps(pixel.y); \
+ for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
+ __m128 x4 = _mm_add_ps(_mm_set1_ps(pixel.x), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); \
+ __m128 active_pixels = _mm_cmplt_ps(x4, _mm_set1_ps(high.x));
+
+#define END_FOR_PIXEL_WINDOW_SSE } \
+ pixel_buffer += buffer_w - (pixel.x - low.x); \
+ }
+
+ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y,
+ __m128 active_pixels,
+ const float *ccl_restrict buffer,
+ __m128 *features,
+ const __m128 *ccl_restrict mean,
+ int pass_stride)
+{
+ features[0] = x;
+ features[1] = y;
+ features[2] = _mm_fabs_ps(ccl_get_feature_sse(0));
+ features[3] = ccl_get_feature_sse(1);
+ features[4] = ccl_get_feature_sse(2);
+ features[5] = ccl_get_feature_sse(3);
+ features[6] = ccl_get_feature_sse(4);
+ features[7] = ccl_get_feature_sse(5);
+ features[8] = ccl_get_feature_sse(6);
+ features[9] = ccl_get_feature_sse(7);
+ if(mean) {
+ for(int i = 0; i < DENOISE_FEATURES; i++)
+ features[i] = _mm_sub_ps(features[i], mean[i]);
+ }
+ for(int i = 0; i < DENOISE_FEATURES; i++)
+ features[i] = _mm_mask_ps(features[i], active_pixels);
+}
+
+ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y,
+ __m128 active_pixels,
+ const float *ccl_restrict buffer,
+ __m128 *scales,
+ const __m128 *ccl_restrict mean,
+ int pass_stride)
+{
+ scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels);
+ scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels);
+
+ scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(_mm_fabs_ps(ccl_get_feature_sse(0)), mean[2])), active_pixels);
+
+ __m128 diff, scale;
+ diff = _mm_sub_ps(ccl_get_feature_sse(1), mean[3]);
+ scale = _mm_mul_ps(diff, diff);
+ diff = _mm_sub_ps(ccl_get_feature_sse(2), mean[4]);
+ scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
+ diff = _mm_sub_ps(ccl_get_feature_sse(3), mean[5]);
+ scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
+ scales[3] = _mm_mask_ps(scale, active_pixels);
+
+ scales[4] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(4), mean[6])), active_pixels);
+
+ diff = _mm_sub_ps(ccl_get_feature_sse(5), mean[7]);
+ scale = _mm_mul_ps(diff, diff);
+ diff = _mm_sub_ps(ccl_get_feature_sse(6), mean[8]);
+ scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
+ diff = _mm_sub_ps(ccl_get_feature_sse(7), mean[9]);
+ scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
+ scales[5] = _mm_mask_ps(scale, active_pixels);
+}
+
+ccl_device_inline void filter_calculate_scale_sse(__m128 *scale)
+{
+ scale[0] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[0]), _mm_set1_ps(0.01f)));
+ scale[1] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[1]), _mm_set1_ps(0.01f)));
+ scale[2] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[2]), _mm_set1_ps(0.01f)));
+ scale[6] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[4]), _mm_set1_ps(0.01f)));
+
+ scale[7] = scale[8] = scale[9] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[5])), _mm_set1_ps(0.01f)));
+ scale[3] = scale[4] = scale[5] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[3])), _mm_set1_ps(0.01f)));
+}
+
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_kernel.h b/intern/cycles/kernel/filter/filter_kernel.h
new file mode 100644
index 00000000000..2ef03dc0a02
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_kernel.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "util/util_color.h"
+#include "util/util_math.h"
+#include "util/util_math_fast.h"
+#include "util/util_texture.h"
+
+#include "util/util_atomic.h"
+#include "util/util_math_matrix.h"
+
+#include "kernel/filter/filter_defines.h"
+
+#include "kernel/filter/filter_features.h"
+#ifdef __KERNEL_SSE3__
+# include "kernel/filter/filter_features_sse.h"
+#endif
+
+#include "kernel/filter/filter_prefilter.h"
+
+#ifdef __KERNEL_GPU__
+# include "kernel/filter/filter_transform_gpu.h"
+#else
+# ifdef __KERNEL_SSE3__
+# include "kernel/filter/filter_transform_sse.h"
+# else
+# include "kernel/filter/filter_transform.h"
+# endif
+#endif
+
+#include "kernel/filter/filter_reconstruction.h"
+
+#ifdef __KERNEL_CPU__
+# include "kernel/filter/filter_nlm_cpu.h"
+#else
+# include "kernel/filter/filter_nlm_gpu.h"
+#endif
diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h
new file mode 100644
index 00000000000..3e752bce68f
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h
@@ -0,0 +1,186 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
+ const float *ccl_restrict weight_image,
+ const float *ccl_restrict variance_image,
+ float *difference_image,
+ int4 rect,
+ int w,
+ int channel_offset,
+ float a,
+ float k_2)
+{
+ for(int y = rect.y; y < rect.w; y++) {
+ for(int x = rect.x; x < rect.z; x++) {
+ float diff = 0.0f;
+ int numChannels = channel_offset? 3 : 1;
+ for(int c = 0; c < numChannels; c++) {
+ float cdiff = weight_image[c*channel_offset + y*w+x] - weight_image[c*channel_offset + (y+dy)*w+(x+dx)];
+ float pvar = variance_image[c*channel_offset + y*w+x];
+ float qvar = variance_image[c*channel_offset + (y+dy)*w+(x+dx)];
+ diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
+ }
+ if(numChannels > 1) {
+ diff *= 1.0f/numChannels;
+ }
+ difference_image[y*w+x] = diff;
+ }
+ }
+}
+
+ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict difference_image,
+ float *out_image,
+ int4 rect,
+ int w,
+ int f)
+{
+#ifdef __KERNEL_SSE3__
+ int aligned_lowx = (rect.x & ~(3));
+ int aligned_highx = ((rect.z + 3) & ~(3));
+#endif
+ for(int y = rect.y; y < rect.w; y++) {
+ const int low = max(rect.y, y-f);
+ const int high = min(rect.w, y+f+1);
+ for(int x = rect.x; x < rect.z; x++) {
+ out_image[y*w+x] = 0.0f;
+ }
+ for(int y1 = low; y1 < high; y1++) {
+#ifdef __KERNEL_SSE3__
+ for(int x = aligned_lowx; x < aligned_highx; x+=4) {
+ _mm_store_ps(out_image + y*w+x, _mm_add_ps(_mm_load_ps(out_image + y*w+x), _mm_load_ps(difference_image + y1*w+x)));
+ }
+#else
+ for(int x = rect.x; x < rect.z; x++) {
+ out_image[y*w+x] += difference_image[y1*w+x];
+ }
+#endif
+ }
+ for(int x = rect.x; x < rect.z; x++) {
+ out_image[y*w+x] *= 1.0f/(high - low);
+ }
+ }
+}
+
+ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
+ float *out_image,
+ int4 rect,
+ int w,
+ int f)
+{
+ for(int y = rect.y; y < rect.w; y++) {
+ for(int x = rect.x; x < rect.z; x++) {
+ out_image[y*w+x] = 0.0f;
+ }
+ }
+ for(int dx = -f; dx <= f; dx++) {
+ int pos_dx = max(0, dx);
+ int neg_dx = min(0, dx);
+ for(int y = rect.y; y < rect.w; y++) {
+ for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) {
+ out_image[y*w+x] += difference_image[y*w+dx+x];
+ }
+ }
+ }
+ for(int y = rect.y; y < rect.w; y++) {
+ for(int x = rect.x; x < rect.z; x++) {
+ const int low = max(rect.x, x-f);
+ const int high = min(rect.z, x+f+1);
+ out_image[y*w+x] = fast_expf(-max(out_image[y*w+x] * (1.0f/(high - low)), 0.0f));
+ }
+ }
+}
+
+ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy,
+ const float *ccl_restrict difference_image,
+ const float *ccl_restrict image,
+ float *out_image,
+ float *accum_image,
+ int4 rect,
+ int w,
+ int f)
+{
+ for(int y = rect.y; y < rect.w; y++) {
+ for(int x = rect.x; x < rect.z; x++) {
+ const int low = max(rect.x, x-f);
+ const int high = min(rect.z, x+f+1);
+ float sum = 0.0f;
+ for(int x1 = low; x1 < high; x1++) {
+ sum += difference_image[y*w+x1];
+ }
+ float weight = sum * (1.0f/(high - low));
+ accum_image[y*w+x] += weight;
+ out_image[y*w+x] += weight*image[(y+dy)*w+(x+dx)];
+ }
+ }
+}
+
+ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
+ const float *ccl_restrict difference_image,
+ const float *ccl_restrict buffer,
+ float *transform,
+ int *rank,
+ float *XtWX,
+ float3 *XtWY,
+ int4 rect,
+ int4 filter_rect,
+ int w, int h, int f,
+ int pass_stride)
+{
+ /* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */
+ for(int fy = max(0, rect.y-filter_rect.y); fy < min(filter_rect.w, rect.w-filter_rect.y); fy++) {
+ int y = fy + filter_rect.y;
+ for(int fx = max(0, rect.x-filter_rect.x); fx < min(filter_rect.z, rect.z-filter_rect.x); fx++) {
+ int x = fx + filter_rect.x;
+ const int low = max(rect.x, x-f);
+ const int high = min(rect.z, x+f+1);
+ float sum = 0.0f;
+ for(int x1 = low; x1 < high; x1++) {
+ sum += difference_image[y*w+x1];
+ }
+ float weight = sum * (1.0f/(high - low));
+
+ int storage_ofs = fy*filter_rect.z + fx;
+ float *l_transform = transform + storage_ofs*TRANSFORM_SIZE;
+ float *l_XtWX = XtWX + storage_ofs*XTWX_SIZE;
+ float3 *l_XtWY = XtWY + storage_ofs*XTWY_SIZE;
+ int *l_rank = rank + storage_ofs;
+
+ kernel_filter_construct_gramian(x, y, 1,
+ dx, dy, w, h,
+ pass_stride,
+ buffer,
+ l_transform, l_rank,
+ weight, l_XtWX, l_XtWY, 0);
+ }
+ }
+}
+
+ccl_device_inline void kernel_filter_nlm_normalize(float *out_image,
+ const float *ccl_restrict accum_image,
+ int4 rect,
+ int w)
+{
+ for(int y = rect.y; y < rect.w; y++) {
+ for(int x = rect.x; x < rect.z; x++) {
+ out_image[y*w+x] /= accum_image[y*w+x];
+ }
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h
new file mode 100644
index 00000000000..2c5ac807051
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y,
+ int dx, int dy,
+ const ccl_global float *ccl_restrict weight_image,
+ const ccl_global float *ccl_restrict variance_image,
+ ccl_global float *difference_image,
+ int4 rect, int w,
+ int channel_offset,
+ float a, float k_2)
+{
+ float diff = 0.0f;
+ int numChannels = channel_offset? 3 : 1;
+ for(int c = 0; c < numChannels; c++) {
+ float cdiff = weight_image[c*channel_offset + y*w+x] - weight_image[c*channel_offset + (y+dy)*w+(x+dx)];
+ float pvar = variance_image[c*channel_offset + y*w+x];
+ float qvar = variance_image[c*channel_offset + (y+dy)*w+(x+dx)];
+ diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
+ }
+ if(numChannels > 1) {
+ diff *= 1.0f/numChannels;
+ }
+ difference_image[y*w+x] = diff;
+}
+
+ccl_device_inline void kernel_filter_nlm_blur(int x, int y,
+ const ccl_global float *ccl_restrict difference_image,
+ ccl_global float *out_image,
+ int4 rect, int w, int f)
+{
+ float sum = 0.0f;
+ const int low = max(rect.y, y-f);
+ const int high = min(rect.w, y+f+1);
+ for(int y1 = low; y1 < high; y1++) {
+ sum += difference_image[y1*w+x];
+ }
+ sum *= 1.0f/(high-low);
+ out_image[y*w+x] = sum;
+}
+
+ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y,
+ const ccl_global float *ccl_restrict difference_image,
+ ccl_global float *out_image,
+ int4 rect, int w, int f)
+{
+ float sum = 0.0f;
+ const int low = max(rect.x, x-f);
+ const int high = min(rect.z, x+f+1);
+ for(int x1 = low; x1 < high; x1++) {
+ sum += difference_image[y*w+x1];
+ }
+ sum *= 1.0f/(high-low);
+ out_image[y*w+x] = fast_expf(-max(sum, 0.0f));
+}
+
+ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
+ int dx, int dy,
+ const ccl_global float *ccl_restrict difference_image,
+ const ccl_global float *ccl_restrict image,
+ ccl_global float *out_image,
+ ccl_global float *accum_image,
+ int4 rect, int w, int f)
+{
+ float sum = 0.0f;
+ const int low = max(rect.x, x-f);
+ const int high = min(rect.z, x+f+1);
+ for(int x1 = low; x1 < high; x1++) {
+ sum += difference_image[y*w+x1];
+ }
+ sum *= 1.0f/(high-low);
+ if(out_image) {
+ accum_image[y*w+x] += sum;
+ out_image[y*w+x] += sum*image[(y+dy)*w+(x+dx)];
+ }
+ else {
+ accum_image[y*w+x] = sum;
+ }
+}
+
+ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy,
+ int dx, int dy,
+ const ccl_global float *ccl_restrict difference_image,
+ const ccl_global float *ccl_restrict buffer,
+ const ccl_global float *ccl_restrict transform,
+ ccl_global int *rank,
+ ccl_global float *XtWX,
+ ccl_global float3 *XtWY,
+ int4 rect,
+ int4 filter_rect,
+ int w, int h, int f,
+ int pass_stride,
+ int localIdx)
+{
+ int y = fy + filter_rect.y;
+ int x = fx + filter_rect.x;
+ const int low = max(rect.x, x-f);
+ const int high = min(rect.z, x+f+1);
+ float sum = 0.0f;
+ for(int x1 = low; x1 < high; x1++) {
+ sum += difference_image[y*w+x1];
+ }
+ float weight = sum * (1.0f/(high - low));
+
+ int storage_ofs = fy*filter_rect.z + fx;
+ transform += storage_ofs;
+ rank += storage_ofs;
+ XtWX += storage_ofs;
+ XtWY += storage_ofs;
+
+ kernel_filter_construct_gramian(x, y,
+ filter_rect.z*filter_rect.w,
+ dx, dy, w, h,
+ pass_stride,
+ buffer,
+ transform, rank,
+ weight, XtWX, XtWY,
+ localIdx);
+}
+
+ccl_device_inline void kernel_filter_nlm_normalize(int x, int y,
+ ccl_global float *out_image,
+ const ccl_global float *ccl_restrict accum_image,
+ int4 rect, int w)
+{
+ out_image[y*w+x] /= accum_image[y*w+x];
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h
new file mode 100644
index 00000000000..a0b89c1111f
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_prefilter.h
@@ -0,0 +1,211 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* First step of the shadow prefiltering, performs the shadow division and stores all data
+ * in a nice and easy rectangular array that can be passed to the NLM filter.
+ *
+ * Calculates:
+ * unfiltered: Contains the two half images of the shadow feature pass
+ * sampleVariance: The sample-based variance calculated in the kernel. Note: This calculation is biased in general, and especially here since the variance of the ratio can only be approximated.
+ * sampleVarianceV: Variance of the sample variance estimation, quite noisy (since it's essentially the buffer variance of the two variance halves)
+ * bufferVariance: The buffer-based variance of the shadow feature. Unbiased, but quite noisy.
+ */
+ccl_device void kernel_filter_divide_shadow(int sample,
+ ccl_global TilesInfo *tiles,
+ int x, int y,
+ ccl_global float *unfilteredA,
+ ccl_global float *unfilteredB,
+ ccl_global float *sampleVariance,
+ ccl_global float *sampleVarianceV,
+ ccl_global float *bufferVariance,
+ int4 rect,
+ int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance)
+{
+ int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2);
+ int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2);
+ int tile = ytile*3+xtile;
+
+ int offset = tiles->offsets[tile];
+ int stride = tiles->strides[tile];
+ const ccl_global float *ccl_restrict center_buffer = (ccl_global float*) tiles->buffers[tile];
+ center_buffer += (y*stride + x + offset)*buffer_pass_stride;
+ center_buffer += buffer_denoising_offset + 14;
+
+ int buffer_w = align_up(rect.z - rect.x, 4);
+ int idx = (y-rect.y)*buffer_w + (x - rect.x);
+ unfilteredA[idx] = center_buffer[1] / max(center_buffer[0], 1e-7f);
+ unfilteredB[idx] = center_buffer[4] / max(center_buffer[3], 1e-7f);
+
+ float varA = center_buffer[2];
+ float varB = center_buffer[5];
+ int odd_sample = (sample+1)/2;
+ int even_sample = sample/2;
+ if(use_split_variance) {
+ varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample);
+ varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample);
+ }
+ varA /= max(odd_sample - 1, 1);
+ varB /= max(even_sample - 1, 1);
+
+ sampleVariance[idx] = 0.5f*(varA + varB) / sample;
+ sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample*sample);
+ bufferVariance[idx] = 0.5f * (unfilteredA[idx] - unfilteredB[idx]) * (unfilteredA[idx] - unfilteredB[idx]);
+}
+
+/* Load a regular feature from the render buffers into the denoise buffer.
+ * Parameters:
+ * - sample: The sample amount in the buffer, used to normalize the buffer.
+ * - m_offset, v_offset: Render Buffer Pass offsets of mean and variance of the feature.
+ * - x, y: Current pixel
+ * - mean, variance: Target denoise buffers.
+ * - rect: The prefilter area (lower pixels inclusive, upper pixels exclusive).
+ */
+ccl_device void kernel_filter_get_feature(int sample,
+ ccl_global TilesInfo *tiles,
+ int m_offset, int v_offset,
+ int x, int y,
+ ccl_global float *mean,
+ ccl_global float *variance,
+ int4 rect, int buffer_pass_stride,
+ int buffer_denoising_offset,
+ bool use_split_variance)
+{
+ int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2);
+ int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2);
+ int tile = ytile*3+xtile;
+ ccl_global float *center_buffer = ((ccl_global float*) tiles->buffers[tile]) + (tiles->offsets[tile] + y*tiles->strides[tile] + x)*buffer_pass_stride + buffer_denoising_offset;
+
+ int buffer_w = align_up(rect.z - rect.x, 4);
+ int idx = (y-rect.y)*buffer_w + (x - rect.x);
+
+ mean[idx] = center_buffer[m_offset] / sample;
+ if (sample > 1) {
+ if(use_split_variance) {
+ variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1)));
+ }
+ else {
+ variance[idx] = center_buffer[v_offset] / (sample * (sample-1));
+ }
+ }
+ else {
+ /* Can't compute variance with single sample, just set it very high. */
+ variance[idx] = 1e10f;
+ }
+}
+
+ccl_device void kernel_filter_detect_outliers(int x, int y,
+ ccl_global float *image,
+ ccl_global float *variance,
+ ccl_global float *depth,
+ ccl_global float *out,
+ int4 rect,
+ int pass_stride)
+{
+ int buffer_w = align_up(rect.z - rect.x, 4);
+
+ int n = 0;
+ float values[25];
+ for(int y1 = max(y-2, rect.y); y1 < min(y+3, rect.w); y1++) {
+ for(int x1 = max(x-2, rect.x); x1 < min(x+3, rect.z); x1++) {
+ int idx = (y1-rect.y)*buffer_w + (x1-rect.x);
+ float L = average(make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]));
+
+ /* Find the position of L. */
+ int i;
+ for(i = 0; i < n; i++) {
+ if(values[i] > L) break;
+ }
+ /* Make space for L by shifting all following values to the right. */
+ for(int j = n; j > i; j--) {
+ values[j] = values[j-1];
+ }
+ /* Insert L. */
+ values[i] = L;
+ n++;
+ }
+ }
+
+ int idx = (y-rect.y)*buffer_w + (x-rect.x);
+ float L = average(make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]));
+
+ float ref = 2.0f*values[(int)(n*0.75f)];
+ float fac = 1.0f;
+ if(L > ref) {
+ /* The pixel appears to be an outlier.
+ * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is that the pixel
+ * should actually be at the reference value:
+ * If the reference is within the 3-sigma interval, the pixel is assumed to be a statistical outlier.
+ * Otherwise, it is very unlikely that the pixel should be darker, which indicates a legitimate highlight.
+ */
+ float stddev = sqrtf(average(make_float3(variance[idx], variance[idx+pass_stride], variance[idx+2*pass_stride])));
+ if(L - 3*stddev < ref) {
+ /* The pixel is an outlier, so negate the depth value to mark it as one.
+ * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */
+ depth[idx] = -depth[idx];
+ fac = ref/L;
+ variance[idx ] *= fac*fac;
+ variance[idx + pass_stride] *= fac*fac;
+ variance[idx+2*pass_stride] *= fac*fac;
+ }
+ }
+ out[idx ] = fac*image[idx];
+ out[idx + pass_stride] = fac*image[idx + pass_stride];
+ out[idx+2*pass_stride] = fac*image[idx+2*pass_stride];
+}
+
+/* Combine A/B buffers.
+ * Calculates the combined mean and the buffer variance. */
+ccl_device void kernel_filter_combine_halves(int x, int y,
+ ccl_global float *mean,
+ ccl_global float *variance,
+ ccl_global float *a,
+ ccl_global float *b,
+ int4 rect, int r)
+{
+ int buffer_w = align_up(rect.z - rect.x, 4);
+ int idx = (y-rect.y)*buffer_w + (x - rect.x);
+
+ if(mean) mean[idx] = 0.5f * (a[idx]+b[idx]);
+ if(variance) {
+ if(r == 0) variance[idx] = 0.25f * (a[idx]-b[idx])*(a[idx]-b[idx]);
+ else {
+ variance[idx] = 0.0f;
+ float values[25];
+ int numValues = 0;
+ for(int py = max(y-r, rect.y); py < min(y+r+1, rect.w); py++) {
+ for(int px = max(x-r, rect.x); px < min(x+r+1, rect.z); px++) {
+ int pidx = (py-rect.y)*buffer_w + (px-rect.x);
+ values[numValues++] = 0.25f * (a[pidx]-b[pidx])*(a[pidx]-b[pidx]);
+ }
+ }
+ /* Insertion-sort the variances (fast enough for 25 elements). */
+ for(int i = 1; i < numValues; i++) {
+ float v = values[i];
+ int j;
+ for(j = i-1; j >= 0 && values[j] > v; j--)
+ values[j+1] = values[j];
+ values[j+1] = v;
+ }
+ variance[idx] = values[(7*numValues)/8];
+ }
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h
new file mode 100644
index 00000000000..25a3025056c
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_reconstruction.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
+ int storage_stride,
+ int dx, int dy,
+ int w, int h,
+ int pass_stride,
+ const ccl_global float *ccl_restrict buffer,
+ const ccl_global float *ccl_restrict transform,
+ ccl_global int *rank,
+ float weight,
+ ccl_global float *XtWX,
+ ccl_global float3 *XtWY,
+ int localIdx)
+{
+ if(weight < 1e-3f) {
+ return;
+ }
+
+ int p_offset = y *w + x;
+ int q_offset = (y+dy)*w + (x+dx);
+
+#ifdef __KERNEL_GPU__
+ const int stride = storage_stride;
+#else
+ const int stride = 1;
+ (void) storage_stride;
+#endif
+
+#ifdef __KERNEL_CUDA__
+ ccl_local float shared_design_row[(DENOISE_FEATURES+1)*CCL_MAX_LOCAL_SIZE];
+ ccl_local_param float *design_row = shared_design_row + localIdx*(DENOISE_FEATURES+1);
+#else
+ float design_row[DENOISE_FEATURES+1];
+#endif
+
+ float3 q_color = filter_get_color(buffer + q_offset, pass_stride);
+
+ /* If the pixel was flagged as an outlier during prefiltering, skip it. */
+ if(ccl_get_feature(buffer + q_offset, 0) < 0.0f) {
+ return;
+ }
+
+ filter_get_design_row_transform(make_int2(x, y), buffer + p_offset,
+ make_int2(x+dx, y+dy), buffer + q_offset,
+ pass_stride, *rank, design_row, transform, stride);
+
+ math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride);
+ math_vec3_add_strided(XtWY, (*rank)+1, design_row, weight * q_color, stride);
+}
+
+ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h,
+ ccl_global float *buffer,
+ ccl_global int *rank,
+ int storage_stride,
+ ccl_global float *XtWX,
+ ccl_global float3 *XtWY,
+ int4 buffer_params,
+ int sample)
+{
+#ifdef __KERNEL_GPU__
+ const int stride = storage_stride;
+#else
+ const int stride = 1;
+ (void) storage_stride;
+#endif
+
+ if(XtWX[0] < 1e-3f) {
+ /* There is not enough information to determine a denoised result.
+ * As a fallback, keep the original value of the pixel. */
+ return;
+ }
+
+ /* The weighted average of pixel colors (essentially, the NLM-filtered image).
+ * In case the solution of the linear model fails due to numerical issues,
+ * fall back to this value. */
+ float3 mean_color = XtWY[0]/XtWX[0];
+
+ math_trimatrix_vec3_solve(XtWX, XtWY, (*rank)+1, stride);
+
+ float3 final_color = XtWY[0];
+ if(!isfinite3_safe(final_color)) {
+ final_color = mean_color;
+ }
+
+ /* Clamp pixel value to positive values. */
+ final_color = max(final_color, make_float3(0.0f, 0.0f, 0.0f));
+
+ ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z;
+ final_color *= sample;
+ if(buffer_params.w) {
+ final_color.x += combined_buffer[buffer_params.w+0];
+ final_color.y += combined_buffer[buffer_params.w+1];
+ final_color.z += combined_buffer[buffer_params.w+2];
+ }
+ combined_buffer[0] = final_color.x;
+ combined_buffer[1] = final_color.y;
+ combined_buffer[2] = final_color.z;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform.h b/intern/cycles/kernel/filter/filter_transform.h
new file mode 100644
index 00000000000..a5f87c05ec0
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_transform.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
+ int x, int y, int4 rect,
+ int pass_stride,
+ float *transform, int *rank,
+ int radius, float pca_threshold)
+{
+ int buffer_w = align_up(rect.z - rect.x, 4);
+
+ float features[DENOISE_FEATURES];
+
+ /* Temporary storage, used in different steps of the algorithm. */
+ float tempmatrix[DENOISE_FEATURES*DENOISE_FEATURES];
+ float tempvector[2*DENOISE_FEATURES];
+ const float *ccl_restrict pixel_buffer;
+ int2 pixel;
+
+ /* === Calculate denoising window. === */
+ int2 low = make_int2(max(rect.x, x - radius),
+ max(rect.y, y - radius));
+ int2 high = make_int2(min(rect.z, x + radius + 1),
+ min(rect.w, y + radius + 1));
+ int num_pixels = (high.y - low.y) * (high.x - low.x);
+
+ /* === Shift feature passes to have mean 0. === */
+ float feature_means[DENOISE_FEATURES];
+ math_vector_zero(feature_means, DENOISE_FEATURES);
+ FOR_PIXEL_WINDOW {
+ filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride);
+ math_vector_add(feature_means, features, DENOISE_FEATURES);
+ } END_FOR_PIXEL_WINDOW
+
+ math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES);
+
+ /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
+ float *feature_scale = tempvector;
+ math_vector_zero(feature_scale, DENOISE_FEATURES);
+
+ FOR_PIXEL_WINDOW {
+ filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride);
+ math_vector_max(feature_scale, features, DENOISE_FEATURES);
+ } END_FOR_PIXEL_WINDOW
+
+ filter_calculate_scale(feature_scale);
+
+ /* === Generate the feature transformation. ===
+ * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
+ * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
+ float* feature_matrix = tempmatrix;
+ math_matrix_zero(feature_matrix, DENOISE_FEATURES);
+ FOR_PIXEL_WINDOW {
+ filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
+ math_vector_mul(features, feature_scale, DENOISE_FEATURES);
+ math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
+ } END_FOR_PIXEL_WINDOW
+
+ math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
+ *rank = 0;
+ /* Prevent overfitting when a small window is used. */
+ int max_rank = min(DENOISE_FEATURES, num_pixels/3);
+ if(pca_threshold < 0.0f) {
+ float threshold_energy = 0.0f;
+ for(int i = 0; i < DENOISE_FEATURES; i++) {
+ threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+ }
+ threshold_energy *= 1.0f - (-pca_threshold);
+
+ float reduced_energy = 0.0f;
+ for(int i = 0; i < max_rank; i++, (*rank)++) {
+ if(i >= 2 && reduced_energy >= threshold_energy)
+ break;
+ float s = feature_matrix[i*DENOISE_FEATURES+i];
+ reduced_energy += s;
+ }
+ }
+ else {
+ for(int i = 0; i < max_rank; i++, (*rank)++) {
+ float s = feature_matrix[i*DENOISE_FEATURES+i];
+ if(i >= 2 && sqrtf(s) < pca_threshold)
+ break;
+ }
+ }
+
+ /* Bake the feature scaling into the transformation matrix. */
+ for(int i = 0; i < (*rank); i++) {
+ math_vector_mul(transform + i*DENOISE_FEATURES, feature_scale, DENOISE_FEATURES);
+ }
+ math_matrix_transpose(transform, DENOISE_FEATURES, 1);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h
new file mode 100644
index 00000000000..83a1222bbdb
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_transform_gpu.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_restrict buffer,
+ int x, int y, int4 rect,
+ int pass_stride,
+ ccl_global float *transform,
+ ccl_global int *rank,
+ int radius, float pca_threshold,
+ int transform_stride, int localIdx)
+{
+ int buffer_w = align_up(rect.z - rect.x, 4);
+
+#ifdef __KERNEL_CUDA__
+ ccl_local float shared_features[DENOISE_FEATURES*CCL_MAX_LOCAL_SIZE];
+ ccl_local_param float *features = shared_features + localIdx*DENOISE_FEATURES;
+#else
+ float features[DENOISE_FEATURES];
+#endif
+
+ /* === Calculate denoising window. === */
+ int2 low = make_int2(max(rect.x, x - radius),
+ max(rect.y, y - radius));
+ int2 high = make_int2(min(rect.z, x + radius + 1),
+ min(rect.w, y + radius + 1));
+ int num_pixels = (high.y - low.y) * (high.x - low.x);
+ const ccl_global float *ccl_restrict pixel_buffer;
+ int2 pixel;
+
+
+
+
+ /* === Shift feature passes to have mean 0. === */
+ float feature_means[DENOISE_FEATURES];
+ math_vector_zero(feature_means, DENOISE_FEATURES);
+ FOR_PIXEL_WINDOW {
+ filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride);
+ math_vector_add(feature_means, features, DENOISE_FEATURES);
+ } END_FOR_PIXEL_WINDOW
+
+ math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES);
+
+ /* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
+ float feature_scale[DENOISE_FEATURES];
+ math_vector_zero(feature_scale, DENOISE_FEATURES);
+
+ FOR_PIXEL_WINDOW {
+ filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride);
+ math_vector_max(feature_scale, features, DENOISE_FEATURES);
+ } END_FOR_PIXEL_WINDOW
+
+ filter_calculate_scale(feature_scale);
+
+
+
+ /* === Generate the feature transformation. ===
+ * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
+ * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
+ float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
+ math_matrix_zero(feature_matrix, DENOISE_FEATURES);
+ FOR_PIXEL_WINDOW {
+ filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
+ math_vector_mul(features, feature_scale, DENOISE_FEATURES);
+ math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
+ } END_FOR_PIXEL_WINDOW
+
+ math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, transform_stride);
+ *rank = 0;
+ /* Prevent overfitting when a small window is used. */
+ int max_rank = min(DENOISE_FEATURES, num_pixels/3);
+ if(pca_threshold < 0.0f) {
+ float threshold_energy = 0.0f;
+ for(int i = 0; i < DENOISE_FEATURES; i++) {
+ threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+ }
+ threshold_energy *= 1.0f - (-pca_threshold);
+
+ float reduced_energy = 0.0f;
+ for(int i = 0; i < max_rank; i++, (*rank)++) {
+ if(i >= 2 && reduced_energy >= threshold_energy)
+ break;
+ float s = feature_matrix[i*DENOISE_FEATURES+i];
+ reduced_energy += s;
+ }
+ }
+ else {
+ for(int i = 0; i < max_rank; i++, (*rank)++) {
+ float s = feature_matrix[i*DENOISE_FEATURES+i];
+ if(i >= 2 && sqrtf(s) < pca_threshold)
+ break;
+ }
+ }
+
+ math_matrix_transpose(transform, DENOISE_FEATURES, transform_stride);
+
+ /* Bake the feature scaling into the transformation matrix. */
+ for(int i = 0; i < DENOISE_FEATURES; i++) {
+ for(int j = 0; j < (*rank); j++) {
+ transform[(i*DENOISE_FEATURES + j)*transform_stride] *= feature_scale[i];
+ }
+ }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h
new file mode 100644
index 00000000000..30dc2969b11
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_transform_sse.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
+ int x, int y, int4 rect,
+ int pass_stride,
+ float *transform, int *rank,
+ int radius, float pca_threshold)
+{
+ int buffer_w = align_up(rect.z - rect.x, 4);
+
+ __m128 features[DENOISE_FEATURES];
+ const float *ccl_restrict pixel_buffer;
+ int2 pixel;
+
+ int2 low = make_int2(max(rect.x, x - radius),
+ max(rect.y, y - radius));
+ int2 high = make_int2(min(rect.z, x + radius + 1),
+ min(rect.w, y + radius + 1));
+ int num_pixels = (high.y - low.y) * (high.x - low.x);
+
+ __m128 feature_means[DENOISE_FEATURES];
+ math_vector_zero_sse(feature_means, DENOISE_FEATURES);
+ FOR_PIXEL_WINDOW_SSE {
+ filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride);
+ math_vector_add_sse(feature_means, DENOISE_FEATURES, features);
+ } END_FOR_PIXEL_WINDOW_SSE
+
+ __m128 pixel_scale = _mm_set1_ps(1.0f / num_pixels);
+ for(int i = 0; i < DENOISE_FEATURES; i++) {
+ feature_means[i] = _mm_mul_ps(_mm_hsum_ps(feature_means[i]), pixel_scale);
+ }
+
+ __m128 feature_scale[DENOISE_FEATURES];
+ math_vector_zero_sse(feature_scale, DENOISE_FEATURES);
+ FOR_PIXEL_WINDOW_SSE {
+ filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
+ math_vector_max_sse(feature_scale, features, DENOISE_FEATURES);
+ } END_FOR_PIXEL_WINDOW_SSE
+
+ filter_calculate_scale_sse(feature_scale);
+
+ __m128 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
+ math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES);
+ FOR_PIXEL_WINDOW_SSE {
+ filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
+ math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale);
+ math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, _mm_set1_ps(1.0f));
+ } END_FOR_PIXEL_WINDOW_SSE
+
+ float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
+ math_matrix_hsum(feature_matrix, DENOISE_FEATURES, feature_matrix_sse);
+
+ math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
+
+ *rank = 0;
+ /* Prevent overfitting when a small window is used. */
+ int max_rank = min(DENOISE_FEATURES, num_pixels/3);
+ if(pca_threshold < 0.0f) {
+ float threshold_energy = 0.0f;
+ for(int i = 0; i < DENOISE_FEATURES; i++) {
+ threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+ }
+ threshold_energy *= 1.0f - (-pca_threshold);
+
+ float reduced_energy = 0.0f;
+ for(int i = 0; i < max_rank; i++, (*rank)++) {
+ if(i >= 2 && reduced_energy >= threshold_energy)
+ break;
+ float s = feature_matrix[i*DENOISE_FEATURES+i];
+ reduced_energy += s;
+ }
+ }
+ else {
+ for(int i = 0; i < max_rank; i++, (*rank)++) {
+ float s = feature_matrix[i*DENOISE_FEATURES+i];
+ if(i >= 2 && sqrtf(s) < pca_threshold)
+ break;
+ }
+ }
+
+ math_matrix_transpose(transform, DENOISE_FEATURES, 1);
+
+ /* Bake the feature scaling into the transformation matrix. */
+ for(int i = 0; i < DENOISE_FEATURES; i++) {
+ math_vector_scale(transform + i*DENOISE_FEATURES, _mm_cvtss_f32(feature_scale[i]), *rank);
+ }
+}
+
+CCL_NAMESPACE_END