Welcome to mirror list, hosted at ThFree Co, Russian Federation.

filter_features_sse.h « filter « kernel « cycles « intern - git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 3185330994c69557718219ed1646c77775a78150 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
/*
 * Copyright 2011-2017 Blender Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

CCL_NAMESPACE_BEGIN

#define ccl_get_feature_sse(pass) _mm_loadu_ps(buffer + (pass)*pass_stride)

/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time.
 * pixel_buffer always points to the first of the 4 current pixel in the first pass.
 * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. */

#define FOR_PIXEL_WINDOW_SSE     pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
                                 for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
                                     __m128 y4 = _mm_set1_ps(pixel.y); \
                                     for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
                                         __m128 x4 = _mm_add_ps(_mm_set1_ps(pixel.x), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); \
                                         __m128 active_pixels = _mm_cmplt_ps(x4, _mm_set1_ps(high.x));

#define END_FOR_PIXEL_WINDOW_SSE     } \
                                     pixel_buffer += buffer_w - (pixel.x - low.x); \
                                 }

ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y,
                                               __m128 active_pixels,
                                               const float *ccl_restrict buffer,
                                               __m128 *features,
                                               const __m128 *ccl_restrict mean,
                                               int pass_stride)
{
	features[0] = x;
	features[1] = y;
	features[2] = _mm_fabs_ps(ccl_get_feature_sse(0));
	features[3] = ccl_get_feature_sse(1);
	features[4] = ccl_get_feature_sse(2);
	features[5] = ccl_get_feature_sse(3);
	features[6] = ccl_get_feature_sse(4);
	features[7] = ccl_get_feature_sse(5);
	features[8] = ccl_get_feature_sse(6);
	features[9] = ccl_get_feature_sse(7);
	if(mean) {
		for(int i = 0; i < DENOISE_FEATURES; i++)
			features[i] = _mm_sub_ps(features[i], mean[i]);
	}
	for(int i = 0; i < DENOISE_FEATURES; i++)
		features[i] = _mm_mask_ps(features[i], active_pixels);
}

ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y,
                                                     __m128 active_pixels,
                                                     const float *ccl_restrict buffer,
                                                     __m128 *scales,
                                                     const __m128 *ccl_restrict mean,
                                                     int pass_stride)
{
	scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels);
	scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels);

	scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(_mm_fabs_ps(ccl_get_feature_sse(0)), mean[2])), active_pixels);

	__m128 diff, scale;
	diff = _mm_sub_ps(ccl_get_feature_sse(1), mean[3]);
	scale = _mm_mul_ps(diff, diff);
	diff = _mm_sub_ps(ccl_get_feature_sse(2), mean[4]);
	scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
	diff = _mm_sub_ps(ccl_get_feature_sse(3), mean[5]);
	scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
	scales[3] = _mm_mask_ps(scale, active_pixels);

	scales[4] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(4), mean[6])), active_pixels);

	diff = _mm_sub_ps(ccl_get_feature_sse(5), mean[7]);
	scale = _mm_mul_ps(diff, diff);
	diff = _mm_sub_ps(ccl_get_feature_sse(6), mean[8]);
	scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
	diff = _mm_sub_ps(ccl_get_feature_sse(7), mean[9]);
	scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
	scales[5] = _mm_mask_ps(scale, active_pixels);
}

ccl_device_inline void filter_calculate_scale_sse(__m128 *scale)
{
	scale[0] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[0]), _mm_set1_ps(0.01f)));
	scale[1] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[1]), _mm_set1_ps(0.01f)));
	scale[2] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[2]), _mm_set1_ps(0.01f)));
	scale[6] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[4]), _mm_set1_ps(0.01f)));

	scale[7] = scale[8] = scale[9] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[5])), _mm_set1_ps(0.01f)));
	scale[3] = scale[4] = scale[5] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[3])), _mm_set1_ps(0.01f)));
}


CCL_NAMESPACE_END