diff options
Diffstat (limited to 'intern/cycles/kernel/filter/filter_features_sse.h')
-rw-r--r-- | intern/cycles/kernel/filter/filter_features_sse.h | 105 |
1 files changed, 105 insertions, 0 deletions
diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h new file mode 100644 index 00000000000..3185330994c --- /dev/null +++ b/intern/cycles/kernel/filter/filter_features_sse.h @@ -0,0 +1,105 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +#define ccl_get_feature_sse(pass) _mm_loadu_ps(buffer + (pass)*pass_stride) + +/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time. + * pixel_buffer always points to the first of the 4 current pixel in the first pass. + * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. */ + +#define FOR_PIXEL_WINDOW_SSE pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \ + for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ + __m128 y4 = _mm_set1_ps(pixel.y); \ + for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \ + __m128 x4 = _mm_add_ps(_mm_set1_ps(pixel.x), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); \ + __m128 active_pixels = _mm_cmplt_ps(x4, _mm_set1_ps(high.x)); + +#define END_FOR_PIXEL_WINDOW_SSE } \ + pixel_buffer += buffer_w - (pixel.x - low.x); \ + } + +ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, + __m128 active_pixels, + const float *ccl_restrict buffer, + __m128 *features, + const __m128 *ccl_restrict mean, + int pass_stride) +{ + features[0] = x; + features[1] = y; + features[2] = _mm_fabs_ps(ccl_get_feature_sse(0)); + features[3] = ccl_get_feature_sse(1); + features[4] = ccl_get_feature_sse(2); + features[5] = ccl_get_feature_sse(3); + features[6] = ccl_get_feature_sse(4); + features[7] = ccl_get_feature_sse(5); + features[8] = ccl_get_feature_sse(6); + features[9] = ccl_get_feature_sse(7); + if(mean) { + for(int i = 0; i < DENOISE_FEATURES; i++) + features[i] = _mm_sub_ps(features[i], mean[i]); + } + for(int i = 0; i < DENOISE_FEATURES; i++) + features[i] = _mm_mask_ps(features[i], active_pixels); +} + +ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y, + __m128 active_pixels, + const float *ccl_restrict buffer, + __m128 *scales, + const __m128 *ccl_restrict mean, + int pass_stride) +{ + scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels); + scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels); + + scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(_mm_fabs_ps(ccl_get_feature_sse(0)), mean[2])), active_pixels); + + __m128 diff, scale; + diff = _mm_sub_ps(ccl_get_feature_sse(1), mean[3]); + scale = _mm_mul_ps(diff, diff); + diff = _mm_sub_ps(ccl_get_feature_sse(2), mean[4]); + scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); + diff = _mm_sub_ps(ccl_get_feature_sse(3), mean[5]); + scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); + scales[3] = _mm_mask_ps(scale, active_pixels); + + scales[4] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(4), mean[6])), active_pixels); + + diff = _mm_sub_ps(ccl_get_feature_sse(5), mean[7]); + scale = _mm_mul_ps(diff, diff); + diff = _mm_sub_ps(ccl_get_feature_sse(6), mean[8]); + scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); + diff = _mm_sub_ps(ccl_get_feature_sse(7), mean[9]); + scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff)); + scales[5] = _mm_mask_ps(scale, active_pixels); +} + +ccl_device_inline void filter_calculate_scale_sse(__m128 *scale) +{ + scale[0] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[0]), _mm_set1_ps(0.01f))); + scale[1] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[1]), _mm_set1_ps(0.01f))); + scale[2] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[2]), _mm_set1_ps(0.01f))); + scale[6] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[4]), _mm_set1_ps(0.01f))); + + scale[7] = scale[8] = scale[9] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[5])), _mm_set1_ps(0.01f))); + scale[3] = scale[4] = scale[5] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[3])), _mm_set1_ps(0.01f))); +} + + +CCL_NAMESPACE_END |