diff options
Diffstat (limited to 'intern/cycles/kernel/filter/filter_features_sse.h')
-rw-r--r-- | intern/cycles/kernel/filter/filter_features_sse.h | 129 |
1 files changed, 67 insertions, 62 deletions
diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h index 5dd001ffb93..1e0d6e93453 100644 --- a/intern/cycles/kernel/filter/filter_features_sse.h +++ b/intern/cycles/kernel/filter/filter_features_sse.h @@ -22,22 +22,27 @@ CCL_NAMESPACE_BEGIN * pixel_buffer always points to the first of the 4 current pixel in the first pass. * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. * Repeat the loop for every secondary frame if there are any. */ -#define FOR_PIXEL_WINDOW_SSE for(int frame = 0; frame < tile_info->num_frames; frame++) { \ - pixel.z = tile_info->frames[frame]; \ - pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x) + frame*frame_stride; \ - float4 t4 = make_float4(pixel.z); \ - for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ - float4 y4 = make_float4(pixel.y); \ - for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \ - float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \ - int4 active_pixels = x4 < make_float4(high.x); +#define FOR_PIXEL_WINDOW_SSE \ + for (int frame = 0; frame < tile_info->num_frames; frame++) { \ + pixel.z = tile_info->frames[frame]; \ + pixel_buffer = buffer + (low.y - rect.y) * buffer_w + (low.x - rect.x) + \ + frame * frame_stride; \ + float4 t4 = make_float4(pixel.z); \ + for (pixel.y = low.y; pixel.y < high.y; pixel.y++) { \ + float4 y4 = make_float4(pixel.y); \ + for (pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \ + float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \ + int4 active_pixels = x4 < make_float4(high.x); -#define END_FOR_PIXEL_WINDOW_SSE } \ - pixel_buffer += buffer_w - (high.x - low.x); \ - } \ - } +#define END_FOR_PIXEL_WINDOW_SSE \ + } \ + pixel_buffer += buffer_w - (high.x - low.x); \ + } \ + } -ccl_device_inline void filter_get_features_sse(float4 x, float4 y, float4 t, +ccl_device_inline void filter_get_features_sse(float4 x, + float4 y, + float4 t, int4 active_pixels, const float *ccl_restrict buffer, float4 *features, @@ -45,33 +50,35 @@ ccl_device_inline void filter_get_features_sse(float4 x, float4 y, float4 t, const float4 *ccl_restrict mean, int pass_stride) { - int num_features = use_time? 11 : 10; + int num_features = use_time ? 11 : 10; - features[0] = x; - features[1] = y; - features[2] = fabs(ccl_get_feature_sse(0)); - features[3] = ccl_get_feature_sse(1); - features[4] = ccl_get_feature_sse(2); - features[5] = ccl_get_feature_sse(3); - features[6] = ccl_get_feature_sse(4); - features[7] = ccl_get_feature_sse(5); - features[8] = ccl_get_feature_sse(6); - features[9] = ccl_get_feature_sse(7); - if(use_time) { - features[10] = t; - } + features[0] = x; + features[1] = y; + features[2] = fabs(ccl_get_feature_sse(0)); + features[3] = ccl_get_feature_sse(1); + features[4] = ccl_get_feature_sse(2); + features[5] = ccl_get_feature_sse(3); + features[6] = ccl_get_feature_sse(4); + features[7] = ccl_get_feature_sse(5); + features[8] = ccl_get_feature_sse(6); + features[9] = ccl_get_feature_sse(7); + if (use_time) { + features[10] = t; + } - if(mean) { - for(int i = 0; i < num_features; i++) { - features[i] = features[i] - mean[i]; - } - } - for(int i = 0; i < num_features; i++) { - features[i] = mask(active_pixels, features[i]); - } + if (mean) { + for (int i = 0; i < num_features; i++) { + features[i] = features[i] - mean[i]; + } + } + for (int i = 0; i < num_features; i++) { + features[i] = mask(active_pixels, features[i]); + } } -ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y, float4 t, +ccl_device_inline void filter_get_feature_scales_sse(float4 x, + float4 y, + float4 t, int4 active_pixels, const float *ccl_restrict buffer, float4 *scales, @@ -79,36 +86,34 @@ ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y, float4 const float4 *ccl_restrict mean, int pass_stride) { - scales[0] = fabs(x - mean[0]); - scales[1] = fabs(y - mean[1]); - scales[2] = fabs(fabs(ccl_get_feature_sse(0)) - mean[2]); - scales[3] = sqr(ccl_get_feature_sse(1) - mean[3]) + - sqr(ccl_get_feature_sse(2) - mean[4]) + - sqr(ccl_get_feature_sse(3) - mean[5]); - scales[4] = fabs(ccl_get_feature_sse(4) - mean[6]); - scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) + - sqr(ccl_get_feature_sse(6) - mean[8]) + - sqr(ccl_get_feature_sse(7) - mean[9]); - if(use_time) { - scales[6] = fabs(t - mean[10]); - } + scales[0] = fabs(x - mean[0]); + scales[1] = fabs(y - mean[1]); + scales[2] = fabs(fabs(ccl_get_feature_sse(0)) - mean[2]); + scales[3] = sqr(ccl_get_feature_sse(1) - mean[3]) + sqr(ccl_get_feature_sse(2) - mean[4]) + + sqr(ccl_get_feature_sse(3) - mean[5]); + scales[4] = fabs(ccl_get_feature_sse(4) - mean[6]); + scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) + sqr(ccl_get_feature_sse(6) - mean[8]) + + sqr(ccl_get_feature_sse(7) - mean[9]); + if (use_time) { + scales[6] = fabs(t - mean[10]); + } - for(int i = 0; i < (use_time? 7 : 6); i++) - scales[i] = mask(active_pixels, scales[i]); + for (int i = 0; i < (use_time ? 7 : 6); i++) + scales[i] = mask(active_pixels, scales[i]); } ccl_device_inline void filter_calculate_scale_sse(float4 *scale, bool use_time) { - scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f))); - scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f))); - scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f))); - if(use_time) { - scale[10] = rcp(max(reduce_max(scale[6]), make_float4(0.01f)));; - } - scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f))); - scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f))); - scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f))); + scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f))); + scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f))); + scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f))); + if (use_time) { + scale[10] = rcp(max(reduce_max(scale[6]), make_float4(0.01f))); + ; + } + scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f))); + scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f))); + scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f))); } - CCL_NAMESPACE_END |