diff options
author | Vic Lee <llyzs@163.com> | 2011-06-24 12:11:44 +0400 |
---|---|---|
committer | Vic Lee <llyzs@163.com> | 2011-06-24 12:11:44 +0400 |
commit | 2a897f3418638dfa1b54646d81a42fe8bb9d0d82 (patch) | |
tree | 9f704e72481d74d875ee8efcc074f6ae3af7eacb | |
parent | a79037ca6d95aed69f01f702af0acc95d931b551 (diff) |
libfreerdp-rfx: add SSE2 for DWT encoding.
-rw-r--r-- | include/freerdp/rfx.h | 1 | ||||
-rw-r--r-- | libfreerdp-rfx/librfx.c | 1 | ||||
-rw-r--r-- | libfreerdp-rfx/rfx_encode.c | 2 | ||||
-rw-r--r-- | libfreerdp-rfx/sse/rfx_sse.c | 2 | ||||
-rw-r--r-- | libfreerdp-rfx/sse/rfx_sse2.c | 149 | ||||
-rw-r--r-- | libfreerdp-rfx/sse/rfx_sse2.h | 1 |
6 files changed, 155 insertions, 1 deletions
diff --git a/include/freerdp/rfx.h b/include/freerdp/rfx.h index 7b2fede..932f5f5 100644 --- a/include/freerdp/rfx.h +++ b/include/freerdp/rfx.h @@ -167,6 +167,7 @@ struct _RFX_CONTEXT void (* quantization_decode)(sint16 * buffer, const uint32 * quantization_values); void (* quantization_encode)(sint16 * buffer, const uint32 * quantization_values); void (* dwt_2d_decode)(sint16 * buffer, sint16 * dwt_buffer); + void (* dwt_2d_encode)(sint16 * buffer, sint16 * dwt_buffer); /* profiler definitions */ PROFILER_DEFINE(prof_rfx_decode_rgb); diff --git a/libfreerdp-rfx/librfx.c b/libfreerdp-rfx/librfx.c index 1693bfb..5d9b5a6 100644 --- a/libfreerdp-rfx/librfx.c +++ b/libfreerdp-rfx/librfx.c @@ -146,6 +146,7 @@ rfx_context_new(void) context->quantization_decode = rfx_quantization_decode; context->quantization_encode = rfx_quantization_encode; context->dwt_2d_decode = rfx_dwt_2d_decode; + context->dwt_2d_encode = rfx_dwt_2d_encode; /* detect and enable SIMD CPU acceleration */ RFX_INIT_SIMD(context); diff --git a/libfreerdp-rfx/rfx_encode.c b/libfreerdp-rfx/rfx_encode.c index ddb9d2d..cd20200 100644 --- a/libfreerdp-rfx/rfx_encode.c +++ b/libfreerdp-rfx/rfx_encode.c @@ -132,7 +132,7 @@ rfx_encode_component(RFX_CONTEXT * context, const uint32 * quantization_values, PROFILER_ENTER(context->prof_rfx_encode_component); PROFILER_ENTER(context->prof_rfx_dwt_2d_encode); - rfx_dwt_2d_encode(data, context->dwt_buffer); + context->dwt_2d_encode(data, context->dwt_buffer); PROFILER_EXIT(context->prof_rfx_dwt_2d_encode); PROFILER_ENTER(context->prof_rfx_quantization_encode); diff --git a/libfreerdp-rfx/sse/rfx_sse.c b/libfreerdp-rfx/sse/rfx_sse.c index 0407323..76a632d 100644 --- a/libfreerdp-rfx/sse/rfx_sse.c +++ b/libfreerdp-rfx/sse/rfx_sse.c @@ -33,10 +33,12 @@ void rfx_init_sse(RFX_CONTEXT * context) IF_PROFILER(context->prof_rfx_quantization_decode->name = "rfx_quantization_decode_SSE2"); IF_PROFILER(context->prof_rfx_quantization_encode->name = "rfx_quantization_encode_SSE2"); IF_PROFILER(context->prof_rfx_dwt_2d_decode->name = "rfx_dwt_2d_decode_SSE2"); + IF_PROFILER(context->prof_rfx_dwt_2d_encode->name = "rfx_dwt_2d_encode_SSE2"); context->decode_YCbCr_to_RGB = rfx_decode_YCbCr_to_RGB_SSE2; context->encode_RGB_to_YCbCr = rfx_encode_RGB_to_YCbCr_SSE2; context->quantization_decode = rfx_quantization_decode_SSE2; context->quantization_encode = rfx_quantization_encode_SSE2; context->dwt_2d_decode = rfx_dwt_2d_decode_SSE2; + context->dwt_2d_encode = rfx_dwt_2d_encode_SSE2; } diff --git a/libfreerdp-rfx/sse/rfx_sse2.c b/libfreerdp-rfx/sse/rfx_sse2.c index a215f86..cdaef30 100644 --- a/libfreerdp-rfx/sse/rfx_sse2.c +++ b/libfreerdp-rfx/sse/rfx_sse2.c @@ -423,3 +423,152 @@ rfx_dwt_2d_decode_SSE2(sint16 * buffer, sint16 * dwt_buffer) rfx_dwt_2d_decode_block_SSE2(buffer + 3072, dwt_buffer, 16); rfx_dwt_2d_decode_block_SSE2(buffer, dwt_buffer, 32); } + +static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +rfx_dwt_2d_encode_block_vert_SSE2(sint16 * src, sint16 * l, sint16 * h, int subband_width) +{ + int total_width; + int x; + int n; + __m128i src_2n; + __m128i src_2n_1; + __m128i src_2n_2; + __m128i h_n; + __m128i h_n_m; + __m128i l_n; + + total_width = subband_width << 1; + + for (n = 0; n < subband_width; n++) + { + for (x = 0; x < total_width; x += 8) + { + src_2n = _mm_load_si128((__m128i*) src); + src_2n_1 = _mm_load_si128((__m128i*) (src + total_width)); + if (n < subband_width - 1) + src_2n_2 = _mm_load_si128((__m128i*) (src + 2 * total_width)); + else + src_2n_2 = src_2n_1; + + /* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */ + + h_n = _mm_add_epi16(src_2n, src_2n_2); + h_n = _mm_srai_epi16(h_n, 1); + h_n = _mm_sub_epi16(src_2n_1, h_n); + h_n = _mm_srai_epi16(h_n, 1); + + _mm_store_si128((__m128i*) h, h_n); + + if (n == 0) + h_n_m = h_n; + else + h_n_m = _mm_load_si128((__m128i*) (h - total_width)); + + /* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */ + + l_n = _mm_add_epi16(h_n_m, h_n); + l_n = _mm_srai_epi16(l_n, 1); + l_n = _mm_add_epi16(l_n, src_2n); + + _mm_store_si128((__m128i*) l, l_n); + + src += 8; + l += 8; + h += 8; + } + src += total_width; + } +} + +static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +rfx_dwt_2d_encode_block_horiz_SSE2(sint16 * src, sint16 * l, sint16 * h, int subband_width) +{ + int y; + int n; + int first; + __m128i src_2n; + __m128i src_2n_1; + __m128i src_2n_2; + __m128i h_n; + __m128i h_n_m; + __m128i l_n; + + for (y = 0; y < subband_width; y++) + { + for (n = 0; n < subband_width; n += 8) + { + /* The following 3 Set operations consumes more than half of the total DWT processing time! */ + src_2n = _mm_set_epi16(src[14], src[12], src[10], src[8], src[6], src[4], src[2], src[0]); + src_2n_1 = _mm_set_epi16(src[15], src[13], src[11], src[9], src[7], src[5], src[3], src[1]); + src_2n_2 = _mm_set_epi16(n == subband_width - 8 ? src[15] : src[16], + src[14], src[12], src[10], src[8], src[6], src[4], src[2]); + + /* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */ + + h_n = _mm_add_epi16(src_2n, src_2n_2); + h_n = _mm_srai_epi16(h_n, 1); + h_n = _mm_sub_epi16(src_2n_1, h_n); + h_n = _mm_srai_epi16(h_n, 1); + + _mm_store_si128((__m128i*) h, h_n); + + h_n_m = _mm_loadu_si128((__m128i*) (h - 1)); + if (n == 0) + { + first = _mm_extract_epi16(h_n_m, 1); + h_n_m = _mm_insert_epi16(h_n_m, first, 0); + } + + /* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */ + + l_n = _mm_add_epi16(h_n_m, h_n); + l_n = _mm_srai_epi16(l_n, 1); + l_n = _mm_add_epi16(l_n, src_2n); + + _mm_store_si128((__m128i*) l, l_n); + + src += 16; + l += 8; + h += 8; + } + } +} + +static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +rfx_dwt_2d_encode_block_SSE2(sint16 * buffer, sint16 * dwt, int subband_width) +{ + sint16 * hl, * lh, * hh, * ll; + sint16 * l_src, * h_src; + + _mm_prefetch_buffer((char *) dwt, subband_width * 4 * sizeof(sint16)); + + /* DWT in vertical direction, results in 2 sub-bands in L, H order in tmp buffer dwt. */ + + l_src = dwt; + h_src = dwt + subband_width * subband_width * 2; + + rfx_dwt_2d_encode_block_vert_SSE2(buffer, l_src, h_src, subband_width); + + /* DWT in horizontal direction, results in 4 sub-bands in HL(0), LH(1), HH(2), LL(3) order, stored in original buffer. */ + /* The lower part L generates LL(3) and HL(0). */ + /* The higher part H generates LH(1) and HH(2). */ + + ll = buffer + subband_width * subband_width * 3; + hl = buffer; + + lh = buffer + subband_width * subband_width; + hh = buffer + subband_width * subband_width * 2; + + rfx_dwt_2d_encode_block_horiz_SSE2(l_src, ll, hl, subband_width); + rfx_dwt_2d_encode_block_horiz_SSE2(h_src, lh, hh, subband_width); +} + +void +rfx_dwt_2d_encode_SSE2(sint16 * buffer, sint16 * dwt_buffer) +{ + _mm_prefetch_buffer((char *) buffer, 4096 * sizeof(sint16)); + + rfx_dwt_2d_encode_block_SSE2(buffer, dwt_buffer, 32); + rfx_dwt_2d_encode_block_SSE2(buffer + 3072, dwt_buffer, 16); + rfx_dwt_2d_encode_block_SSE2(buffer + 3840, dwt_buffer, 8); +} diff --git a/libfreerdp-rfx/sse/rfx_sse2.h b/libfreerdp-rfx/sse/rfx_sse2.h index 0fcf604..85921da 100644 --- a/libfreerdp-rfx/sse/rfx_sse2.h +++ b/libfreerdp-rfx/sse/rfx_sse2.h @@ -27,5 +27,6 @@ void rfx_encode_RGB_to_YCbCr_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sin void rfx_quantization_decode_SSE2(sint16 * buffer, const uint32 * quantization_values); void rfx_quantization_encode_SSE2(sint16 * buffer, const uint32 * quantization_values); void rfx_dwt_2d_decode_SSE2(sint16 * buffer, sint16 * dwt_buffer); +void rfx_dwt_2d_encode_SSE2(sint16 * buffer, sint16 * dwt_buffer); #endif /* __RFX_SSE2_H */ |