diff options
author | Stephen Erisman <serisman@serisman.com> | 2011-06-15 09:56:10 +0400 |
---|---|---|
committer | Stephen Erisman <serisman@serisman.com> | 2011-06-15 09:56:10 +0400 |
commit | 2fd11d6e001ed3290caea18a9bd8d814c61ff3a6 (patch) | |
tree | b373389553d576f862e9c2f11278f5b6341a1f52 | |
parent | 5a4c2b157226ebeefbc79c0502f3e2caf1c691e8 (diff) |
libfreerdp-rfx: SSE2 Optimizations for iDWT Decode (round 2)
-rw-r--r-- | libfreerdp-rfx/sse/rfx_sse2.c | 111 |
1 files changed, 79 insertions, 32 deletions
diff --git a/libfreerdp-rfx/sse/rfx_sse2.c b/libfreerdp-rfx/sse/rfx_sse2.c index 642d725..b94ac75 100644 --- a/libfreerdp-rfx/sse/rfx_sse2.c +++ b/libfreerdp-rfx/sse/rfx_sse2.c @@ -128,13 +128,13 @@ void rfx_quantization_decode_SSE2(sint16 * buffer, const uint32 * quantization_v rfx_quantization_decode_block_SSE2(buffer + 3840, 64, quantization_values[2]); // HL3 rfx_quantization_decode_block_SSE2(buffer + 3904, 64, quantization_values[1]); // LH3 rfx_quantization_decode_block_SSE2(buffer + 3868, 64, quantization_values[3]); // HH3 - rfx_quantization_decode_block_SSE2(buffer + 4032, 64, quantization_values[0]); // LL3 + rfx_quantization_decode_block_SSE2(buffer + 4032, 64, quantization_values[0]); // LL3 } static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) rfx_dwt_2d_decode_block_horiz_SSE2(sint16 * l, sint16 * h, sint16 * dst, int subband_width) { - int y, n; + int y, n; sint16 * l_ptr = l; sint16 * h_ptr = h; sint16 * dst_ptr = dst; @@ -166,7 +166,7 @@ rfx_dwt_2d_decode_block_horiz_SSE2(sint16 * l, sint16 * h, sint16 * dst, int sub l_ptr+=8; h_ptr+=8; - } + } l_ptr -= subband_width; h_ptr -= subband_width; @@ -206,19 +206,87 @@ rfx_dwt_2d_decode_block_horiz_SSE2(sint16 * l, sint16 * h, sint16 * dst, int sub } static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +rfx_dwt_2d_decode_block_vert_SSE2(sint16 * l, sint16 * h, sint16 * dst, int subband_width) +{ + int x, n; + sint16 * l_ptr = l; + sint16 * h_ptr = h; + sint16 * dst_ptr = dst; + + int total_width = subband_width * subband_width; + + /* Even coefficients */ + for (n = 0; n < subband_width; n++) + { + for (x = 0; x < total_width; x+=8) + { + // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); + + __m128i l_n = _mm_load_si128((__m128i*) l_ptr); + __m128i h_n = _mm_load_si128((__m128i*) h_ptr); + + __m128i tmp_n = _mm_add_epi16(h_n, _mm_set1_epi16(1));; + if (n == 0) + tmp_n = _mm_add_epi16(tmp_n, h_n); + else + { + __m128i h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - total_width)); + tmp_n = _mm_add_epi16(tmp_n, h_n_m); + } + tmp_n = _mm_srai_epi16(tmp_n, 1); + + __m128i dst_n = _mm_sub_epi16(l_n, tmp_n); + _mm_store_si128((__m128i*) dst_ptr, dst_n); + + l_ptr+=8; + h_ptr+=8; + dst_ptr+=8; + } + dst_ptr+=total_width; + } + + h_ptr = h; + dst_ptr = dst + total_width; + + /* Odd coefficients */ + for (n = 0; n < subband_width; n++) + { + for (x = 0; x < total_width; x+=8) + { + // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); + + __m128i h_n = _mm_load_si128((__m128i*) h_ptr); + __m128i dst_n_m = _mm_load_si128((__m128i*) (dst_ptr - total_width)); + h_n = _mm_slli_epi16(h_n, 1); + + __m128i tmp_n = dst_n_m; + if (n == subband_width - 1) + tmp_n = _mm_add_epi16(tmp_n, dst_n_m); + else + { + __m128i dst_n_p = _mm_loadu_si128((__m128i*) (dst_ptr + total_width)); + tmp_n = _mm_add_epi16(tmp_n, dst_n_p); + } + tmp_n = _mm_srai_epi16(tmp_n, 1); + + __m128i dst_n = _mm_add_epi16(tmp_n, h_n); + _mm_store_si128((__m128i*) dst_ptr, dst_n); + + h_ptr+=8; + dst_ptr+=8; + } + dst_ptr+=total_width; + } +} + +static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) rfx_dwt_2d_decode_block_SSE2(sint16 * buffer, sint16 * idwt, int subband_width) { - sint16 * dst, * l, * h; - sint16 * l_dst, * h_dst; sint16 * hl, * lh, * hh, * ll; - int total_width; - int x, y; - int n; + sint16 * l_dst, * h_dst; _mm_prefetch_buffer((char *) idwt, subband_width * 4 * sizeof(sint16)); - total_width = subband_width + subband_width; - /* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt. */ /* The 4 sub-bands are stored in HL(0), LH(1), HH(2), LL(3) order. */ /* The lower part L uses LL(3) and HL(0). */ @@ -237,28 +305,7 @@ rfx_dwt_2d_decode_block_SSE2(sint16 * buffer, sint16 * idwt, int subband_width) rfx_dwt_2d_decode_block_horiz_SSE2(lh, hh, h_dst, subband_width); /* Inverse DWT in vertical direction, results are stored in original buffer. */ - for (x = 0; x < total_width; x++) - { - /* Even coefficients */ - for (n = 0; n < subband_width; n++) - { - y = n << 1; - dst = buffer + y * total_width + x; - l = idwt + n * total_width + x; - h = l + subband_width * total_width; - dst[0] = *l - (((n > 0 ? *(h - total_width) : *h) + (*h) + 1) >> 1); - } - - /* Odd coefficients */ - for (n = 0; n < subband_width; n++) - { - y = n << 1; - dst = buffer + y * total_width + x; - l = idwt + n * total_width + x; - h = l + subband_width * total_width; - dst[total_width] = (*h << 1) + ((dst[0] + dst[n < subband_width - 1 ? 2 * total_width : 0]) >> 1); - } - } + rfx_dwt_2d_decode_block_vert_SSE2(l_dst, h_dst, buffer, subband_width); } void rfx_dwt_2d_decode_SSE2(sint16 * buffer, sint16 * dwt_buffer_8, sint16 * dwt_buffer_16, sint16 * dwt_buffer_32) |