Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/FreeRDP/FreeRDP-old.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Erisman <serisman@serisman.com>2011-06-15 09:56:10 +0400
committerStephen Erisman <serisman@serisman.com>2011-06-15 09:56:10 +0400
commit2fd11d6e001ed3290caea18a9bd8d814c61ff3a6 (patch)
treeb373389553d576f862e9c2f11278f5b6341a1f52
parent5a4c2b157226ebeefbc79c0502f3e2caf1c691e8 (diff)
libfreerdp-rfx: SSE2 Optimizations for iDWT Decode (round 2)
-rw-r--r--libfreerdp-rfx/sse/rfx_sse2.c111
1 files changed, 79 insertions, 32 deletions
diff --git a/libfreerdp-rfx/sse/rfx_sse2.c b/libfreerdp-rfx/sse/rfx_sse2.c
index 642d725..b94ac75 100644
--- a/libfreerdp-rfx/sse/rfx_sse2.c
+++ b/libfreerdp-rfx/sse/rfx_sse2.c
@@ -128,13 +128,13 @@ void rfx_quantization_decode_SSE2(sint16 * buffer, const uint32 * quantization_v
rfx_quantization_decode_block_SSE2(buffer + 3840, 64, quantization_values[2]); // HL3
rfx_quantization_decode_block_SSE2(buffer + 3904, 64, quantization_values[1]); // LH3
rfx_quantization_decode_block_SSE2(buffer + 3868, 64, quantization_values[3]); // HH3
- rfx_quantization_decode_block_SSE2(buffer + 4032, 64, quantization_values[0]); // LL3
+ rfx_quantization_decode_block_SSE2(buffer + 4032, 64, quantization_values[0]); // LL3
}
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
rfx_dwt_2d_decode_block_horiz_SSE2(sint16 * l, sint16 * h, sint16 * dst, int subband_width)
{
- int y, n;
+ int y, n;
sint16 * l_ptr = l;
sint16 * h_ptr = h;
sint16 * dst_ptr = dst;
@@ -166,7 +166,7 @@ rfx_dwt_2d_decode_block_horiz_SSE2(sint16 * l, sint16 * h, sint16 * dst, int sub
l_ptr+=8;
h_ptr+=8;
- }
+ }
l_ptr -= subband_width;
h_ptr -= subband_width;
@@ -206,19 +206,87 @@ rfx_dwt_2d_decode_block_horiz_SSE2(sint16 * l, sint16 * h, sint16 * dst, int sub
}
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+rfx_dwt_2d_decode_block_vert_SSE2(sint16 * l, sint16 * h, sint16 * dst, int subband_width)
+{
+ int x, n;
+ sint16 * l_ptr = l;
+ sint16 * h_ptr = h;
+ sint16 * dst_ptr = dst;
+
+ int total_width = subband_width * subband_width;
+
+ /* Even coefficients */
+ for (n = 0; n < subband_width; n++)
+ {
+ for (x = 0; x < total_width; x+=8)
+ {
+ // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
+
+ __m128i l_n = _mm_load_si128((__m128i*) l_ptr);
+ __m128i h_n = _mm_load_si128((__m128i*) h_ptr);
+
+ __m128i tmp_n = _mm_add_epi16(h_n, _mm_set1_epi16(1));;
+ if (n == 0)
+ tmp_n = _mm_add_epi16(tmp_n, h_n);
+ else
+ {
+ __m128i h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - total_width));
+ tmp_n = _mm_add_epi16(tmp_n, h_n_m);
+ }
+ tmp_n = _mm_srai_epi16(tmp_n, 1);
+
+ __m128i dst_n = _mm_sub_epi16(l_n, tmp_n);
+ _mm_store_si128((__m128i*) dst_ptr, dst_n);
+
+ l_ptr+=8;
+ h_ptr+=8;
+ dst_ptr+=8;
+ }
+ dst_ptr+=total_width;
+ }
+
+ h_ptr = h;
+ dst_ptr = dst + total_width;
+
+ /* Odd coefficients */
+ for (n = 0; n < subband_width; n++)
+ {
+ for (x = 0; x < total_width; x+=8)
+ {
+ // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
+
+ __m128i h_n = _mm_load_si128((__m128i*) h_ptr);
+ __m128i dst_n_m = _mm_load_si128((__m128i*) (dst_ptr - total_width));
+ h_n = _mm_slli_epi16(h_n, 1);
+
+ __m128i tmp_n = dst_n_m;
+ if (n == subband_width - 1)
+ tmp_n = _mm_add_epi16(tmp_n, dst_n_m);
+ else
+ {
+ __m128i dst_n_p = _mm_loadu_si128((__m128i*) (dst_ptr + total_width));
+ tmp_n = _mm_add_epi16(tmp_n, dst_n_p);
+ }
+ tmp_n = _mm_srai_epi16(tmp_n, 1);
+
+ __m128i dst_n = _mm_add_epi16(tmp_n, h_n);
+ _mm_store_si128((__m128i*) dst_ptr, dst_n);
+
+ h_ptr+=8;
+ dst_ptr+=8;
+ }
+ dst_ptr+=total_width;
+ }
+}
+
+static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
rfx_dwt_2d_decode_block_SSE2(sint16 * buffer, sint16 * idwt, int subband_width)
{
- sint16 * dst, * l, * h;
- sint16 * l_dst, * h_dst;
sint16 * hl, * lh, * hh, * ll;
- int total_width;
- int x, y;
- int n;
+ sint16 * l_dst, * h_dst;
_mm_prefetch_buffer((char *) idwt, subband_width * 4 * sizeof(sint16));
- total_width = subband_width + subband_width;
-
/* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt. */
/* The 4 sub-bands are stored in HL(0), LH(1), HH(2), LL(3) order. */
/* The lower part L uses LL(3) and HL(0). */
@@ -237,28 +305,7 @@ rfx_dwt_2d_decode_block_SSE2(sint16 * buffer, sint16 * idwt, int subband_width)
rfx_dwt_2d_decode_block_horiz_SSE2(lh, hh, h_dst, subband_width);
/* Inverse DWT in vertical direction, results are stored in original buffer. */
- for (x = 0; x < total_width; x++)
- {
- /* Even coefficients */
- for (n = 0; n < subband_width; n++)
- {
- y = n << 1;
- dst = buffer + y * total_width + x;
- l = idwt + n * total_width + x;
- h = l + subband_width * total_width;
- dst[0] = *l - (((n > 0 ? *(h - total_width) : *h) + (*h) + 1) >> 1);
- }
-
- /* Odd coefficients */
- for (n = 0; n < subband_width; n++)
- {
- y = n << 1;
- dst = buffer + y * total_width + x;
- l = idwt + n * total_width + x;
- h = l + subband_width * total_width;
- dst[total_width] = (*h << 1) + ((dst[0] + dst[n < subband_width - 1 ? 2 * total_width : 0]) >> 1);
- }
- }
+ rfx_dwt_2d_decode_block_vert_SSE2(l_dst, h_dst, buffer, subband_width);
}
void rfx_dwt_2d_decode_SSE2(sint16 * buffer, sint16 * dwt_buffer_8, sint16 * dwt_buffer_16, sint16 * dwt_buffer_32)