Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/FreeRDP/FreeRDP-old.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorVic Lee <llyzs@163.com>2011-06-24 12:11:44 +0400
committerVic Lee <llyzs@163.com>2011-06-24 12:11:44 +0400
commit2a897f3418638dfa1b54646d81a42fe8bb9d0d82 (patch)
tree9f704e72481d74d875ee8efcc074f6ae3af7eacb
parenta79037ca6d95aed69f01f702af0acc95d931b551 (diff)
libfreerdp-rfx: add SSE2 for DWT encoding.
-rw-r--r--include/freerdp/rfx.h1
-rw-r--r--libfreerdp-rfx/librfx.c1
-rw-r--r--libfreerdp-rfx/rfx_encode.c2
-rw-r--r--libfreerdp-rfx/sse/rfx_sse.c2
-rw-r--r--libfreerdp-rfx/sse/rfx_sse2.c149
-rw-r--r--libfreerdp-rfx/sse/rfx_sse2.h1
6 files changed, 155 insertions, 1 deletions
diff --git a/include/freerdp/rfx.h b/include/freerdp/rfx.h
index 7b2fede..932f5f5 100644
--- a/include/freerdp/rfx.h
+++ b/include/freerdp/rfx.h
@@ -167,6 +167,7 @@ struct _RFX_CONTEXT
void (* quantization_decode)(sint16 * buffer, const uint32 * quantization_values);
void (* quantization_encode)(sint16 * buffer, const uint32 * quantization_values);
void (* dwt_2d_decode)(sint16 * buffer, sint16 * dwt_buffer);
+ void (* dwt_2d_encode)(sint16 * buffer, sint16 * dwt_buffer);
/* profiler definitions */
PROFILER_DEFINE(prof_rfx_decode_rgb);
diff --git a/libfreerdp-rfx/librfx.c b/libfreerdp-rfx/librfx.c
index 1693bfb..5d9b5a6 100644
--- a/libfreerdp-rfx/librfx.c
+++ b/libfreerdp-rfx/librfx.c
@@ -146,6 +146,7 @@ rfx_context_new(void)
context->quantization_decode = rfx_quantization_decode;
context->quantization_encode = rfx_quantization_encode;
context->dwt_2d_decode = rfx_dwt_2d_decode;
+ context->dwt_2d_encode = rfx_dwt_2d_encode;
/* detect and enable SIMD CPU acceleration */
RFX_INIT_SIMD(context);
diff --git a/libfreerdp-rfx/rfx_encode.c b/libfreerdp-rfx/rfx_encode.c
index ddb9d2d..cd20200 100644
--- a/libfreerdp-rfx/rfx_encode.c
+++ b/libfreerdp-rfx/rfx_encode.c
@@ -132,7 +132,7 @@ rfx_encode_component(RFX_CONTEXT * context, const uint32 * quantization_values,
PROFILER_ENTER(context->prof_rfx_encode_component);
PROFILER_ENTER(context->prof_rfx_dwt_2d_encode);
- rfx_dwt_2d_encode(data, context->dwt_buffer);
+ context->dwt_2d_encode(data, context->dwt_buffer);
PROFILER_EXIT(context->prof_rfx_dwt_2d_encode);
PROFILER_ENTER(context->prof_rfx_quantization_encode);
diff --git a/libfreerdp-rfx/sse/rfx_sse.c b/libfreerdp-rfx/sse/rfx_sse.c
index 0407323..76a632d 100644
--- a/libfreerdp-rfx/sse/rfx_sse.c
+++ b/libfreerdp-rfx/sse/rfx_sse.c
@@ -33,10 +33,12 @@ void rfx_init_sse(RFX_CONTEXT * context)
IF_PROFILER(context->prof_rfx_quantization_decode->name = "rfx_quantization_decode_SSE2");
IF_PROFILER(context->prof_rfx_quantization_encode->name = "rfx_quantization_encode_SSE2");
IF_PROFILER(context->prof_rfx_dwt_2d_decode->name = "rfx_dwt_2d_decode_SSE2");
+ IF_PROFILER(context->prof_rfx_dwt_2d_encode->name = "rfx_dwt_2d_encode_SSE2");
context->decode_YCbCr_to_RGB = rfx_decode_YCbCr_to_RGB_SSE2;
context->encode_RGB_to_YCbCr = rfx_encode_RGB_to_YCbCr_SSE2;
context->quantization_decode = rfx_quantization_decode_SSE2;
context->quantization_encode = rfx_quantization_encode_SSE2;
context->dwt_2d_decode = rfx_dwt_2d_decode_SSE2;
+ context->dwt_2d_encode = rfx_dwt_2d_encode_SSE2;
}
diff --git a/libfreerdp-rfx/sse/rfx_sse2.c b/libfreerdp-rfx/sse/rfx_sse2.c
index a215f86..cdaef30 100644
--- a/libfreerdp-rfx/sse/rfx_sse2.c
+++ b/libfreerdp-rfx/sse/rfx_sse2.c
@@ -423,3 +423,152 @@ rfx_dwt_2d_decode_SSE2(sint16 * buffer, sint16 * dwt_buffer)
rfx_dwt_2d_decode_block_SSE2(buffer + 3072, dwt_buffer, 16);
rfx_dwt_2d_decode_block_SSE2(buffer, dwt_buffer, 32);
}
+
+static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+rfx_dwt_2d_encode_block_vert_SSE2(sint16 * src, sint16 * l, sint16 * h, int subband_width)
+{
+ int total_width;
+ int x;
+ int n;
+ __m128i src_2n;
+ __m128i src_2n_1;
+ __m128i src_2n_2;
+ __m128i h_n;
+ __m128i h_n_m;
+ __m128i l_n;
+
+ total_width = subband_width << 1;
+
+ for (n = 0; n < subband_width; n++)
+ {
+ for (x = 0; x < total_width; x += 8)
+ {
+ src_2n = _mm_load_si128((__m128i*) src);
+ src_2n_1 = _mm_load_si128((__m128i*) (src + total_width));
+ if (n < subband_width - 1)
+ src_2n_2 = _mm_load_si128((__m128i*) (src + 2 * total_width));
+ else
+ src_2n_2 = src_2n_1;
+
+ /* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */
+
+ h_n = _mm_add_epi16(src_2n, src_2n_2);
+ h_n = _mm_srai_epi16(h_n, 1);
+ h_n = _mm_sub_epi16(src_2n_1, h_n);
+ h_n = _mm_srai_epi16(h_n, 1);
+
+ _mm_store_si128((__m128i*) h, h_n);
+
+ if (n == 0)
+ h_n_m = h_n;
+ else
+ h_n_m = _mm_load_si128((__m128i*) (h - total_width));
+
+ /* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */
+
+ l_n = _mm_add_epi16(h_n_m, h_n);
+ l_n = _mm_srai_epi16(l_n, 1);
+ l_n = _mm_add_epi16(l_n, src_2n);
+
+ _mm_store_si128((__m128i*) l, l_n);
+
+ src += 8;
+ l += 8;
+ h += 8;
+ }
+ src += total_width;
+ }
+}
+
+static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+rfx_dwt_2d_encode_block_horiz_SSE2(sint16 * src, sint16 * l, sint16 * h, int subband_width)
+{
+ int y;
+ int n;
+ int first;
+ __m128i src_2n;
+ __m128i src_2n_1;
+ __m128i src_2n_2;
+ __m128i h_n;
+ __m128i h_n_m;
+ __m128i l_n;
+
+ for (y = 0; y < subband_width; y++)
+ {
+ for (n = 0; n < subband_width; n += 8)
+ {
+ /* The following 3 Set operations consumes more than half of the total DWT processing time! */
+ src_2n = _mm_set_epi16(src[14], src[12], src[10], src[8], src[6], src[4], src[2], src[0]);
+ src_2n_1 = _mm_set_epi16(src[15], src[13], src[11], src[9], src[7], src[5], src[3], src[1]);
+ src_2n_2 = _mm_set_epi16(n == subband_width - 8 ? src[15] : src[16],
+ src[14], src[12], src[10], src[8], src[6], src[4], src[2]);
+
+ /* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */
+
+ h_n = _mm_add_epi16(src_2n, src_2n_2);
+ h_n = _mm_srai_epi16(h_n, 1);
+ h_n = _mm_sub_epi16(src_2n_1, h_n);
+ h_n = _mm_srai_epi16(h_n, 1);
+
+ _mm_store_si128((__m128i*) h, h_n);
+
+ h_n_m = _mm_loadu_si128((__m128i*) (h - 1));
+ if (n == 0)
+ {
+ first = _mm_extract_epi16(h_n_m, 1);
+ h_n_m = _mm_insert_epi16(h_n_m, first, 0);
+ }
+
+ /* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */
+
+ l_n = _mm_add_epi16(h_n_m, h_n);
+ l_n = _mm_srai_epi16(l_n, 1);
+ l_n = _mm_add_epi16(l_n, src_2n);
+
+ _mm_store_si128((__m128i*) l, l_n);
+
+ src += 16;
+ l += 8;
+ h += 8;
+ }
+ }
+}
+
+static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+rfx_dwt_2d_encode_block_SSE2(sint16 * buffer, sint16 * dwt, int subband_width)
+{
+ sint16 * hl, * lh, * hh, * ll;
+ sint16 * l_src, * h_src;
+
+ _mm_prefetch_buffer((char *) dwt, subband_width * 4 * sizeof(sint16));
+
+ /* DWT in vertical direction, results in 2 sub-bands in L, H order in tmp buffer dwt. */
+
+ l_src = dwt;
+ h_src = dwt + subband_width * subband_width * 2;
+
+ rfx_dwt_2d_encode_block_vert_SSE2(buffer, l_src, h_src, subband_width);
+
+ /* DWT in horizontal direction, results in 4 sub-bands in HL(0), LH(1), HH(2), LL(3) order, stored in original buffer. */
+ /* The lower part L generates LL(3) and HL(0). */
+ /* The higher part H generates LH(1) and HH(2). */
+
+ ll = buffer + subband_width * subband_width * 3;
+ hl = buffer;
+
+ lh = buffer + subband_width * subband_width;
+ hh = buffer + subband_width * subband_width * 2;
+
+ rfx_dwt_2d_encode_block_horiz_SSE2(l_src, ll, hl, subband_width);
+ rfx_dwt_2d_encode_block_horiz_SSE2(h_src, lh, hh, subband_width);
+}
+
+void
+rfx_dwt_2d_encode_SSE2(sint16 * buffer, sint16 * dwt_buffer)
+{
+ _mm_prefetch_buffer((char *) buffer, 4096 * sizeof(sint16));
+
+ rfx_dwt_2d_encode_block_SSE2(buffer, dwt_buffer, 32);
+ rfx_dwt_2d_encode_block_SSE2(buffer + 3072, dwt_buffer, 16);
+ rfx_dwt_2d_encode_block_SSE2(buffer + 3840, dwt_buffer, 8);
+}
diff --git a/libfreerdp-rfx/sse/rfx_sse2.h b/libfreerdp-rfx/sse/rfx_sse2.h
index 0fcf604..85921da 100644
--- a/libfreerdp-rfx/sse/rfx_sse2.h
+++ b/libfreerdp-rfx/sse/rfx_sse2.h
@@ -27,5 +27,6 @@ void rfx_encode_RGB_to_YCbCr_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sin
void rfx_quantization_decode_SSE2(sint16 * buffer, const uint32 * quantization_values);
void rfx_quantization_encode_SSE2(sint16 * buffer, const uint32 * quantization_values);
void rfx_dwt_2d_decode_SSE2(sint16 * buffer, sint16 * dwt_buffer);
+void rfx_dwt_2d_encode_SSE2(sint16 * buffer, sint16 * dwt_buffer);
#endif /* __RFX_SSE2_H */