diff options
author | Martin Fleisz <mfleisz@thinstuff.com> | 2011-06-28 14:17:01 +0400 |
---|---|---|
committer | Martin Fleisz <mfleisz@thinstuff.com> | 2011-06-28 14:17:01 +0400 |
commit | b958b1f154906982f4c96c1c19b5d4943057f1ae (patch) | |
tree | 539f27dcef14923638e3504dbb1bbbaecb5d82a2 | |
parent | 588966d52039b6ce3d8cfdebbdb36773eb720498 (diff) | |
parent | 3862f5fa79cc0de21550ad654521512322c3d8e8 (diff) |
Merge remote branch 'origin/remotefx' into remotefx
-rw-r--r-- | channels/rdpdr/devman.c | 6 | ||||
-rw-r--r-- | cunit/test_librfx.c | 2 | ||||
-rw-r--r-- | include/freerdp/rfx.h | 11 | ||||
-rw-r--r-- | libfreerdp-core/crypto/openssl.c | 8 | ||||
-rw-r--r-- | libfreerdp-gdi/decode.c | 51 | ||||
-rw-r--r-- | libfreerdp-rfx/librfx.c | 5 | ||||
-rw-r--r-- | libfreerdp-rfx/rfx_decode.c | 2 | ||||
-rw-r--r-- | libfreerdp-rfx/rfx_dwt.c | 18 | ||||
-rw-r--r-- | libfreerdp-rfx/rfx_dwt.h | 4 | ||||
-rw-r--r-- | libfreerdp-rfx/rfx_encode.c | 2 | ||||
-rw-r--r-- | libfreerdp-rfx/sse/rfx_sse.c | 2 | ||||
-rw-r--r-- | libfreerdp-rfx/sse/rfx_sse2.c | 268 | ||||
-rw-r--r-- | libfreerdp-rfx/sse/rfx_sse2.h | 3 | ||||
-rw-r--r-- | libfreerdp-utils/unicode.c | 1 |
14 files changed, 303 insertions, 80 deletions
diff --git a/channels/rdpdr/devman.c b/channels/rdpdr/devman.c index 49e0d46..d14a051 100644 --- a/channels/rdpdr/devman.c +++ b/channels/rdpdr/devman.c @@ -103,11 +103,11 @@ devman_unregister_service(DEVMAN* devman, SERVICE* srv) if (pdev->service == srv) { - devman_unregister_device(devman, pdev); - devman_rewind(devman); - if (pdev->service->type == RDPDR_DTYP_SMARTCARD && pdev->service->control) pthread_cancel(scard_thread); + + devman_unregister_device(devman, pdev); + devman_rewind(devman); } } diff --git a/cunit/test_librfx.c b/cunit/test_librfx.c index 584ac7a..78c453d 100644 --- a/cunit/test_librfx.c +++ b/cunit/test_librfx.c @@ -269,7 +269,7 @@ test_dwt(void) RFX_CONTEXT * context; context = rfx_context_new(); - rfx_dwt_2d_decode(buffer, context->dwt_buffer_8, context->dwt_buffer_16, context->dwt_buffer_32); + rfx_dwt_2d_decode(buffer, context->dwt_buffer); //dump_buffer(buffer, 4096); rfx_context_free(context); } diff --git a/include/freerdp/rfx.h b/include/freerdp/rfx.h index 7cf309c..932f5f5 100644 --- a/include/freerdp/rfx.h +++ b/include/freerdp/rfx.h @@ -157,20 +157,17 @@ struct _RFX_CONTEXT sint16 * cb_g_buffer; sint16 * cr_b_buffer; - sint16 dwt_mem_8[8*8*2*2 + 8]; /* sub-band width 8 */ - sint16 dwt_mem_16[16*16*2*2 + 8]; /* sub-band width 16 */ - sint16 dwt_mem_32[32*32*2*2 + 8]; /* sub-band width 32 */ + sint16 dwt_mem[32*32*2*2 + 8]; /* maximum sub-band width is 32 */ - sint16 * dwt_buffer_8; - sint16 * dwt_buffer_16; - sint16 * dwt_buffer_32; + sint16 * dwt_buffer; /* routines */ void (* decode_YCbCr_to_RGB)(sint16 * y_r_buf, sint16 * cb_g_buf, sint16 * cr_b_buf); void (* encode_RGB_to_YCbCr)(sint16 * y_r_buf, sint16 * cb_g_buf, sint16 * cr_b_buf); void (* quantization_decode)(sint16 * buffer, const uint32 * quantization_values); void (* quantization_encode)(sint16 * buffer, const uint32 * quantization_values); - void (* dwt_2d_decode)(sint16 * buffer, sint16 * dwt_buffer_8, sint16 * dwt_buffer_16, sint16 * dwt_buffer_32); + void (* dwt_2d_decode)(sint16 * buffer, sint16 * dwt_buffer); + void (* dwt_2d_encode)(sint16 * buffer, sint16 * dwt_buffer); /* profiler definitions */ PROFILER_DEFINE(prof_rfx_decode_rgb); diff --git a/libfreerdp-core/crypto/openssl.c b/libfreerdp-core/crypto/openssl.c index 333585b..84417d7 100644 --- a/libfreerdp-core/crypto/openssl.c +++ b/libfreerdp-core/crypto/openssl.c @@ -21,6 +21,7 @@ #include "crypto.h" #include <freerdp/utils/memory.h> #include <freerdp/constants/constants.h> +#include <time.h> #include "tls.h" #include "crypto/openssl.h" @@ -349,6 +350,7 @@ struct rdp_tls { SSL_CTX * ctx; SSL * ssl; + struct timespec ts; }; RD_BOOL @@ -428,6 +430,10 @@ tls_new(void) SSL_CTX_set_options(tls->ctx, SSL_OP_ALL); + /* a small 0.1ms delay when network blocking happens. */ + tls->ts.tv_sec = 0; + tls->ts.tv_nsec = 100000; + return tls; } @@ -515,6 +521,7 @@ tls_write(rdpTls * tls, char* b, int length) break; case SSL_ERROR_WANT_WRITE: + nanosleep(&tls->ts, NULL); break; default: @@ -543,6 +550,7 @@ tls_read(rdpTls * tls, char* b, int length) break; case SSL_ERROR_WANT_READ: + nanosleep(&tls->ts, NULL); break; default: diff --git a/libfreerdp-gdi/decode.c b/libfreerdp-gdi/decode.c index 1c1ad75..9f3f4c0 100644 --- a/libfreerdp-gdi/decode.c +++ b/libfreerdp-gdi/decode.c @@ -32,7 +32,8 @@ int gdi_decode_bitmap_data_ex(GDI *gdi, uint16 x, uint16 y, uint8 * data, int size) { - int i, tx, ty; + int i, j; + int tx, ty; uint8* bitmapData; uint32 bitmapDataLength; RFX_MESSAGE * message; @@ -50,17 +51,49 @@ int gdi_decode_bitmap_data_ex(GDI *gdi, uint16 x, uint16 y, uint8 * data, int si /* decode bitmap data */ message = rfx_process_message((RFX_CONTEXT *) gdi->rfx_context, bitmapData, bitmapDataLength); - /* blit each tile */ - for (i = 0; i < message->num_tiles; i++) + if (message->num_rects > 1) /* RDVH */ { - tx = message->tiles[i]->x + x; - ty = message->tiles[i]->y + y; - data = message->tiles[i]->data; + /* blit each tile */ + for (i = 0; i < message->num_tiles; i++) + { + tx = message->tiles[i]->x + x; + ty = message->tiles[i]->y + y; + data = message->tiles[i]->data; + + gdi_image_convert(data, gdi->tile->bitmap->data, 64, 64, 32, 32, gdi->clrconv); - gdi_image_convert(data, gdi->tile->bitmap->data, 64, 64, 32, 32, gdi->clrconv); - gdi_BitBlt(gdi->primary->hdc, tx, ty, 64, 64, gdi->tile->hdc, 0, 0, GDI_SRCCOPY); + for (j = 0; j < message->num_rects; j++) + { + gdi_SetClipRgn(gdi->primary->hdc, + message->rects[j].x, message->rects[j].y, + message->rects[j].width, message->rects[j].height); + + gdi_BitBlt(gdi->primary->hdc, tx, ty, 64, 64, gdi->tile->hdc, 0, 0, GDI_SRCCOPY); + } + } - gdi_InvalidateRegion(gdi->primary->hdc, tx, ty, 64, 64); + for (i = 0; i < message->num_rects; i++) + { + gdi_InvalidateRegion(gdi->primary->hdc, + message->rects[i].x, message->rects[i].y, + message->rects[i].width, message->rects[i].height); + } + } + else /* RDSH */ + { + /* blit each tile */ + for (i = 0; i < message->num_tiles; i++) + { + tx = message->tiles[i]->x + x; + ty = message->tiles[i]->y + y; + data = message->tiles[i]->data; + + gdi_image_convert(data, gdi->tile->bitmap->data, 64, 64, 32, 32, gdi->clrconv); + + gdi_BitBlt(gdi->primary->hdc, tx, ty, 64, 64, gdi->tile->hdc, 0, 0, GDI_SRCCOPY); + + gdi_InvalidateRegion(gdi->primary->hdc, tx, ty, 64, 64); + } } rfx_message_free(gdi->rfx_context, message); diff --git a/libfreerdp-rfx/librfx.c b/libfreerdp-rfx/librfx.c index c8e5bee..5d9b5a6 100644 --- a/libfreerdp-rfx/librfx.c +++ b/libfreerdp-rfx/librfx.c @@ -135,9 +135,7 @@ rfx_context_new(void) context->cb_g_buffer = (sint16 *)(((uintptr_t)context->cb_g_mem + 16) & ~ 0x0F); context->cr_b_buffer = (sint16 *)(((uintptr_t)context->cr_b_mem + 16) & ~ 0x0F); - context->dwt_buffer_8 = (sint16 *)(((uintptr_t)context->dwt_mem_8 + 16) & ~ 0x0F); - context->dwt_buffer_16 = (sint16 *)(((uintptr_t)context->dwt_mem_16 + 16) & ~ 0x0F); - context->dwt_buffer_32 = (sint16 *)(((uintptr_t)context->dwt_mem_32 + 16) & ~ 0x0F); + context->dwt_buffer = (sint16 *)(((uintptr_t)context->dwt_mem + 16) & ~ 0x0F); /* create profilers for default decoding routines */ rfx_profiler_create(context); @@ -148,6 +146,7 @@ rfx_context_new(void) context->quantization_decode = rfx_quantization_decode; context->quantization_encode = rfx_quantization_encode; context->dwt_2d_decode = rfx_dwt_2d_decode; + context->dwt_2d_encode = rfx_dwt_2d_encode; /* detect and enable SIMD CPU acceleration */ RFX_INIT_SIMD(context); diff --git a/libfreerdp-rfx/rfx_decode.c b/libfreerdp-rfx/rfx_decode.c index aff61ef..8dc8853 100644 --- a/libfreerdp-rfx/rfx_decode.c +++ b/libfreerdp-rfx/rfx_decode.c @@ -120,7 +120,7 @@ rfx_decode_component(RFX_CONTEXT * context, const uint32 * quantization_values, PROFILER_EXIT(context->prof_rfx_quantization_decode); PROFILER_ENTER(context->prof_rfx_dwt_2d_decode); - context->dwt_2d_decode(buffer, context->dwt_buffer_8, context->dwt_buffer_16, context->dwt_buffer_32); + context->dwt_2d_decode(buffer, context->dwt_buffer); PROFILER_EXIT(context->prof_rfx_dwt_2d_decode); PROFILER_EXIT(context->prof_rfx_decode_component); diff --git a/libfreerdp-rfx/rfx_dwt.c b/libfreerdp-rfx/rfx_dwt.c index 97b8ec6..7f80975 100644 --- a/libfreerdp-rfx/rfx_dwt.c +++ b/libfreerdp-rfx/rfx_dwt.c @@ -106,11 +106,11 @@ rfx_dwt_2d_decode_block(sint16 * buffer, sint16 * idwt, int subband_width) } void -rfx_dwt_2d_decode(sint16 * buffer, sint16 * dwt_buffer_8, sint16 * dwt_buffer_16, sint16 * dwt_buffer_32) +rfx_dwt_2d_decode(sint16 * buffer, sint16 * dwt_buffer) { - rfx_dwt_2d_decode_block(buffer + 3840, dwt_buffer_8, 8); - rfx_dwt_2d_decode_block(buffer + 3072, dwt_buffer_16, 16); - rfx_dwt_2d_decode_block(buffer, dwt_buffer_32, 32); + rfx_dwt_2d_decode_block(buffer + 3840, dwt_buffer, 8); + rfx_dwt_2d_decode_block(buffer + 3072, dwt_buffer, 16); + rfx_dwt_2d_decode_block(buffer, dwt_buffer, 32); } void @@ -136,7 +136,7 @@ rfx_dwt_2d_encode_block(sint16 * buffer, sint16 * dwt, int subband_width) src = buffer + y * total_width + x; /* H */ - *h = (src[total_width] - ((src[0] + src[n < subband_width - 1 ? 2 * total_width : 0]) >> 1)) >> 1; + *h = (src[total_width] - ((src[0] + src[n < subband_width - 1 ? 2 * total_width : total_width]) >> 1)) >> 1; /* L */ *l = src[0] + (n == 0 ? *h : (*(h - total_width) + *h) >> 1); @@ -190,9 +190,9 @@ rfx_dwt_2d_encode_block(sint16 * buffer, sint16 * dwt, int subband_width) } void -rfx_dwt_2d_encode(sint16 * buffer, sint16 * dwt_buffer_8, sint16 * dwt_buffer_16, sint16 * dwt_buffer_32) +rfx_dwt_2d_encode(sint16 * buffer, sint16 * dwt_buffer) { - rfx_dwt_2d_encode_block(buffer, dwt_buffer_32, 32); - rfx_dwt_2d_encode_block(buffer + 3072, dwt_buffer_16, 16); - rfx_dwt_2d_encode_block(buffer + 3840, dwt_buffer_8, 8); + rfx_dwt_2d_encode_block(buffer, dwt_buffer, 32); + rfx_dwt_2d_encode_block(buffer + 3072, dwt_buffer, 16); + rfx_dwt_2d_encode_block(buffer + 3840, dwt_buffer, 8); } diff --git a/libfreerdp-rfx/rfx_dwt.h b/libfreerdp-rfx/rfx_dwt.h index 6fee77d..449d61c 100644 --- a/libfreerdp-rfx/rfx_dwt.h +++ b/libfreerdp-rfx/rfx_dwt.h @@ -23,9 +23,9 @@ #include <freerdp/rfx.h> void -rfx_dwt_2d_decode(sint16 * buffer, sint16 * dwt_buffer_8, sint16 * dwt_buffer_16, sint16 * dwt_buffer_32); +rfx_dwt_2d_decode(sint16 * buffer, sint16 * dwt_buffer); void -rfx_dwt_2d_encode(sint16 * buffer, sint16 * dwt_buffer_8, sint16 * dwt_buffer_16, sint16 * dwt_buffer_32); +rfx_dwt_2d_encode(sint16 * buffer, sint16 * dwt_buffer); #endif diff --git a/libfreerdp-rfx/rfx_encode.c b/libfreerdp-rfx/rfx_encode.c index e458dd1..cd20200 100644 --- a/libfreerdp-rfx/rfx_encode.c +++ b/libfreerdp-rfx/rfx_encode.c @@ -132,7 +132,7 @@ rfx_encode_component(RFX_CONTEXT * context, const uint32 * quantization_values, PROFILER_ENTER(context->prof_rfx_encode_component); PROFILER_ENTER(context->prof_rfx_dwt_2d_encode); - rfx_dwt_2d_encode(data, context->dwt_buffer_8, context->dwt_buffer_16, context->dwt_buffer_32); + context->dwt_2d_encode(data, context->dwt_buffer); PROFILER_EXIT(context->prof_rfx_dwt_2d_encode); PROFILER_ENTER(context->prof_rfx_quantization_encode); diff --git a/libfreerdp-rfx/sse/rfx_sse.c b/libfreerdp-rfx/sse/rfx_sse.c index 0407323..76a632d 100644 --- a/libfreerdp-rfx/sse/rfx_sse.c +++ b/libfreerdp-rfx/sse/rfx_sse.c @@ -33,10 +33,12 @@ void rfx_init_sse(RFX_CONTEXT * context) IF_PROFILER(context->prof_rfx_quantization_decode->name = "rfx_quantization_decode_SSE2"); IF_PROFILER(context->prof_rfx_quantization_encode->name = "rfx_quantization_encode_SSE2"); IF_PROFILER(context->prof_rfx_dwt_2d_decode->name = "rfx_dwt_2d_decode_SSE2"); + IF_PROFILER(context->prof_rfx_dwt_2d_encode->name = "rfx_dwt_2d_encode_SSE2"); context->decode_YCbCr_to_RGB = rfx_decode_YCbCr_to_RGB_SSE2; context->encode_RGB_to_YCbCr = rfx_encode_RGB_to_YCbCr_SSE2; context->quantization_decode = rfx_quantization_decode_SSE2; context->quantization_encode = rfx_quantization_encode_SSE2; context->dwt_2d_decode = rfx_dwt_2d_decode_SSE2; + context->dwt_2d_encode = rfx_dwt_2d_encode_SSE2; } diff --git a/libfreerdp-rfx/sse/rfx_sse2.c b/libfreerdp-rfx/sse/rfx_sse2.c index 63dfdcf..3434ec5 100644 --- a/libfreerdp-rfx/sse/rfx_sse2.c +++ b/libfreerdp-rfx/sse/rfx_sse2.c @@ -48,7 +48,15 @@ rfx_decode_YCbCr_to_RGB_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * __m128i * cb_g_buf = (__m128i*) cb_g_buffer; __m128i * cr_b_buf = (__m128i*) cr_b_buffer; + __m128i y; + __m128i cr; + __m128i cb; + __m128i r; + __m128i g; + __m128i b; + int i; + for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i))) { _mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA); @@ -58,14 +66,14 @@ rfx_decode_YCbCr_to_RGB_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i++) { /* y = y_r_buf[i] + 128; */ - __m128i y = _mm_load_si128(&y_r_buf[i]); + y = _mm_load_si128(&y_r_buf[i]); y = _mm_add_epi16(y, _mm_set1_epi16(128)); /* cr = cr_b_buf[i]; */ - __m128i cr = _mm_load_si128(&cr_b_buf[i]); + cr = _mm_load_si128(&cr_b_buf[i]); /* r = between(y + cr + (cr >> 2) + (cr >> 3) + (cr >> 5), 0, 255); */ - __m128i r = _mm_add_epi16(y, cr); + r = _mm_add_epi16(y, cr); r = _mm_add_epi16(r, _mm_srai_epi16(cr, 2)); r = _mm_add_epi16(r, _mm_srai_epi16(cr, 3)); r = _mm_add_epi16(r, _mm_srai_epi16(cr, 5)); @@ -73,10 +81,10 @@ rfx_decode_YCbCr_to_RGB_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * _mm_store_si128(&y_r_buf[i], r); /* cb = cb_g_buf[i]; */ - __m128i cb = _mm_load_si128(&cb_g_buf[i]); + cb = _mm_load_si128(&cb_g_buf[i]); /* g = between(y - (cb >> 2) - (cb >> 4) - (cb >> 5) - (cr >> 1) - (cr >> 3) - (cr >> 4) - (cr >> 5), 0, 255); */ - __m128i g = _mm_sub_epi16(y, _mm_srai_epi16(cb, 2)); + g = _mm_sub_epi16(y, _mm_srai_epi16(cb, 2)); g = _mm_sub_epi16(g, _mm_srai_epi16(cb, 4)); g = _mm_sub_epi16(g, _mm_srai_epi16(cb, 5)); g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 1)); @@ -87,7 +95,7 @@ rfx_decode_YCbCr_to_RGB_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * _mm_store_si128(&cb_g_buf[i], g); /* b = between(y + cb + (cb >> 1) + (cb >> 2) + (cb >> 6), 0, 255); */ - __m128i b = _mm_add_epi16(y, cb); + b = _mm_add_epi16(y, cb); b = _mm_add_epi16(b, _mm_srai_epi16(cb, 1)); b = _mm_add_epi16(b, _mm_srai_epi16(cb, 2)); b = _mm_add_epi16(b, _mm_srai_epi16(cb, 6)); @@ -106,7 +114,15 @@ rfx_encode_RGB_to_YCbCr_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * __m128i * cb_g_buf = (__m128i*) cb_g_buffer; __m128i * cr_b_buf = (__m128i*) cr_b_buffer; + __m128i y; + __m128i cr; + __m128i cb; + __m128i r; + __m128i g; + __m128i b; + int i; + for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i))) { _mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA); @@ -116,17 +132,17 @@ rfx_encode_RGB_to_YCbCr_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i++) { /* r = y_r_buf[i]; */ - __m128i r = _mm_load_si128(&y_r_buf[i]); + r = _mm_load_si128(&y_r_buf[i]); /* g = cb_g_buf[i]; */ - __m128i g = _mm_load_si128(&cb_g_buf[i]); + g = _mm_load_si128(&cb_g_buf[i]); /* b = cr_b_buf[i]; */ - __m128i b = _mm_load_si128(&cr_b_buf[i]); + b = _mm_load_si128(&cr_b_buf[i]); /* y = ((r >> 2) + (r >> 5) + (r >> 6)) + ((g >> 1) + (g >> 4) + (g >> 6) + (g >> 7)) + ((b >> 4) + (b >> 5) + (b >> 6)); */ /* y_r_buf[i] = MINMAX(y, 0, 255) - 128; */ - __m128i y = _mm_add_epi16(_mm_srai_epi16(r, 2), _mm_srai_epi16(r, 5)); + y = _mm_add_epi16(_mm_srai_epi16(r, 2), _mm_srai_epi16(r, 5)); y = _mm_add_epi16(y, _mm_srai_epi16(r, 6)); y = _mm_add_epi16(y, _mm_srai_epi16(g, 1)); y = _mm_add_epi16(y, _mm_srai_epi16(g, 4)); @@ -141,7 +157,7 @@ rfx_encode_RGB_to_YCbCr_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * /* cb = 0 - ((r >> 3) + (r >> 5) + (r >> 7)) - ((g >> 2) + (g >> 4) + (g >> 6)) + (b >> 1); */ /* cb_g_buf[i] = MINMAX(cb, -128, 127); */ - __m128i cb = _mm_sub_epi16(_mm_srai_epi16(b, 1), _mm_srai_epi16(r, 3)); + cb = _mm_sub_epi16(_mm_srai_epi16(b, 1), _mm_srai_epi16(r, 3)); cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 5)); cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 7)); cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 2)); @@ -152,7 +168,7 @@ rfx_encode_RGB_to_YCbCr_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * /* cr = (r >> 1) - ((g >> 2) + (g >> 3) + (g >> 5) + (g >> 7)) - ((b >> 4) + (b >> 6)); */ /* cr_b_buf[i] = MINMAX(cr, -128, 127); */ - __m128i cr = _mm_sub_epi16(_mm_srai_epi16(r, 1), _mm_srai_epi16(g, 2)); + cr = _mm_sub_epi16(_mm_srai_epi16(r, 1), _mm_srai_epi16(g, 2)); cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 3)); cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 5)); cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 7)); @@ -244,29 +260,39 @@ rfx_dwt_2d_decode_block_horiz_SSE2(sint16 * l, sint16 * h, sint16 * dst, int sub sint16 * l_ptr = l; sint16 * h_ptr = h; sint16 * dst_ptr = dst; + int first; + int last; + __m128i l_n; + __m128i h_n; + __m128i h_n_m; + __m128i tmp_n; + __m128i dst_n; + __m128i dst_n_p; + __m128i dst1; + __m128i dst2; for (y = 0; y < subband_width; y++) { /* Even coefficients */ for (n = 0; n < subband_width; n+=8) { - // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); + /* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */ - __m128i l_n = _mm_load_si128((__m128i*) l_ptr); + l_n = _mm_load_si128((__m128i*) l_ptr); - __m128i h_n = _mm_load_si128((__m128i*) h_ptr); - __m128i h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - 1)); + h_n = _mm_load_si128((__m128i*) h_ptr); + h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - 1)); if (n == 0) { - int first = _mm_extract_epi16(h_n_m, 1); + first = _mm_extract_epi16(h_n_m, 1); h_n_m = _mm_insert_epi16(h_n_m, first, 0); } - __m128i tmp_n = _mm_add_epi16(h_n, h_n_m); + tmp_n = _mm_add_epi16(h_n, h_n_m); tmp_n = _mm_add_epi16(tmp_n, _mm_set1_epi16(1)); tmp_n = _mm_srai_epi16(tmp_n, 1); - __m128i dst_n = _mm_sub_epi16(l_n, tmp_n); + dst_n = _mm_sub_epi16(l_n, tmp_n); _mm_store_si128((__m128i*) l_ptr, dst_n); @@ -279,27 +305,27 @@ rfx_dwt_2d_decode_block_horiz_SSE2(sint16 * l, sint16 * h, sint16 * dst, int sub /* Odd coefficients */ for (n = 0; n < subband_width; n+=8) { - // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); + /* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */ - __m128i h_n = _mm_load_si128((__m128i*) h_ptr); + h_n = _mm_load_si128((__m128i*) h_ptr); h_n = _mm_slli_epi16(h_n, 1); - __m128i dst_n = _mm_load_si128((__m128i*) (l_ptr)); - __m128i dst_n_p = _mm_loadu_si128((__m128i*) (l_ptr + 1)); + dst_n = _mm_load_si128((__m128i*) (l_ptr)); + dst_n_p = _mm_loadu_si128((__m128i*) (l_ptr + 1)); if (n == subband_width - 8) { - int last = _mm_extract_epi16(dst_n_p, 6); + last = _mm_extract_epi16(dst_n_p, 6); dst_n_p = _mm_insert_epi16(dst_n_p, last, 7); } - __m128i tmp_n = _mm_add_epi16(dst_n_p, dst_n); + tmp_n = _mm_add_epi16(dst_n_p, dst_n); tmp_n = _mm_srai_epi16(tmp_n, 1); tmp_n = _mm_add_epi16(tmp_n, h_n); - __m128i dst1 = _mm_unpacklo_epi16(dst_n, tmp_n); - __m128i dst2 = _mm_unpackhi_epi16(dst_n, tmp_n); + dst1 = _mm_unpacklo_epi16(dst_n, tmp_n); + dst2 = _mm_unpackhi_epi16(dst_n, tmp_n); _mm_store_si128((__m128i*) dst_ptr, dst1); _mm_store_si128((__m128i*) (dst_ptr + 8), dst2); @@ -318,6 +344,13 @@ rfx_dwt_2d_decode_block_vert_SSE2(sint16 * l, sint16 * h, sint16 * dst, int subb sint16 * l_ptr = l; sint16 * h_ptr = h; sint16 * dst_ptr = dst; + __m128i l_n; + __m128i h_n; + __m128i tmp_n; + __m128i h_n_m; + __m128i dst_n; + __m128i dst_n_m; + __m128i dst_n_p; int total_width = subband_width + subband_width; @@ -326,22 +359,22 @@ rfx_dwt_2d_decode_block_vert_SSE2(sint16 * l, sint16 * h, sint16 * dst, int subb { for (x = 0; x < total_width; x+=8) { - // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); + /* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */ - __m128i l_n = _mm_load_si128((__m128i*) l_ptr); - __m128i h_n = _mm_load_si128((__m128i*) h_ptr); + l_n = _mm_load_si128((__m128i*) l_ptr); + h_n = _mm_load_si128((__m128i*) h_ptr); - __m128i tmp_n = _mm_add_epi16(h_n, _mm_set1_epi16(1));; + tmp_n = _mm_add_epi16(h_n, _mm_set1_epi16(1));; if (n == 0) tmp_n = _mm_add_epi16(tmp_n, h_n); else { - __m128i h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - total_width)); + h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - total_width)); tmp_n = _mm_add_epi16(tmp_n, h_n_m); } tmp_n = _mm_srai_epi16(tmp_n, 1); - __m128i dst_n = _mm_sub_epi16(l_n, tmp_n); + dst_n = _mm_sub_epi16(l_n, tmp_n); _mm_store_si128((__m128i*) dst_ptr, dst_n); l_ptr+=8; @@ -359,23 +392,23 @@ rfx_dwt_2d_decode_block_vert_SSE2(sint16 * l, sint16 * h, sint16 * dst, int subb { for (x = 0; x < total_width; x+=8) { - // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); + /* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */ - __m128i h_n = _mm_load_si128((__m128i*) h_ptr); - __m128i dst_n_m = _mm_load_si128((__m128i*) (dst_ptr - total_width)); + h_n = _mm_load_si128((__m128i*) h_ptr); + dst_n_m = _mm_load_si128((__m128i*) (dst_ptr - total_width)); h_n = _mm_slli_epi16(h_n, 1); - __m128i tmp_n = dst_n_m; + tmp_n = dst_n_m; if (n == subband_width - 1) tmp_n = _mm_add_epi16(tmp_n, dst_n_m); else { - __m128i dst_n_p = _mm_loadu_si128((__m128i*) (dst_ptr + total_width)); + dst_n_p = _mm_loadu_si128((__m128i*) (dst_ptr + total_width)); tmp_n = _mm_add_epi16(tmp_n, dst_n_p); } tmp_n = _mm_srai_epi16(tmp_n, 1); - __m128i dst_n = _mm_add_epi16(tmp_n, h_n); + dst_n = _mm_add_epi16(tmp_n, h_n); _mm_store_si128((__m128i*) dst_ptr, dst_n); h_ptr+=8; @@ -415,11 +448,160 @@ rfx_dwt_2d_decode_block_SSE2(sint16 * buffer, sint16 * idwt, int subband_width) } void -rfx_dwt_2d_decode_SSE2(sint16 * buffer, sint16 * dwt_buffer_8, sint16 * dwt_buffer_16, sint16 * dwt_buffer_32) +rfx_dwt_2d_decode_SSE2(sint16 * buffer, sint16 * dwt_buffer) +{ + _mm_prefetch_buffer((char *) buffer, 4096 * sizeof(sint16)); + + rfx_dwt_2d_decode_block_SSE2(buffer + 3840, dwt_buffer, 8); + rfx_dwt_2d_decode_block_SSE2(buffer + 3072, dwt_buffer, 16); + rfx_dwt_2d_decode_block_SSE2(buffer, dwt_buffer, 32); +} + +static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +rfx_dwt_2d_encode_block_vert_SSE2(sint16 * src, sint16 * l, sint16 * h, int subband_width) +{ + int total_width; + int x; + int n; + __m128i src_2n; + __m128i src_2n_1; + __m128i src_2n_2; + __m128i h_n; + __m128i h_n_m; + __m128i l_n; + + total_width = subband_width << 1; + + for (n = 0; n < subband_width; n++) + { + for (x = 0; x < total_width; x += 8) + { + src_2n = _mm_load_si128((__m128i*) src); + src_2n_1 = _mm_load_si128((__m128i*) (src + total_width)); + if (n < subband_width - 1) + src_2n_2 = _mm_load_si128((__m128i*) (src + 2 * total_width)); + else + src_2n_2 = src_2n_1; + + /* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */ + + h_n = _mm_add_epi16(src_2n, src_2n_2); + h_n = _mm_srai_epi16(h_n, 1); + h_n = _mm_sub_epi16(src_2n_1, h_n); + h_n = _mm_srai_epi16(h_n, 1); + + _mm_store_si128((__m128i*) h, h_n); + + if (n == 0) + h_n_m = h_n; + else + h_n_m = _mm_load_si128((__m128i*) (h - total_width)); + + /* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */ + + l_n = _mm_add_epi16(h_n_m, h_n); + l_n = _mm_srai_epi16(l_n, 1); + l_n = _mm_add_epi16(l_n, src_2n); + + _mm_store_si128((__m128i*) l, l_n); + + src += 8; + l += 8; + h += 8; + } + src += total_width; + } +} + +static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +rfx_dwt_2d_encode_block_horiz_SSE2(sint16 * src, sint16 * l, sint16 * h, int subband_width) +{ + int y; + int n; + int first; + __m128i src_2n; + __m128i src_2n_1; + __m128i src_2n_2; + __m128i h_n; + __m128i h_n_m; + __m128i l_n; + + for (y = 0; y < subband_width; y++) + { + for (n = 0; n < subband_width; n += 8) + { + /* The following 3 Set operations consumes more than half of the total DWT processing time! */ + src_2n = _mm_set_epi16(src[14], src[12], src[10], src[8], src[6], src[4], src[2], src[0]); + src_2n_1 = _mm_set_epi16(src[15], src[13], src[11], src[9], src[7], src[5], src[3], src[1]); + src_2n_2 = _mm_set_epi16(n == subband_width - 8 ? src[15] : src[16], + src[14], src[12], src[10], src[8], src[6], src[4], src[2]); + + /* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */ + + h_n = _mm_add_epi16(src_2n, src_2n_2); + h_n = _mm_srai_epi16(h_n, 1); + h_n = _mm_sub_epi16(src_2n_1, h_n); + h_n = _mm_srai_epi16(h_n, 1); + + _mm_store_si128((__m128i*) h, h_n); + + h_n_m = _mm_loadu_si128((__m128i*) (h - 1)); + if (n == 0) + { + first = _mm_extract_epi16(h_n_m, 1); + h_n_m = _mm_insert_epi16(h_n_m, first, 0); + } + + /* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */ + + l_n = _mm_add_epi16(h_n_m, h_n); + l_n = _mm_srai_epi16(l_n, 1); + l_n = _mm_add_epi16(l_n, src_2n); + + _mm_store_si128((__m128i*) l, l_n); + + src += 16; + l += 8; + h += 8; + } + } +} + +static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +rfx_dwt_2d_encode_block_SSE2(sint16 * buffer, sint16 * dwt, int subband_width) +{ + sint16 * hl, * lh, * hh, * ll; + sint16 * l_src, * h_src; + + _mm_prefetch_buffer((char *) dwt, subband_width * 4 * sizeof(sint16)); + + /* DWT in vertical direction, results in 2 sub-bands in L, H order in tmp buffer dwt. */ + + l_src = dwt; + h_src = dwt + subband_width * subband_width * 2; + + rfx_dwt_2d_encode_block_vert_SSE2(buffer, l_src, h_src, subband_width); + + /* DWT in horizontal direction, results in 4 sub-bands in HL(0), LH(1), HH(2), LL(3) order, stored in original buffer. */ + /* The lower part L generates LL(3) and HL(0). */ + /* The higher part H generates LH(1) and HH(2). */ + + ll = buffer + subband_width * subband_width * 3; + hl = buffer; + + lh = buffer + subband_width * subband_width; + hh = buffer + subband_width * subband_width * 2; + + rfx_dwt_2d_encode_block_horiz_SSE2(l_src, ll, hl, subband_width); + rfx_dwt_2d_encode_block_horiz_SSE2(h_src, lh, hh, subband_width); +} + +void +rfx_dwt_2d_encode_SSE2(sint16 * buffer, sint16 * dwt_buffer) { _mm_prefetch_buffer((char *) buffer, 4096 * sizeof(sint16)); - rfx_dwt_2d_decode_block_SSE2(buffer + 3840, dwt_buffer_8, 8); - rfx_dwt_2d_decode_block_SSE2(buffer + 3072, dwt_buffer_16, 16); - rfx_dwt_2d_decode_block_SSE2(buffer, dwt_buffer_32, 32); + rfx_dwt_2d_encode_block_SSE2(buffer, dwt_buffer, 32); + rfx_dwt_2d_encode_block_SSE2(buffer + 3072, dwt_buffer, 16); + rfx_dwt_2d_encode_block_SSE2(buffer + 3840, dwt_buffer, 8); } diff --git a/libfreerdp-rfx/sse/rfx_sse2.h b/libfreerdp-rfx/sse/rfx_sse2.h index 8f35f7c..85921da 100644 --- a/libfreerdp-rfx/sse/rfx_sse2.h +++ b/libfreerdp-rfx/sse/rfx_sse2.h @@ -26,6 +26,7 @@ void rfx_decode_YCbCr_to_RGB_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sin void rfx_encode_RGB_to_YCbCr_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer); void rfx_quantization_decode_SSE2(sint16 * buffer, const uint32 * quantization_values); void rfx_quantization_encode_SSE2(sint16 * buffer, const uint32 * quantization_values); -void rfx_dwt_2d_decode_SSE2(sint16 * buffer, sint16 * dwt_buffer_8, sint16 * dwt_buffer_16, sint16 * dwt_buffer_32); +void rfx_dwt_2d_decode_SSE2(sint16 * buffer, sint16 * dwt_buffer); +void rfx_dwt_2d_encode_SSE2(sint16 * buffer, sint16 * dwt_buffer); #endif /* __RFX_SSE2_H */ diff --git a/libfreerdp-utils/unicode.c b/libfreerdp-utils/unicode.c index dda7f96..c4c6fc7 100644 --- a/libfreerdp-utils/unicode.c +++ b/libfreerdp-utils/unicode.c @@ -17,6 +17,7 @@ limitations under the License. */ +#include "config.h" #include <errno.h> #include <freerdp/utils/memory.h> |