Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/FreeRDP/FreeRDP-old.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Fleisz <mfleisz@thinstuff.com>2011-06-28 14:17:01 +0400
committerMartin Fleisz <mfleisz@thinstuff.com>2011-06-28 14:17:01 +0400
commitb958b1f154906982f4c96c1c19b5d4943057f1ae (patch)
tree539f27dcef14923638e3504dbb1bbbaecb5d82a2
parent588966d52039b6ce3d8cfdebbdb36773eb720498 (diff)
parent3862f5fa79cc0de21550ad654521512322c3d8e8 (diff)
Merge remote branch 'origin/remotefx' into remotefx
-rw-r--r--channels/rdpdr/devman.c6
-rw-r--r--cunit/test_librfx.c2
-rw-r--r--include/freerdp/rfx.h11
-rw-r--r--libfreerdp-core/crypto/openssl.c8
-rw-r--r--libfreerdp-gdi/decode.c51
-rw-r--r--libfreerdp-rfx/librfx.c5
-rw-r--r--libfreerdp-rfx/rfx_decode.c2
-rw-r--r--libfreerdp-rfx/rfx_dwt.c18
-rw-r--r--libfreerdp-rfx/rfx_dwt.h4
-rw-r--r--libfreerdp-rfx/rfx_encode.c2
-rw-r--r--libfreerdp-rfx/sse/rfx_sse.c2
-rw-r--r--libfreerdp-rfx/sse/rfx_sse2.c268
-rw-r--r--libfreerdp-rfx/sse/rfx_sse2.h3
-rw-r--r--libfreerdp-utils/unicode.c1
14 files changed, 303 insertions, 80 deletions
diff --git a/channels/rdpdr/devman.c b/channels/rdpdr/devman.c
index 49e0d46..d14a051 100644
--- a/channels/rdpdr/devman.c
+++ b/channels/rdpdr/devman.c
@@ -103,11 +103,11 @@ devman_unregister_service(DEVMAN* devman, SERVICE* srv)
if (pdev->service == srv)
{
- devman_unregister_device(devman, pdev);
- devman_rewind(devman);
-
if (pdev->service->type == RDPDR_DTYP_SMARTCARD && pdev->service->control)
pthread_cancel(scard_thread);
+
+ devman_unregister_device(devman, pdev);
+ devman_rewind(devman);
}
}
diff --git a/cunit/test_librfx.c b/cunit/test_librfx.c
index 584ac7a..78c453d 100644
--- a/cunit/test_librfx.c
+++ b/cunit/test_librfx.c
@@ -269,7 +269,7 @@ test_dwt(void)
RFX_CONTEXT * context;
context = rfx_context_new();
- rfx_dwt_2d_decode(buffer, context->dwt_buffer_8, context->dwt_buffer_16, context->dwt_buffer_32);
+ rfx_dwt_2d_decode(buffer, context->dwt_buffer);
//dump_buffer(buffer, 4096);
rfx_context_free(context);
}
diff --git a/include/freerdp/rfx.h b/include/freerdp/rfx.h
index 7cf309c..932f5f5 100644
--- a/include/freerdp/rfx.h
+++ b/include/freerdp/rfx.h
@@ -157,20 +157,17 @@ struct _RFX_CONTEXT
sint16 * cb_g_buffer;
sint16 * cr_b_buffer;
- sint16 dwt_mem_8[8*8*2*2 + 8]; /* sub-band width 8 */
- sint16 dwt_mem_16[16*16*2*2 + 8]; /* sub-band width 16 */
- sint16 dwt_mem_32[32*32*2*2 + 8]; /* sub-band width 32 */
+ sint16 dwt_mem[32*32*2*2 + 8]; /* maximum sub-band width is 32 */
- sint16 * dwt_buffer_8;
- sint16 * dwt_buffer_16;
- sint16 * dwt_buffer_32;
+ sint16 * dwt_buffer;
/* routines */
void (* decode_YCbCr_to_RGB)(sint16 * y_r_buf, sint16 * cb_g_buf, sint16 * cr_b_buf);
void (* encode_RGB_to_YCbCr)(sint16 * y_r_buf, sint16 * cb_g_buf, sint16 * cr_b_buf);
void (* quantization_decode)(sint16 * buffer, const uint32 * quantization_values);
void (* quantization_encode)(sint16 * buffer, const uint32 * quantization_values);
- void (* dwt_2d_decode)(sint16 * buffer, sint16 * dwt_buffer_8, sint16 * dwt_buffer_16, sint16 * dwt_buffer_32);
+ void (* dwt_2d_decode)(sint16 * buffer, sint16 * dwt_buffer);
+ void (* dwt_2d_encode)(sint16 * buffer, sint16 * dwt_buffer);
/* profiler definitions */
PROFILER_DEFINE(prof_rfx_decode_rgb);
diff --git a/libfreerdp-core/crypto/openssl.c b/libfreerdp-core/crypto/openssl.c
index 333585b..84417d7 100644
--- a/libfreerdp-core/crypto/openssl.c
+++ b/libfreerdp-core/crypto/openssl.c
@@ -21,6 +21,7 @@
#include "crypto.h"
#include <freerdp/utils/memory.h>
#include <freerdp/constants/constants.h>
+#include <time.h>
#include "tls.h"
#include "crypto/openssl.h"
@@ -349,6 +350,7 @@ struct rdp_tls
{
SSL_CTX * ctx;
SSL * ssl;
+ struct timespec ts;
};
RD_BOOL
@@ -428,6 +430,10 @@ tls_new(void)
SSL_CTX_set_options(tls->ctx, SSL_OP_ALL);
+ /* a small 0.1ms delay when network blocking happens. */
+ tls->ts.tv_sec = 0;
+ tls->ts.tv_nsec = 100000;
+
return tls;
}
@@ -515,6 +521,7 @@ tls_write(rdpTls * tls, char* b, int length)
break;
case SSL_ERROR_WANT_WRITE:
+ nanosleep(&tls->ts, NULL);
break;
default:
@@ -543,6 +550,7 @@ tls_read(rdpTls * tls, char* b, int length)
break;
case SSL_ERROR_WANT_READ:
+ nanosleep(&tls->ts, NULL);
break;
default:
diff --git a/libfreerdp-gdi/decode.c b/libfreerdp-gdi/decode.c
index 1c1ad75..9f3f4c0 100644
--- a/libfreerdp-gdi/decode.c
+++ b/libfreerdp-gdi/decode.c
@@ -32,7 +32,8 @@
int gdi_decode_bitmap_data_ex(GDI *gdi, uint16 x, uint16 y, uint8 * data, int size)
{
- int i, tx, ty;
+ int i, j;
+ int tx, ty;
uint8* bitmapData;
uint32 bitmapDataLength;
RFX_MESSAGE * message;
@@ -50,17 +51,49 @@ int gdi_decode_bitmap_data_ex(GDI *gdi, uint16 x, uint16 y, uint8 * data, int si
/* decode bitmap data */
message = rfx_process_message((RFX_CONTEXT *) gdi->rfx_context, bitmapData, bitmapDataLength);
- /* blit each tile */
- for (i = 0; i < message->num_tiles; i++)
+ if (message->num_rects > 1) /* RDVH */
{
- tx = message->tiles[i]->x + x;
- ty = message->tiles[i]->y + y;
- data = message->tiles[i]->data;
+ /* blit each tile */
+ for (i = 0; i < message->num_tiles; i++)
+ {
+ tx = message->tiles[i]->x + x;
+ ty = message->tiles[i]->y + y;
+ data = message->tiles[i]->data;
+
+ gdi_image_convert(data, gdi->tile->bitmap->data, 64, 64, 32, 32, gdi->clrconv);
- gdi_image_convert(data, gdi->tile->bitmap->data, 64, 64, 32, 32, gdi->clrconv);
- gdi_BitBlt(gdi->primary->hdc, tx, ty, 64, 64, gdi->tile->hdc, 0, 0, GDI_SRCCOPY);
+ for (j = 0; j < message->num_rects; j++)
+ {
+ gdi_SetClipRgn(gdi->primary->hdc,
+ message->rects[j].x, message->rects[j].y,
+ message->rects[j].width, message->rects[j].height);
+
+ gdi_BitBlt(gdi->primary->hdc, tx, ty, 64, 64, gdi->tile->hdc, 0, 0, GDI_SRCCOPY);
+ }
+ }
- gdi_InvalidateRegion(gdi->primary->hdc, tx, ty, 64, 64);
+ for (i = 0; i < message->num_rects; i++)
+ {
+ gdi_InvalidateRegion(gdi->primary->hdc,
+ message->rects[i].x, message->rects[i].y,
+ message->rects[i].width, message->rects[i].height);
+ }
+ }
+ else /* RDSH */
+ {
+ /* blit each tile */
+ for (i = 0; i < message->num_tiles; i++)
+ {
+ tx = message->tiles[i]->x + x;
+ ty = message->tiles[i]->y + y;
+ data = message->tiles[i]->data;
+
+ gdi_image_convert(data, gdi->tile->bitmap->data, 64, 64, 32, 32, gdi->clrconv);
+
+ gdi_BitBlt(gdi->primary->hdc, tx, ty, 64, 64, gdi->tile->hdc, 0, 0, GDI_SRCCOPY);
+
+ gdi_InvalidateRegion(gdi->primary->hdc, tx, ty, 64, 64);
+ }
}
rfx_message_free(gdi->rfx_context, message);
diff --git a/libfreerdp-rfx/librfx.c b/libfreerdp-rfx/librfx.c
index c8e5bee..5d9b5a6 100644
--- a/libfreerdp-rfx/librfx.c
+++ b/libfreerdp-rfx/librfx.c
@@ -135,9 +135,7 @@ rfx_context_new(void)
context->cb_g_buffer = (sint16 *)(((uintptr_t)context->cb_g_mem + 16) & ~ 0x0F);
context->cr_b_buffer = (sint16 *)(((uintptr_t)context->cr_b_mem + 16) & ~ 0x0F);
- context->dwt_buffer_8 = (sint16 *)(((uintptr_t)context->dwt_mem_8 + 16) & ~ 0x0F);
- context->dwt_buffer_16 = (sint16 *)(((uintptr_t)context->dwt_mem_16 + 16) & ~ 0x0F);
- context->dwt_buffer_32 = (sint16 *)(((uintptr_t)context->dwt_mem_32 + 16) & ~ 0x0F);
+ context->dwt_buffer = (sint16 *)(((uintptr_t)context->dwt_mem + 16) & ~ 0x0F);
/* create profilers for default decoding routines */
rfx_profiler_create(context);
@@ -148,6 +146,7 @@ rfx_context_new(void)
context->quantization_decode = rfx_quantization_decode;
context->quantization_encode = rfx_quantization_encode;
context->dwt_2d_decode = rfx_dwt_2d_decode;
+ context->dwt_2d_encode = rfx_dwt_2d_encode;
/* detect and enable SIMD CPU acceleration */
RFX_INIT_SIMD(context);
diff --git a/libfreerdp-rfx/rfx_decode.c b/libfreerdp-rfx/rfx_decode.c
index aff61ef..8dc8853 100644
--- a/libfreerdp-rfx/rfx_decode.c
+++ b/libfreerdp-rfx/rfx_decode.c
@@ -120,7 +120,7 @@ rfx_decode_component(RFX_CONTEXT * context, const uint32 * quantization_values,
PROFILER_EXIT(context->prof_rfx_quantization_decode);
PROFILER_ENTER(context->prof_rfx_dwt_2d_decode);
- context->dwt_2d_decode(buffer, context->dwt_buffer_8, context->dwt_buffer_16, context->dwt_buffer_32);
+ context->dwt_2d_decode(buffer, context->dwt_buffer);
PROFILER_EXIT(context->prof_rfx_dwt_2d_decode);
PROFILER_EXIT(context->prof_rfx_decode_component);
diff --git a/libfreerdp-rfx/rfx_dwt.c b/libfreerdp-rfx/rfx_dwt.c
index 97b8ec6..7f80975 100644
--- a/libfreerdp-rfx/rfx_dwt.c
+++ b/libfreerdp-rfx/rfx_dwt.c
@@ -106,11 +106,11 @@ rfx_dwt_2d_decode_block(sint16 * buffer, sint16 * idwt, int subband_width)
}
void
-rfx_dwt_2d_decode(sint16 * buffer, sint16 * dwt_buffer_8, sint16 * dwt_buffer_16, sint16 * dwt_buffer_32)
+rfx_dwt_2d_decode(sint16 * buffer, sint16 * dwt_buffer)
{
- rfx_dwt_2d_decode_block(buffer + 3840, dwt_buffer_8, 8);
- rfx_dwt_2d_decode_block(buffer + 3072, dwt_buffer_16, 16);
- rfx_dwt_2d_decode_block(buffer, dwt_buffer_32, 32);
+ rfx_dwt_2d_decode_block(buffer + 3840, dwt_buffer, 8);
+ rfx_dwt_2d_decode_block(buffer + 3072, dwt_buffer, 16);
+ rfx_dwt_2d_decode_block(buffer, dwt_buffer, 32);
}
void
@@ -136,7 +136,7 @@ rfx_dwt_2d_encode_block(sint16 * buffer, sint16 * dwt, int subband_width)
src = buffer + y * total_width + x;
/* H */
- *h = (src[total_width] - ((src[0] + src[n < subband_width - 1 ? 2 * total_width : 0]) >> 1)) >> 1;
+ *h = (src[total_width] - ((src[0] + src[n < subband_width - 1 ? 2 * total_width : total_width]) >> 1)) >> 1;
/* L */
*l = src[0] + (n == 0 ? *h : (*(h - total_width) + *h) >> 1);
@@ -190,9 +190,9 @@ rfx_dwt_2d_encode_block(sint16 * buffer, sint16 * dwt, int subband_width)
}
void
-rfx_dwt_2d_encode(sint16 * buffer, sint16 * dwt_buffer_8, sint16 * dwt_buffer_16, sint16 * dwt_buffer_32)
+rfx_dwt_2d_encode(sint16 * buffer, sint16 * dwt_buffer)
{
- rfx_dwt_2d_encode_block(buffer, dwt_buffer_32, 32);
- rfx_dwt_2d_encode_block(buffer + 3072, dwt_buffer_16, 16);
- rfx_dwt_2d_encode_block(buffer + 3840, dwt_buffer_8, 8);
+ rfx_dwt_2d_encode_block(buffer, dwt_buffer, 32);
+ rfx_dwt_2d_encode_block(buffer + 3072, dwt_buffer, 16);
+ rfx_dwt_2d_encode_block(buffer + 3840, dwt_buffer, 8);
}
diff --git a/libfreerdp-rfx/rfx_dwt.h b/libfreerdp-rfx/rfx_dwt.h
index 6fee77d..449d61c 100644
--- a/libfreerdp-rfx/rfx_dwt.h
+++ b/libfreerdp-rfx/rfx_dwt.h
@@ -23,9 +23,9 @@
#include <freerdp/rfx.h>
void
-rfx_dwt_2d_decode(sint16 * buffer, sint16 * dwt_buffer_8, sint16 * dwt_buffer_16, sint16 * dwt_buffer_32);
+rfx_dwt_2d_decode(sint16 * buffer, sint16 * dwt_buffer);
void
-rfx_dwt_2d_encode(sint16 * buffer, sint16 * dwt_buffer_8, sint16 * dwt_buffer_16, sint16 * dwt_buffer_32);
+rfx_dwt_2d_encode(sint16 * buffer, sint16 * dwt_buffer);
#endif
diff --git a/libfreerdp-rfx/rfx_encode.c b/libfreerdp-rfx/rfx_encode.c
index e458dd1..cd20200 100644
--- a/libfreerdp-rfx/rfx_encode.c
+++ b/libfreerdp-rfx/rfx_encode.c
@@ -132,7 +132,7 @@ rfx_encode_component(RFX_CONTEXT * context, const uint32 * quantization_values,
PROFILER_ENTER(context->prof_rfx_encode_component);
PROFILER_ENTER(context->prof_rfx_dwt_2d_encode);
- rfx_dwt_2d_encode(data, context->dwt_buffer_8, context->dwt_buffer_16, context->dwt_buffer_32);
+ context->dwt_2d_encode(data, context->dwt_buffer);
PROFILER_EXIT(context->prof_rfx_dwt_2d_encode);
PROFILER_ENTER(context->prof_rfx_quantization_encode);
diff --git a/libfreerdp-rfx/sse/rfx_sse.c b/libfreerdp-rfx/sse/rfx_sse.c
index 0407323..76a632d 100644
--- a/libfreerdp-rfx/sse/rfx_sse.c
+++ b/libfreerdp-rfx/sse/rfx_sse.c
@@ -33,10 +33,12 @@ void rfx_init_sse(RFX_CONTEXT * context)
IF_PROFILER(context->prof_rfx_quantization_decode->name = "rfx_quantization_decode_SSE2");
IF_PROFILER(context->prof_rfx_quantization_encode->name = "rfx_quantization_encode_SSE2");
IF_PROFILER(context->prof_rfx_dwt_2d_decode->name = "rfx_dwt_2d_decode_SSE2");
+ IF_PROFILER(context->prof_rfx_dwt_2d_encode->name = "rfx_dwt_2d_encode_SSE2");
context->decode_YCbCr_to_RGB = rfx_decode_YCbCr_to_RGB_SSE2;
context->encode_RGB_to_YCbCr = rfx_encode_RGB_to_YCbCr_SSE2;
context->quantization_decode = rfx_quantization_decode_SSE2;
context->quantization_encode = rfx_quantization_encode_SSE2;
context->dwt_2d_decode = rfx_dwt_2d_decode_SSE2;
+ context->dwt_2d_encode = rfx_dwt_2d_encode_SSE2;
}
diff --git a/libfreerdp-rfx/sse/rfx_sse2.c b/libfreerdp-rfx/sse/rfx_sse2.c
index 63dfdcf..3434ec5 100644
--- a/libfreerdp-rfx/sse/rfx_sse2.c
+++ b/libfreerdp-rfx/sse/rfx_sse2.c
@@ -48,7 +48,15 @@ rfx_decode_YCbCr_to_RGB_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 *
__m128i * cb_g_buf = (__m128i*) cb_g_buffer;
__m128i * cr_b_buf = (__m128i*) cr_b_buffer;
+ __m128i y;
+ __m128i cr;
+ __m128i cb;
+ __m128i r;
+ __m128i g;
+ __m128i b;
+
int i;
+
for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i)))
{
_mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA);
@@ -58,14 +66,14 @@ rfx_decode_YCbCr_to_RGB_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 *
for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i++)
{
/* y = y_r_buf[i] + 128; */
- __m128i y = _mm_load_si128(&y_r_buf[i]);
+ y = _mm_load_si128(&y_r_buf[i]);
y = _mm_add_epi16(y, _mm_set1_epi16(128));
/* cr = cr_b_buf[i]; */
- __m128i cr = _mm_load_si128(&cr_b_buf[i]);
+ cr = _mm_load_si128(&cr_b_buf[i]);
/* r = between(y + cr + (cr >> 2) + (cr >> 3) + (cr >> 5), 0, 255); */
- __m128i r = _mm_add_epi16(y, cr);
+ r = _mm_add_epi16(y, cr);
r = _mm_add_epi16(r, _mm_srai_epi16(cr, 2));
r = _mm_add_epi16(r, _mm_srai_epi16(cr, 3));
r = _mm_add_epi16(r, _mm_srai_epi16(cr, 5));
@@ -73,10 +81,10 @@ rfx_decode_YCbCr_to_RGB_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 *
_mm_store_si128(&y_r_buf[i], r);
/* cb = cb_g_buf[i]; */
- __m128i cb = _mm_load_si128(&cb_g_buf[i]);
+ cb = _mm_load_si128(&cb_g_buf[i]);
/* g = between(y - (cb >> 2) - (cb >> 4) - (cb >> 5) - (cr >> 1) - (cr >> 3) - (cr >> 4) - (cr >> 5), 0, 255); */
- __m128i g = _mm_sub_epi16(y, _mm_srai_epi16(cb, 2));
+ g = _mm_sub_epi16(y, _mm_srai_epi16(cb, 2));
g = _mm_sub_epi16(g, _mm_srai_epi16(cb, 4));
g = _mm_sub_epi16(g, _mm_srai_epi16(cb, 5));
g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 1));
@@ -87,7 +95,7 @@ rfx_decode_YCbCr_to_RGB_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 *
_mm_store_si128(&cb_g_buf[i], g);
/* b = between(y + cb + (cb >> 1) + (cb >> 2) + (cb >> 6), 0, 255); */
- __m128i b = _mm_add_epi16(y, cb);
+ b = _mm_add_epi16(y, cb);
b = _mm_add_epi16(b, _mm_srai_epi16(cb, 1));
b = _mm_add_epi16(b, _mm_srai_epi16(cb, 2));
b = _mm_add_epi16(b, _mm_srai_epi16(cb, 6));
@@ -106,7 +114,15 @@ rfx_encode_RGB_to_YCbCr_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 *
__m128i * cb_g_buf = (__m128i*) cb_g_buffer;
__m128i * cr_b_buf = (__m128i*) cr_b_buffer;
+ __m128i y;
+ __m128i cr;
+ __m128i cb;
+ __m128i r;
+ __m128i g;
+ __m128i b;
+
int i;
+
for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i)))
{
_mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA);
@@ -116,17 +132,17 @@ rfx_encode_RGB_to_YCbCr_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 *
for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i++)
{
/* r = y_r_buf[i]; */
- __m128i r = _mm_load_si128(&y_r_buf[i]);
+ r = _mm_load_si128(&y_r_buf[i]);
/* g = cb_g_buf[i]; */
- __m128i g = _mm_load_si128(&cb_g_buf[i]);
+ g = _mm_load_si128(&cb_g_buf[i]);
/* b = cr_b_buf[i]; */
- __m128i b = _mm_load_si128(&cr_b_buf[i]);
+ b = _mm_load_si128(&cr_b_buf[i]);
/* y = ((r >> 2) + (r >> 5) + (r >> 6)) + ((g >> 1) + (g >> 4) + (g >> 6) + (g >> 7)) + ((b >> 4) + (b >> 5) + (b >> 6)); */
/* y_r_buf[i] = MINMAX(y, 0, 255) - 128; */
- __m128i y = _mm_add_epi16(_mm_srai_epi16(r, 2), _mm_srai_epi16(r, 5));
+ y = _mm_add_epi16(_mm_srai_epi16(r, 2), _mm_srai_epi16(r, 5));
y = _mm_add_epi16(y, _mm_srai_epi16(r, 6));
y = _mm_add_epi16(y, _mm_srai_epi16(g, 1));
y = _mm_add_epi16(y, _mm_srai_epi16(g, 4));
@@ -141,7 +157,7 @@ rfx_encode_RGB_to_YCbCr_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 *
/* cb = 0 - ((r >> 3) + (r >> 5) + (r >> 7)) - ((g >> 2) + (g >> 4) + (g >> 6)) + (b >> 1); */
/* cb_g_buf[i] = MINMAX(cb, -128, 127); */
- __m128i cb = _mm_sub_epi16(_mm_srai_epi16(b, 1), _mm_srai_epi16(r, 3));
+ cb = _mm_sub_epi16(_mm_srai_epi16(b, 1), _mm_srai_epi16(r, 3));
cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 5));
cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 7));
cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 2));
@@ -152,7 +168,7 @@ rfx_encode_RGB_to_YCbCr_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 *
/* cr = (r >> 1) - ((g >> 2) + (g >> 3) + (g >> 5) + (g >> 7)) - ((b >> 4) + (b >> 6)); */
/* cr_b_buf[i] = MINMAX(cr, -128, 127); */
- __m128i cr = _mm_sub_epi16(_mm_srai_epi16(r, 1), _mm_srai_epi16(g, 2));
+ cr = _mm_sub_epi16(_mm_srai_epi16(r, 1), _mm_srai_epi16(g, 2));
cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 3));
cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 5));
cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 7));
@@ -244,29 +260,39 @@ rfx_dwt_2d_decode_block_horiz_SSE2(sint16 * l, sint16 * h, sint16 * dst, int sub
sint16 * l_ptr = l;
sint16 * h_ptr = h;
sint16 * dst_ptr = dst;
+ int first;
+ int last;
+ __m128i l_n;
+ __m128i h_n;
+ __m128i h_n_m;
+ __m128i tmp_n;
+ __m128i dst_n;
+ __m128i dst_n_p;
+ __m128i dst1;
+ __m128i dst2;
for (y = 0; y < subband_width; y++)
{
/* Even coefficients */
for (n = 0; n < subband_width; n+=8)
{
- // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
+ /* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */
- __m128i l_n = _mm_load_si128((__m128i*) l_ptr);
+ l_n = _mm_load_si128((__m128i*) l_ptr);
- __m128i h_n = _mm_load_si128((__m128i*) h_ptr);
- __m128i h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - 1));
+ h_n = _mm_load_si128((__m128i*) h_ptr);
+ h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - 1));
if (n == 0)
{
- int first = _mm_extract_epi16(h_n_m, 1);
+ first = _mm_extract_epi16(h_n_m, 1);
h_n_m = _mm_insert_epi16(h_n_m, first, 0);
}
- __m128i tmp_n = _mm_add_epi16(h_n, h_n_m);
+ tmp_n = _mm_add_epi16(h_n, h_n_m);
tmp_n = _mm_add_epi16(tmp_n, _mm_set1_epi16(1));
tmp_n = _mm_srai_epi16(tmp_n, 1);
- __m128i dst_n = _mm_sub_epi16(l_n, tmp_n);
+ dst_n = _mm_sub_epi16(l_n, tmp_n);
_mm_store_si128((__m128i*) l_ptr, dst_n);
@@ -279,27 +305,27 @@ rfx_dwt_2d_decode_block_horiz_SSE2(sint16 * l, sint16 * h, sint16 * dst, int sub
/* Odd coefficients */
for (n = 0; n < subband_width; n+=8)
{
- // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
+ /* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */
- __m128i h_n = _mm_load_si128((__m128i*) h_ptr);
+ h_n = _mm_load_si128((__m128i*) h_ptr);
h_n = _mm_slli_epi16(h_n, 1);
- __m128i dst_n = _mm_load_si128((__m128i*) (l_ptr));
- __m128i dst_n_p = _mm_loadu_si128((__m128i*) (l_ptr + 1));
+ dst_n = _mm_load_si128((__m128i*) (l_ptr));
+ dst_n_p = _mm_loadu_si128((__m128i*) (l_ptr + 1));
if (n == subband_width - 8)
{
- int last = _mm_extract_epi16(dst_n_p, 6);
+ last = _mm_extract_epi16(dst_n_p, 6);
dst_n_p = _mm_insert_epi16(dst_n_p, last, 7);
}
- __m128i tmp_n = _mm_add_epi16(dst_n_p, dst_n);
+ tmp_n = _mm_add_epi16(dst_n_p, dst_n);
tmp_n = _mm_srai_epi16(tmp_n, 1);
tmp_n = _mm_add_epi16(tmp_n, h_n);
- __m128i dst1 = _mm_unpacklo_epi16(dst_n, tmp_n);
- __m128i dst2 = _mm_unpackhi_epi16(dst_n, tmp_n);
+ dst1 = _mm_unpacklo_epi16(dst_n, tmp_n);
+ dst2 = _mm_unpackhi_epi16(dst_n, tmp_n);
_mm_store_si128((__m128i*) dst_ptr, dst1);
_mm_store_si128((__m128i*) (dst_ptr + 8), dst2);
@@ -318,6 +344,13 @@ rfx_dwt_2d_decode_block_vert_SSE2(sint16 * l, sint16 * h, sint16 * dst, int subb
sint16 * l_ptr = l;
sint16 * h_ptr = h;
sint16 * dst_ptr = dst;
+ __m128i l_n;
+ __m128i h_n;
+ __m128i tmp_n;
+ __m128i h_n_m;
+ __m128i dst_n;
+ __m128i dst_n_m;
+ __m128i dst_n_p;
int total_width = subband_width + subband_width;
@@ -326,22 +359,22 @@ rfx_dwt_2d_decode_block_vert_SSE2(sint16 * l, sint16 * h, sint16 * dst, int subb
{
for (x = 0; x < total_width; x+=8)
{
- // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
+ /* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */
- __m128i l_n = _mm_load_si128((__m128i*) l_ptr);
- __m128i h_n = _mm_load_si128((__m128i*) h_ptr);
+ l_n = _mm_load_si128((__m128i*) l_ptr);
+ h_n = _mm_load_si128((__m128i*) h_ptr);
- __m128i tmp_n = _mm_add_epi16(h_n, _mm_set1_epi16(1));;
+ tmp_n = _mm_add_epi16(h_n, _mm_set1_epi16(1));;
if (n == 0)
tmp_n = _mm_add_epi16(tmp_n, h_n);
else
{
- __m128i h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - total_width));
+ h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - total_width));
tmp_n = _mm_add_epi16(tmp_n, h_n_m);
}
tmp_n = _mm_srai_epi16(tmp_n, 1);
- __m128i dst_n = _mm_sub_epi16(l_n, tmp_n);
+ dst_n = _mm_sub_epi16(l_n, tmp_n);
_mm_store_si128((__m128i*) dst_ptr, dst_n);
l_ptr+=8;
@@ -359,23 +392,23 @@ rfx_dwt_2d_decode_block_vert_SSE2(sint16 * l, sint16 * h, sint16 * dst, int subb
{
for (x = 0; x < total_width; x+=8)
{
- // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
+ /* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */
- __m128i h_n = _mm_load_si128((__m128i*) h_ptr);
- __m128i dst_n_m = _mm_load_si128((__m128i*) (dst_ptr - total_width));
+ h_n = _mm_load_si128((__m128i*) h_ptr);
+ dst_n_m = _mm_load_si128((__m128i*) (dst_ptr - total_width));
h_n = _mm_slli_epi16(h_n, 1);
- __m128i tmp_n = dst_n_m;
+ tmp_n = dst_n_m;
if (n == subband_width - 1)
tmp_n = _mm_add_epi16(tmp_n, dst_n_m);
else
{
- __m128i dst_n_p = _mm_loadu_si128((__m128i*) (dst_ptr + total_width));
+ dst_n_p = _mm_loadu_si128((__m128i*) (dst_ptr + total_width));
tmp_n = _mm_add_epi16(tmp_n, dst_n_p);
}
tmp_n = _mm_srai_epi16(tmp_n, 1);
- __m128i dst_n = _mm_add_epi16(tmp_n, h_n);
+ dst_n = _mm_add_epi16(tmp_n, h_n);
_mm_store_si128((__m128i*) dst_ptr, dst_n);
h_ptr+=8;
@@ -415,11 +448,160 @@ rfx_dwt_2d_decode_block_SSE2(sint16 * buffer, sint16 * idwt, int subband_width)
}
void
-rfx_dwt_2d_decode_SSE2(sint16 * buffer, sint16 * dwt_buffer_8, sint16 * dwt_buffer_16, sint16 * dwt_buffer_32)
+rfx_dwt_2d_decode_SSE2(sint16 * buffer, sint16 * dwt_buffer)
+{
+ _mm_prefetch_buffer((char *) buffer, 4096 * sizeof(sint16));
+
+ rfx_dwt_2d_decode_block_SSE2(buffer + 3840, dwt_buffer, 8);
+ rfx_dwt_2d_decode_block_SSE2(buffer + 3072, dwt_buffer, 16);
+ rfx_dwt_2d_decode_block_SSE2(buffer, dwt_buffer, 32);
+}
+
+static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+rfx_dwt_2d_encode_block_vert_SSE2(sint16 * src, sint16 * l, sint16 * h, int subband_width)
+{
+ int total_width;
+ int x;
+ int n;
+ __m128i src_2n;
+ __m128i src_2n_1;
+ __m128i src_2n_2;
+ __m128i h_n;
+ __m128i h_n_m;
+ __m128i l_n;
+
+ total_width = subband_width << 1;
+
+ for (n = 0; n < subband_width; n++)
+ {
+ for (x = 0; x < total_width; x += 8)
+ {
+ src_2n = _mm_load_si128((__m128i*) src);
+ src_2n_1 = _mm_load_si128((__m128i*) (src + total_width));
+ if (n < subband_width - 1)
+ src_2n_2 = _mm_load_si128((__m128i*) (src + 2 * total_width));
+ else
+ src_2n_2 = src_2n_1;
+
+ /* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */
+
+ h_n = _mm_add_epi16(src_2n, src_2n_2);
+ h_n = _mm_srai_epi16(h_n, 1);
+ h_n = _mm_sub_epi16(src_2n_1, h_n);
+ h_n = _mm_srai_epi16(h_n, 1);
+
+ _mm_store_si128((__m128i*) h, h_n);
+
+ if (n == 0)
+ h_n_m = h_n;
+ else
+ h_n_m = _mm_load_si128((__m128i*) (h - total_width));
+
+ /* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */
+
+ l_n = _mm_add_epi16(h_n_m, h_n);
+ l_n = _mm_srai_epi16(l_n, 1);
+ l_n = _mm_add_epi16(l_n, src_2n);
+
+ _mm_store_si128((__m128i*) l, l_n);
+
+ src += 8;
+ l += 8;
+ h += 8;
+ }
+ src += total_width;
+ }
+}
+
+static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+rfx_dwt_2d_encode_block_horiz_SSE2(sint16 * src, sint16 * l, sint16 * h, int subband_width)
+{
+ int y;
+ int n;
+ int first;
+ __m128i src_2n;
+ __m128i src_2n_1;
+ __m128i src_2n_2;
+ __m128i h_n;
+ __m128i h_n_m;
+ __m128i l_n;
+
+ for (y = 0; y < subband_width; y++)
+ {
+ for (n = 0; n < subband_width; n += 8)
+ {
+ /* The following 3 Set operations consumes more than half of the total DWT processing time! */
+ src_2n = _mm_set_epi16(src[14], src[12], src[10], src[8], src[6], src[4], src[2], src[0]);
+ src_2n_1 = _mm_set_epi16(src[15], src[13], src[11], src[9], src[7], src[5], src[3], src[1]);
+ src_2n_2 = _mm_set_epi16(n == subband_width - 8 ? src[15] : src[16],
+ src[14], src[12], src[10], src[8], src[6], src[4], src[2]);
+
+ /* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */
+
+ h_n = _mm_add_epi16(src_2n, src_2n_2);
+ h_n = _mm_srai_epi16(h_n, 1);
+ h_n = _mm_sub_epi16(src_2n_1, h_n);
+ h_n = _mm_srai_epi16(h_n, 1);
+
+ _mm_store_si128((__m128i*) h, h_n);
+
+ h_n_m = _mm_loadu_si128((__m128i*) (h - 1));
+ if (n == 0)
+ {
+ first = _mm_extract_epi16(h_n_m, 1);
+ h_n_m = _mm_insert_epi16(h_n_m, first, 0);
+ }
+
+ /* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */
+
+ l_n = _mm_add_epi16(h_n_m, h_n);
+ l_n = _mm_srai_epi16(l_n, 1);
+ l_n = _mm_add_epi16(l_n, src_2n);
+
+ _mm_store_si128((__m128i*) l, l_n);
+
+ src += 16;
+ l += 8;
+ h += 8;
+ }
+ }
+}
+
+static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+rfx_dwt_2d_encode_block_SSE2(sint16 * buffer, sint16 * dwt, int subband_width)
+{
+ sint16 * hl, * lh, * hh, * ll;
+ sint16 * l_src, * h_src;
+
+ _mm_prefetch_buffer((char *) dwt, subband_width * 4 * sizeof(sint16));
+
+ /* DWT in vertical direction, results in 2 sub-bands in L, H order in tmp buffer dwt. */
+
+ l_src = dwt;
+ h_src = dwt + subband_width * subband_width * 2;
+
+ rfx_dwt_2d_encode_block_vert_SSE2(buffer, l_src, h_src, subband_width);
+
+ /* DWT in horizontal direction, results in 4 sub-bands in HL(0), LH(1), HH(2), LL(3) order, stored in original buffer. */
+ /* The lower part L generates LL(3) and HL(0). */
+ /* The higher part H generates LH(1) and HH(2). */
+
+ ll = buffer + subband_width * subband_width * 3;
+ hl = buffer;
+
+ lh = buffer + subband_width * subband_width;
+ hh = buffer + subband_width * subband_width * 2;
+
+ rfx_dwt_2d_encode_block_horiz_SSE2(l_src, ll, hl, subband_width);
+ rfx_dwt_2d_encode_block_horiz_SSE2(h_src, lh, hh, subband_width);
+}
+
+void
+rfx_dwt_2d_encode_SSE2(sint16 * buffer, sint16 * dwt_buffer)
{
_mm_prefetch_buffer((char *) buffer, 4096 * sizeof(sint16));
- rfx_dwt_2d_decode_block_SSE2(buffer + 3840, dwt_buffer_8, 8);
- rfx_dwt_2d_decode_block_SSE2(buffer + 3072, dwt_buffer_16, 16);
- rfx_dwt_2d_decode_block_SSE2(buffer, dwt_buffer_32, 32);
+ rfx_dwt_2d_encode_block_SSE2(buffer, dwt_buffer, 32);
+ rfx_dwt_2d_encode_block_SSE2(buffer + 3072, dwt_buffer, 16);
+ rfx_dwt_2d_encode_block_SSE2(buffer + 3840, dwt_buffer, 8);
}
diff --git a/libfreerdp-rfx/sse/rfx_sse2.h b/libfreerdp-rfx/sse/rfx_sse2.h
index 8f35f7c..85921da 100644
--- a/libfreerdp-rfx/sse/rfx_sse2.h
+++ b/libfreerdp-rfx/sse/rfx_sse2.h
@@ -26,6 +26,7 @@ void rfx_decode_YCbCr_to_RGB_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sin
void rfx_encode_RGB_to_YCbCr_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer);
void rfx_quantization_decode_SSE2(sint16 * buffer, const uint32 * quantization_values);
void rfx_quantization_encode_SSE2(sint16 * buffer, const uint32 * quantization_values);
-void rfx_dwt_2d_decode_SSE2(sint16 * buffer, sint16 * dwt_buffer_8, sint16 * dwt_buffer_16, sint16 * dwt_buffer_32);
+void rfx_dwt_2d_decode_SSE2(sint16 * buffer, sint16 * dwt_buffer);
+void rfx_dwt_2d_encode_SSE2(sint16 * buffer, sint16 * dwt_buffer);
#endif /* __RFX_SSE2_H */
diff --git a/libfreerdp-utils/unicode.c b/libfreerdp-utils/unicode.c
index dda7f96..c4c6fc7 100644
--- a/libfreerdp-utils/unicode.c
+++ b/libfreerdp-utils/unicode.c
@@ -17,6 +17,7 @@
limitations under the License.
*/
+#include "config.h"
#include <errno.h>
#include <freerdp/utils/memory.h>