diff options
author | Marc-André Moreau <marcandre.moreau@gmail.com> | 2011-06-28 18:19:40 +0400 |
---|---|---|
committer | Marc-André Moreau <marcandre.moreau@gmail.com> | 2011-06-28 18:19:40 +0400 |
commit | 46c1d75a72521b548e8f0f677b86f3aab01027e8 (patch) | |
tree | 0a5f57437e5c7cfcfee3db4a71564bc4211a4b06 | |
parent | b2e7d3410410a6e79e95ed6c3ba354fdd4e7ed05 (diff) | |
parent | c78f823e681a9e53cb8f89d78ab9bdf65405656c (diff) |
Merge pull request #54 from FreeRDP/remotefx
Remotefx
58 files changed, 3264 insertions, 393 deletions
diff --git a/Makefile.am b/Makefile.am index 5d63bc9..daa2a61 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,9 +1,8 @@ ## Process this file with automake to produce Makefile.in REQUIRED_SUBDIRS = \ libfreerdp-asn1 \ - libfreerdp-gdi \ - libfreerdp-rfx/sse \ libfreerdp-rfx \ + libfreerdp-gdi \ libfreerdp-utils \ libfreerdp-core \ docs \ diff --git a/X11/xf_win.c b/X11/xf_win.c index ce1cc7f..3f6accf 100644 --- a/X11/xf_win.c +++ b/X11/xf_win.c @@ -1073,6 +1073,8 @@ RD_BOOL l_ui_check_certificate(rdpInst * inst, const char * fingerprint, const char * subject, const char * issuer, RD_BOOL verified) { + //char answer; + printf("certificate details:\n"); printf(" Subject:\n %s\n", subject); printf(" Issued by:\n %s\n", issuer); @@ -1081,7 +1083,17 @@ l_ui_check_certificate(rdpInst * inst, const char * fingerprint, if (!verified) printf("The server could not be authenticated. Connection security may be compromised!\n"); +#if 0 + printf("Accept this certificate? (Y/N): "); + answer = fgetc(stdin); + + if (answer == 'y' || answer == 'Y') + return True; + else + return False; +#else return True; +#endif } static int diff --git a/configure.ac b/configure.ac index 39e3863..6cf6e02 100644 --- a/configure.ac +++ b/configure.ac @@ -19,7 +19,9 @@ AH_TEMPLATE(IPv6, [IPv6]) AH_TEMPLATE(NEED_ALIGN, [Alignment]) AH_TEMPLATE(DISABLE_TLS, [Disable TLS encryption]) AH_TEMPLATE(WITH_SSE, [Enable SSE Optimizations]) +AH_TEMPLATE(WITH_NEON, [Enable NEON Optimizations]) AH_TEMPLATE(WITH_XKBFILE, [Use xkbfile for keyboard handling]) +AH_TEMPLATE(WITH_PROFILER, [Turn on the code profiler]) AH_TEMPLATE(WITH_DEBUG, [Turn on debugging messages]) AH_TEMPLATE(WITH_DEBUG_RDP, [Turn on debugging messages]) AH_TEMPLATE(WITH_DEBUG_GDI, [Turn on debugging messages]) @@ -1026,24 +1028,52 @@ AC_ARG_WITH([cunit], ) # -# SSE +# Profiler # -sse="yes" -AC_ARG_WITH([sse], - [AS_HELP_STRING([--with-sse], [Enable SSE Optimizations [default=yes]])]) -AS_IF([test "x$with_sse" == xno], +profiler="no" +AC_ARG_WITH([profiler], + [AS_HELP_STRING([--with-profiler], [enable the code profiler])]) +AS_IF([test "x$with_profiler" == xyes], [ - sse="no" - AM_CONDITIONAL(WITH_SSE, false) + profiler="yes" + AM_CONDITIONAL(WITH_PROFILER, true) + AC_DEFINE(WITH_PROFILER,1) ], [ - sse="yes" + profiler="no" + AM_CONDITIONAL(WITH_PROFILER, false) + ] +) + +# +# SSE +# +AM_CONDITIONAL(WITH_SSE, false) +AC_ARG_WITH(sse, + [ --with-sse enable SSE optimizations], + [ + if test $withval != "no"; + then AM_CONDITIONAL(WITH_SSE, true) AC_DEFINE(WITH_SSE,1) CFLAGS="$CFLAGS -msse2" - REQUIRED_SUBDIRS="libfreerdp-rfx/sse $REQUIRED_SUBDIRS" - ] -) + fi + ]) + +# +# NEON +# +AM_CONDITIONAL(WITH_NEON, false) +AC_ARG_WITH(neon, + [ --with-neon enable NEON optimizations], + [ + if test $withval != "no"; + then + AM_CONDITIONAL(WITH_NEON, true) + AC_DEFINE(WITH_NEON,1) + CFLAGS="$CFLAGS -mfpu=neon" + fi + ]) # # X11 @@ -1139,9 +1169,12 @@ AC_CONFIG_FILES([ Makefile freerdp.pc libfreerdp-asn1/Makefile -libfreerdp-gdi/Makefile libfreerdp-rfx/Makefile libfreerdp-rfx/sse/Makefile +libfreerdp-rfx/neon/Makefile +libfreerdp-gdi/Makefile +libfreerdp-gdi/sse/Makefile +libfreerdp-gdi/neon/Makefile libfreerdp-utils/Makefile libfreerdp-core/Makefile docs/Makefile diff --git a/cunit/test_librfx.c b/cunit/test_librfx.c index 42913d4..78c453d 100644 --- a/cunit/test_librfx.c +++ b/cunit/test_librfx.c @@ -32,6 +32,7 @@ #include "rfx_quantization.h" #include "rfx_dwt.h" #include "rfx_decode.h" +#include "rfx_encode.h" #include "test_librfx.h" @@ -60,7 +61,7 @@ static const uint8 y_data[] = static const uint8 cb_data[] = { - 0x1b, 0x04, 0x7f, 0x04, 0x31, 0x5f, 0xc2, + 0x1b, 0x04, 0x7f, 0x04, 0x31, 0x5f, 0xc2, 0x94, 0xaf, 0x05, 0x29, 0x5e, 0x0a, 0x52, 0xbc, 0x14, 0xa5, 0x78, 0x29, 0x25, 0x78, 0x29, 0x25, 0x78, 0x29, 0x25, 0x68, 0x52, 0x4a, 0xf0, 0x52, 0x4a, 0xf0, 0x52, 0x4a, 0xd0, 0xa4, 0x95, 0xe0, 0xa4, 0x95, 0xe0, 0xa4, 0x95, 0xa1, 0x49, 0x2b, 0xc1, 0x49, 0x2b, 0xc1, 0x49, 0x2b, 0x42, 0x92, @@ -85,7 +86,7 @@ static const uint8 cb_data[] = static const uint8 cr_data[] = { - 0x1b, 0xfc, 0x11, 0xc1, 0x0f, 0x4a, 0xc1, 0x4f, 0x4a, 0xc1, + 0x1b, 0xfc, 0x11, 0xc1, 0x0f, 0x4a, 0xc1, 0x4f, 0x4a, 0xc1, 0x4f, 0x4a, 0xa1, 0x4d, 0x95, 0x42, 0x9e, 0x95, 0x42, 0x9e, 0x95, 0x42, 0x9b, 0x2a, 0x85, 0x3d, 0x2a, 0x85, 0x3d, 0x2a, 0x85, 0x36, 0x55, 0x0a, 0x7a, 0x55, 0x0a, 0x7a, 0x55, 0x0a, 0x6c, 0xaa, 0x14, 0xf4, 0xaa, 0x14, 0xf4, 0xaa, 0x14, 0xd9, 0x54, 0x29, 0xe9, 0x54, 0x29, 0xe9, 0x54, 0x29, @@ -114,6 +115,44 @@ static const unsigned int test_quantization_values[] = 6, 6, 6, 6, 7, 7, 8, 8, 8, 9 }; +static const uint8 rgb_scanline_data[] = +{ + 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, + 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, + 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, + 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, + 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, + 0xFF, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, + 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, + 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, + 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, + 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, + 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, + 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, + 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, + 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, + 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, + 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, + + 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, + 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, + 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, + 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, + 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, + 0xFF, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, + 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, + 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, + 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, + 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, + 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, + 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, + 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, + 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, + 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, + 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF +}; + +static uint8 * rgb_data; int init_librfx_suite(void) { @@ -130,11 +169,14 @@ int add_librfx_suite(void) add_test_suite(librfx); add_test_function(bitstream); + add_test_function(bitstream_enc); add_test_function(rlgr); add_test_function(differential); add_test_function(quantization); add_test_function(dwt); add_test_function(decode); + add_test_function(encode); + add_test_function(message); return 0; } @@ -142,7 +184,7 @@ int add_librfx_suite(void) void test_bitstream(void) { - unsigned int b; + uint16 b; RFX_BITSTREAM * bs; bs = rfx_bitstream_new(); @@ -157,10 +199,33 @@ test_bitstream(void) //printf("\n"); } -static unsigned int buffer[4096]; +void +test_bitstream_enc(void) +{ + uint8 buffer[10]; + RFX_BITSTREAM * bs; + int i; + + bs = rfx_bitstream_new(); + memset(buffer, 0, sizeof(buffer)); + rfx_bitstream_put_buffer(bs, buffer, sizeof(buffer)); + for (i = 0; i < 16; i++) + { + rfx_bitstream_put_bits(bs, i, 5); + } + /*for (i = 0; i < sizeof(buffer); i++) + { + printf("%X ", buffer[i]); + }*/ + rfx_bitstream_free(bs); + + //printf("\n"); +} + +static sint16 buffer[4096]; void -dump_buffer(int * buf, int n) +dump_buffer(sint16 * buf, int n) { int i; @@ -168,7 +233,7 @@ dump_buffer(int * buf, int n) { if (i % 16 == 0) printf("\n%04d ", i); - printf("% 3d ", buf[i]); + printf("% 4d ", buf[i]); } printf("\n"); } @@ -194,16 +259,7 @@ test_differential(void) void test_quantization(void) { - rfx_quantization_decode(buffer, 1024, test_quantization_values[0]); /* HL1 */ - rfx_quantization_decode(buffer + 1024, 1024, test_quantization_values[1]); /* LH1 */ - rfx_quantization_decode(buffer + 2048, 1024, test_quantization_values[2]); /* HH1 */ - rfx_quantization_decode(buffer + 3072, 256, test_quantization_values[3]); /* HL2 */ - rfx_quantization_decode(buffer + 3328, 256, test_quantization_values[4]); /* LH2 */ - rfx_quantization_decode(buffer + 3584, 256, test_quantization_values[5]); /* HH2 */ - rfx_quantization_decode(buffer + 3840, 64, test_quantization_values[6]); /* HL3 */ - rfx_quantization_decode(buffer + 3904, 64, test_quantization_values[7]); /* LH3 */ - rfx_quantization_decode(buffer + 3868, 64, test_quantization_values[8]); /* HH3 */ - rfx_quantization_decode(buffer + 4032, 64, test_quantization_values[9]); /* LL3 */ + rfx_quantization_decode(buffer, test_quantization_values); //dump_buffer(buffer, 4096); } @@ -213,13 +269,30 @@ test_dwt(void) RFX_CONTEXT * context; context = rfx_context_new(); - rfx_dwt_2d_decode(context, (int*) buffer + 3840, 8); - rfx_dwt_2d_decode(context, (int*) buffer + 3072, 16); - rfx_dwt_2d_decode(context, (int*) buffer, 32); + rfx_dwt_2d_decode(buffer, context->dwt_buffer); //dump_buffer(buffer, 4096); rfx_context_free(context); } +static void +dump_ppm_image(uint8 * image_buf) +{ + /* Dump a .ppm image. */ + static int frame_id = 0; + char buf[100]; + FILE * fp; + + snprintf(buf, sizeof(buf), "/tmp/FreeRDP_Frame_%d.ppm", frame_id); + fp = fopen(buf, "wb"); + fwrite("P6\n", 1, 3, fp); + fwrite("64 64\n", 1, 6, fp); + fwrite("255\n", 1, 4, fp); + fwrite(image_buf, 1, 4096 * 3, fp); + fflush(fp); + fclose(fp); + frame_id++; +} + void test_decode(void) { @@ -236,20 +309,91 @@ test_decode(void) decode_buffer); rfx_context_free(context); - /* Dump a .ppm image. */ - static int frame_id = 0; - char buf[100]; - FILE * fp; + dump_ppm_image(decode_buffer); +} - snprintf(buf, sizeof(buf), "/tmp/FreeRDP_Frame_%d.ppm", frame_id); - fp = fopen(buf, "wb"); - fwrite("P6\n", 1, 3, fp); - fwrite("64 64\n", 1, 6, fp); - fwrite("255\n", 1, 4, fp); - fwrite(decode_buffer, 1, 4096 * 3, fp); - fflush(fp); - fclose(fp); - frame_id++; +void +test_encode(void) +{ + RFX_CONTEXT * context; + uint8 ycbcr_buffer[1024000]; + int y_size, cb_size, cr_size; + int i; + uint8 decode_buffer[4096 * 3]; + + rgb_data = (uint8 *) malloc(64 * 64 * 3); + for (i = 0; i < 64; i++) + memcpy(rgb_data + i * 64 * 3, rgb_scanline_data, 64 * 3); + //hexdump(rgb_data, 64 * 64 * 3); + + context = rfx_context_new(); + context->mode = RLGR3; + rfx_context_set_pixel_format(context, RFX_PIXEL_FORMAT_RGB); + + rfx_encode_rgb(context, rgb_data, 64, 64, 64 * 3, + test_quantization_values, test_quantization_values, test_quantization_values, + ycbcr_buffer, sizeof(ycbcr_buffer), &y_size, &cb_size, &cr_size); + //dump_buffer(context->cb_g_buffer, 4096); + + /*printf("*** Y ***\n"); + hexdump(ycbcr_buffer, y_size); + printf("*** Cb ***\n"); + hexdump(ycbcr_buffer + y_size, cb_size); + printf("*** Cr ***\n"); + hexdump(ycbcr_buffer + y_size + cb_size, cr_size);*/ + + rfx_decode_rgb(context, + ycbcr_buffer, y_size, test_quantization_values, + ycbcr_buffer + y_size, cb_size, test_quantization_values, + ycbcr_buffer + y_size + cb_size, cr_size, test_quantization_values, + decode_buffer); + dump_ppm_image(decode_buffer); + + rfx_context_free(context); + free(rgb_data); } +void +test_message(void) +{ + RFX_CONTEXT * context; + uint8 buffer[1024000]; + int size; + int i, j; + RFX_RECT rect = {0, 0, 100, 80}; + RFX_MESSAGE * message; + rgb_data = (uint8 *) malloc(100 * 80 * 3); + for (i = 0; i < 80; i++) + memcpy(rgb_data + i * 100 * 3, rgb_scanline_data, 100 * 3); + + context = rfx_context_new(); + context->mode = RLGR3; + context->width = 800; + context->height = 600; + rfx_context_set_pixel_format(context, RFX_PIXEL_FORMAT_RGB); + + size = rfx_compose_message_header(context, buffer, sizeof(buffer)); + /*hexdump(buffer, size);*/ + message = rfx_process_message(context, buffer, size); + rfx_message_free(context, message); + + for (i = 0; i < 1000; i++) + { + size = rfx_compose_message_data(context, buffer, sizeof(buffer), + &rect, 1, rgb_data, 100, 80, 100 * 3); + /*hexdump(buffer, size);*/ + message = rfx_process_message(context, buffer, size); + if (i == 0) + { + for (j = 0; j < message->num_tiles; j++) + { + dump_ppm_image(message->tiles[j]->data); + } + } + rfx_message_free(context, message); + } + + rfx_context_free(context); + free(rgb_data); +} diff --git a/cunit/test_librfx.h b/cunit/test_librfx.h index 726146a..51d9bcf 100644 --- a/cunit/test_librfx.h +++ b/cunit/test_librfx.h @@ -26,6 +26,8 @@ int add_librfx_suite(void); void test_bitstream(void); void +test_bitstream_enc(void); +void test_rlgr(void); void test_differential(void); @@ -35,4 +37,8 @@ void test_dwt(void); void test_decode(void); +void +test_encode(void); +void +test_message(void); diff --git a/dfb/Makefile.am b/dfb/Makefile.am index c65c5bc..42079ce 100644 --- a/dfb/Makefile.am +++ b/dfb/Makefile.am @@ -23,6 +23,7 @@ dfbfreerdp_LDADD = \ ../libfreerdp-kbd/libfreerdp-kbd.la \ ../libfreerdp-chanman/libfreerdp-chanman.la \ ../libfreerdp-core/libfreerdp-core.la \ + ../libfreerdp-utils/libfreerdp-utils.la \ @DFB_LIBS@ -lfusion -ldirect -lz diff --git a/dfb/dfb_win.c b/dfb/dfb_win.c index 5dd7c89..ac92c57 100644 --- a/dfb/dfb_win.c +++ b/dfb/dfb_win.c @@ -568,12 +568,16 @@ dfb_post_connect(rdpInst * inst) } void -dfb_uninit(void * dfb_info) +dfb_uninit(rdpInst * inst) { - dfbInfo * dfbi; - dfbi = (dfbInfo *) dfb_info; - dfbi->primary->Release(dfbi->primary); - dfbi->dfb->Release(dfbi->dfb); + dfbInfo *dfbi = GET_DFBI(inst); + + if (inst->settings->software_gdi == 1) + { + gdi_free(inst); + dfbi->primary->Release(dfbi->primary); + dfbi->dfb->Release(dfbi->dfb); + } } int diff --git a/dfb/dfb_win.h b/dfb/dfb_win.h index b33ca39..7dfcb05 100644 --- a/dfb/dfb_win.h +++ b/dfb/dfb_win.h @@ -29,7 +29,7 @@ dfb_pre_connect(rdpInst * inst); int dfb_post_connect(rdpInst * inst); void -dfb_uninit(void * dfb_info); +dfb_uninit(rdpInst * inst); int dfb_get_fds(rdpInst * inst, void ** read_fds, int * read_count, void ** write_fds, int * write_count); diff --git a/dfb/dfbfreerdp.c b/dfb/dfbfreerdp.c index 275fe81..c207523 100644 --- a/dfb/dfbfreerdp.c +++ b/dfb/dfbfreerdp.c @@ -503,7 +503,6 @@ static int run_dfbfreerdp(rdpSet * settings, rdpChanMan * chan_man) { rdpInst * inst; - void * dfb_info; void * read_fds[32]; void * write_fds[32]; int read_count; @@ -643,10 +642,9 @@ run_dfbfreerdp(rdpSet * settings, rdpChanMan * chan_man) } } /* cleanup */ - dfb_info = inst->param1; inst->rdp_disconnect(inst); + dfb_uninit(inst); freerdp_free(inst); - dfb_uninit(dfb_info); return 0; } diff --git a/include/freerdp/constants/ui.h b/include/freerdp/constants/ui.h index 02c17e9..61db60c 100644 --- a/include/freerdp/constants/ui.h +++ b/include/freerdp/constants/ui.h @@ -54,7 +54,7 @@ #define PERF_ENABLE_DESKTOP_COMPOSITION 0x00000100 /* Surface Command */ -#define CMDTYPE_SET_SURFACE_BITS 0x0001 +#define CMDTYPE_SET_SURFACE_BITS 0x0001 #define CMDTYPE_FRAME_MARKER 0x0004 #define CMDTYPE_STREAM_SURFACE_BITS 0x0006 diff --git a/include/freerdp/rfx.h b/include/freerdp/rfx.h index 346a26f..932f5f5 100644 --- a/include/freerdp/rfx.h +++ b/include/freerdp/rfx.h @@ -22,6 +22,8 @@ #include "types/base.h" +#include <freerdp/utils/profiler.h> + #ifdef __cplusplus extern "C" { #endif @@ -39,9 +41,13 @@ extern "C" { #define WBT_FRAME_END 0xCCC5 #define WBT_REGION 0xCCC6 #define WBT_EXTENSION 0xCCC7 +#define CBT_REGION 0xCAC1 #define CBT_TILESET 0xCAC2 #define CBT_TILE 0xCAC3 +/* tileSize */ +#define CT_TILE_64x64 0x0040 + /* properties.flags */ #define CODEC_MODE 0x02 @@ -120,7 +126,8 @@ typedef struct _RFX_MESSAGE RFX_MESSAGE; struct _RFX_CONTEXT { - int flags; + uint16 flags; + uint16 properties; uint16 width; uint16 height; RLGR_MODE mode; @@ -128,29 +135,58 @@ struct _RFX_CONTEXT uint32 codec_id; uint32 codec_version; RFX_PIXEL_FORMAT pixel_format; + uint8 bytes_per_pixel; /* temporary data within a frame */ + uint32 frame_idx; uint8 num_quants; uint32 * quants; + uint8 quant_idx_y; + uint8 quant_idx_cb; + uint8 quant_idx_cr; /* pre-allocated buffers */ RFX_POOL* pool; /* memory pool */ - uint32 y_r_mem[4096+4]; /* 4096 = 64x64 (+ 4x4 = 16 for mem align) */ - uint32 cb_g_mem[4096+4]; /* 4096 = 64x64 (+ 4x4 = 16 for mem align) */ - uint32 cr_b_mem[4096+4]; /* 4096 = 64x64 (+ 4x4 = 16 for mem align) */ + sint16 y_r_mem[4096+8]; /* 4096 = 64x64 (+ 8x2 = 16 for mem align) */ + sint16 cb_g_mem[4096+8]; /* 4096 = 64x64 (+ 8x2 = 16 for mem align) */ + sint16 cr_b_mem[4096+8]; /* 4096 = 64x64 (+ 8x2 = 16 for mem align) */ - uint32* y_r_buffer; - uint32* cb_g_buffer; - uint32* cr_b_buffer; + sint16 * y_r_buffer; + sint16 * cb_g_buffer; + sint16 * cr_b_buffer; - uint32 idwt_buffer_8[256]; /* sub-band width 8 */ - uint32 idwt_buffer_16[1024]; /* sub-band width 16 */ - uint32 idwt_buffer_32[4096]; /* sub-band width 32 */ - uint32* idwt_buffers[5]; /* sub-band buffer array */ - - void (* decode_YCbCr_to_RGB)(uint32 * y_r_buf, uint32 * cb_g_buf, uint32 * cr_b_buf); + sint16 dwt_mem[32*32*2*2 + 8]; /* maximum sub-band width is 32 */ + + sint16 * dwt_buffer; + + /* routines */ + void (* decode_YCbCr_to_RGB)(sint16 * y_r_buf, sint16 * cb_g_buf, sint16 * cr_b_buf); + void (* encode_RGB_to_YCbCr)(sint16 * y_r_buf, sint16 * cb_g_buf, sint16 * cr_b_buf); + void (* quantization_decode)(sint16 * buffer, const uint32 * quantization_values); + void (* quantization_encode)(sint16 * buffer, const uint32 * quantization_values); + void (* dwt_2d_decode)(sint16 * buffer, sint16 * dwt_buffer); + void (* dwt_2d_encode)(sint16 * buffer, sint16 * dwt_buffer); + + /* profiler definitions */ + PROFILER_DEFINE(prof_rfx_decode_rgb); + PROFILER_DEFINE(prof_rfx_decode_component); + PROFILER_DEFINE(prof_rfx_rlgr_decode); + PROFILER_DEFINE(prof_rfx_differential_decode); + PROFILER_DEFINE(prof_rfx_quantization_decode); + PROFILER_DEFINE(prof_rfx_dwt_2d_decode); + PROFILER_DEFINE(prof_rfx_decode_YCbCr_to_RGB); + PROFILER_DEFINE(prof_rfx_decode_format_RGB); + + PROFILER_DEFINE(prof_rfx_encode_rgb); + PROFILER_DEFINE(prof_rfx_encode_component); + PROFILER_DEFINE(prof_rfx_rlgr_encode); + PROFILER_DEFINE(prof_rfx_differential_encode); + PROFILER_DEFINE(prof_rfx_quantization_encode); + PROFILER_DEFINE(prof_rfx_dwt_2d_encode); + PROFILER_DEFINE(prof_rfx_encode_RGB_to_YCbCr); + PROFILER_DEFINE(prof_rfx_encode_format_RGB); }; typedef struct _RFX_CONTEXT RFX_CONTEXT; @@ -158,9 +194,13 @@ RFX_CONTEXT* rfx_context_new(void); void rfx_context_free(RFX_CONTEXT * context); void rfx_context_set_pixel_format(RFX_CONTEXT * context, RFX_PIXEL_FORMAT pixel_format); -RFX_MESSAGE* rfx_process_message(RFX_CONTEXT * context, uint8 * data, int data_size); +RFX_MESSAGE* rfx_process_message(RFX_CONTEXT * context, uint8 * data, int size); void rfx_message_free(RFX_CONTEXT * context, RFX_MESSAGE * message); +int rfx_compose_message_header(RFX_CONTEXT * context, uint8 * buffer, int buffer_size); +int rfx_compose_message_data(RFX_CONTEXT * context, uint8 * buffer, int buffer_size, + const RFX_RECT * rects, int num_rects, uint8 * image_data, int width, int height, int rowstride); + #ifdef __cplusplus } #endif diff --git a/include/freerdp/utils/Makefile.am b/include/freerdp/utils/Makefile.am index eec4969..410babf 100644 --- a/include/freerdp/utils/Makefile.am +++ b/include/freerdp/utils/Makefile.am @@ -6,7 +6,9 @@ include_HEADERS = \ chan_plugin.h \ datablob.h \ memory.h \ + profiler.h \ semaphore.h \ + stopwatch.h \ stream.h \ unicode.h \ wait_obj.h diff --git a/include/freerdp/utils/profiler.h b/include/freerdp/utils/profiler.h new file mode 100644 index 0000000..1a149c1 --- /dev/null +++ b/include/freerdp/utils/profiler.h @@ -0,0 +1,71 @@ +/* + FreeRDP: A Remote Desktop Protocol client. + Profiler Utils + + Copyright 2011 Stephen Erisman + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __UTILS_PROFILER_H +#define __UTILS_PROFILER_H + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <stdio.h> + +#include <freerdp/utils/memory.h> +#include <freerdp/utils/stopwatch.h> + +struct _PROFILER +{ + char *name; + STOPWATCH *stopwatch; +}; +typedef struct _PROFILER PROFILER; + +PROFILER * profiler_create(char * name); +void profiler_free(PROFILER * profiler); + +void profiler_enter(PROFILER * profiler); +void profiler_exit(PROFILER * profiler); + +void profiler_print_header(); +void profiler_print(PROFILER * profiler); +void profiler_print_footer(); + +#ifdef WITH_PROFILER +#define IF_PROFILER(then) then +#define PROFILER_DEFINE(prof) PROFILER * prof +#define PROFILER_CREATE(prof,name) prof = profiler_create(name) +#define PROFILER_FREE(prof) profiler_free(prof) +#define PROFILER_ENTER(prof) profiler_enter(prof) +#define PROFILER_EXIT(prof) profiler_exit(prof) +#define PROFILER_PRINT_HEADER profiler_print_header() +#define PROFILER_PRINT(prof) profiler_print(prof) +#define PROFILER_PRINT_FOOTER profiler_print_footer() +#else +#define IF_PROFILER(then) ; +#define PROFILER_DEFINE(prof) ; +#define PROFILER_CREATE(prof,name) ; +#define PROFILER_FREE(prof) ; +#define PROFILER_ENTER(prof) ; +#define PROFILER_EXIT(prof) ; +#define PROFILER_PRINT_HEADER ; +#define PROFILER_PRINT(prof) ; +#define PROFILER_PRINT_FOOTER ; +#endif + +#endif /* __UTILS_PROFILER_H */ diff --git a/include/freerdp/utils/stopwatch.h b/include/freerdp/utils/stopwatch.h new file mode 100644 index 0000000..02db7ea --- /dev/null +++ b/include/freerdp/utils/stopwatch.h @@ -0,0 +1,44 @@ +/* + FreeRDP: A Remote Desktop Protocol client. + Stopwatch Utils + + Copyright 2011 Stephen Erisman + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __UTILS_STOPWATCH_H +#define __UTILS_STOPWATCH_H + +#include <freerdp/utils/memory.h> +#include <time.h> + +struct _STOPWATCH +{ + clock_t start; + clock_t end; + double elapsed; + clock_t count; +}; +typedef struct _STOPWATCH STOPWATCH; + +STOPWATCH * stopwatch_create(); +void stopwatch_free(STOPWATCH * stopwatch); + +void stopwatch_start(STOPWATCH * stopwatch); +void stopwatch_stop(STOPWATCH * stopwatch); +void stopwatch_reset(STOPWATCH * stopwatch); + +double stopwatch_get_elapsed_time_in_seconds(STOPWATCH * stopwatch); + +#endif /* __UTILS_STOPWATCH_H */ diff --git a/libfreerdp-gdi/Makefile.am b/libfreerdp-gdi/Makefile.am index bda50bc..2210079 100644 --- a/libfreerdp-gdi/Makefile.am +++ b/libfreerdp-gdi/Makefile.am @@ -21,6 +21,7 @@ libfreerdp_gdi_la_SOURCES = \ gdi_8bpp.c gdi_8bpp.h \ color.c color.h \ decode.c decode.h \ + libgdi.h \ gdi.c gdi.h libfreerdp_gdi_la_CFLAGS = \ @@ -30,9 +31,21 @@ libfreerdp_gdi_la_CFLAGS = \ libfreerdp_gdi_la_LDFLAGS = -libfreerdp_gdi_la_LIBDADD = \ +libfreerdp_gdi_la_LIBADD = \ ../libfreerdp-rfx/libfreerdp-rfx.la +if WITH_SSE +SUBDIRS = sse +libfreerdp_gdi_la_CFLAGS += -I./sse +libfreerdp_gdi_la_LIBADD += ./sse/libfreerdp-gdi-sse.la +endif + +if WITH_NEON +SUBDIRS = neon +libfreerdp_gdi_la_CFLAGS += -I./neon +libfreerdp_gdi_la_LIBADD += ./neon/libfreerdp-gdi-neon.la +endif + # extra EXTRA_DIST = diff --git a/libfreerdp-gdi/color.c b/libfreerdp-gdi/color.c index 371f69a..76f72e9 100644 --- a/libfreerdp-gdi/color.c +++ b/libfreerdp-gdi/color.c @@ -618,7 +618,7 @@ uint8* gdi_image_convert_32bpp(uint8* srcData, uint8* dstData, int width, int he return srcData; } -p_gdi_image_convert gdi_image_convert_[5] = +p_gdi_image_convert_bpp gdi_image_convert_[5] = { NULL, gdi_image_convert_8bpp, @@ -629,7 +629,7 @@ p_gdi_image_convert gdi_image_convert_[5] = uint8* gdi_image_convert(uint8* srcData, uint8* dstData, int width, int height, int srcBpp, int dstBpp, HCLRCONV clrconv) { - p_gdi_image_convert _p_gdi_image_convert = gdi_image_convert_[IBPP(srcBpp)]; + p_gdi_image_convert_bpp _p_gdi_image_convert = gdi_image_convert_[IBPP(srcBpp)]; if (_p_gdi_image_convert != NULL) return _p_gdi_image_convert(srcData, dstData, width, height, srcBpp, dstBpp, clrconv); diff --git a/libfreerdp-gdi/color.h b/libfreerdp-gdi/color.h index 2e116c1..14e4f1e 100644 --- a/libfreerdp-gdi/color.h +++ b/libfreerdp-gdi/color.h @@ -233,7 +233,7 @@ typedef CLRCONV* HCLRCONV; #define IBPP(_bpp) (((_bpp + 1)/ 8) % 5) -typedef uint8* (*p_gdi_image_convert)(uint8* srcData, uint8* dstData, int width, int height, int srcBpp, int dstBpp, HCLRCONV clrconv); +typedef uint8* (*p_gdi_image_convert_bpp)(uint8* srcData, uint8* dstData, int width, int height, int srcBpp, int dstBpp, HCLRCONV clrconv); int gdi_get_pixel(uint8 * data, int x, int y, int width, int height, int bpp); void gdi_set_pixel(uint8* data, int x, int y, int width, int height, int bpp, int pixel); @@ -244,6 +244,8 @@ uint8* gdi_mono_image_convert(uint8* srcData, int width, int height, int srcBpp, int gdi_mono_cursor_convert(uint8* srcData, uint8* maskData, uint8* xorMask, uint8* andMask, int width, int height, int bpp, HCLRCONV clrconv); int gdi_alpha_cursor_convert(uint8* alphaData, uint8* xorMask, uint8* andMask, int width, int height, int bpp, HCLRCONV clrconv); +typedef uint8* (*p_gdi_image_convert)(uint8* srcData, uint8 *dstData, int width, int height, int srcBpp, int dstBpp, HCLRCONV clrconv); + #ifdef __cplusplus } #endif diff --git a/libfreerdp-gdi/decode.c b/libfreerdp-gdi/decode.c index 133d706..9f3f4c0 100644 --- a/libfreerdp-gdi/decode.c +++ b/libfreerdp-gdi/decode.c @@ -30,67 +30,150 @@ #include "decode.h" -void gdi_decode_frame(GDI *gdi, int x, int y, uint8 * data, uint32 length) +int gdi_decode_bitmap_data_ex(GDI *gdi, uint16 x, uint16 y, uint8 * data, int size) { - int i, tx, ty; + int i, j; + int tx, ty; + uint8* bitmapData; + uint32 bitmapDataLength; RFX_MESSAGE * message; - message = rfx_process_message((RFX_CONTEXT *) gdi->rfx_context, data, length); + /* BITMAP_DATA_EX */ + /* bpp (1 byte) */ + /* reserved1 (1 byte) */ + /* reserved2 (1 byte) */ + /* codecID (1 byte) */ + /* width (2 bytes) */ + /* height (2 bytes) */ + bitmapDataLength = GET_UINT32(data, 8); /* bitmapDataLength (4 bytes) */ + bitmapData = data + 12; /* bitmapData */ - for (i = 0; i < message->num_rects; i++) + /* decode bitmap data */ + message = rfx_process_message((RFX_CONTEXT *) gdi->rfx_context, bitmapData, bitmapDataLength); + + if (message->num_rects > 1) /* RDVH */ { - tx = message->rects[i].x + x; - ty = message->rects[i].y + y; - gdi_SetClipRgn(gdi->primary->hdc, tx, ty, message->rects[i].width, message->rects[i].height); - } + /* blit each tile */ + for (i = 0; i < message->num_tiles; i++) + { + tx = message->tiles[i]->x + x; + ty = message->tiles[i]->y + y; + data = message->tiles[i]->data; + + gdi_image_convert(data, gdi->tile->bitmap->data, 64, 64, 32, 32, gdi->clrconv); + + for (j = 0; j < message->num_rects; j++) + { + gdi_SetClipRgn(gdi->primary->hdc, + message->rects[j].x, message->rects[j].y, + message->rects[j].width, message->rects[j].height); + + gdi_BitBlt(gdi->primary->hdc, tx, ty, 64, 64, gdi->tile->hdc, 0, 0, GDI_SRCCOPY); + } + } - for (i = 0; i < message->num_tiles; i++) + for (i = 0; i < message->num_rects; i++) + { + gdi_InvalidateRegion(gdi->primary->hdc, + message->rects[i].x, message->rects[i].y, + message->rects[i].width, message->rects[i].height); + } + } + else /* RDSH */ { - tx = message->tiles[i]->x + x; - ty = message->tiles[i]->y + y; - data = message->tiles[i]->data; + /* blit each tile */ + for (i = 0; i < message->num_tiles; i++) + { + tx = message->tiles[i]->x + x; + ty = message->tiles[i]->y + y; + data = message->tiles[i]->data; - gdi_image_convert(data, gdi->tile->bitmap->data, 64, 64, 32, 32, gdi->clrconv); - gdi_BitBlt(gdi->primary->hdc, tx, ty, 64, 64, gdi->tile->hdc, 0, 0, GDI_SRCCOPY); + gdi_image_convert(data, gdi->tile->bitmap->data, 64, 64, 32, 32, gdi->clrconv); - gdi_InvalidateRegion(gdi->primary->hdc, tx, ty, 64, 64); + gdi_BitBlt(gdi->primary->hdc, tx, ty, 64, 64, gdi->tile->hdc, 0, 0, GDI_SRCCOPY); + + gdi_InvalidateRegion(gdi->primary->hdc, tx, ty, 64, 64); + } } rfx_message_free(gdi->rfx_context, message); + + return bitmapDataLength + 12; } -void gdi_decode_data(GDI *gdi, uint8 * data, int data_size) +int gdi_decode_surface_bits(GDI *gdi, uint8 * data, int size) { - int size; - int destLeft; - int destTop; + int length; + uint16 destLeft; + uint16 destTop; + uint16 destRight; + uint16 destBottom; + + /* SURFCMD_STREAM_SURF_BITS */ + /* cmdType (2 bytes) */ + destLeft = GET_UINT16(data, 2); /* destLeft (2 bytes) */ + destTop = GET_UINT16(data, 4); /* destTop (2 bytes) */ + destRight = GET_UINT16(data, 6); /* destRight (2 bytes) */ + destBottom = GET_UINT16(data, 8); /* destBottom (2 bytes) */ + + /* set clipping region */ + gdi_SetClipRgn(gdi->primary->hdc, destLeft, destTop, destRight - destLeft, destBottom - destTop); + + /* decode extended bitmap data */ + length = gdi_decode_bitmap_data_ex(gdi, destLeft, destTop, data + 10, size - 10) + 10; + + return length; +} + +int gdi_decode_frame_marker(GDI *gdi, uint8 * data, int size) +{ + uint16 frameAction; + uint32 frameId; + + frameAction = GET_UINT16(data, 0); /* frameAction */ + frameId = GET_UINT32(data, 2); /* frameId */ + + switch (frameAction) + { + case SURFACECMD_FRAMEACTION_BEGIN: + break; + + case SURFACECMD_FRAMEACTION_END: + break; + + default: + break; + } + + return 8; +} + +void gdi_decode_data(GDI *gdi, uint8 * data, int size) +{ + int cmdLength; uint16 cmdType; - uint32 length; - while (data_size > 0) + while (size > 0) { - cmdType = GET_UINT16(data, 0); + cmdType = GET_UINT16(data, 0); /* cmdType */ switch (cmdType) { case CMDTYPE_SET_SURFACE_BITS: case CMDTYPE_STREAM_SURFACE_BITS: - destLeft = GET_UINT16(data, 2); - destTop = GET_UINT16(data, 4); - length = GET_UINT32(data, 18); - gdi_decode_frame(gdi, destLeft, destTop, data + 22, length); - size = 22 + length; + cmdLength = gdi_decode_surface_bits(gdi, data, size); break; case CMDTYPE_FRAME_MARKER: - size = 8; + cmdLength = gdi_decode_frame_marker(gdi, data, size); break; default: - size = 2; + cmdLength = 2; break; } - data_size -= size; - data += size; + + size -= cmdLength; + data += cmdLength; } } diff --git a/libfreerdp-gdi/decode.h b/libfreerdp-gdi/decode.h index ccadb29..408241a 100644 --- a/libfreerdp-gdi/decode.h +++ b/libfreerdp-gdi/decode.h @@ -25,7 +25,7 @@ #include "gdi.h" -void gdi_decode_frame(GDI *gdi, int x, int y, uint8 * data, uint32 length); -void gdi_decode_data(GDI *gdi, uint8 * data, int data_size); +void gdi_decode_bitmap_data(GDI *gdi, int x, int y, uint8 * data, uint32 length); +void gdi_decode_data(GDI *gdi, uint8 * data, int size); #endif /* __DECODE_H */ diff --git a/libfreerdp-gdi/gdi.c b/libfreerdp-gdi/gdi.c index 8e86736..ab25825 100644 --- a/libfreerdp-gdi/gdi.c +++ b/libfreerdp-gdi/gdi.c @@ -23,19 +23,7 @@ #include <freerdp/rfx.h> #include <freerdp/freerdp.h> -#include "color.h" -#include "decode.h" - -#include "gdi_dc.h" -#include "gdi_pen.h" -#include "gdi_line.h" -#include "gdi_shape.h" -#include "gdi_brush.h" -#include "gdi_region.h" -#include "gdi_bitmap.h" -#include "gdi_palette.h" -#include "gdi_drawing.h" -#include "gdi_clipping.h" +#include "libgdi.h" #include "gdi.h" @@ -1188,6 +1176,11 @@ gdi_init(rdpInst * inst, uint32 flags) gdi_register_callbacks(inst); + gdi->BitBlt = gdi_BitBlt; + gdi->gdi_image_convert = gdi_image_convert; + + GDI_INIT_SIMD(gdi); + return 0; } @@ -1197,6 +1190,8 @@ void gdi_free(rdpInst* inst) if (gdi) { + gdi_bitmap_free(gdi->tile); + rfx_context_free(gdi->rfx_context); gdi_bitmap_free(gdi->primary); gdi_DeleteObject((HGDIOBJECT) gdi->hdc); free(gdi->clrconv); diff --git a/libfreerdp-gdi/gdi.h b/libfreerdp-gdi/gdi.h index 9398d36..38f2194 100644 --- a/libfreerdp-gdi/gdi.h +++ b/libfreerdp-gdi/gdi.h @@ -229,6 +229,17 @@ struct _GDI_IMAGE typedef struct _GDI_IMAGE GDI_IMAGE; typedef GDI_IMAGE* HGDI_IMAGE; +#include "gdi_dc.h" +#include "gdi_pen.h" +#include "gdi_line.h" +#include "gdi_shape.h" +#include "gdi_brush.h" +#include "gdi_region.h" +#include "gdi_bitmap.h" +#include "gdi_palette.h" +#include "gdi_drawing.h" +#include "gdi_clipping.h" + struct _GDI { int width; @@ -247,9 +258,15 @@ struct _GDI GDI_COLOR textColor; void * rfx_context; GDI_IMAGE *tile; + + /* callbacks */ + p_gdi_BitBlt BitBlt; + p_gdi_image_convert gdi_image_convert; }; typedef struct _GDI GDI; +#include "decode.h" + uint32 gdi_rop3_code(uint8 code); void gdi_copy_mem(uint8 *d, uint8 *s, int n); void gdi_copy_memb(uint8 *d, uint8 *s, int n); diff --git a/libfreerdp-gdi/gdi_bitmap.c b/libfreerdp-gdi/gdi_bitmap.c index 973a4ea..c5af510 100644 --- a/libfreerdp-gdi/gdi_bitmap.c +++ b/libfreerdp-gdi/gdi_bitmap.c @@ -31,7 +31,7 @@ #include "gdi_bitmap.h" -pBitBlt BitBlt_[5] = +p_gdi_BitBlt_bpp BitBlt_[5] = { NULL, BitBlt_8bpp, @@ -180,7 +180,7 @@ HGDI_BITMAP gdi_CreateCompatibleBitmap(HGDI_DC hdc, int nWidth, int nHeight) int gdi_BitBlt(HGDI_DC hdcDest, int nXDest, int nYDest, int nWidth, int nHeight, HGDI_DC hdcSrc, int nXSrc, int nYSrc, int rop) { - pBitBlt _BitBlt = BitBlt_[IBPP(hdcDest->bitsPerPixel)]; + p_gdi_BitBlt_bpp _BitBlt = BitBlt_[IBPP(hdcDest->bitsPerPixel)]; if (_BitBlt != NULL) return _BitBlt(hdcDest, nXDest, nYDest, nWidth, nHeight, hdcSrc, nXSrc, nYSrc, rop); diff --git a/libfreerdp-gdi/gdi_bitmap.h b/libfreerdp-gdi/gdi_bitmap.h index f6943a0..e4f7604 100644 --- a/libfreerdp-gdi/gdi_bitmap.h +++ b/libfreerdp-gdi/gdi_bitmap.h @@ -37,6 +37,7 @@ HGDI_BITMAP gdi_CreateBitmap(int nWidth, int nHeight, int cBitsPerPixel, uint8* HGDI_BITMAP gdi_CreateCompatibleBitmap(HGDI_DC hdc, int nWidth, int nHeight); int gdi_BitBlt(HGDI_DC hdcDest, int nXDest, int nYDest, int nWidth, int nHeight, HGDI_DC hdcSrc, int nXSrc, int nYSrc, int rop); -typedef int (*pBitBlt)(HGDI_DC hdcDest, int nXDest, int nYDest, int nWidth, int nHeight, HGDI_DC hdcSrc, int nXSrc, int nYSrc, int rop); +typedef int (*p_gdi_BitBlt_bpp)(HGDI_DC hdcDest, int nXDest, int nYDest, int nWidth, int nHeight, HGDI_DC hdcSrc, int nXSrc, int nYSrc, int rop); +typedef int (*p_gdi_BitBlt)(HGDI_DC hdcDest, int nXDest, int nYDest, int nWidth, int nHeight, HGDI_DC hdcSrc, int nXSrc, int nYSrc, int rop); #endif /* __GDI_BITMAP_H */ diff --git a/libfreerdp-gdi/libgdi.h b/libfreerdp-gdi/libgdi.h new file mode 100644 index 0000000..22f48f1 --- /dev/null +++ b/libfreerdp-gdi/libgdi.h @@ -0,0 +1,35 @@ +/* + FreeRDP: A Remote Desktop Protocol client. + GDI Library + + Copyright 2011 Marc-Andre Moreau <marcandre.moreau@gmail.com> + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __LIBGDI_H +#define __LIBGDI_H + +#ifdef WITH_SSE +#include "gdi_sse.h" +#endif + +#ifdef WITH_NEON +#include "gdi_neon.h" +#endif + +#ifndef GDI_INIT_SIMD +#define GDI_INIT_SIMD(_gdi) do { } while (0) +#endif + +#endif /* __LIBGDI_H */ diff --git a/libfreerdp-gdi/neon/Makefile.am b/libfreerdp-gdi/neon/Makefile.am new file mode 100644 index 0000000..27aee48 --- /dev/null +++ b/libfreerdp-gdi/neon/Makefile.am @@ -0,0 +1,25 @@ +## Process this file with automake to produce Makefile.in + +# libfreerdp-gdi-neon +noinst_LTLIBRARIES = libfreerdp-gdi-neon.la + +libfreerdp_gdi_neon_la_SOURCES = + +if WITH_NEON +libfreerdp_gdi_neon_la_SOURCES += \ + gdi_neon.c gdi_neon.h +endif + +libfreerdp_gdi_neon_la_CFLAGS = \ + -I$(top_srcdir) \ + -I$(top_srcdir)/include \ + -I.. + +libfreerdp_gdi_neon_la_LDFLAGS = + +libfreerdp_gdi_neon_la_LIBDADD = + +# extra +EXTRA_DIST = + +DISTCLEANFILES = diff --git a/libfreerdp-gdi/neon/gdi_neon.c b/libfreerdp-gdi/neon/gdi_neon.c new file mode 100644 index 0000000..a0a92b7 --- /dev/null +++ b/libfreerdp-gdi/neon/gdi_neon.c @@ -0,0 +1,32 @@ +/* + FreeRDP: A Remote Desktop Protocol client. + GDI NEON Optimizations + + Copyright 2011 Marc-Andre Moreau <marcandre.moreau@gmail.com> + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include <stdio.h> +#include <string.h> +#include <stdlib.h> + +#include <freerdp/freerdp.h> +#include "gdi.h" + +#include "gdi_neon.h" + +void gdi_init_neon(GDI* gdi) +{ + +} diff --git a/libfreerdp-gdi/neon/gdi_neon.h b/libfreerdp-gdi/neon/gdi_neon.h new file mode 100644 index 0000000..ef94d46 --- /dev/null +++ b/libfreerdp-gdi/neon/gdi_neon.h @@ -0,0 +1,31 @@ +/* + FreeRDP: A Remote Desktop Protocol client. + GDI NEON Optimizations + + Copyright 2011 Marc-Andre Moreau <marcandre.moreau@gmail.com> + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __GDI_NEON_H +#define __GDI_NEON_H + +#include "gdi.h" + +void gdi_init_neon(GDI* gdi); + +#ifndef GDI_INIT_SIMD +#define GDI_INIT_SIMD(_gdi) gdi_init_neon(_gdi) +#endif + +#endif /* __GDI_NEON_H */ diff --git a/libfreerdp-gdi/sse/Makefile.am b/libfreerdp-gdi/sse/Makefile.am new file mode 100644 index 0000000..23f6e27 --- /dev/null +++ b/libfreerdp-gdi/sse/Makefile.am @@ -0,0 +1,25 @@ +## Process this file with automake to produce Makefile.in + +# libfreerdp-gdi-sse +noinst_LTLIBRARIES = libfreerdp-gdi-sse.la + +libfreerdp_gdi_sse_la_SOURCES = + +if WITH_SSE +libfreerdp_gdi_sse_la_SOURCES += \ + gdi_sse.c gdi_sse.h +endif + +libfreerdp_gdi_sse_la_CFLAGS = \ + -I$(top_srcdir) \ + -I$(top_srcdir)/include \ + -I.. + +libfreerdp_gdi_sse_la_LDFLAGS = + +libfreerdp_gdi_sse_la_LIBDADD = + +# extra +EXTRA_DIST = + +DISTCLEANFILES = diff --git a/libfreerdp-gdi/sse/gdi_sse.c b/libfreerdp-gdi/sse/gdi_sse.c new file mode 100644 index 0000000..fb0cc8a --- /dev/null +++ b/libfreerdp-gdi/sse/gdi_sse.c @@ -0,0 +1,32 @@ +/* + FreeRDP: A Remote Desktop Protocol client. + GDI SSE Optimizations + + Copyright 2011 Marc-Andre Moreau <marcandre.moreau@gmail.com> + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include <stdio.h> +#include <string.h> +#include <stdlib.h> + +#include <freerdp/freerdp.h> +#include "gdi.h" + +#include "gdi_sse.h" + +void gdi_init_sse(GDI* gdi) +{ + +} diff --git a/libfreerdp-gdi/sse/gdi_sse.h b/libfreerdp-gdi/sse/gdi_sse.h new file mode 100644 index 0000000..e326503 --- /dev/null +++ b/libfreerdp-gdi/sse/gdi_sse.h @@ -0,0 +1,31 @@ +/* + FreeRDP: A Remote Desktop Protocol client. + GDI SSE Optimizations + + Copyright 2011 Marc-Andre Moreau <marcandre.moreau@gmail.com> + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __GDI_SSE_H +#define __GDI_SSE_H + +#include "gdi.h" + +void gdi_init_sse(GDI* gdi); + +#ifndef GDI_INIT_SIMD +#define GDI_INIT_SIMD(_gdi) gdi_init_sse(_gdi) +#endif + +#endif /* __GDI_SSE_H */ diff --git a/libfreerdp-rfx/Makefile.am b/libfreerdp-rfx/Makefile.am index bc60460..908dd06 100644 --- a/libfreerdp-rfx/Makefile.am +++ b/libfreerdp-rfx/Makefile.am @@ -12,6 +12,7 @@ libfreerdp_rfx_la_SOURCES = \ rfx_quantization.c rfx_quantization.h \ rfx_dwt.c rfx_dwt.h \ rfx_decode.c rfx_decode.h \ + rfx_encode.c rfx_encode.h \ rfx_pool.c rfx_pool.h \ librfx.c librfx.h @@ -24,10 +25,17 @@ libfreerdp_rfx_la_LDFLAGS = libfreerdp_rfx_la_LIBADD = if WITH_SSE +SUBDIRS = sse libfreerdp_rfx_la_CFLAGS += -I$(top_srcdir)/libfreerdp-rfx/sse libfreerdp_rfx_la_LIBADD += sse/libfreerdp-rfx-sse.la endif +if WITH_NEON +SUBDIRS = neon +libfreerdp_rfx_la_CFLAGS += -I$(top_srcdir)/libfreerdp-rfx/neon +libfreerdp_rfx_la_LIBADD += neon/libfreerdp-rfx-neon.la +endif + # extra EXTRA_DIST = diff --git a/libfreerdp-rfx/librfx.c b/libfreerdp-rfx/librfx.c index 6bc9f66..5d9b5a6 100644 --- a/libfreerdp-rfx/librfx.c +++ b/libfreerdp-rfx/librfx.c @@ -27,9 +27,96 @@ #include "rfx_pool.h" #include "rfx_decode.h" +#include "rfx_encode.h" +#include "rfx_quantization.h" +#include "rfx_dwt.h" #include "librfx.h" +/* + The quantization values control the compression rate and quality. The value + range is between 6 and 15. The higher value, the higher compression rate + and lower quality. + + This is the default values being use by the MS RDP server, and we will also + use it as our default values for the encoder. It can be overrided by setting + the context->num_quants and context->quants member. + + The order of the values are: + LL3, LH3, HL3, HH3, LH2, HL2, HH2, LH1, HL1, HH1 +*/ +static const uint32 rfx_default_quantization_values[] = +{ + 6, 6, 6, 6, 7, 7, 8, 8, 8, 9 +}; + +void rfx_profiler_create(RFX_CONTEXT * context) +{ + PROFILER_CREATE(context->prof_rfx_decode_rgb, "rfx_decode_rgb"); + PROFILER_CREATE(context->prof_rfx_decode_component, "rfx_decode_component"); + PROFILER_CREATE(context->prof_rfx_rlgr_decode, "rfx_rlgr_decode"); + PROFILER_CREATE(context->prof_rfx_differential_decode, "rfx_differential_decode"); + PROFILER_CREATE(context->prof_rfx_quantization_decode, "rfx_quantization_decode"); + PROFILER_CREATE(context->prof_rfx_dwt_2d_decode, "rfx_dwt_2d_decode"); + PROFILER_CREATE(context->prof_rfx_decode_YCbCr_to_RGB, "rfx_decode_YCbCr_to_RGB"); + PROFILER_CREATE(context->prof_rfx_decode_format_RGB, "rfx_decode_format_RGB"); + + PROFILER_CREATE(context->prof_rfx_encode_rgb, "rfx_encode_rgb"); + PROFILER_CREATE(context->prof_rfx_encode_component, "rfx_encode_component"); + PROFILER_CREATE(context->prof_rfx_rlgr_encode, "rfx_rlgr_encode"); + PROFILER_CREATE(context->prof_rfx_differential_encode, "rfx_differential_encode"); + PROFILER_CREATE(context->prof_rfx_quantization_encode, "rfx_quantization_encode"); + PROFILER_CREATE(context->prof_rfx_dwt_2d_encode, "rfx_dwt_2d_encode"); + PROFILER_CREATE(context->prof_rfx_encode_RGB_to_YCbCr, "rfx_encode_RGB_to_YCbCr"); + PROFILER_CREATE(context->prof_rfx_encode_format_RGB, "rfx_encode_format_RGB"); +} + +void rfx_profiler_free(RFX_CONTEXT * context) +{ + PROFILER_FREE(context->prof_rfx_decode_rgb); + PROFILER_FREE(context->prof_rfx_decode_component); + PROFILER_FREE(context->prof_rfx_rlgr_decode); + PROFILER_FREE(context->prof_rfx_differential_decode); + PROFILER_FREE(context->prof_rfx_quantization_decode); + PROFILER_FREE(context->prof_rfx_dwt_2d_decode); + PROFILER_FREE(context->prof_rfx_decode_YCbCr_to_RGB); + PROFILER_FREE(context->prof_rfx_decode_format_RGB); + + PROFILER_FREE(context->prof_rfx_encode_rgb); + PROFILER_FREE(context->prof_rfx_encode_component); + PROFILER_FREE(context->prof_rfx_rlgr_encode); + PROFILER_FREE(context->prof_rfx_differential_encode); + PROFILER_FREE(context->prof_rfx_quantization_encode); + PROFILER_FREE(context->prof_rfx_dwt_2d_encode); + PROFILER_FREE(context->prof_rfx_encode_RGB_to_YCbCr); + PROFILER_FREE(context->prof_rfx_encode_format_RGB); +} + +void rfx_profiler_print(RFX_CONTEXT * context) +{ + PROFILER_PRINT_HEADER; + + PROFILER_PRINT(context->prof_rfx_decode_rgb); + PROFILER_PRINT(context->prof_rfx_decode_component); + PROFILER_PRINT(context->prof_rfx_rlgr_decode); + PROFILER_PRINT(context->prof_rfx_differential_decode); + PROFILER_PRINT(context->prof_rfx_quantization_decode); + PROFILER_PRINT(context->prof_rfx_dwt_2d_decode); + PROFILER_PRINT(context->prof_rfx_decode_YCbCr_to_RGB); + PROFILER_PRINT(context->prof_rfx_decode_format_RGB); + + PROFILER_PRINT(context->prof_rfx_encode_rgb); + PROFILER_PRINT(context->prof_rfx_encode_component); + PROFILER_PRINT(context->prof_rfx_rlgr_encode); + PROFILER_PRINT(context->prof_rfx_differential_encode); + PROFILER_PRINT(context->prof_rfx_quantization_encode); + PROFILER_PRINT(context->prof_rfx_dwt_2d_encode); + PROFILER_PRINT(context->prof_rfx_encode_RGB_to_YCbCr); + PROFILER_PRINT(context->prof_rfx_encode_format_RGB); + + PROFILER_PRINT_FOOTER; +} + RFX_CONTEXT * rfx_context_new(void) { @@ -40,17 +127,26 @@ rfx_context_new(void) context->pool = rfx_pool_new(); + /* initialize the default pixel format */ + rfx_context_set_pixel_format(context, RFX_PIXEL_FORMAT_BGRA); + /* align buffers to 16 byte boundary (needed for SSE/SSE2 instructions) */ - context->y_r_buffer = (uint32 *)(((uintptr_t)context->y_r_mem + 16) & ~ 0x0F); - context->cb_g_buffer = (uint32 *)(((uintptr_t)context->cb_g_mem + 16) & ~ 0x0F); - context->cr_b_buffer = (uint32 *)(((uintptr_t)context->cr_b_mem + 16) & ~ 0x0F); + context->y_r_buffer = (sint16 *)(((uintptr_t)context->y_r_mem + 16) & ~ 0x0F); + context->cb_g_buffer = (sint16 *)(((uintptr_t)context->cb_g_mem + 16) & ~ 0x0F); + context->cr_b_buffer = (sint16 *)(((uintptr_t)context->cr_b_mem + 16) & ~ 0x0F); + + context->dwt_buffer = (sint16 *)(((uintptr_t)context->dwt_mem + 16) & ~ 0x0F); - context->idwt_buffers[1] = (uint32*) context->idwt_buffer_8; - context->idwt_buffers[2] = (uint32*) context->idwt_buffer_16; - context->idwt_buffers[4] = (uint32*) context->idwt_buffer_32; + /* create profilers for default decoding routines */ + rfx_profiler_create(context); - /* set up default decoding routines */ + /* set up default routines */ context->decode_YCbCr_to_RGB = rfx_decode_YCbCr_to_RGB; + context->encode_RGB_to_YCbCr = rfx_encode_RGB_to_YCbCr; + context->quantization_decode = rfx_quantization_decode; + context->quantization_encode = rfx_quantization_encode; + context->dwt_2d_decode = rfx_dwt_2d_decode; + context->dwt_2d_encode = rfx_dwt_2d_encode; /* detect and enable SIMD CPU acceleration */ RFX_INIT_SIMD(context); @@ -66,6 +162,9 @@ rfx_context_free(RFX_CONTEXT * context) rfx_pool_free(context->pool); + rfx_profiler_print(context); + rfx_profiler_free(context); + if (context != NULL) free(context); } @@ -74,14 +173,29 @@ void rfx_context_set_pixel_format(RFX_CONTEXT * context, RFX_PIXEL_FORMAT pixel_format) { context->pixel_format = pixel_format; + switch (pixel_format) + { + case RFX_PIXEL_FORMAT_BGRA: + case RFX_PIXEL_FORMAT_RGBA: + context->bytes_per_pixel = 4; + break; + case RFX_PIXEL_FORMAT_BGR: + case RFX_PIXEL_FORMAT_RGB: + context->bytes_per_pixel = 3; + break; + default: + context->bytes_per_pixel = 0; + break; + } } static void -rfx_process_message_sync(RFX_CONTEXT * context, uint8 * data, int data_size) +rfx_process_message_sync(RFX_CONTEXT * context, uint8 * data, int size) { uint32 magic; - magic = GET_UINT32(data, 0); + /* RFX_SYNC */ + magic = GET_UINT32(data, 0); /* magic (4 bytes), 0xCACCACCA */ if (magic != WF_MAGIC) { @@ -89,7 +203,7 @@ rfx_process_message_sync(RFX_CONTEXT * context, uint8 * data, int data_size) return; } - context->version = GET_UINT16(data, 4); + context->version = GET_UINT16(data, 4); /* version (2 bytes), WF_VERSION_1_0 (0x0100) */ if (context->version != WF_VERSION_1_0) { @@ -101,70 +215,68 @@ rfx_process_message_sync(RFX_CONTEXT * context, uint8 * data, int data_size) } static void -rfx_process_message_codec_versions(RFX_CONTEXT * context, uint8 * data, int data_size) +rfx_process_message_codec_versions(RFX_CONTEXT * context, uint8 * data, int size) { int numCodecs; - numCodecs = GET_UINT8(data, 0); + numCodecs = GET_UINT8(data, 0); /* numCodecs (1 byte), must be set to 0x01 */ - if (numCodecs < 1) + if (numCodecs != 1) { - DEBUG_RFX("no version."); + DEBUG_RFX("numCodecs: %d, expected:1", numCodecs); return; } - context->codec_id = GET_UINT8(data, 1); - context->codec_version = GET_UINT16(data, 2); + /* RFX_CODEC_VERSIONT */ + context->codec_id = GET_UINT8(data, 1); /* codecId (1 byte) */ + context->codec_version = GET_UINT16(data, 2); /* version (2 bytes) */ DEBUG_RFX("id %d version 0x%X.", context->codec_id, context->codec_version); } static void -rfx_process_message_channels(RFX_CONTEXT * context, uint8 * data, int data_size) +rfx_process_message_channels(RFX_CONTEXT * context, uint8 * data, int size) { int channelId; - int numChannels; + uint8 numChannels; - numChannels = GET_UINT8(data, 0); + numChannels = GET_UINT8(data, 0); /* numChannels (1 byte), must bet set to 0x01 */ - if (numChannels < 1) + if (numChannels != 1) { - DEBUG_RFX("no channel."); + DEBUG_RFX("numChannels:%d, expected:1", numChannels); return; } - channelId = GET_UINT8(data, 1); - context->width = GET_UINT16(data, 2); - context->height = GET_UINT16(data, 4); + /* RFX_CHANNELT */ + channelId = GET_UINT8(data, 1); /* channelId (1 byte) */ + context->width = GET_UINT16(data, 2); /* width (2 bytes) */ + context->height = GET_UINT16(data, 4); /* height (2 bytes) */ DEBUG_RFX("numChannels %d id %d, %dx%d.", numChannels, channelId, context->width, context->height); } static void -rfx_process_message_context(RFX_CONTEXT * context, uint8 * data, int data_size) +rfx_process_message_context(RFX_CONTEXT * context, uint8 * data, int size) { uint8 ctxId; - uint8 codecId; - uint8 channelId; uint16 tileSize; uint16 properties; - codecId = GET_UINT8(data, 0); - channelId = GET_UINT8(data, 1); - ctxId = GET_UINT8(data, 2); - tileSize = GET_UINT16(data, 3); - properties = GET_UINT16(data, 5); + ctxId = GET_UINT8(data, 0); /* ctxId (1 byte), must be set to 0x00 */ + tileSize = GET_UINT16(data, 1); /* tileSize (2 bytes), must be set to CT_TILE_64x64 (0x0040) */ + properties = GET_UINT16(data, 3); /* properties (2 bytes) */ - DEBUG_RFX("codec %d channel %d ctx %d tileSize %d properties 0x%X.", - codecId, channelId, ctxId, tileSize, properties); + DEBUG_RFX("ctxId %d tileSize %d properties 0x%X.", ctxId, tileSize, properties); + context->properties = properties; context->flags = (properties & 0x0007); if (context->flags == CODEC_MODE) - DEBUG_RFX("codec in image mode."); + DEBUG_RFX("codec is in image mode."); else - DEBUG_RFX("codec in video mode."); + DEBUG_RFX("codec is in video mode."); switch ((properties & 0x1E00) >> 9) { @@ -185,11 +297,30 @@ rfx_process_message_context(RFX_CONTEXT * context, uint8 * data, int data_size) } static void -rfx_process_message_region(RFX_CONTEXT * context, RFX_MESSAGE * message, uint8 * data, int data_size) +rfx_process_message_frame_begin(RFX_CONTEXT * context, RFX_MESSAGE * message, uint8 * data, int size) +{ + uint32 frameIdx; + uint16 numRegions; + + frameIdx = GET_UINT32(data, 0); /* frameIdx (4 bytes), if codec is in video mode, must be ignored */ + numRegions = GET_UINT16(data, 4); /* numRegions (2 bytes) */ + + DEBUG_RFX("RFX_FRAME_BEGIN: frameIdx:%d numRegions:%d", frameIdx, numRegions); +} + +static void +rfx_process_message_frame_end(RFX_CONTEXT * context, RFX_MESSAGE * message, uint8 * data, int size) +{ + DEBUG_RFX("RFX_FRAME_END"); +} + +static void +rfx_process_message_region(RFX_CONTEXT * context, RFX_MESSAGE * message, uint8 * data, int size) { int i; - message->num_rects = GET_UINT16(data, 3); + /* regionFlags (1 byte) */ + message->num_rects = GET_UINT16(data, 1); /* numRects (2 bytes) */ if (message->num_rects < 1) { @@ -202,26 +333,28 @@ rfx_process_message_region(RFX_CONTEXT * context, RFX_MESSAGE * message, uint8 * else message->rects = (RFX_RECT*) malloc(message->num_rects * sizeof(RFX_RECT)); - data += 5; - data_size -= 5; + data += 3; + size -= 3; - for (i = 0; i < message->num_rects && data_size > 0; i++) + /* rects */ + for (i = 0; i < message->num_rects && size > 0; i++) { - message->rects[i].x = GET_UINT16(data, 0); - message->rects[i].y = GET_UINT16(data, 2); - message->rects[i].width = GET_UINT16(data, 4); - message->rects[i].height = GET_UINT16(data, 6); + /* RFX_RECT */ + message->rects[i].x = GET_UINT16(data, 0); /* x (2 bytes) */ + message->rects[i].y = GET_UINT16(data, 2); /* y (2 bytes) */ + message->rects[i].width = GET_UINT16(data, 4); /* width (2 bytes) */ + message->rects[i].height = GET_UINT16(data, 6); /* height (2 bytes) */ DEBUG_RFX("rect %d (%d %d %d %d).", i, message->rects[i].x, message->rects[i].y, message->rects[i].width, message->rects[i].height); data += 8; - data_size -= 8; + size -= 8; } } static void -rfx_process_message_tile(RFX_CONTEXT * context, RFX_TILE * tile, uint8 * data, int data_size) +rfx_process_message_tile(RFX_CONTEXT * context, RFX_TILE * tile, uint8 * data, int size) { uint8 quantIdxY; uint8 quantIdxCb; @@ -229,14 +362,15 @@ rfx_process_message_tile(RFX_CONTEXT * context, RFX_TILE * tile, uint8 * data, i uint16 xIdx, yIdx; uint16 YLen, CbLen, CrLen; - quantIdxY = GET_UINT8(data, 0); - quantIdxCb = GET_UINT8(data, 1); - quantIdxCr = GET_UINT8(data, 2); - xIdx = GET_UINT16(data, 3); - yIdx = GET_UINT16(data, 5); - YLen = GET_UINT16(data, 7); - CbLen = GET_UINT16(data, 9); - CrLen = GET_UINT16(data, 11); + /* RFX_TILE */ + quantIdxY = GET_UINT8(data, 0); /* quantIdxY (1 byte) */ + quantIdxCb = GET_UINT8(data, 1); /* quantIdxCb (1 byte) */ + quantIdxCr = GET_UINT8(data, 2); /* quantIdxCr (1 byte) */ + xIdx = GET_UINT16(data, 3); /* xIdx (2 bytes) */ + yIdx = GET_UINT16(data, 5); /* yIdx (2 bytes) */ + YLen = GET_UINT16(data, 7); /* YLen (2 bytes) */ + CbLen = GET_UINT16(data, 9); /* CbLen (2 bytes) */ + CrLen = GET_UINT16(data, 11); /* CrLen (2 bytes) */ DEBUG_RFX("quantIdxY:%d quantIdxCb:%d quantIdxCr:%d xIdx:%d yIdx:%d YLen:%d CbLen:%d CrLen:%d", quantIdxY, quantIdxCb, quantIdxCr, xIdx, yIdx, YLen, CbLen, CrLen); @@ -253,14 +387,27 @@ rfx_process_message_tile(RFX_CONTEXT * context, RFX_TILE * tile, uint8 * data, i } static void -rfx_process_message_tileset(RFX_CONTEXT * context, RFX_MESSAGE * message, uint8 * data, int data_size) +rfx_process_message_tileset(RFX_CONTEXT * context, RFX_MESSAGE * message, uint8 * data, int size) { int i; + uint16 subtype; uint32 blockLen; uint32 blockType; - uint32 tileDataSize; + uint32 tilesDataSize; + + subtype = GET_UINT16(data, 0); /* subtype (2 bytes) must be set to CBT_TILESET (0xCAC2) */ + + if (subtype != CBT_TILESET) + { + DEBUG_RFX("invalid subtype, expected CBT_TILESET."); + return; + } - context->num_quants = GET_UINT8(data, 4); + /* idx (2 bytes), must be set to 0x0000 */ + /* properties (2 bytes) */ + + context->num_quants = GET_UINT8(data, 6); /* numQuant (1 byte) */ + /* tileSize (1 byte), must be set to 0x40 */ if (context->num_quants < 1) { @@ -268,7 +415,7 @@ rfx_process_message_tileset(RFX_CONTEXT * context, RFX_MESSAGE * message, uint8 return; } - message->num_tiles = GET_UINT16(data, 6); + message->num_tiles = GET_UINT16(data, 8); /* numTiles (2 bytes) */ if (message->num_tiles < 1) { @@ -276,18 +423,20 @@ rfx_process_message_tileset(RFX_CONTEXT * context, RFX_MESSAGE * message, uint8 return; } - tileDataSize = GET_UINT32(data, 8); + tilesDataSize = GET_UINT32(data, 10); /* tilesDataSize (4 bytes) */ - data += 12; - data_size -= 12; + data += 14; + size -= 14; if (context->quants != NULL) context->quants = (uint32*) realloc((void*) context->quants, context->num_quants * 10 * sizeof(uint32)); else context->quants = (uint32*) malloc(context->num_quants * 10 * sizeof(uint32)); - for (i = 0; i < context->num_quants && data_size > 0; i++) + /* quantVals */ + for (i = 0; i < context->num_quants && size > 0; i++) { + /* RFX_CODEC_QUANT */ context->quants[i * 10] = (data[0] & 0x0F); context->quants[i * 10 + 1] = (data[0] >> 4); context->quants[i * 10 + 2] = (data[1] & 0x0F); @@ -307,36 +456,35 @@ rfx_process_message_tileset(RFX_CONTEXT * context, RFX_MESSAGE * message, uint8 context->quants[i * 10 + 8], context->quants[i * 10 + 9]); data += 5; - data_size -= 5; + size -= 5; } message->tiles = rfx_pool_get_tiles(context->pool, message->num_tiles); - for (i = 0; i < message->num_tiles && data_size > 0; i++) + /* tiles */ + for (i = 0; i < message->num_tiles && size > 0; i++) { - blockType = GET_UINT16(data, 0); - blockLen = GET_UINT32(data, 2); + /* RFX_TILE */ + blockType = GET_UINT16(data, 0); /* blockType (2 bytes), must be set to CBT_TILE (0xCAC3) */ + blockLen = GET_UINT32(data, 2); /* blockLen (4 bytes) */ - switch (blockType) + if (blockType != CBT_TILE) { - case CBT_TILE: - rfx_process_message_tile(context, message->tiles[i], data + 6, blockLen - 6); - break; - - default: - DEBUG_RFX("unknown block type 0x%X", blockType); - break; + DEBUG_RFX("unknown block type 0x%X, expected CBT_TILE (0xCAC3).", blockType); + break; } - data_size -= blockLen; + rfx_process_message_tile(context, message->tiles[i], data + 6, blockLen - 6); + + size -= blockLen; data += blockLen; } } RFX_MESSAGE * -rfx_process_message(RFX_CONTEXT * context, uint8 * data, int data_size) +rfx_process_message(RFX_CONTEXT * context, uint8 * data, int size) { - uint32 subtype; + uint32 offset; uint32 blockLen; uint32 blockType; RFX_MESSAGE * message; @@ -344,50 +492,55 @@ rfx_process_message(RFX_CONTEXT * context, uint8 * data, int data_size) message = (RFX_MESSAGE *) malloc(sizeof(RFX_MESSAGE)); memset(message, 0, sizeof(RFX_MESSAGE)); - while (data_size > 0) + while (size > 0) { - blockType = GET_UINT16(data, 0); - blockLen = GET_UINT32(data, 2); + /* RFX_BLOCKT */ + blockType = GET_UINT16(data, 0); /* blockType (2 bytes) */ + blockLen = GET_UINT32(data, 2); /* blockLen (4 bytes) */ + offset = 6; + DEBUG_RFX("blockType 0x%X blockLen %d", blockType, blockLen); + if (blockType >= WBT_CONTEXT && blockType <= WBT_EXTENSION) + { + /* RFX_CODEC_CHANNELT */ + /* codecId (1 byte) must be set to 0x01 */ + /* channelId (1 byte) must be set to 0x00 */ + offset = 8; + } + switch (blockType) { case WBT_SYNC: - rfx_process_message_sync(context, data + 6, blockLen - 6); + rfx_process_message_sync(context, data + offset, blockLen - offset); break; case WBT_CODEC_VERSIONS: - rfx_process_message_codec_versions(context, data + 6, blockLen - 6); + rfx_process_message_codec_versions(context, data + offset, blockLen - offset); break; case WBT_CHANNELS: - rfx_process_message_channels(context, data + 6, blockLen - 6); + rfx_process_message_channels(context, data + offset, blockLen - offset); break; case WBT_CONTEXT: - rfx_process_message_context(context, data + 6, blockLen - 6); + rfx_process_message_context(context, data + offset, blockLen - offset); break; case WBT_FRAME_BEGIN: + rfx_process_message_frame_begin(context, message, data + offset, blockLen - offset); + break; + case WBT_FRAME_END: - /* Can be ignored. */ + rfx_process_message_frame_end(context, message, data + offset, blockLen - offset); break; case WBT_REGION: - rfx_process_message_region(context, message, data + 6, blockLen - 6); + rfx_process_message_region(context, message, data + offset, blockLen - offset); break; case WBT_EXTENSION: - subtype = GET_UINT16(data, 8); - switch (subtype) - { - case CBT_TILESET: - rfx_process_message_tileset(context, message, data + 10, blockLen - 10); - break; - default: - DEBUG_RFX("unknown subtype 0x%X", subtype); - break; - } + rfx_process_message_tileset(context, message, data + offset, blockLen - offset); break; default: @@ -395,7 +548,7 @@ rfx_process_message(RFX_CONTEXT * context, uint8 * data, int data_size) break; } - data_size -= blockLen; + size -= blockLen; data += blockLen; } @@ -419,3 +572,320 @@ rfx_message_free(RFX_CONTEXT * context, RFX_MESSAGE * message) free(message); } } + +static int +rfx_compose_message_sync(RFX_CONTEXT * context, uint8 * buffer, int buffer_size) +{ + if (buffer_size < 12) + { + printf("rfx_compose_message_sync: buffer size too small.\n"); + return 0; + } + + SET_UINT16(buffer, 0, WBT_SYNC); /* BlockT.blockType */ + SET_UINT32(buffer, 2, 12); /* BlockT.blockLen */ + SET_UINT32(buffer, 6, WF_MAGIC); /* magic */ + SET_UINT16(buffer, 10, WF_VERSION_1_0); /* version */ + + return 12; +} + +static int +rfx_compose_message_codec_versions(RFX_CONTEXT * context, uint8 * buffer, int buffer_size) +{ + if (buffer_size < 10) + { + printf("rfx_compose_message_codec_versions: buffer size too small.\n"); + return 0; + } + + SET_UINT16(buffer, 0, WBT_CODEC_VERSIONS); /* BlockT.blockType */ + SET_UINT32(buffer, 2, 10); /* BlockT.blockLen */ + SET_UINT8(buffer, 6, 1); /* numCodecs */ + SET_UINT8(buffer, 7, 1); /* codecs.codecId */ + SET_UINT16(buffer, 8, WF_VERSION_1_0); /* codecs.version */ + + return 10; +} + +static int +rfx_compose_message_channels(RFX_CONTEXT * context, uint8 * buffer, int buffer_size) +{ + if (buffer_size < 12) + { + printf("rfx_compose_message_channels: buffer size too small.\n"); + return 0; + } + + SET_UINT16(buffer, 0, WBT_CHANNELS); /* BlockT.blockType */ + SET_UINT32(buffer, 2, 12); /* BlockT.blockLen */ + SET_UINT8(buffer, 6, 1); /* numChannels */ + SET_UINT8(buffer, 7, 0); /* Channel.channelId */ + SET_UINT16(buffer, 8, context->width); /* Channel.width */ + SET_UINT16(buffer, 10, context->height); /* Channel.height */ + + return 12; +} + +static int +rfx_compose_message_context(RFX_CONTEXT * context, uint8 * buffer, int buffer_size) +{ + uint16 properties; + + if (buffer_size < 13) + { + printf("rfx_compose_message_context: buffer size too small.\n"); + return 0; + } + + SET_UINT16(buffer, 0, WBT_CONTEXT); /* CodecChannelT.blockType */ + SET_UINT32(buffer, 2, 13); /* CodecChannelT.blockLen */ + SET_UINT8(buffer, 6, 1); /* CodecChannelT.codecId */ + SET_UINT8(buffer, 7, 0); /* CodecChannelT.channelId */ + SET_UINT8(buffer, 8, 0); /* ctxId */ + SET_UINT16(buffer, 9, CT_TILE_64x64); /* tileSize */ + + /* properties */ + properties = context->flags; /* flags */ + properties |= (COL_CONV_ICT << 3); /* cct */ + properties |= (CLW_XFORM_DWT_53_A << 5); /* xft */ + properties |= ((context->mode == RLGR1 ? CLW_ENTROPY_RLGR1 : CLW_ENTROPY_RLGR3) << 9); /* et */ + properties |= (SCALAR_QUANTIZATION << 13); /* qt */ + SET_UINT16(buffer, 11, properties); + context->properties = properties; + + return 13; +} + +int +rfx_compose_message_header(RFX_CONTEXT * context, uint8 * buffer, int buffer_size) +{ + int composed_size; + + composed_size = rfx_compose_message_sync(context, buffer, buffer_size); + composed_size += rfx_compose_message_codec_versions(context, buffer + composed_size, buffer_size - composed_size); + composed_size += rfx_compose_message_channels(context, buffer + composed_size, buffer_size - composed_size); + composed_size += rfx_compose_message_context(context, buffer + composed_size, buffer_size - composed_size); + + return composed_size; +} + +static int +rfx_compose_message_frame_begin(RFX_CONTEXT * context, uint8 * buffer, int buffer_size) +{ + if (buffer_size < 14) + { + printf("rfx_compose_message_frame_begin: buffer size too small.\n"); + return 0; + } + + SET_UINT16(buffer, 0, WBT_FRAME_BEGIN); /* CodecChannelT.blockType */ + SET_UINT32(buffer, 2, 14); /* CodecChannelT.blockLen */ + SET_UINT8(buffer, 6, 1); /* CodecChannelT.codecId */ + SET_UINT8(buffer, 7, 0); /* CodecChannelT.channelId */ + SET_UINT32(buffer, 8, context->frame_idx); /* frameIdx */ + SET_UINT16(buffer, 12, 1); /* numRegions */ + + return 14; +} + +static int +rfx_compose_message_region(RFX_CONTEXT * context, uint8 * buffer, int buffer_size, + const RFX_RECT * rects, int num_rects) +{ + int size; + int i; + + if (buffer_size < 15 + num_rects * 8) + { + printf("rfx_compose_message_region: buffer size too small.\n"); + return 0; + } + + SET_UINT16(buffer, 0, WBT_REGION); /* CodecChannelT.blockType */ + /* set CodecChannelT.blockLen later */ + SET_UINT8(buffer, 6, 1); /* CodecChannelT.codecId */ + SET_UINT8(buffer, 7, 0); /* CodecChannelT.channelId */ + SET_UINT8(buffer, 8, 1); /* regionFlags */ + SET_UINT16(buffer, 9, num_rects); /* numRects */ + size = 11; + + for (i = 0; i < num_rects; i++) + { + SET_UINT16(buffer, size, rects[i].x); + SET_UINT16(buffer, size + 2, rects[i].y); + SET_UINT16(buffer, size + 4, rects[i].width); + SET_UINT16(buffer, size + 6, rects[i].height); + size += 8; + } + + SET_UINT16(buffer, size, CBT_REGION); /* regionType */ + SET_UINT16(buffer, size + 2, 1); /* numTilesets */ + size += 4; + + SET_UINT32(buffer, 2, size); /* CodecChannelT.blockLen */ + return size; +} + +static int +rfx_compose_message_tile(RFX_CONTEXT * context, uint8 * buffer, int buffer_size, + uint8 * tile_data, int tile_width, int tile_height, int rowstride, + const uint32 * quantVals, int quantIdxY, int quantIdxCb, int quantIdxCr, int xIdx, int yIdx) +{ + int YLen = 0; + int CbLen = 0; + int CrLen = 0; + int size; + + if (buffer_size < 19) + { + printf("rfx_compose_message_tile: buffer size too small.\n"); + return 0; + } + + SET_UINT16(buffer, 0, CBT_TILE); /* BlockT.blockType */ + /* set BlockT.blockLen later */ + SET_UINT8(buffer, 6, quantIdxY); /* quantIdxY */ + SET_UINT8(buffer, 7, quantIdxCb); /* quantIdxCb */ + SET_UINT8(buffer, 8, quantIdxCr); /* quantIdxCr */ + SET_UINT16(buffer, 9, xIdx); /* xIdx */ + SET_UINT16(buffer, 11, yIdx); /* yIdx */ + + rfx_encode_rgb(context, tile_data, tile_width, tile_height, rowstride, + quantVals + quantIdxY * 10, quantVals + quantIdxCb * 10, quantVals + quantIdxCr * 10, + buffer + 19, buffer_size - 19, &YLen, &CbLen, &CrLen); + + DEBUG_RFX("xIdx=%d yIdx=%d width=%d height=%d YLen=%d CbLen=%d CrLen=%d", + xIdx, yIdx, tile_width, tile_height, YLen, CbLen, CrLen); + + SET_UINT16(buffer, 13, YLen); /* YLen */ + SET_UINT16(buffer, 15, CbLen); /* CbLen */ + SET_UINT16(buffer, 17, CrLen); /* CrLen */ + size = 19 + YLen + CbLen + CrLen; + SET_UINT32(buffer, 2, size); /* BlockT.blockLen */ + + return size; +} + +static int +rfx_compose_message_tileset(RFX_CONTEXT * context, uint8 * buffer, int buffer_size, + uint8 * image_data, int width, int height, int rowstride) +{ + int size; + int i; + int numQuants; + const uint32 * quantVals; + const uint32 * quantValsPtr; + int quantIdxY; + int quantIdxCb; + int quantIdxCr; + int numTiles; + int numTilesX; + int numTilesY; + int xIdx; + int yIdx; + int tilesDataSize; + + if (context->num_quants == 0) + { + numQuants = 1; + quantVals = rfx_default_quantization_values; + quantIdxY = 0; + quantIdxCb = 0; + quantIdxCr = 0; + } + else + { + numQuants = context->num_quants; + quantVals = context->quants; + quantIdxY = context->quant_idx_y; + quantIdxCb = context->quant_idx_cb; + quantIdxCr = context->quant_idx_cr; + } + + numTilesX = (width + 63) / 64; + numTilesY = (height + 63) / 64; + numTiles = numTilesX * numTilesY; + + if (buffer_size < 22 + numQuants * 5) + { + printf("rfx_compose_message_tileset: buffer size too small.\n"); + return 0; + } + + SET_UINT16(buffer, 0, WBT_EXTENSION); /* CodecChannelT.blockType */ + /* set CodecChannelT.blockLen later */ + SET_UINT8(buffer, 6, 1); /* CodecChannelT.codecId */ + SET_UINT8(buffer, 7, 0); /* CodecChannelT.channelId */ + SET_UINT16(buffer, 8, CBT_TILESET); /* subtype */ + SET_UINT16(buffer, 10, 0); /* idx */ + SET_UINT16(buffer, 12, context->properties); /* properties */ + SET_UINT8(buffer, 14, numQuants); /* numQuants */ + SET_UINT8(buffer, 15, 0x40); /* tileSize */ + SET_UINT16(buffer, 16, numTiles); /* numTiles */ + /* set tilesDataSize later */ + size = 22; + + quantValsPtr = quantVals; + for (i = 0; i < numQuants * 5; i++) + { + SET_UINT8(buffer, size, quantValsPtr[0] + (quantValsPtr[1] << 4)); + quantValsPtr += 2; + size++; + } + + DEBUG_RFX("width:%d height:%d rowstride:%d", width, height, rowstride); + + tilesDataSize = 0; + for (yIdx = 0; yIdx < numTilesY; yIdx++) + { + for (xIdx = 0; xIdx < numTilesX; xIdx++) + { + tilesDataSize += rfx_compose_message_tile(context, + buffer + size + tilesDataSize, buffer_size - size - tilesDataSize, + image_data + yIdx * 64 * rowstride + xIdx * 64 * context->bytes_per_pixel, + xIdx < numTilesX - 1 ? 64 : width - xIdx * 64, + yIdx < numTilesY - 1 ? 64 : height - yIdx * 64, + rowstride, quantVals, quantIdxY, quantIdxCb, quantIdxCr, xIdx, yIdx); + } + } + + size += tilesDataSize; + SET_UINT32(buffer, 2, size); /* CodecChannelT.blockLen */ + SET_UINT32(buffer, 18, tilesDataSize); /* tilesDataSize */ + + return size; +} + +static int +rfx_compose_message_frame_end(RFX_CONTEXT * context, uint8 * buffer, int buffer_size) +{ + if (buffer_size < 8) + { + printf("rfx_compose_message_frame_end: buffer size too small.\n"); + return 0; + } + + SET_UINT16(buffer, 0, WBT_FRAME_END); /* CodecChannelT.blockType */ + SET_UINT32(buffer, 2, 8); /* CodecChannelT.blockLen */ + SET_UINT8(buffer, 6, 1); /* CodecChannelT.codecId */ + SET_UINT8(buffer, 7, 0); /* CodecChannelT.channelId */ + + return 8; +} + +int +rfx_compose_message_data(RFX_CONTEXT * context, uint8 * buffer, int buffer_size, + const RFX_RECT * rects, int num_rects, uint8 * image_data, int width, int height, int rowstride) +{ + int composed_size; + + composed_size = rfx_compose_message_frame_begin(context, buffer, buffer_size); + composed_size += rfx_compose_message_region(context, buffer + composed_size, buffer_size - composed_size, + rects, num_rects); + composed_size += rfx_compose_message_tileset(context, buffer + composed_size, buffer_size - composed_size, + image_data, width, height, rowstride); + composed_size += rfx_compose_message_frame_end(context, buffer + composed_size, buffer_size - composed_size); + + return composed_size; +} diff --git a/libfreerdp-rfx/librfx.h b/libfreerdp-rfx/librfx.h index 3c18474..87ea742 100644 --- a/libfreerdp-rfx/librfx.h +++ b/libfreerdp-rfx/librfx.h @@ -32,6 +32,10 @@ #include "rfx_sse.h" #endif +#ifdef WITH_NEON +#include "rfx_neon.h" +#endif + #ifndef RFX_INIT_SIMD #define RFX_INIT_SIMD(_rfx_context) do { } while (0) #endif diff --git a/libfreerdp-rfx/neon/Makefile.am b/libfreerdp-rfx/neon/Makefile.am new file mode 100644 index 0000000..12d4921 --- /dev/null +++ b/libfreerdp-rfx/neon/Makefile.am @@ -0,0 +1,25 @@ +## Process this file with automake to produce Makefile.in + +# libfreerdp-rfx-neon +noinst_LTLIBRARIES = libfreerdp-rfx-neon.la + +libfreerdp_rfx_neon_la_SOURCES = + +if WITH_NEON +libfreerdp_rfx_neon_la_SOURCES += \ + rfx_neon.c rfx_neon.h +endif + +libfreerdp_rfx_neon_la_CFLAGS = \ + -I$(top_srcdir) \ + -I$(top_srcdir)/include \ + -I.. + +libfreerdp_rfx_neon_la_LDFLAGS = + +libfreerdp_rfx_neon_la_LIBDADD = + +# extra +EXTRA_DIST = + +DISTCLEANFILES = diff --git a/libfreerdp-rfx/neon/rfx_neon.c b/libfreerdp-rfx/neon/rfx_neon.c new file mode 100644 index 0000000..745f9ca --- /dev/null +++ b/libfreerdp-rfx/neon/rfx_neon.c @@ -0,0 +1,372 @@ +/* + FreeRDP: A Remote Desktop Protocol client. + RemoteFX Codec Library - NEON Optimizations + + Copyright 2011 Marc-Andre Moreau <marcandre.moreau@gmail.com> + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <arm_neon.h> + +#include "rfx_neon.h" + +#if defined(ANDROID) +#include <cpu-features.h> +#endif + + +static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +prefetch_data(void * buffer1) +{ + asm(" pld [%0, #64] \t\n" + : // no output + : "r" (buffer1) + ); +} + +void rfx_decode_YCbCr_to_RGB_NEON(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer) +{ + int16x8_t zero = vdupq_n_s16(0); + int16x8_t max = vdupq_n_s16(255); + int16x8_t y_add = vdupq_n_s16(128); + + int16x8_t* y_r_buf = (int16x8_t*)y_r_buffer; + int16x8_t* cb_g_buf = (int16x8_t*)cb_g_buffer; + int16x8_t* cr_b_buf = (int16x8_t*)cr_b_buffer; + + int i; + for (i = 0; i < 4096 / 8; i++) + { + prefetch_data(&y_r_buf[i]); + prefetch_data(&cr_b_buf[i]); + prefetch_data(&cb_g_buf[i]); + + int16x8_t y = vld1q_s16((sint16*)&y_r_buf[i]); + y = vaddq_s16(y, y_add); + + int16x8_t cr = vld1q_s16((sint16*)&cr_b_buf[i]); + + // r = between((y + cr + (cr >> 2) + (cr >> 3) + (cr >> 5)), 0, 255); + int16x8_t r = vaddq_s16(y, cr); + r = vaddq_s16(r, vshrq_n_s16(cr, 2)); + r = vaddq_s16(r, vshrq_n_s16(cr, 3)); + r = vaddq_s16(r, vshrq_n_s16(cr, 5)); + r = vminq_s16(vmaxq_s16(r, zero), max); + vst1q_s16((sint16*)&y_r_buf[i], r); + + // cb = cb_g_buf[i]; + int16x8_t cb = vld1q_s16((sint16*)&cb_g_buf[i]); + + // g = between(y - (cb >> 2) - (cb >> 4) - (cb >> 5) - (cr >> 1) - (cr >> 3) - (cr >> 4) - (cr >> 5), 0, 255); + int16x8_t g = vsubq_s16(y, vshrq_n_s16(cb, 2)); + g = vsubq_s16(g, vshrq_n_s16(cb, 4)); + g = vsubq_s16(g, vshrq_n_s16(cb, 5)); + g = vsubq_s16(g, vshrq_n_s16(cr, 1)); + g = vsubq_s16(g, vshrq_n_s16(cr, 3)); + g = vsubq_s16(g, vshrq_n_s16(cr, 4)); + g = vsubq_s16(g, vshrq_n_s16(cr, 5)); + g = vminq_s16(vmaxq_s16(g, zero), max); + vst1q_s16((sint16*)&cb_g_buf[i], g); + + // b = between((y + cb + (cb >> 1) + (cb >> 2) + (cb >> 6)), 0, 255); + int16x8_t b = vaddq_s16(y, cb); + b = vaddq_s16(b, vshrq_n_s16(cb, 1)); + b = vaddq_s16(b, vshrq_n_s16(cb, 2)); + b = vaddq_s16(b, vshrq_n_s16(cb, 6)); + b = vminq_s16(vmaxq_s16(b, zero), max); + vst1q_s16((sint16*)&cr_b_buf[i], b); + } +} + +static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +rfx_quantization_decode_block_NEON(sint16 * buffer, const int buffer_size, const uint32 factor) +{ + if (factor <= 6) + return; + int16x8_t quantFactors = vdupq_n_s16(factor - 6); + int16x8_t* buf = (int16x8_t*)buffer; + int16x8_t* buf_end = (int16x8_t*)(buffer + buffer_size); + + do + { + prefetch_data(buf); + int16x8_t val = vld1q_s16((sint16*)buf); + val = vshlq_s16(val, quantFactors); + vst1q_s16((sint16*)buf, val); + buf++; + } + while(buf < buf_end); +} + +void +rfx_quantization_decode_NEON(sint16 * buffer, const uint32 * quantization_values) +{ + rfx_quantization_decode_block_NEON(buffer, 1024, quantization_values[8]); /* HL1 */ + rfx_quantization_decode_block_NEON(buffer + 1024, 1024, quantization_values[7]); /* LH1 */ + rfx_quantization_decode_block_NEON(buffer + 2048, 1024, quantization_values[9]); /* HH1 */ + rfx_quantization_decode_block_NEON(buffer + 3072, 256, quantization_values[5]); /* HL2 */ + rfx_quantization_decode_block_NEON(buffer + 3328, 256, quantization_values[4]); /* LH2 */ + rfx_quantization_decode_block_NEON(buffer + 3584, 256, quantization_values[6]); /* HH2 */ + rfx_quantization_decode_block_NEON(buffer + 3840, 64, quantization_values[2]); /* HL3 */ + rfx_quantization_decode_block_NEON(buffer + 3904, 64, quantization_values[1]); /* LH3 */ + rfx_quantization_decode_block_NEON(buffer + 3868, 64, quantization_values[3]); /* HH3 */ + rfx_quantization_decode_block_NEON(buffer + 4032, 64, quantization_values[0]); /* LL3 */ +} + + + +static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +rfx_dwt_2d_decode_block_horiz_NEON(sint16 * l, sint16 * h, sint16 * dst, int subband_width) +{ + int y, n; + sint16 * l_ptr = l; + sint16 * h_ptr = h; + sint16 * dst_ptr = dst; + + for (y = 0; y < subband_width; y++) + { + /* Even coefficients */ + for (n = 0; n < subband_width; n+=8) + { + // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); + int16x8_t l_n = vld1q_s16(l_ptr); + prefetch_data(l_ptr); + + int16x8_t h_n = vld1q_s16(h_ptr); + int16x8_t h_n_m = vld1q_s16(h_ptr - 1); + prefetch_data(h_ptr); + + if (n == 0) + { + int16_t first = vgetq_lane_s16(h_n_m, 1); + h_n_m = vsetq_lane_s16(first, h_n_m, 0); + } + + int16x8_t tmp_n = vaddq_s16(h_n, h_n_m); + tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1)); + tmp_n = vshrq_n_s16(tmp_n, 1); + + int16x8_t dst_n = vsubq_s16(l_n, tmp_n); + + vst1q_s16(l_ptr, dst_n); + + l_ptr+=8; + h_ptr+=8; + } + l_ptr -= subband_width; + h_ptr -= subband_width; + + /* Odd coefficients */ + for (n = 0; n < subband_width; n+=8) + { + // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); + + int16x8_t h_n = vld1q_s16(h_ptr); + prefetch_data(h_ptr); + + h_n = vshlq_n_s16(h_n, 1); + + int16x8x2_t dst_n; + dst_n.val[0] = vld1q_s16(l_ptr); + prefetch_data(l_ptr); + int16x8_t dst_n_p = vld1q_s16(l_ptr + 1); + if (n == subband_width - 8) + { + int16_t last = vgetq_lane_s16(dst_n_p, 6); + dst_n_p = vsetq_lane_s16(last, dst_n_p, 7); + } + + dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]); + dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1); + + dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n); + + vst2q_s16(dst_ptr, dst_n); + prefetch_data(dst_ptr); + + l_ptr+=8; + h_ptr+=8; + dst_ptr+=16; + } + } +} + +static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +rfx_dwt_2d_decode_block_vert_NEON(sint16 * l, sint16 * h, sint16 * dst, int subband_width) +{ + int x, n; + sint16 * l_ptr = l; + sint16 * h_ptr = h; + sint16 * dst_ptr = dst; + + int total_width = subband_width + subband_width; + + /* Even coefficients */ + for (n = 0; n < subband_width; n++) + { + for (x = 0; x < total_width; x+=8) + { + prefetch_data(l_ptr); + prefetch_data(h_ptr); + // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); + + int16x8_t l_n = vld1q_s16(l_ptr); + int16x8_t h_n = vld1q_s16(h_ptr); + + int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));; + if (n == 0) + tmp_n = vaddq_s16(tmp_n, h_n); + else + { + int16x8_t h_n_m = vld1q_s16((h_ptr - total_width)); + tmp_n = vaddq_s16(tmp_n, h_n_m); + } + tmp_n = vshrq_n_s16(tmp_n, 1); + + prefetch_data(dst_ptr); + int16x8_t dst_n = vsubq_s16(l_n, tmp_n); + vst1q_s16(dst_ptr, dst_n); + + l_ptr+=8; + h_ptr+=8; + dst_ptr+=8; + } + dst_ptr+=total_width; + } + + h_ptr = h; + dst_ptr = dst + total_width; + + /* Odd coefficients */ + for (n = 0; n < subband_width; n++) + { + for (x = 0; x < total_width; x+=8) + { + // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); + int16x8_t h_n = vld1q_s16(h_ptr); + int16x8_t dst_n_m = vld1q_s16(dst_ptr - total_width); + + prefetch_data(h_ptr); + prefetch_data(dst_ptr - total_width); + + h_n = vshlq_n_s16(h_n, 1); + + int16x8_t tmp_n = dst_n_m; + if (n == subband_width - 1) + tmp_n = vaddq_s16(tmp_n, dst_n_m); + else + { + int16x8_t dst_n_p = vld1q_s16((dst_ptr + total_width)); + tmp_n = vaddq_s16(tmp_n, dst_n_p); + } + tmp_n = vshrq_n_s16(tmp_n, 1); + + int16x8_t dst_n = vaddq_s16(tmp_n, h_n); + vst1q_s16(dst_ptr, dst_n); + prefetch_data(dst_ptr); + + h_ptr+=8; + dst_ptr+=8; + } + dst_ptr+=total_width; + } +} + +static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +rfx_dwt_2d_decode_block_NEON(sint16 * buffer, sint16 * idwt, int subband_width) +{ + sint16 * hl, * lh, * hh, * ll; + sint16 * l_dst, * h_dst; + + /* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt. */ + /* The 4 sub-bands are stored in HL(0), LH(1), HH(2), LL(3) order. */ + /* The lower part L uses LL(3) and HL(0). */ + /* The higher part H uses LH(1) and HH(2). */ + + ll = buffer + subband_width * subband_width * 3; + hl = buffer; + l_dst = idwt; + + rfx_dwt_2d_decode_block_horiz_NEON(ll, hl, l_dst, subband_width); + + lh = buffer + subband_width * subband_width; + hh = buffer + subband_width * subband_width * 2; + h_dst = idwt + subband_width * subband_width * 2; + + rfx_dwt_2d_decode_block_horiz_NEON(lh, hh, h_dst, subband_width); + + /* Inverse DWT in vertical direction, results are stored in original buffer. */ + rfx_dwt_2d_decode_block_vert_NEON(l_dst, h_dst, buffer, subband_width); +} + +void +rfx_dwt_2d_decode_NEON(sint16 * buffer, sint16 * dwt_buffer) +{ + rfx_dwt_2d_decode_block_NEON(buffer + 3840, dwt_buffer, 8); + rfx_dwt_2d_decode_block_NEON(buffer + 3072, dwt_buffer, 16); + rfx_dwt_2d_decode_block_NEON(buffer, dwt_buffer, 32); +} + + + +int isNeonSupported() +{ +#if defined(ANDROID) + if (android_getCpuFamily() != ANDROID_CPU_FAMILY_ARM) + { + DEBUG_RFX("NEON optimization disabled - No ARM CPU found"); + return 0; + } + + uint64_t features = android_getCpuFeatures(); + if ((features & ANDROID_CPU_ARM_FEATURE_ARMv7)) + { + if (features & ANDROID_CPU_ARM_FEATURE_NEON) + { + DEBUG_RFX("NEON optimization enabled!"); + return 1; + } + DEBUG_RFX("NEON optimization disabled - CPU not NEON capable"); + } + else + DEBUG_RFX("NEON optimization disabled - No ARMv7 CPU found"); + + return 0; +#else + return 1; // TODO: set to 1 for development +// return 0; +#endif +} + + +void rfx_init_neon(RFX_CONTEXT * context) +{ + + + if(isNeonSupported()) + { + DEBUG_RFX("Using NEON optimizations"); + + IF_PROFILER(context->prof_rfx_decode_YCbCr_to_RGB->name = "rfx_decode_YCbCr_to_RGB_NEON"); + IF_PROFILER(context->prof_rfx_quantization_decode->name = "rfx_quantization_decode_NEON"); + IF_PROFILER(context->prof_rfx_dwt_2d_decode->name = "rfx_dwt_2d_decode_NEON"); + + context->decode_YCbCr_to_RGB = rfx_decode_YCbCr_to_RGB_NEON; + context->quantization_decode = rfx_quantization_decode_NEON; + context->dwt_2d_decode = rfx_dwt_2d_decode_NEON; + } +} diff --git a/libfreerdp-rfx/neon/rfx_neon.h b/libfreerdp-rfx/neon/rfx_neon.h new file mode 100644 index 0000000..64b702f --- /dev/null +++ b/libfreerdp-rfx/neon/rfx_neon.h @@ -0,0 +1,32 @@ +/* + FreeRDP: A Remote Desktop Protocol client. + RemoteFX Codec Library - NEON Optimizations + + Copyright 2011 Marc-Andre Moreau <marcandre.moreau@gmail.com> + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __RFX_NEON_H +#define __RFX_NEON_H + +#include "librfx.h" +#include <freerdp/rfx.h> + +void rfx_init_neon(RFX_CONTEXT * context); + +#ifndef RFX_INIT_SIMD +#define RFX_INIT_SIMD(_rfx_context) rfx_init_neon(_rfx_context) +#endif + +#endif /* __RFX_NEON_H */ diff --git a/libfreerdp-rfx/rfx_bitstream.c b/libfreerdp-rfx/rfx_bitstream.c index ce741fc..a62ec17 100644 --- a/libfreerdp-rfx/rfx_bitstream.c +++ b/libfreerdp-rfx/rfx_bitstream.c @@ -43,11 +43,11 @@ rfx_bitstream_put_buffer(RFX_BITSTREAM * bs, uint8 * buffer, int nbytes) bs->bits_left = 8; } -uint32 +uint16 rfx_bitstream_get_bits(RFX_BITSTREAM * bs, int nbits) { int b; - uint32 n = 0; + uint16 n = 0; while (bs->byte_pos < bs->nbytes && nbits > 0) { @@ -73,6 +73,30 @@ rfx_bitstream_get_bits(RFX_BITSTREAM * bs, int nbits) return n; } +void +rfx_bitstream_put_bits(RFX_BITSTREAM * bs, uint16 bits, int nbits) +{ + int b; + + while (bs->byte_pos < bs->nbytes && nbits > 0) + { + b = nbits; + + if (b > bs->bits_left) + b = bs->bits_left; + + bs->buffer[bs->byte_pos] |= ((bits >> (nbits - b)) & ((1 << b) - 1)) << (bs->bits_left - b); + bs->bits_left -= b; + nbits -= b; + + if (bs->bits_left == 0) + { + bs->bits_left = 8; + bs->byte_pos++; + } + } +} + int rfx_bitstream_eos(RFX_BITSTREAM * bs) { @@ -88,6 +112,12 @@ rfx_bitstream_left(RFX_BITSTREAM * bs) return (bs->nbytes - bs->byte_pos - 1) * 8 + bs->bits_left; } +int +rfx_bitstream_get_processed_bytes(RFX_BITSTREAM * bs) +{ + return (bs->bits_left < 8 ? bs->byte_pos + 1 : bs->byte_pos); +} + void rfx_bitstream_free(RFX_BITSTREAM * bs) { diff --git a/libfreerdp-rfx/rfx_bitstream.h b/libfreerdp-rfx/rfx_bitstream.h index 93d12b7..191c191 100644 --- a/libfreerdp-rfx/rfx_bitstream.h +++ b/libfreerdp-rfx/rfx_bitstream.h @@ -35,12 +35,16 @@ RFX_BITSTREAM * rfx_bitstream_new(void); void rfx_bitstream_put_buffer(RFX_BITSTREAM * bs, uint8 * buffer, int nbytes); -unsigned int +uint16 rfx_bitstream_get_bits(RFX_BITSTREAM * bs, int nbits); +void +rfx_bitstream_put_bits(RFX_BITSTREAM * bs, uint16 bits, int nbits); int rfx_bitstream_eos(RFX_BITSTREAM * bs); int rfx_bitstream_left(RFX_BITSTREAM * bs); +int +rfx_bitstream_get_processed_bytes(RFX_BITSTREAM * bs); void rfx_bitstream_free(RFX_BITSTREAM * bs); diff --git a/libfreerdp-rfx/rfx_decode.c b/libfreerdp-rfx/rfx_decode.c index 35dc306..8dc8853 100644 --- a/libfreerdp-rfx/rfx_decode.c +++ b/libfreerdp-rfx/rfx_decode.c @@ -27,13 +27,64 @@ #include "rfx_decode.h" +static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +rfx_decode_format_RGB(sint16 * r_buf, sint16 * g_buf, sint16 * b_buf, + RFX_PIXEL_FORMAT pixel_format, uint8 * dst_buf) +{ + sint16 * r = r_buf; + sint16 * g = g_buf; + sint16 * b = b_buf; + uint8 * dst = dst_buf; + int i; + + switch (pixel_format) + { + case RFX_PIXEL_FORMAT_BGRA: + for (i = 0; i < 4096; i++) + { + *dst++ = (uint8) (*b++); + *dst++ = (uint8) (*g++); + *dst++ = (uint8) (*r++); + *dst++ = 0xFF; + } + break; + case RFX_PIXEL_FORMAT_RGBA: + for (i = 0; i < 4096; i++) + { + *dst++ = (uint8) (*r++); + *dst++ = (uint8) (*g++); + *dst++ = (uint8) (*b++); + *dst++ = 0xFF; + } + break; + case RFX_PIXEL_FORMAT_BGR: + for (i = 0; i < 4096; i++) + { + *dst++ = (uint8) (*b++); + *dst++ = (uint8) (*g++); + *dst++ = (uint8) (*r++); + } + break; + case RFX_PIXEL_FORMAT_RGB: + for (i = 0; i < 4096; i++) + { + *dst++ = (uint8) (*r++); + *dst++ = (uint8) (*g++); + *dst++ = (uint8) (*b++); + } + break; + default: + break; + } +} + #define MINMAX(_v,_l,_h) ((_v) < (_l) ? (_l) : ((_v) > (_h) ? (_h) : (_v))) void -rfx_decode_YCbCr_to_RGB(uint32 * y_r_buf, uint32 * cb_g_buf, uint32 * cr_b_buf) +rfx_decode_YCbCr_to_RGB(sint16 * y_r_buf, sint16 * cb_g_buf, sint16 * cr_b_buf) { - int y, cb, cr; - int r, g, b; + sint16 y, cb, cr; + sint16 r, g, b; int i; for (i = 0; i < 4096; i++) @@ -51,29 +102,28 @@ rfx_decode_YCbCr_to_RGB(uint32 * y_r_buf, uint32 * cb_g_buf, uint32 * cr_b_buf) } static void -rfx_decode_component(RFX_CONTEXT * context, const uint32 * quantization_values, int half, - const uint8 * data, int size, uint32 * buffer) +rfx_decode_component(RFX_CONTEXT * context, const uint32 * quantization_values, + const uint8 * data, int size, sint16 * buffer) { - rfx_rlgr_decode(context->mode, data, size, buffer, 4096); - - rfx_differential_decode(buffer + 4032, 64); - - rfx_quantization_decode(buffer, 1024, quantization_values[8]); /* HL1 */ - rfx_quantization_decode(buffer + 1024, 1024, quantization_values[7]); /* LH1 */ - rfx_quantization_decode(buffer + 2048, 1024, quantization_values[9]); /* HH1 */ - rfx_quantization_decode(buffer + 3072, 256, quantization_values[5]); /* HL2 */ - rfx_quantization_decode(buffer + 3328, 256, quantization_values[4]); /* LH2 */ - rfx_quantization_decode(buffer + 3584, 256, quantization_values[6]); /* HH2 */ - rfx_quantization_decode(buffer + 3840, 64, quantization_values[2]); /* HL3 */ - rfx_quantization_decode(buffer + 3904, 64, quantization_values[1]); /* LH3 */ - rfx_quantization_decode(buffer + 3868, 64, quantization_values[3]); /* HH3 */ - rfx_quantization_decode(buffer + 4032, 64, quantization_values[0]); /* LL3 */ - - rfx_dwt_2d_decode(context, (int*) buffer + 3840, 8); - rfx_dwt_2d_decode(context, (int*) buffer + 3072, 16); - - if (!half) - rfx_dwt_2d_decode(context, (int*) buffer, 32); + PROFILER_ENTER(context->prof_rfx_decode_component); + + PROFILER_ENTER(context->prof_rfx_rlgr_decode); + rfx_rlgr_decode(context->mode, data, size, buffer, 4096); + PROFILER_EXIT(context->prof_rfx_rlgr_decode); + + PROFILER_ENTER(context->prof_rfx_differential_decode); + rfx_differential_decode(buffer + 4032, 64); + PROFILER_EXIT(context->prof_rfx_differential_decode); + + PROFILER_ENTER(context->prof_rfx_quantization_decode); + context->quantization_decode(buffer, quantization_values); + PROFILER_EXIT(context->prof_rfx_quantization_decode); + + PROFILER_ENTER(context->prof_rfx_dwt_2d_decode); + context->dwt_2d_decode(buffer, context->dwt_buffer); + PROFILER_EXIT(context->prof_rfx_dwt_2d_decode); + + PROFILER_EXIT(context->prof_rfx_decode_component); } uint8* @@ -82,49 +132,22 @@ rfx_decode_rgb(RFX_CONTEXT * context, const uint8 * cb_data, int cb_size, const uint32 * cb_quants, const uint8 * cr_data, int cr_size, const uint32 * cr_quants, uint8* rgb_buffer) { - int i; - uint8 * dst; - int r, g, b; + PROFILER_ENTER(context->prof_rfx_decode_rgb); - dst = rgb_buffer; - rfx_decode_component(context, y_quants, 0, y_data, y_size, context->y_r_buffer); - rfx_decode_component(context, cb_quants, 0, cb_data, cb_size, context->cb_g_buffer); - rfx_decode_component(context, cr_quants, 0, cr_data, cr_size, context->cr_b_buffer); + rfx_decode_component(context, y_quants, y_data, y_size, context->y_r_buffer); /* YData */ + rfx_decode_component(context, cb_quants, cb_data, cb_size, context->cb_g_buffer); /* CbData */ + rfx_decode_component(context, cr_quants, cr_data, cr_size, context->cr_b_buffer); /* CrData */ - context->decode_YCbCr_to_RGB(context->y_r_buffer, context->cb_g_buffer, context->cr_b_buffer); + PROFILER_ENTER(context->prof_rfx_decode_YCbCr_to_RGB); + context->decode_YCbCr_to_RGB(context->y_r_buffer, context->cb_g_buffer, context->cr_b_buffer); + PROFILER_EXIT(context->prof_rfx_decode_YCbCr_to_RGB); + + PROFILER_ENTER(context->prof_rfx_decode_format_RGB); + rfx_decode_format_RGB(context->y_r_buffer, context->cb_g_buffer, context->cr_b_buffer, + context->pixel_format, rgb_buffer); + PROFILER_EXIT(context->prof_rfx_decode_format_RGB); + + PROFILER_EXIT(context->prof_rfx_decode_rgb); - for (i = 0; i < 4096; i++) - { - r = context->y_r_buffer[i]; - g = context->cb_g_buffer[i]; - b = context->cr_b_buffer[i]; - switch (context->pixel_format) - { - case RFX_PIXEL_FORMAT_BGRA: - *dst++ = (uint8) (b); - *dst++ = (uint8) (g); - *dst++ = (uint8) (r); - *dst++ = 0xFF; - break; - case RFX_PIXEL_FORMAT_RGBA: - *dst++ = (uint8) (r); - *dst++ = (uint8) (g); - *dst++ = (uint8) (b); - *dst++ = 0xFF; - break; - case RFX_PIXEL_FORMAT_BGR: - *dst++ = (uint8) (b); - *dst++ = (uint8) (g); - *dst++ = (uint8) (r); - break; - case RFX_PIXEL_FORMAT_RGB: - *dst++ = (uint8) (r); - *dst++ = (uint8) (g); - *dst++ = (uint8) (b); - break; - default: - break; - } - } return rgb_buffer; } diff --git a/libfreerdp-rfx/rfx_decode.h b/libfreerdp-rfx/rfx_decode.h index 356e232..3b7fd9b 100644 --- a/libfreerdp-rfx/rfx_decode.h +++ b/libfreerdp-rfx/rfx_decode.h @@ -23,7 +23,7 @@ #include <freerdp/rfx.h> void -rfx_decode_YCbCr_to_RGB(uint32 * y_r_buf, uint32 * cb_g_buf, uint32 * cr_b_buf); +rfx_decode_YCbCr_to_RGB(sint16 * y_r_buf, sint16 * cb_g_buf, sint16 * cr_b_buf); unsigned char * rfx_decode_rgb(RFX_CONTEXT * context, diff --git a/libfreerdp-rfx/rfx_differential.c b/libfreerdp-rfx/rfx_differential.c index 10b9dd6..d1b3eb1 100644 --- a/libfreerdp-rfx/rfx_differential.c +++ b/libfreerdp-rfx/rfx_differential.c @@ -23,10 +23,10 @@ #include "rfx_differential.h" void -rfx_differential_decode(uint32 * buffer, int buffer_size) +rfx_differential_decode(sint16 * buffer, int buffer_size) { - uint32 * src; - uint32 * dst; + sint16 * src; + sint16 * dst; for (src = buffer, dst = buffer + 1; buffer_size > 1; src++, dst++, buffer_size--) { @@ -34,3 +34,17 @@ rfx_differential_decode(uint32 * buffer, int buffer_size) } } +void +rfx_differential_encode(sint16 * buffer, int buffer_size) +{ + sint16 n1, n2; + sint16 * dst; + + for (n1 = *buffer, dst = buffer + 1; buffer_size > 1; dst++, buffer_size--) + { + n2 = *dst; + *dst -= n1; + n1 = n2; + } +} + diff --git a/libfreerdp-rfx/rfx_differential.h b/libfreerdp-rfx/rfx_differential.h index fc5deb5..141e8c9 100644 --- a/libfreerdp-rfx/rfx_differential.h +++ b/libfreerdp-rfx/rfx_differential.h @@ -23,7 +23,9 @@ #include <freerdp/rfx.h> void -rfx_differential_decode(uint32 * buffer, int buffer_size); +rfx_differential_decode(sint16 * buffer, int buffer_size); +void +rfx_differential_encode(sint16 * buffer, int buffer_size); #endif diff --git a/libfreerdp-rfx/rfx_dwt.c b/libfreerdp-rfx/rfx_dwt.c index 3ed4e89..7f80975 100644 --- a/libfreerdp-rfx/rfx_dwt.c +++ b/libfreerdp-rfx/rfx_dwt.c @@ -24,32 +24,15 @@ #include "rfx_dwt.h" void -rfx_dwt_2d_decode(RFX_CONTEXT * context, int * buffer, int subband_width) +rfx_dwt_2d_decode_block(sint16 * buffer, sint16 * idwt, int subband_width) { - int idwt_alloc; - int * idwt; - int * dst, * l, * h; - int * l_dst, * h_dst; - int * hl, * lh, * hh, * ll; + sint16 * dst, * l, * h; + sint16 * l_dst, * h_dst; + sint16 * hl, * lh, * hh, * ll; int total_width; int x, y; int n; - switch (subband_width) - { - case 8: - case 16: - case 32: - idwt = (int*) context->idwt_buffers[subband_width >> 3]; - idwt_alloc = 0; - break; - - default: - idwt = (int*) malloc(subband_width * subband_width * 4 * sizeof(int)); - idwt_alloc = 1; - break; - } - total_width = subband_width << 1; /* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt. */ @@ -85,8 +68,8 @@ rfx_dwt_2d_decode(RFX_CONTEXT * context, int * buffer, int subband_width) h_dst[x + 1] = (hh[n] << 1) + ((h_dst[x] + h_dst[x + 2]) >> 1); } x = n << 1; - l_dst[x + 1] = (hl[n] << 1) + ((l_dst[x] + l_dst[x]) >> 1); - h_dst[x + 1] = (hh[n] << 1) + ((h_dst[x] + h_dst[x]) >> 1); + l_dst[x + 1] = (hl[n] << 1) + (l_dst[x]); + h_dst[x + 1] = (hh[n] << 1) + (h_dst[x]); ll += subband_width; hl += subband_width; @@ -120,8 +103,96 @@ rfx_dwt_2d_decode(RFX_CONTEXT * context, int * buffer, int subband_width) dst[total_width] = (*h << 1) + ((dst[0] + dst[n < subband_width - 1 ? 2 * total_width : 0]) >> 1); } } +} - if (idwt_alloc) - free(idwt); +void +rfx_dwt_2d_decode(sint16 * buffer, sint16 * dwt_buffer) +{ + rfx_dwt_2d_decode_block(buffer + 3840, dwt_buffer, 8); + rfx_dwt_2d_decode_block(buffer + 3072, dwt_buffer, 16); + rfx_dwt_2d_decode_block(buffer, dwt_buffer, 32); } +void +rfx_dwt_2d_encode_block(sint16 * buffer, sint16 * dwt, int subband_width) +{ + sint16 * src, * l, * h; + sint16 * l_src, * h_src; + sint16 * hl, * lh, * hh, * ll; + int total_width; + int x, y; + int n; + + total_width = subband_width << 1; + + /* DWT in vertical direction, results in 2 sub-bands in L, H order in tmp buffer dwt. */ + for (x = 0; x < total_width; x++) + { + for (n = 0; n < subband_width; n++) + { + y = n << 1; + l = dwt + n * total_width + x; + h = l + subband_width * total_width; + src = buffer + y * total_width + x; + + /* H */ + *h = (src[total_width] - ((src[0] + src[n < subband_width - 1 ? 2 * total_width : total_width]) >> 1)) >> 1; + + /* L */ + *l = src[0] + (n == 0 ? *h : (*(h - total_width) + *h) >> 1); + } + } + + /* DWT in horizontal direction, results in 4 sub-bands in HL(0), LH(1), HH(2), LL(3) order, stored in original buffer. */ + /* The lower part L generates LL(3) and HL(0). */ + /* The higher part H generates LH(1) and HH(2). */ + + ll = buffer + subband_width * subband_width * 3; + hl = buffer; + l_src = dwt; + + lh = buffer + subband_width * subband_width; + hh = buffer + subband_width * subband_width * 2; + h_src = dwt + subband_width * subband_width * 2; + + for (y = 0; y < subband_width; y++) + { + /* L */ + for (n = 0; n < subband_width; n++) + { + x = n << 1; + + /* HL */ + hl[n] = (l_src[x + 1] - ((l_src[x] + l_src[n < subband_width - 1 ? x + 2 : x]) >> 1)) >> 1; + /* LL */ + ll[n] = l_src[x] + (n == 0 ? hl[n] : (hl[n - 1] + hl[n]) >> 1); + } + + /* H */ + for (n = 0; n < subband_width; n++) + { + x = n << 1; + + /* HH */ + hh[n] = (h_src[x + 1] - ((h_src[x] + h_src[n < subband_width - 1 ? x + 2 : x]) >> 1)) >> 1; + /* LH */ + lh[n] = h_src[x] + (n == 0 ? hh[n] : (hh[n - 1] + hh[n]) >> 1); + } + + ll += subband_width; + hl += subband_width; + l_src += total_width; + + lh += subband_width; + hh += subband_width; + h_src += total_width; + } +} + +void +rfx_dwt_2d_encode(sint16 * buffer, sint16 * dwt_buffer) +{ + rfx_dwt_2d_encode_block(buffer, dwt_buffer, 32); + rfx_dwt_2d_encode_block(buffer + 3072, dwt_buffer, 16); + rfx_dwt_2d_encode_block(buffer + 3840, dwt_buffer, 8); +} diff --git a/libfreerdp-rfx/rfx_dwt.h b/libfreerdp-rfx/rfx_dwt.h index 21140bb..449d61c 100644 --- a/libfreerdp-rfx/rfx_dwt.h +++ b/libfreerdp-rfx/rfx_dwt.h @@ -23,7 +23,9 @@ #include <freerdp/rfx.h> void -rfx_dwt_2d_decode(RFX_CONTEXT * context, int * buffer, int subband_width); +rfx_dwt_2d_decode(sint16 * buffer, sint16 * dwt_buffer); +void +rfx_dwt_2d_encode(sint16 * buffer, sint16 * dwt_buffer); #endif diff --git a/libfreerdp-rfx/rfx_encode.c b/libfreerdp-rfx/rfx_encode.c new file mode 100644 index 0000000..cd20200 --- /dev/null +++ b/libfreerdp-rfx/rfx_encode.c @@ -0,0 +1,182 @@ +/* + FreeRDP: A Remote Desktop Protocol client. + RemoteFX Codec Library - Encode + + Copyright 2011 Vic Lee + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "rfx_rlgr.h" +#include "rfx_differential.h" +#include "rfx_quantization.h" +#include "rfx_dwt.h" + +#include "rfx_encode.h" + +#define MINMAX(_v,_l,_h) ((_v) < (_l) ? (_l) : ((_v) > (_h) ? (_h) : (_v))) + +static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +rfx_encode_format_RGB(const uint8 * rgb_data, int width, int height, int rowstride, + RFX_PIXEL_FORMAT pixel_format, sint16 * r_buf, sint16 * g_buf, sint16 * b_buf) +{ + int x, y; + int x_exceed; + int y_exceed; + const uint8 * src; + + x_exceed = 64 - width; + y_exceed = 64 - height; + for (y = 0; y < height; y++) + { + src = rgb_data + y * rowstride; + + switch (pixel_format) + { + case RFX_PIXEL_FORMAT_BGRA: + for (x = 0; x < width; x++) + { + *b_buf++ = (sint16) (*src++); + *g_buf++ = (sint16) (*src++); + *r_buf++ = (sint16) (*src++); + src++; + } + break; + case RFX_PIXEL_FORMAT_RGBA: + for (x = 0; x < width; x++) + { + *r_buf++ = (sint16) (*src++); + *g_buf++ = (sint16) (*src++); + *b_buf++ = (sint16) (*src++); + src++; + } + break; + case RFX_PIXEL_FORMAT_BGR: + for (x = 0; x < width; x++) + { + *b_buf++ = (sint16) (*src++); + *g_buf++ = (sint16) (*src++); + *r_buf++ = (sint16) (*src++); + } + break; + case RFX_PIXEL_FORMAT_RGB: + for (x = 0; x < width; x++) + { + *r_buf++ = (sint16) (*src++); + *g_buf++ = (sint16) (*src++); + *b_buf++ = (sint16) (*src++); + } + break; + default: + break; + } + /* Fill the horizontal region outside of 64x64 tile size to 0 in order to be better compressed. */ + if (x_exceed > 0) + { + memset(r_buf, 0, x_exceed * sizeof(sint16)); + memset(g_buf, 0, x_exceed * sizeof(sint16)); + memset(b_buf, 0, x_exceed * sizeof(sint16)); + r_buf += x_exceed; + g_buf += x_exceed; + b_buf += x_exceed; + } + } + + /* Fill the vertical region outside of 64x64 tile size to 0 in order to be better compressed. */ + if (y_exceed > 0) + { + memset(r_buf, 0, y_exceed * 64 * sizeof(sint16)); + memset(g_buf, 0, y_exceed * 64 * sizeof(sint16)); + memset(b_buf, 0, y_exceed * 64 * sizeof(sint16)); + } +} + +void +rfx_encode_RGB_to_YCbCr(sint16 * y_r_buf, sint16 * cb_g_buf, sint16 * cr_b_buf) +{ + sint16 y, cb, cr; + sint16 r, g, b; + + int i; + for (i = 0; i < 4096; i++) + { + r = y_r_buf[i]; + g = cb_g_buf[i]; + b = cr_b_buf[i]; + y = ((r >> 2) + (r >> 5) + (r >> 6)) + ((g >> 1) + (g >> 4) + (g >> 6) + (g >> 7)) + ((b >> 4) + (b >> 5) + (b >> 6)); + y_r_buf[i] = MINMAX(y, 0, 255) - 128; + cb = 0 - ((r >> 3) + (r >> 5) + (r >> 7)) - ((g >> 2) + (g >> 4) + (g >> 6)) + (b >> 1); + cb_g_buf[i] = MINMAX(cb, -128, 127); + cr = (r >> 1) - ((g >> 2) + (g >> 3) + (g >> 5) + (g >> 7)) - ((b >> 4) + (b >> 6)); + cr_b_buf[i] = MINMAX(cr, -128, 127); + } +} + +static void +rfx_encode_component(RFX_CONTEXT * context, const uint32 * quantization_values, + sint16 * data, uint8 * buffer, int buffer_size, int * size) +{ + PROFILER_ENTER(context->prof_rfx_encode_component); + + PROFILER_ENTER(context->prof_rfx_dwt_2d_encode); + context->dwt_2d_encode(data, context->dwt_buffer); + PROFILER_EXIT(context->prof_rfx_dwt_2d_encode); + + PROFILER_ENTER(context->prof_rfx_quantization_encode); + context->quantization_encode(data, quantization_values); + PROFILER_EXIT(context->prof_rfx_quantization_encode); + + PROFILER_ENTER(context->prof_rfx_differential_encode); + rfx_differential_encode(data + 4032, 64); + PROFILER_EXIT(context->prof_rfx_differential_encode); + + PROFILER_ENTER(context->prof_rfx_rlgr_encode); + *size = rfx_rlgr_encode(context->mode, data, 4096, buffer, buffer_size); + PROFILER_EXIT(context->prof_rfx_rlgr_encode); + + PROFILER_EXIT(context->prof_rfx_encode_component); +} + +void +rfx_encode_rgb(RFX_CONTEXT * context, const uint8 * rgb_data, int width, int height, int rowstride, + const uint32 * y_quants, const uint32 * cb_quants, const uint32 * cr_quants, + uint8 * ycbcr_buffer, int buffer_size, int * y_size, int * cb_size, int * cr_size) +{ + sint16 * y_r_buffer = context->y_r_buffer; + sint16 * cb_g_buffer = context->cb_g_buffer; + sint16 * cr_b_buffer = context->cr_b_buffer; + + PROFILER_ENTER(context->prof_rfx_encode_rgb); + + PROFILER_ENTER(context->prof_rfx_encode_format_RGB); + rfx_encode_format_RGB(rgb_data, width, height, rowstride, + context->pixel_format, y_r_buffer, cb_g_buffer, cr_b_buffer); + PROFILER_EXIT(context->prof_rfx_encode_format_RGB); + + PROFILER_ENTER(context->prof_rfx_encode_RGB_to_YCbCr); + context->encode_RGB_to_YCbCr(context->y_r_buffer, context->cb_g_buffer, context->cr_b_buffer); + PROFILER_EXIT(context->prof_rfx_encode_RGB_to_YCbCr); + + rfx_encode_component(context, y_quants, context->y_r_buffer, ycbcr_buffer, buffer_size, y_size); + ycbcr_buffer += (*y_size); + buffer_size -= (*y_size); + rfx_encode_component(context, cb_quants, context->cb_g_buffer, ycbcr_buffer, buffer_size, cb_size); + ycbcr_buffer += (*cb_size); + buffer_size -= (*cb_size); + rfx_encode_component(context, cr_quants, context->cr_b_buffer, ycbcr_buffer, buffer_size, cr_size); + + PROFILER_EXIT(context->prof_rfx_encode_rgb); +} diff --git a/libfreerdp-rfx/rfx_encode.h b/libfreerdp-rfx/rfx_encode.h new file mode 100644 index 0000000..2fac0be --- /dev/null +++ b/libfreerdp-rfx/rfx_encode.h @@ -0,0 +1,34 @@ +/* + FreeRDP: A Remote Desktop Protocol client. + RemoteFX Codec Library - Decode + + Copyright 2011 Vic Lee + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __RFX_ENCODE_H +#define __RFX_ENCODE_H + +#include <freerdp/rfx.h> + +void +rfx_encode_RGB_to_YCbCr(sint16 * y_r_buf, sint16 * cb_g_buf, sint16 * cr_b_buf); + +void +rfx_encode_rgb(RFX_CONTEXT * context, const uint8 * rgb_data, int width, int height, int rowstride, + const uint32 * y_quants, const uint32 * cb_quants, const uint32 * cr_quants, + uint8 * ycbcr_buffer, int buffer_size, int * y_size, int * cb_size, int * cr_size); + +#endif + diff --git a/libfreerdp-rfx/rfx_quantization.c b/libfreerdp-rfx/rfx_quantization.c index 9d67c0c..67755c0 100644 --- a/libfreerdp-rfx/rfx_quantization.c +++ b/libfreerdp-rfx/rfx_quantization.c @@ -19,17 +19,63 @@ #include "rfx_quantization.h" +static void +rfx_quantization_decode_block(sint16 * buffer, int buffer_size, uint32 factor) +{ + sint16 * dst; + + if (factor <= 6) + return; + + factor -= 6; + for (dst = buffer; buffer_size > 0; dst++, buffer_size--) + { + *dst <<= factor; + } +} + void -rfx_quantization_decode(uint32 * buffer, int buffer_size, uint32 factor) +rfx_quantization_decode(sint16 * buffer, const uint32 * quantization_values) +{ + rfx_quantization_decode_block(buffer, 1024, quantization_values[8]); /* HL1 */ + rfx_quantization_decode_block(buffer + 1024, 1024, quantization_values[7]); /* LH1 */ + rfx_quantization_decode_block(buffer + 2048, 1024, quantization_values[9]); /* HH1 */ + rfx_quantization_decode_block(buffer + 3072, 256, quantization_values[5]); /* HL2 */ + rfx_quantization_decode_block(buffer + 3328, 256, quantization_values[4]); /* LH2 */ + rfx_quantization_decode_block(buffer + 3584, 256, quantization_values[6]); /* HH2 */ + rfx_quantization_decode_block(buffer + 3840, 64, quantization_values[2]); /* HL3 */ + rfx_quantization_decode_block(buffer + 3904, 64, quantization_values[1]); /* LH3 */ + rfx_quantization_decode_block(buffer + 3868, 64, quantization_values[3]); /* HH3 */ + rfx_quantization_decode_block(buffer + 4032, 64, quantization_values[0]); /* LL3 */ +} + +static void +rfx_quantization_encode_block(sint16 * buffer, int buffer_size, uint32 factor) { - uint32 * dst; + sint16 * dst; if (factor <= 6) return; + factor -= 6; for (dst = buffer; buffer_size > 0; dst++, buffer_size--) { - *dst <<= (factor - 6); + *dst >>= factor; } } +void +rfx_quantization_encode(sint16 * buffer, const uint32 * quantization_values) +{ + rfx_quantization_encode_block(buffer, 1024, quantization_values[8]); /* HL1 */ + rfx_quantization_encode_block(buffer + 1024, 1024, quantization_values[7]); /* LH1 */ + rfx_quantization_encode_block(buffer + 2048, 1024, quantization_values[9]); /* HH1 */ + rfx_quantization_encode_block(buffer + 3072, 256, quantization_values[5]); /* HL2 */ + rfx_quantization_encode_block(buffer + 3328, 256, quantization_values[4]); /* LH2 */ + rfx_quantization_encode_block(buffer + 3584, 256, quantization_values[6]); /* HH2 */ + rfx_quantization_encode_block(buffer + 3840, 64, quantization_values[2]); /* HL3 */ + rfx_quantization_encode_block(buffer + 3904, 64, quantization_values[1]); /* LH3 */ + rfx_quantization_encode_block(buffer + 3868, 64, quantization_values[3]); /* HH3 */ + rfx_quantization_encode_block(buffer + 4032, 64, quantization_values[0]); /* LL3 */ +} + diff --git a/libfreerdp-rfx/rfx_quantization.h b/libfreerdp-rfx/rfx_quantization.h index 53d35b8..6a37f28 100644 --- a/libfreerdp-rfx/rfx_quantization.h +++ b/libfreerdp-rfx/rfx_quantization.h @@ -23,7 +23,9 @@ #include <freerdp/rfx.h> void -rfx_quantization_decode(uint32 * buffer, int buffer_size, uint32 factor); +rfx_quantization_decode(sint16 * buffer, const uint32 * quantization_values); +void +rfx_quantization_encode(sint16 * buffer, const uint32 * quantization_values); #endif diff --git a/libfreerdp-rfx/rfx_rlgr.c b/libfreerdp-rfx/rfx_rlgr.c index c727e7c..2d27907 100644 --- a/libfreerdp-rfx/rfx_rlgr.c +++ b/libfreerdp-rfx/rfx_rlgr.c @@ -56,7 +56,7 @@ nZeroesWritten = buffer_size; \ if (nZeroesWritten > 0) \ { \ - memset(dst, 0, nZeroesWritten * sizeof(int)); \ + memset(dst, 0, nZeroesWritten * sizeof(sint16)); \ dst += nZeroesWritten; \ } \ buffer_size -= (nZeroes); \ @@ -72,10 +72,10 @@ _v >>= 1; \ _nbits++; \ } \ -} \ +} /* Converts from (2 * magnitude - sign) to integer */ -#define GetIntFrom2MagSign(twoMs) (((twoMs) & 1) ? -1 * (int)(((twoMs) + 1) >> 1) : (int)((twoMs) >> 1)) +#define GetIntFrom2MagSign(twoMs) (((twoMs) & 1) ? -1 * (sint16)(((twoMs) + 1) >> 1) : (sint16)((twoMs) >> 1)) /* * Update the passed parameter and clamp it to the range [0, KPMAX] @@ -94,11 +94,11 @@ /* Outputs the Golomb/Rice encoding of a non-negative integer */ #define GetGRCode(krp, kr) rfx_rlgr_get_gr_code(bs, krp, kr) -static uint32 +static uint16 rfx_rlgr_get_gr_code(RFX_BITSTREAM * bs, int * krp, int * kr) { int vk; - uint32 mag; + uint16 mag; /* chew up/count leading 1s and escape 0 */ for (vk = 0; GetBits(1) == 1;) @@ -122,13 +122,13 @@ rfx_rlgr_get_gr_code(RFX_BITSTREAM * bs, int * krp, int * kr) } int -rfx_rlgr_decode(RLGR_MODE mode, const uint8 * data, int data_size, uint32 * buffer, int buffer_size) +rfx_rlgr_decode(RLGR_MODE mode, const uint8 * data, int data_size, sint16 * buffer, int buffer_size) { int k; int kp; int kr; int krp; - uint32 * dst; + sint16 * dst; RFX_BITSTREAM * bs; bs = rfx_bitstream_new(); @@ -230,3 +230,202 @@ rfx_rlgr_decode(RLGR_MODE mode, const uint8 * data, int data_size, uint32 * buff return (dst - buffer); } + +/* Returns the next coefficient (a signed int) to encode, from the input stream */ +#define GetNextInput(_n) \ +{ \ + if (data_size > 0) \ + { \ + _n = *data++; \ + data_size--; \ + } \ + else \ + { \ + _n = 0; \ + } \ +} + +/* Emit bitPattern to the output bitstream */ +#define OutputBits(numBits, bitPattern) rfx_bitstream_put_bits(bs, bitPattern, numBits); + +/* Emit a bit (0 or 1), count number of times, to the output bitstream */ +#define OutputBit(count, bit) \ +{ \ + uint16 _b = (bit ? 0xFFFF : 0); \ + int _c = (count); \ + for (; _c > 0; _c -= 16) \ + rfx_bitstream_put_bits(bs, _b, (_c > 16 ? 16 : _c)); \ +} + +/* Converts the input value to (2 * abs(input) - sign(input)), where sign(input) = (input < 0 ? 1 : 0) and returns it */ +#define Get2MagSign(input) ((input) >= 0 ? 2 * (input) : -2 * (input) - 1) + +/* Outputs the Golomb/Rice encoding of a non-negative integer */ +#define CodeGR(krp, val) rfx_rlgr_code_gr(bs, krp, val) + +static void +rfx_rlgr_code_gr(RFX_BITSTREAM * bs, int * krp, uint16 val) +{ + int kr = *krp >> LSGR; + + /* unary part of GR code */ + + uint16 vk = (val) >> kr; + OutputBit(vk, 1); + OutputBit(1, 0); + + /* remainder part of GR code, if needed */ + if (kr) + { + OutputBits(kr, val & ((1 << kr) - 1)); + } + + /* update krp, only if it is not equal to 1 */ + if (vk == 0) + { + UpdateParam(*krp, -2, kr); + } + else if (vk > 1) + { + UpdateParam(*krp, vk, kr); + } +} + +int +rfx_rlgr_encode(RLGR_MODE mode, const sint16 * data, int data_size, uint8 * buffer, int buffer_size) +{ + int k; + int kp; + int kr; + int krp; + RFX_BITSTREAM * bs; + int processed_size; + + bs = rfx_bitstream_new(); + rfx_bitstream_put_buffer(bs, buffer, buffer_size); + + /* initialize the parameters */ + k = 1; + kp = 1 << LSGR; + kr = 1; + krp = 1 << LSGR; + + /* process all the input coefficients */ + while (data_size > 0) + { + int input; + + if (k) + { + int numZeros; + int runmax; + int mag; + int sign; + + /* RUN-LENGTH MODE */ + + /* collect the run of zeros in the input stream */ + numZeros = 0; + GetNextInput(input); + while (input == 0 && data_size > 0) + { + numZeros++; + GetNextInput(input); + } + + // emit output zeros + runmax = 1 << k; + while (numZeros >= runmax) + { + OutputBit(1, 0); /* output a zero bit */ + numZeros -= runmax; + UpdateParam(kp, UP_GR, k); /* update kp, k */ + runmax = 1 << k; + } + + /* output a 1 to terminate runs */ + OutputBit(1, 1); + + /* output the remaining run length using k bits */ + OutputBits(k, numZeros); + + if (input != 0) + { + /* encode the nonzero value using GR coding */ + mag = (input < 0 ? -input : input); /* absolute value of input coefficient */ + sign = (input < 0 ? 1 : 0); /* sign of input coefficient */ + + OutputBit(1, sign); /* output the sign bit */ + CodeGR(&krp, mag - 1); /* output GR code for (mag - 1) */ + + UpdateParam(kp, -DN_GR, k); + } + } + else + { + /* GOLOMB-RICE MODE */ + + if (mode == RLGR1) + { + uint32 twoMs; + + /* RLGR1 variant */ + + /* convert input to (2*magnitude - sign), encode using GR code */ + GetNextInput(input); + twoMs = Get2MagSign(input); + CodeGR(&krp, twoMs); + + /* update k, kp */ + if (twoMs) + { + UpdateParam(kp, UQ_GR, k); + } + else + { + UpdateParam(kp, -DQ_GR, k); + } + } + else /* mode == RLGR3 */ + { + uint32 twoMs1; + uint32 twoMs2; + uint32 sum2Ms; + uint32 nIdx; + + /* RLGR3 variant */ + + /* convert the next two input values to (2*magnitude - sign) and */ + /* encode their sum using GR code */ + + GetNextInput(input); + twoMs1 = Get2MagSign(input); + GetNextInput(input); + twoMs2 = Get2MagSign(input); + sum2Ms = twoMs1 + twoMs2; + + CodeGR(&krp, sum2Ms); + + /* encode binary representation of the first input (twoMs1). */ + GetMinBits(sum2Ms, nIdx); + OutputBits(nIdx, twoMs1); + + /* update k,kp for the two input values */ + + if (twoMs1 && twoMs2) + { + UpdateParam(kp, -2 * DQ_GR, k); + } + else if (!twoMs1 && !twoMs2) + { + UpdateParam(kp, 2 * UQ_GR, k); + } + } + } + } + + processed_size = rfx_bitstream_get_processed_bytes(bs); + rfx_bitstream_free(bs); + + return processed_size; +} diff --git a/libfreerdp-rfx/rfx_rlgr.h b/libfreerdp-rfx/rfx_rlgr.h index 3f988c4..75aa7e4 100644 --- a/libfreerdp-rfx/rfx_rlgr.h +++ b/libfreerdp-rfx/rfx_rlgr.h @@ -23,7 +23,9 @@ #include <freerdp/rfx.h> int -rfx_rlgr_decode(RLGR_MODE mode, const uint8 * data, int data_size, uint32 * buffer, int buffer_size); +rfx_rlgr_decode(RLGR_MODE mode, const uint8 * data, int data_size, sint16 * buffer, int buffer_size); +int +rfx_rlgr_encode(RLGR_MODE mode, const sint16 * data, int data_size, uint8 * buffer, int buffer_size); #endif diff --git a/libfreerdp-rfx/sse/Makefile.am b/libfreerdp-rfx/sse/Makefile.am index a97edae..faa9796 100644 --- a/libfreerdp-rfx/sse/Makefile.am +++ b/libfreerdp-rfx/sse/Makefile.am @@ -3,9 +3,13 @@ # libfreerdp-rfx-sse noinst_LTLIBRARIES = libfreerdp-rfx-sse.la -libfreerdp_rfx_sse_la_SOURCES = \ +libfreerdp_rfx_sse_la_SOURCES = + +if WITH_SSE +libfreerdp_rfx_sse_la_SOURCES += \ rfx_sse.c rfx_sse.h \ rfx_sse2.c rfx_sse2.h +endif libfreerdp_rfx_sse_la_CFLAGS = \ -I$(top_srcdir) \ diff --git a/libfreerdp-rfx/sse/rfx_sse.c b/libfreerdp-rfx/sse/rfx_sse.c index 7353e37..76a632d 100644 --- a/libfreerdp-rfx/sse/rfx_sse.c +++ b/libfreerdp-rfx/sse/rfx_sse.c @@ -22,11 +22,23 @@ #include <string.h> #include "rfx_sse2.h" - #include "rfx_sse.h" void rfx_init_sse(RFX_CONTEXT * context) { - DEBUG_RFX("Using SSE2 optimizations"); - context->decode_YCbCr_to_RGB = rfx_decode_YCbCr_to_RGB_SSE2; + DEBUG_RFX("Using SSE2 optimizations"); + + IF_PROFILER(context->prof_rfx_decode_YCbCr_to_RGB->name = "rfx_decode_YCbCr_to_RGB_SSE2"); + IF_PROFILER(context->prof_rfx_encode_RGB_to_YCbCr->name = "rfx_encode_RGB_to_YCbCr_SSE2"); + IF_PROFILER(context->prof_rfx_quantization_decode->name = "rfx_quantization_decode_SSE2"); + IF_PROFILER(context->prof_rfx_quantization_encode->name = "rfx_quantization_encode_SSE2"); + IF_PROFILER(context->prof_rfx_dwt_2d_decode->name = "rfx_dwt_2d_decode_SSE2"); + IF_PROFILER(context->prof_rfx_dwt_2d_encode->name = "rfx_dwt_2d_encode_SSE2"); + + context->decode_YCbCr_to_RGB = rfx_decode_YCbCr_to_RGB_SSE2; + context->encode_RGB_to_YCbCr = rfx_encode_RGB_to_YCbCr_SSE2; + context->quantization_decode = rfx_quantization_decode_SSE2; + context->quantization_encode = rfx_quantization_encode_SSE2; + context->dwt_2d_decode = rfx_dwt_2d_decode_SSE2; + context->dwt_2d_encode = rfx_dwt_2d_encode_SSE2; } diff --git a/libfreerdp-rfx/sse/rfx_sse.h b/libfreerdp-rfx/sse/rfx_sse.h index 48b8f29..e5be61c 100644 --- a/libfreerdp-rfx/sse/rfx_sse.h +++ b/libfreerdp-rfx/sse/rfx_sse.h @@ -31,20 +31,12 @@ void rfx_init_sse(RFX_CONTEXT * context); #define RFX_INIT_SIMD(_rfx_context) rfx_init_sse(_rfx_context) #endif -static __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_between_ps (__m128 val, __m128 min, __m128 max) +static __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_between_epi16 (__m128i val, __m128i min, __m128i max) { - __m128 ret; - ret = _mm_max_ps(val, min); - return _mm_min_ps(ret, max); -} - -static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cvtps_epi32_and_store (__m128i * loc, __m128 val) -{ - __m128i tmp; - tmp = _mm_cvtps_epi32(val); - _mm_stream_si128(loc, tmp); + __m128i ret; + ret = _mm_max_epi16(val, min); + return _mm_min_epi16(ret, max); } #endif /* __RFX_SSE_H */ diff --git a/libfreerdp-rfx/sse/rfx_sse2.c b/libfreerdp-rfx/sse/rfx_sse2.c index c796c92..3434ec5 100644 --- a/libfreerdp-rfx/sse/rfx_sse2.c +++ b/libfreerdp-rfx/sse/rfx_sse2.c @@ -25,57 +25,583 @@ #include "rfx_sse2.h" -void rfx_decode_YCbCr_to_RGB_SSE2(uint32 * y_r_buffer, uint32 * cb_g_buffer, uint32 * cr_b_buffer) +#define CACHE_LINE_BYTES 64 + +static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_prefetch_buffer(char * buffer, int num_bytes) { - __m128 y_add = _mm_set_ps1(128.0f); - __m128 r_cr_t = _mm_set_ps1(1.403f); - __m128 g_cb_t = _mm_set_ps1(-0.344f); - __m128 g_cr_t = _mm_set_ps1(-0.714f); - __m128 b_cb_t = _mm_set_ps1(1.77f); + __m128i * buf = (__m128i*) buffer; + int i; + for (i = 0; i < (num_bytes / sizeof(__m128i)); i+=(CACHE_LINE_BYTES / sizeof(__m128i))) + { + _mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA); + } +} + +void +rfx_decode_YCbCr_to_RGB_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer) +{ + __m128i zero = _mm_setzero_si128(); + __m128i max = _mm_set1_epi16(255); + + __m128i * y_r_buf = (__m128i*) y_r_buffer; + __m128i * cb_g_buf = (__m128i*) cb_g_buffer; + __m128i * cr_b_buf = (__m128i*) cr_b_buffer; + + __m128i y; + __m128i cr; + __m128i cb; + __m128i r; + __m128i g; + __m128i b; + + int i; - __m128 min = _mm_set_ps1(0.0f); - __m128 max = _mm_set_ps1(255.0f); + for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i))) + { + _mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA); + _mm_prefetch((char*)(&cb_g_buf[i]), _MM_HINT_NTA); + _mm_prefetch((char*)(&cr_b_buf[i]), _MM_HINT_NTA); + } + for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i++) + { + /* y = y_r_buf[i] + 128; */ + y = _mm_load_si128(&y_r_buf[i]); + y = _mm_add_epi16(y, _mm_set1_epi16(128)); - __m128 y, cb, cr; - __m128 r, g, b, tmp; + /* cr = cr_b_buf[i]; */ + cr = _mm_load_si128(&cr_b_buf[i]); + + /* r = between(y + cr + (cr >> 2) + (cr >> 3) + (cr >> 5), 0, 255); */ + r = _mm_add_epi16(y, cr); + r = _mm_add_epi16(r, _mm_srai_epi16(cr, 2)); + r = _mm_add_epi16(r, _mm_srai_epi16(cr, 3)); + r = _mm_add_epi16(r, _mm_srai_epi16(cr, 5)); + r = _mm_between_epi16(r, zero, max); + _mm_store_si128(&y_r_buf[i], r); + + /* cb = cb_g_buf[i]; */ + cb = _mm_load_si128(&cb_g_buf[i]); + + /* g = between(y - (cb >> 2) - (cb >> 4) - (cb >> 5) - (cr >> 1) - (cr >> 3) - (cr >> 4) - (cr >> 5), 0, 255); */ + g = _mm_sub_epi16(y, _mm_srai_epi16(cb, 2)); + g = _mm_sub_epi16(g, _mm_srai_epi16(cb, 4)); + g = _mm_sub_epi16(g, _mm_srai_epi16(cb, 5)); + g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 1)); + g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 3)); + g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 4)); + g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 5)); + g = _mm_between_epi16(g, zero, max); + _mm_store_si128(&cb_g_buf[i], g); + + /* b = between(y + cb + (cb >> 1) + (cb >> 2) + (cb >> 6), 0, 255); */ + b = _mm_add_epi16(y, cb); + b = _mm_add_epi16(b, _mm_srai_epi16(cb, 1)); + b = _mm_add_epi16(b, _mm_srai_epi16(cb, 2)); + b = _mm_add_epi16(b, _mm_srai_epi16(cb, 6)); + b = _mm_between_epi16(b, zero, max); + _mm_store_si128(&cr_b_buf[i], b); + } +} + +void +rfx_encode_RGB_to_YCbCr_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer) +{ + __m128i min = _mm_set1_epi16(-128); + __m128i max = _mm_set1_epi16(127); __m128i * y_r_buf = (__m128i*) y_r_buffer; __m128i * cb_g_buf = (__m128i*) cb_g_buffer; __m128i * cr_b_buf = (__m128i*) cr_b_buffer; + __m128i y; + __m128i cr; + __m128i cb; + __m128i r; + __m128i g; + __m128i b; + int i; - for (i = 0; i < (4096 / 4); i++) + + for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i))) { - y = _mm_cvtepi32_ps(*y_r_buf); - cb = _mm_cvtepi32_ps(*cb_g_buf); - cr = _mm_cvtepi32_ps(*cr_b_buf); - - /* y = y + 128 */ - y = _mm_add_ps(y, y_add); - - /* r = between(y + (cr * 1.403), 0, 255) */ - r = _mm_mul_ps(cr, r_cr_t); - r = _mm_add_ps(r, y); - r = _mm_between_ps(r, min, max); - _mm_cvtps_epi32_and_store(y_r_buf, r); - - /* g = between(y + (cb * -0.344) + (cr * -0.714), 0, 255) */ - g = _mm_mul_ps(cb, g_cb_t); - tmp = _mm_mul_ps(cr, g_cr_t); - g = _mm_add_ps(g, tmp); - g = _mm_add_ps(g, y); - g = _mm_between_ps(g, min, max); - _mm_cvtps_epi32_and_store(cb_g_buf, g); - - /* b = between(y + (cb * 1.77), 0, 255) */ - b = _mm_mul_ps(cb, b_cb_t); - b = _mm_add_ps(b, y); - b = _mm_between_ps(b, min, max); - _mm_cvtps_epi32_and_store(cr_b_buf, b); - - y_r_buf++; - cb_g_buf++; - cr_b_buf++; + _mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA); + _mm_prefetch((char*)(&cb_g_buf[i]), _MM_HINT_NTA); + _mm_prefetch((char*)(&cr_b_buf[i]), _MM_HINT_NTA); } + for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i++) + { + /* r = y_r_buf[i]; */ + r = _mm_load_si128(&y_r_buf[i]); + + /* g = cb_g_buf[i]; */ + g = _mm_load_si128(&cb_g_buf[i]); + + /* b = cr_b_buf[i]; */ + b = _mm_load_si128(&cr_b_buf[i]); + + /* y = ((r >> 2) + (r >> 5) + (r >> 6)) + ((g >> 1) + (g >> 4) + (g >> 6) + (g >> 7)) + ((b >> 4) + (b >> 5) + (b >> 6)); */ + /* y_r_buf[i] = MINMAX(y, 0, 255) - 128; */ + y = _mm_add_epi16(_mm_srai_epi16(r, 2), _mm_srai_epi16(r, 5)); + y = _mm_add_epi16(y, _mm_srai_epi16(r, 6)); + y = _mm_add_epi16(y, _mm_srai_epi16(g, 1)); + y = _mm_add_epi16(y, _mm_srai_epi16(g, 4)); + y = _mm_add_epi16(y, _mm_srai_epi16(g, 6)); + y = _mm_add_epi16(y, _mm_srai_epi16(g, 7)); + y = _mm_add_epi16(y, _mm_srai_epi16(b, 4)); + y = _mm_add_epi16(y, _mm_srai_epi16(b, 5)); + y = _mm_add_epi16(y, _mm_srai_epi16(b, 6)); + y = _mm_add_epi16(y, min); + y = _mm_between_epi16(y, min, max); + _mm_store_si128(&y_r_buf[i], y); + + /* cb = 0 - ((r >> 3) + (r >> 5) + (r >> 7)) - ((g >> 2) + (g >> 4) + (g >> 6)) + (b >> 1); */ + /* cb_g_buf[i] = MINMAX(cb, -128, 127); */ + cb = _mm_sub_epi16(_mm_srai_epi16(b, 1), _mm_srai_epi16(r, 3)); + cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 5)); + cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 7)); + cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 2)); + cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 4)); + cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 6)); + cb = _mm_between_epi16(cb, min, max); + _mm_store_si128(&cb_g_buf[i], cb); + + /* cr = (r >> 1) - ((g >> 2) + (g >> 3) + (g >> 5) + (g >> 7)) - ((b >> 4) + (b >> 6)); */ + /* cr_b_buf[i] = MINMAX(cr, -128, 127); */ + cr = _mm_sub_epi16(_mm_srai_epi16(r, 1), _mm_srai_epi16(g, 2)); + cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 3)); + cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 5)); + cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 7)); + cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 4)); + cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 6)); + cr = _mm_between_epi16(cr, min, max); + _mm_store_si128(&cr_b_buf[i], cr); + } +} + +static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +rfx_quantization_decode_block_SSE2(sint16 * buffer, const int buffer_size, const uint32 factor) +{ + int shift = factor-6; + if (shift <= 0) + return; + + __m128i a; + __m128i * ptr = (__m128i*) buffer; + __m128i * buf_end = (__m128i*) (buffer + buffer_size); + do + { + a = _mm_load_si128(ptr); + a = _mm_slli_epi16(a, shift); + _mm_store_si128(ptr, a); + + ptr++; + } while(ptr < buf_end); +} + +void +rfx_quantization_decode_SSE2(sint16 * buffer, const uint32 * quantization_values) +{ + _mm_prefetch_buffer((char *) buffer, 4096 * sizeof(sint16)); + + rfx_quantization_decode_block_SSE2(buffer, 1024, quantization_values[8]); /* HL1 */ + rfx_quantization_decode_block_SSE2(buffer + 1024, 1024, quantization_values[7]); /* LH1 */ + rfx_quantization_decode_block_SSE2(buffer + 2048, 1024, quantization_values[9]); /* HH1 */ + rfx_quantization_decode_block_SSE2(buffer + 3072, 256, quantization_values[5]); /* HL2 */ + rfx_quantization_decode_block_SSE2(buffer + 3328, 256, quantization_values[4]); /* LH2 */ + rfx_quantization_decode_block_SSE2(buffer + 3584, 256, quantization_values[6]); /* HH2 */ + rfx_quantization_decode_block_SSE2(buffer + 3840, 64, quantization_values[2]); /* HL3 */ + rfx_quantization_decode_block_SSE2(buffer + 3904, 64, quantization_values[1]); /* LH3 */ + rfx_quantization_decode_block_SSE2(buffer + 3868, 64, quantization_values[3]); /* HH3 */ + rfx_quantization_decode_block_SSE2(buffer + 4032, 64, quantization_values[0]); /* LL3 */ +} + +static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +rfx_quantization_encode_block_SSE2(sint16 * buffer, const int buffer_size, const uint32 factor) +{ + int shift = factor-6; + if (shift <= 0) + return; + + __m128i a; + __m128i * ptr = (__m128i*) buffer; + __m128i * buf_end = (__m128i*) (buffer + buffer_size); + do + { + a = _mm_load_si128(ptr); + a = _mm_srai_epi16(a, shift); + _mm_store_si128(ptr, a); + + ptr++; + } while(ptr < buf_end); +} + +void +rfx_quantization_encode_SSE2(sint16 * buffer, const uint32 * quantization_values) +{ + _mm_prefetch_buffer((char *) buffer, 4096 * sizeof(sint16)); + + rfx_quantization_encode_block_SSE2(buffer, 1024, quantization_values[8]); /* HL1 */ + rfx_quantization_encode_block_SSE2(buffer + 1024, 1024, quantization_values[7]); /* LH1 */ + rfx_quantization_encode_block_SSE2(buffer + 2048, 1024, quantization_values[9]); /* HH1 */ + rfx_quantization_encode_block_SSE2(buffer + 3072, 256, quantization_values[5]); /* HL2 */ + rfx_quantization_encode_block_SSE2(buffer + 3328, 256, quantization_values[4]); /* LH2 */ + rfx_quantization_encode_block_SSE2(buffer + 3584, 256, quantization_values[6]); /* HH2 */ + rfx_quantization_encode_block_SSE2(buffer + 3840, 64, quantization_values[2]); /* HL3 */ + rfx_quantization_encode_block_SSE2(buffer + 3904, 64, quantization_values[1]); /* LH3 */ + rfx_quantization_encode_block_SSE2(buffer + 3868, 64, quantization_values[3]); /* HH3 */ + rfx_quantization_encode_block_SSE2(buffer + 4032, 64, quantization_values[0]); /* LL3 */ } +static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +rfx_dwt_2d_decode_block_horiz_SSE2(sint16 * l, sint16 * h, sint16 * dst, int subband_width) +{ + int y, n; + sint16 * l_ptr = l; + sint16 * h_ptr = h; + sint16 * dst_ptr = dst; + int first; + int last; + __m128i l_n; + __m128i h_n; + __m128i h_n_m; + __m128i tmp_n; + __m128i dst_n; + __m128i dst_n_p; + __m128i dst1; + __m128i dst2; + + for (y = 0; y < subband_width; y++) + { + /* Even coefficients */ + for (n = 0; n < subband_width; n+=8) + { + /* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */ + + l_n = _mm_load_si128((__m128i*) l_ptr); + + h_n = _mm_load_si128((__m128i*) h_ptr); + h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - 1)); + if (n == 0) + { + first = _mm_extract_epi16(h_n_m, 1); + h_n_m = _mm_insert_epi16(h_n_m, first, 0); + } + + tmp_n = _mm_add_epi16(h_n, h_n_m); + tmp_n = _mm_add_epi16(tmp_n, _mm_set1_epi16(1)); + tmp_n = _mm_srai_epi16(tmp_n, 1); + + dst_n = _mm_sub_epi16(l_n, tmp_n); + + _mm_store_si128((__m128i*) l_ptr, dst_n); + + l_ptr+=8; + h_ptr+=8; + } + l_ptr -= subband_width; + h_ptr -= subband_width; + + /* Odd coefficients */ + for (n = 0; n < subband_width; n+=8) + { + /* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */ + + h_n = _mm_load_si128((__m128i*) h_ptr); + + h_n = _mm_slli_epi16(h_n, 1); + + dst_n = _mm_load_si128((__m128i*) (l_ptr)); + dst_n_p = _mm_loadu_si128((__m128i*) (l_ptr + 1)); + if (n == subband_width - 8) + { + last = _mm_extract_epi16(dst_n_p, 6); + dst_n_p = _mm_insert_epi16(dst_n_p, last, 7); + } + + tmp_n = _mm_add_epi16(dst_n_p, dst_n); + tmp_n = _mm_srai_epi16(tmp_n, 1); + + tmp_n = _mm_add_epi16(tmp_n, h_n); + + dst1 = _mm_unpacklo_epi16(dst_n, tmp_n); + dst2 = _mm_unpackhi_epi16(dst_n, tmp_n); + + _mm_store_si128((__m128i*) dst_ptr, dst1); + _mm_store_si128((__m128i*) (dst_ptr + 8), dst2); + + l_ptr+=8; + h_ptr+=8; + dst_ptr+=16; + } + } +} + +static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +rfx_dwt_2d_decode_block_vert_SSE2(sint16 * l, sint16 * h, sint16 * dst, int subband_width) +{ + int x, n; + sint16 * l_ptr = l; + sint16 * h_ptr = h; + sint16 * dst_ptr = dst; + __m128i l_n; + __m128i h_n; + __m128i tmp_n; + __m128i h_n_m; + __m128i dst_n; + __m128i dst_n_m; + __m128i dst_n_p; + + int total_width = subband_width + subband_width; + + /* Even coefficients */ + for (n = 0; n < subband_width; n++) + { + for (x = 0; x < total_width; x+=8) + { + /* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */ + + l_n = _mm_load_si128((__m128i*) l_ptr); + h_n = _mm_load_si128((__m128i*) h_ptr); + + tmp_n = _mm_add_epi16(h_n, _mm_set1_epi16(1));; + if (n == 0) + tmp_n = _mm_add_epi16(tmp_n, h_n); + else + { + h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - total_width)); + tmp_n = _mm_add_epi16(tmp_n, h_n_m); + } + tmp_n = _mm_srai_epi16(tmp_n, 1); + + dst_n = _mm_sub_epi16(l_n, tmp_n); + _mm_store_si128((__m128i*) dst_ptr, dst_n); + + l_ptr+=8; + h_ptr+=8; + dst_ptr+=8; + } + dst_ptr+=total_width; + } + + h_ptr = h; + dst_ptr = dst + total_width; + + /* Odd coefficients */ + for (n = 0; n < subband_width; n++) + { + for (x = 0; x < total_width; x+=8) + { + /* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */ + + h_n = _mm_load_si128((__m128i*) h_ptr); + dst_n_m = _mm_load_si128((__m128i*) (dst_ptr - total_width)); + h_n = _mm_slli_epi16(h_n, 1); + + tmp_n = dst_n_m; + if (n == subband_width - 1) + tmp_n = _mm_add_epi16(tmp_n, dst_n_m); + else + { + dst_n_p = _mm_loadu_si128((__m128i*) (dst_ptr + total_width)); + tmp_n = _mm_add_epi16(tmp_n, dst_n_p); + } + tmp_n = _mm_srai_epi16(tmp_n, 1); + + dst_n = _mm_add_epi16(tmp_n, h_n); + _mm_store_si128((__m128i*) dst_ptr, dst_n); + + h_ptr+=8; + dst_ptr+=8; + } + dst_ptr+=total_width; + } +} + +static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +rfx_dwt_2d_decode_block_SSE2(sint16 * buffer, sint16 * idwt, int subband_width) +{ + sint16 * hl, * lh, * hh, * ll; + sint16 * l_dst, * h_dst; + + _mm_prefetch_buffer((char *) idwt, subband_width * 4 * sizeof(sint16)); + + /* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt. */ + /* The 4 sub-bands are stored in HL(0), LH(1), HH(2), LL(3) order. */ + /* The lower part L uses LL(3) and HL(0). */ + /* The higher part H uses LH(1) and HH(2). */ + + ll = buffer + subband_width * subband_width * 3; + hl = buffer; + l_dst = idwt; + + rfx_dwt_2d_decode_block_horiz_SSE2(ll, hl, l_dst, subband_width); + + lh = buffer + subband_width * subband_width; + hh = buffer + subband_width * subband_width * 2; + h_dst = idwt + subband_width * subband_width * 2; + + rfx_dwt_2d_decode_block_horiz_SSE2(lh, hh, h_dst, subband_width); + + /* Inverse DWT in vertical direction, results are stored in original buffer. */ + rfx_dwt_2d_decode_block_vert_SSE2(l_dst, h_dst, buffer, subband_width); +} + +void +rfx_dwt_2d_decode_SSE2(sint16 * buffer, sint16 * dwt_buffer) +{ + _mm_prefetch_buffer((char *) buffer, 4096 * sizeof(sint16)); + + rfx_dwt_2d_decode_block_SSE2(buffer + 3840, dwt_buffer, 8); + rfx_dwt_2d_decode_block_SSE2(buffer + 3072, dwt_buffer, 16); + rfx_dwt_2d_decode_block_SSE2(buffer, dwt_buffer, 32); +} + +static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +rfx_dwt_2d_encode_block_vert_SSE2(sint16 * src, sint16 * l, sint16 * h, int subband_width) +{ + int total_width; + int x; + int n; + __m128i src_2n; + __m128i src_2n_1; + __m128i src_2n_2; + __m128i h_n; + __m128i h_n_m; + __m128i l_n; + + total_width = subband_width << 1; + + for (n = 0; n < subband_width; n++) + { + for (x = 0; x < total_width; x += 8) + { + src_2n = _mm_load_si128((__m128i*) src); + src_2n_1 = _mm_load_si128((__m128i*) (src + total_width)); + if (n < subband_width - 1) + src_2n_2 = _mm_load_si128((__m128i*) (src + 2 * total_width)); + else + src_2n_2 = src_2n_1; + + /* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */ + + h_n = _mm_add_epi16(src_2n, src_2n_2); + h_n = _mm_srai_epi16(h_n, 1); + h_n = _mm_sub_epi16(src_2n_1, h_n); + h_n = _mm_srai_epi16(h_n, 1); + + _mm_store_si128((__m128i*) h, h_n); + + if (n == 0) + h_n_m = h_n; + else + h_n_m = _mm_load_si128((__m128i*) (h - total_width)); + + /* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */ + + l_n = _mm_add_epi16(h_n_m, h_n); + l_n = _mm_srai_epi16(l_n, 1); + l_n = _mm_add_epi16(l_n, src_2n); + + _mm_store_si128((__m128i*) l, l_n); + + src += 8; + l += 8; + h += 8; + } + src += total_width; + } +} + +static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +rfx_dwt_2d_encode_block_horiz_SSE2(sint16 * src, sint16 * l, sint16 * h, int subband_width) +{ + int y; + int n; + int first; + __m128i src_2n; + __m128i src_2n_1; + __m128i src_2n_2; + __m128i h_n; + __m128i h_n_m; + __m128i l_n; + + for (y = 0; y < subband_width; y++) + { + for (n = 0; n < subband_width; n += 8) + { + /* The following 3 Set operations consumes more than half of the total DWT processing time! */ + src_2n = _mm_set_epi16(src[14], src[12], src[10], src[8], src[6], src[4], src[2], src[0]); + src_2n_1 = _mm_set_epi16(src[15], src[13], src[11], src[9], src[7], src[5], src[3], src[1]); + src_2n_2 = _mm_set_epi16(n == subband_width - 8 ? src[15] : src[16], + src[14], src[12], src[10], src[8], src[6], src[4], src[2]); + + /* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */ + + h_n = _mm_add_epi16(src_2n, src_2n_2); + h_n = _mm_srai_epi16(h_n, 1); + h_n = _mm_sub_epi16(src_2n_1, h_n); + h_n = _mm_srai_epi16(h_n, 1); + + _mm_store_si128((__m128i*) h, h_n); + + h_n_m = _mm_loadu_si128((__m128i*) (h - 1)); + if (n == 0) + { + first = _mm_extract_epi16(h_n_m, 1); + h_n_m = _mm_insert_epi16(h_n_m, first, 0); + } + + /* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */ + + l_n = _mm_add_epi16(h_n_m, h_n); + l_n = _mm_srai_epi16(l_n, 1); + l_n = _mm_add_epi16(l_n, src_2n); + + _mm_store_si128((__m128i*) l, l_n); + + src += 16; + l += 8; + h += 8; + } + } +} + +static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +rfx_dwt_2d_encode_block_SSE2(sint16 * buffer, sint16 * dwt, int subband_width) +{ + sint16 * hl, * lh, * hh, * ll; + sint16 * l_src, * h_src; + + _mm_prefetch_buffer((char *) dwt, subband_width * 4 * sizeof(sint16)); + + /* DWT in vertical direction, results in 2 sub-bands in L, H order in tmp buffer dwt. */ + + l_src = dwt; + h_src = dwt + subband_width * subband_width * 2; + + rfx_dwt_2d_encode_block_vert_SSE2(buffer, l_src, h_src, subband_width); + + /* DWT in horizontal direction, results in 4 sub-bands in HL(0), LH(1), HH(2), LL(3) order, stored in original buffer. */ + /* The lower part L generates LL(3) and HL(0). */ + /* The higher part H generates LH(1) and HH(2). */ + + ll = buffer + subband_width * subband_width * 3; + hl = buffer; + + lh = buffer + subband_width * subband_width; + hh = buffer + subband_width * subband_width * 2; + + rfx_dwt_2d_encode_block_horiz_SSE2(l_src, ll, hl, subband_width); + rfx_dwt_2d_encode_block_horiz_SSE2(h_src, lh, hh, subband_width); +} + +void +rfx_dwt_2d_encode_SSE2(sint16 * buffer, sint16 * dwt_buffer) +{ + _mm_prefetch_buffer((char *) buffer, 4096 * sizeof(sint16)); + + rfx_dwt_2d_encode_block_SSE2(buffer, dwt_buffer, 32); + rfx_dwt_2d_encode_block_SSE2(buffer + 3072, dwt_buffer, 16); + rfx_dwt_2d_encode_block_SSE2(buffer + 3840, dwt_buffer, 8); +} diff --git a/libfreerdp-rfx/sse/rfx_sse2.h b/libfreerdp-rfx/sse/rfx_sse2.h index 0c16180..85921da 100644 --- a/libfreerdp-rfx/sse/rfx_sse2.h +++ b/libfreerdp-rfx/sse/rfx_sse2.h @@ -22,6 +22,11 @@ #include <freerdp/rfx.h> -void rfx_decode_YCbCr_to_RGB_SSE2(uint32 * y_r_buffer, uint32 * cb_g_buffer, uint32 * cr_b_buffer); +void rfx_decode_YCbCr_to_RGB_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer); +void rfx_encode_RGB_to_YCbCr_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer); +void rfx_quantization_decode_SSE2(sint16 * buffer, const uint32 * quantization_values); +void rfx_quantization_encode_SSE2(sint16 * buffer, const uint32 * quantization_values); +void rfx_dwt_2d_decode_SSE2(sint16 * buffer, sint16 * dwt_buffer); +void rfx_dwt_2d_encode_SSE2(sint16 * buffer, sint16 * dwt_buffer); #endif /* __RFX_SSE2_H */ diff --git a/libfreerdp-utils/Makefile.am b/libfreerdp-utils/Makefile.am index 6db6df1..d6912fd 100644 --- a/libfreerdp-utils/Makefile.am +++ b/libfreerdp-utils/Makefile.am @@ -11,7 +11,9 @@ libfreerdp_utils_la_SOURCES = \ semaphore.c \ unicode.c \ wait_obj.c \ - chan_plugin.c + chan_plugin.c \ + stopwatch.c \ + profiler.c libfreerdp_utils_la_CFLAGS = \ -I$(top_srcdir) \ diff --git a/libfreerdp-utils/profiler.c b/libfreerdp-utils/profiler.c new file mode 100644 index 0000000..308c78c --- /dev/null +++ b/libfreerdp-utils/profiler.c @@ -0,0 +1,72 @@ +/* + FreeRDP: A Remote Desktop Protocol client. + Profiler Utils + + Copyright 2011 Stephen Erisman + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include <freerdp/utils/profiler.h> + +PROFILER * profiler_create(char * name) +{ + PROFILER * profiler; + + profiler = (PROFILER *) xmalloc(sizeof(PROFILER)); + + profiler->name = name; + profiler->stopwatch = stopwatch_create(); + + return profiler; +} + +void profiler_free(PROFILER * profiler) +{ + stopwatch_free(profiler->stopwatch); + + xfree(profiler); +} + +void profiler_enter(PROFILER * profiler) +{ + stopwatch_start(profiler->stopwatch); +} + +void profiler_exit(PROFILER * profiler) +{ + stopwatch_stop(profiler->stopwatch); +} + +void profiler_print_header() +{ + printf("\n"); + printf(" |-----------------------|\n" ); + printf(" PROFILER | elapsed seconds |\n" ); + printf("|--------------------------------------------|-----------------------|\n" ); + printf("| code section | iterations | total | avg. |\n" ); + printf("|-------------------------------|------------|-----------|-----------|\n" ); +} + +void profiler_print(PROFILER * profiler) +{ + double elapsed_sec = stopwatch_get_elapsed_time_in_seconds(profiler->stopwatch); + double avg_sec = elapsed_sec / (double) profiler->stopwatch->count; + + printf("| %-30.30s| %'10lu | %'9f | %'9f |\n", profiler->name, profiler->stopwatch->count, elapsed_sec, avg_sec); +} + +void profiler_print_footer() +{ + printf("|--------------------------------------------------------------------|\n" ); +} diff --git a/libfreerdp-utils/stopwatch.c b/libfreerdp-utils/stopwatch.c new file mode 100644 index 0000000..c13365c --- /dev/null +++ b/libfreerdp-utils/stopwatch.c @@ -0,0 +1,60 @@ +/* + FreeRDP: A Remote Desktop Protocol client. + Stopwatch Utils + + Copyright 2011 Stephen Erisman + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include <freerdp/utils/stopwatch.h> + +STOPWATCH * stopwatch_create() +{ + STOPWATCH * sw; + + sw = (STOPWATCH *) xmalloc(sizeof(STOPWATCH)); + stopwatch_reset(sw); + + return sw; +} + +void stopwatch_free(STOPWATCH * stopwatch) +{ + xfree(stopwatch); +} + +void stopwatch_start(STOPWATCH * stopwatch) +{ + stopwatch->start = clock(); + stopwatch->count++; +} + +void stopwatch_stop(STOPWATCH * stopwatch) +{ + stopwatch->end = clock(); + stopwatch->elapsed += (stopwatch->end - stopwatch->start); +} + +void stopwatch_reset(STOPWATCH * stopwatch) +{ + stopwatch->start = 0; + stopwatch->end = 0; + stopwatch->elapsed = 0; + stopwatch->count = 0; +} + +double stopwatch_get_elapsed_time_in_seconds(STOPWATCH * stopwatch) +{ + return ((double)stopwatch->elapsed) / CLOCKS_PER_SEC; +} |