diff options
author | Marc-André Moreau <marcandre.moreau@gmail.com> | 2011-06-22 08:23:58 +0400 |
---|---|---|
committer | Marc-André Moreau <marcandre.moreau@gmail.com> | 2011-06-22 08:23:58 +0400 |
commit | 605ec47335f1b3b2d61a2d1c253a1579dc852d05 (patch) | |
tree | b693d696b93b86af4a2e2d3c6badf19dd210c17d | |
parent | 3ccc01ff6a03fdb238cde2e414722013f5f81097 (diff) | |
parent | 8b8194f515c7bf713fbb1855a9caaffbb1b71729 (diff) |
libfreerdp-rfx: merging latest refactoring with remotefx branch
-rw-r--r-- | cunit/test_librfx.c | 56 | ||||
-rw-r--r-- | include/freerdp/rfx.h | 23 | ||||
-rw-r--r-- | include/freerdp/utils/Makefile.am | 2 | ||||
-rw-r--r-- | libfreerdp-rfx/librfx.c | 284 | ||||
-rw-r--r-- | libfreerdp-rfx/rfx_decode.c | 112 | ||||
-rw-r--r-- | libfreerdp-rfx/rfx_encode.c | 146 | ||||
-rw-r--r-- | libfreerdp-rfx/rfx_encode.h | 2 | ||||
-rw-r--r-- | libfreerdp-rfx/rfx_rlgr.c | 15 | ||||
-rw-r--r-- | libfreerdp-rfx/sse/rfx_sse.c | 4 | ||||
-rw-r--r-- | libfreerdp-rfx/sse/rfx_sse2.c | 262 | ||||
-rw-r--r-- | libfreerdp-rfx/sse/rfx_sse2.h | 2 |
11 files changed, 731 insertions, 177 deletions
diff --git a/cunit/test_librfx.c b/cunit/test_librfx.c index 16b8cbf..584ac7a 100644 --- a/cunit/test_librfx.c +++ b/cunit/test_librfx.c @@ -61,7 +61,7 @@ static const uint8 y_data[] = static const uint8 cb_data[] = { - 0x1b, 0x04, 0x7f, 0x04, 0x31, 0x5f, 0xc2, + 0x1b, 0x04, 0x7f, 0x04, 0x31, 0x5f, 0xc2, 0x94, 0xaf, 0x05, 0x29, 0x5e, 0x0a, 0x52, 0xbc, 0x14, 0xa5, 0x78, 0x29, 0x25, 0x78, 0x29, 0x25, 0x78, 0x29, 0x25, 0x68, 0x52, 0x4a, 0xf0, 0x52, 0x4a, 0xf0, 0x52, 0x4a, 0xd0, 0xa4, 0x95, 0xe0, 0xa4, 0x95, 0xe0, 0xa4, 0x95, 0xa1, 0x49, 0x2b, 0xc1, 0x49, 0x2b, 0xc1, 0x49, 0x2b, 0x42, 0x92, @@ -86,7 +86,7 @@ static const uint8 cb_data[] = static const uint8 cr_data[] = { - 0x1b, 0xfc, 0x11, 0xc1, 0x0f, 0x4a, 0xc1, 0x4f, 0x4a, 0xc1, + 0x1b, 0xfc, 0x11, 0xc1, 0x0f, 0x4a, 0xc1, 0x4f, 0x4a, 0xc1, 0x4f, 0x4a, 0xa1, 0x4d, 0x95, 0x42, 0x9e, 0x95, 0x42, 0x9e, 0x95, 0x42, 0x9b, 0x2a, 0x85, 0x3d, 0x2a, 0x85, 0x3d, 0x2a, 0x85, 0x36, 0x55, 0x0a, 0x7a, 0x55, 0x0a, 0x7a, 0x55, 0x0a, 0x6c, 0xaa, 0x14, 0xf4, 0xaa, 0x14, 0xf4, 0xaa, 0x14, 0xd9, 0x54, 0x29, 0xe9, 0x54, 0x29, 0xe9, 0x54, 0x29, @@ -132,6 +132,23 @@ static const uint8 rgb_scanline_data[] = 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, + 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, + + 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, + 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, + 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, + 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, + 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, + 0xFF, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, + 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, + 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, + 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, + 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, + 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, + 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, + 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, + 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, + 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF }; @@ -299,7 +316,7 @@ void test_encode(void) { RFX_CONTEXT * context; - uint8 ycbcr_buffer[16384]; + uint8 ycbcr_buffer[1024000]; int y_size, cb_size, cr_size; int i; uint8 decode_buffer[4096 * 3]; @@ -313,7 +330,7 @@ test_encode(void) context->mode = RLGR3; rfx_context_set_pixel_format(context, RFX_PIXEL_FORMAT_RGB); - rfx_encode_rgb(context, rgb_data, 64 * 3, + rfx_encode_rgb(context, rgb_data, 64, 64, 64 * 3, test_quantization_values, test_quantization_values, test_quantization_values, ycbcr_buffer, sizeof(ycbcr_buffer), &y_size, &cb_size, &cr_size); //dump_buffer(context->cb_g_buffer, 4096); @@ -340,8 +357,15 @@ void test_message(void) { RFX_CONTEXT * context; - uint8 buffer[16384]; + uint8 buffer[1024000]; int size; + int i, j; + RFX_RECT rect = {0, 0, 100, 80}; + RFX_MESSAGE * message; + + rgb_data = (uint8 *) malloc(100 * 80 * 3); + for (i = 0; i < 80; i++) + memcpy(rgb_data + i * 100 * 3, rgb_scanline_data, 100 * 3); context = rfx_context_new(); context->mode = RLGR3; @@ -349,9 +373,27 @@ test_message(void) context->height = 600; rfx_context_set_pixel_format(context, RFX_PIXEL_FORMAT_RGB); - size = rfx_compose_message_header(context, buffer, sizeof(buffer)); + size = rfx_compose_message_header(context, buffer, sizeof(buffer)); /*hexdump(buffer, size);*/ - rfx_process_message(context, buffer, size); + message = rfx_process_message(context, buffer, size); + rfx_message_free(context, message); + + for (i = 0; i < 1000; i++) + { + size = rfx_compose_message_data(context, buffer, sizeof(buffer), + &rect, 1, rgb_data, 100, 80, 100 * 3); + /*hexdump(buffer, size);*/ + message = rfx_process_message(context, buffer, size); + if (i == 0) + { + for (j = 0; j < message->num_tiles; j++) + { + dump_ppm_image(message->tiles[j]->data); + } + } + rfx_message_free(context, message); + } rfx_context_free(context); + free(rgb_data); } diff --git a/include/freerdp/rfx.h b/include/freerdp/rfx.h index 6e2c42f..7cf309c 100644 --- a/include/freerdp/rfx.h +++ b/include/freerdp/rfx.h @@ -41,6 +41,7 @@ extern "C" { #define WBT_FRAME_END 0xCCC5 #define WBT_REGION 0xCCC6 #define WBT_EXTENSION 0xCCC7 +#define CBT_REGION 0xCAC1 #define CBT_TILESET 0xCAC2 #define CBT_TILE 0xCAC3 @@ -125,7 +126,8 @@ typedef struct _RFX_MESSAGE RFX_MESSAGE; struct _RFX_CONTEXT { - int flags; + uint16 flags; + uint16 properties; uint16 width; uint16 height; RLGR_MODE mode; @@ -133,10 +135,15 @@ struct _RFX_CONTEXT uint32 codec_id; uint32 codec_version; RFX_PIXEL_FORMAT pixel_format; + uint8 bytes_per_pixel; /* temporary data within a frame */ + uint32 frame_idx; uint8 num_quants; uint32 * quants; + uint8 quant_idx_y; + uint8 quant_idx_cb; + uint8 quant_idx_cr; /* pre-allocated buffers */ @@ -153,7 +160,6 @@ struct _RFX_CONTEXT sint16 dwt_mem_8[8*8*2*2 + 8]; /* sub-band width 8 */ sint16 dwt_mem_16[16*16*2*2 + 8]; /* sub-band width 16 */ sint16 dwt_mem_32[32*32*2*2 + 8]; /* sub-band width 32 */ - //sint16* dwt_buffers[5]; /* sub-band buffer array */ sint16 * dwt_buffer_8; sint16 * dwt_buffer_16; @@ -163,6 +169,7 @@ struct _RFX_CONTEXT void (* decode_YCbCr_to_RGB)(sint16 * y_r_buf, sint16 * cb_g_buf, sint16 * cr_b_buf); void (* encode_RGB_to_YCbCr)(sint16 * y_r_buf, sint16 * cb_g_buf, sint16 * cr_b_buf); void (* quantization_decode)(sint16 * buffer, const uint32 * quantization_values); + void (* quantization_encode)(sint16 * buffer, const uint32 * quantization_values); void (* dwt_2d_decode)(sint16 * buffer, sint16 * dwt_buffer_8, sint16 * dwt_buffer_16, sint16 * dwt_buffer_32); /* profiler definitions */ @@ -173,6 +180,16 @@ struct _RFX_CONTEXT PROFILER_DEFINE(prof_rfx_quantization_decode); PROFILER_DEFINE(prof_rfx_dwt_2d_decode); PROFILER_DEFINE(prof_rfx_decode_YCbCr_to_RGB); + PROFILER_DEFINE(prof_rfx_decode_format_RGB); + + PROFILER_DEFINE(prof_rfx_encode_rgb); + PROFILER_DEFINE(prof_rfx_encode_component); + PROFILER_DEFINE(prof_rfx_rlgr_encode); + PROFILER_DEFINE(prof_rfx_differential_encode); + PROFILER_DEFINE(prof_rfx_quantization_encode); + PROFILER_DEFINE(prof_rfx_dwt_2d_encode); + PROFILER_DEFINE(prof_rfx_encode_RGB_to_YCbCr); + PROFILER_DEFINE(prof_rfx_encode_format_RGB); }; typedef struct _RFX_CONTEXT RFX_CONTEXT; @@ -185,7 +202,7 @@ void rfx_message_free(RFX_CONTEXT * context, RFX_MESSAGE * message); int rfx_compose_message_header(RFX_CONTEXT * context, uint8 * buffer, int buffer_size); int rfx_compose_message_data(RFX_CONTEXT * context, uint8 * buffer, int buffer_size, - const RFX_RECT * rects, int num_rects, uint8 * image_buffer, int width, int height); + const RFX_RECT * rects, int num_rects, uint8 * image_data, int width, int height, int rowstride); #ifdef __cplusplus } diff --git a/include/freerdp/utils/Makefile.am b/include/freerdp/utils/Makefile.am index eec4969..410babf 100644 --- a/include/freerdp/utils/Makefile.am +++ b/include/freerdp/utils/Makefile.am @@ -6,7 +6,9 @@ include_HEADERS = \ chan_plugin.h \ datablob.h \ memory.h \ + profiler.h \ semaphore.h \ + stopwatch.h \ stream.h \ unicode.h \ wait_obj.h diff --git a/libfreerdp-rfx/librfx.c b/libfreerdp-rfx/librfx.c index 160ea57..b5b046f 100644 --- a/libfreerdp-rfx/librfx.c +++ b/libfreerdp-rfx/librfx.c @@ -33,6 +33,23 @@ #include "librfx.h" +/* + The quantization values control the compression rate and quality. The value + range is between 6 and 15. The higher value, the higher compression rate + and lower quality. + + This is the default values being use by the MS RDP server, and we will also + use it as our default values for the encoder. It can be overrided by setting + the context->num_quants and context->quants member. + + The order of the values are: + LL3, LH3, HL3, HH3, LH2, HL2, HH2, LH1, HL1, HH1 +*/ +static const uint32 rfx_default_quantization_values[] = +{ + 6, 6, 6, 6, 7, 7, 8, 8, 8, 9 +}; + void rfx_profiler_create(RFX_CONTEXT * context) { PROFILER_CREATE(context->prof_rfx_decode_rgb, "rfx_decode_rgb"); @@ -42,6 +59,16 @@ void rfx_profiler_create(RFX_CONTEXT * context) PROFILER_CREATE(context->prof_rfx_quantization_decode, "rfx_quantization_decode"); PROFILER_CREATE(context->prof_rfx_dwt_2d_decode, "rfx_dwt_2d_decode"); PROFILER_CREATE(context->prof_rfx_decode_YCbCr_to_RGB, "rfx_decode_YCbCr_to_RGB"); + PROFILER_CREATE(context->prof_rfx_decode_format_RGB, "rfx_decode_format_RGB"); + + PROFILER_CREATE(context->prof_rfx_encode_rgb, "rfx_encode_rgb"); + PROFILER_CREATE(context->prof_rfx_encode_component, "rfx_encode_component"); + PROFILER_CREATE(context->prof_rfx_rlgr_encode, "rfx_rlgr_encode"); + PROFILER_CREATE(context->prof_rfx_differential_encode, "rfx_differential_encode"); + PROFILER_CREATE(context->prof_rfx_quantization_encode, "rfx_quantization_encode"); + PROFILER_CREATE(context->prof_rfx_dwt_2d_encode, "rfx_dwt_2d_encode"); + PROFILER_CREATE(context->prof_rfx_encode_RGB_to_YCbCr, "rfx_encode_RGB_to_YCbCr"); + PROFILER_CREATE(context->prof_rfx_encode_format_RGB, "rfx_encode_format_RGB"); } void rfx_profiler_free(RFX_CONTEXT * context) @@ -53,11 +80,22 @@ void rfx_profiler_free(RFX_CONTEXT * context) PROFILER_FREE(context->prof_rfx_quantization_decode); PROFILER_FREE(context->prof_rfx_dwt_2d_decode); PROFILER_FREE(context->prof_rfx_decode_YCbCr_to_RGB); + PROFILER_FREE(context->prof_rfx_decode_format_RGB); + + PROFILER_FREE(context->prof_rfx_encode_rgb); + PROFILER_FREE(context->prof_rfx_encode_component); + PROFILER_FREE(context->prof_rfx_rlgr_encode); + PROFILER_FREE(context->prof_rfx_differential_encode); + PROFILER_FREE(context->prof_rfx_quantization_encode); + PROFILER_FREE(context->prof_rfx_dwt_2d_encode); + PROFILER_FREE(context->prof_rfx_encode_RGB_to_YCbCr); + PROFILER_FREE(context->prof_rfx_encode_format_RGB); } void rfx_profiler_print(RFX_CONTEXT * context) { PROFILER_PRINT_HEADER; + PROFILER_PRINT(context->prof_rfx_decode_rgb); PROFILER_PRINT(context->prof_rfx_decode_component); PROFILER_PRINT(context->prof_rfx_rlgr_decode); @@ -65,6 +103,17 @@ void rfx_profiler_print(RFX_CONTEXT * context) PROFILER_PRINT(context->prof_rfx_quantization_decode); PROFILER_PRINT(context->prof_rfx_dwt_2d_decode); PROFILER_PRINT(context->prof_rfx_decode_YCbCr_to_RGB); + PROFILER_PRINT(context->prof_rfx_decode_format_RGB); + + PROFILER_PRINT(context->prof_rfx_encode_rgb); + PROFILER_PRINT(context->prof_rfx_encode_component); + PROFILER_PRINT(context->prof_rfx_rlgr_encode); + PROFILER_PRINT(context->prof_rfx_differential_encode); + PROFILER_PRINT(context->prof_rfx_quantization_encode); + PROFILER_PRINT(context->prof_rfx_dwt_2d_encode); + PROFILER_PRINT(context->prof_rfx_encode_RGB_to_YCbCr); + PROFILER_PRINT(context->prof_rfx_encode_format_RGB); + PROFILER_PRINT_FOOTER; } @@ -78,6 +127,9 @@ rfx_context_new(void) context->pool = rfx_pool_new(); + /* initialize the default pixel format */ + rfx_context_set_pixel_format(context, RFX_PIXEL_FORMAT_BGRA); + /* align buffers to 16 byte boundary (needed for SSE/SSE2 instructions) */ context->y_r_buffer = (sint16 *)(((uintptr_t)context->y_r_mem + 16) & ~ 0x0F); context->cb_g_buffer = (sint16 *)(((uintptr_t)context->cb_g_mem + 16) & ~ 0x0F); @@ -121,6 +173,20 @@ void rfx_context_set_pixel_format(RFX_CONTEXT * context, RFX_PIXEL_FORMAT pixel_format) { context->pixel_format = pixel_format; + switch (pixel_format) + { + case RFX_PIXEL_FORMAT_BGRA: + case RFX_PIXEL_FORMAT_RGBA: + context->bytes_per_pixel = 4; + break; + case RFX_PIXEL_FORMAT_BGR: + case RFX_PIXEL_FORMAT_RGB: + context->bytes_per_pixel = 3; + break; + default: + context->bytes_per_pixel = 0; + break; + } } static void @@ -204,6 +270,7 @@ rfx_process_message_context(RFX_CONTEXT * context, uint8 * data, int size) DEBUG_RFX("ctxId %d tileSize %d properties 0x%X.", ctxId, tileSize, properties); + context->properties = properties; context->flags = (properties & 0x0007); if (context->flags == CODEC_MODE) @@ -509,6 +576,12 @@ rfx_message_free(RFX_CONTEXT * context, RFX_MESSAGE * message) static int rfx_compose_message_sync(RFX_CONTEXT * context, uint8 * buffer, int buffer_size) { + if (buffer_size < 12) + { + printf("rfx_compose_message_sync: buffer size too small.\n"); + return 0; + } + SET_UINT16(buffer, 0, WBT_SYNC); /* BlockT.blockType */ SET_UINT32(buffer, 2, 12); /* BlockT.blockLen */ SET_UINT32(buffer, 6, WF_MAGIC); /* magic */ @@ -520,6 +593,12 @@ rfx_compose_message_sync(RFX_CONTEXT * context, uint8 * buffer, int buffer_size) static int rfx_compose_message_codec_versions(RFX_CONTEXT * context, uint8 * buffer, int buffer_size) { + if (buffer_size < 10) + { + printf("rfx_compose_message_codec_versions: buffer size too small.\n"); + return 0; + } + SET_UINT16(buffer, 0, WBT_CODEC_VERSIONS); /* BlockT.blockType */ SET_UINT32(buffer, 2, 10); /* BlockT.blockLen */ SET_UINT8(buffer, 6, 1); /* numCodecs */ @@ -532,6 +611,12 @@ rfx_compose_message_codec_versions(RFX_CONTEXT * context, uint8 * buffer, int bu static int rfx_compose_message_channels(RFX_CONTEXT * context, uint8 * buffer, int buffer_size) { + if (buffer_size < 12) + { + printf("rfx_compose_message_channels: buffer size too small.\n"); + return 0; + } + SET_UINT16(buffer, 0, WBT_CHANNELS); /* BlockT.blockType */ SET_UINT32(buffer, 2, 12); /* BlockT.blockLen */ SET_UINT8(buffer, 6, 1); /* numChannels */ @@ -547,6 +632,12 @@ rfx_compose_message_context(RFX_CONTEXT * context, uint8 * buffer, int buffer_si { uint16 properties; + if (buffer_size < 13) + { + printf("rfx_compose_message_context: buffer size too small.\n"); + return 0; + } + SET_UINT16(buffer, 0, WBT_CONTEXT); /* CodecChannelT.blockType */ SET_UINT32(buffer, 2, 13); /* CodecChannelT.blockLen */ SET_UINT8(buffer, 6, 1); /* CodecChannelT.codecId */ @@ -561,6 +652,7 @@ rfx_compose_message_context(RFX_CONTEXT * context, uint8 * buffer, int buffer_si properties |= ((context->mode == RLGR1 ? CLW_ENTROPY_RLGR1 : CLW_ENTROPY_RLGR3) << 9); /* et */ properties |= (SCALAR_QUANTIZATION << 13); /* qt */ SET_UINT16(buffer, 11, properties); + context->properties = properties; return 13; } @@ -581,32 +673,210 @@ rfx_compose_message_header(RFX_CONTEXT * context, uint8 * buffer, int buffer_siz static int rfx_compose_message_frame_begin(RFX_CONTEXT * context, uint8 * buffer, int buffer_size) { - return 0; + if (buffer_size < 14) + { + printf("rfx_compose_message_frame_begin: buffer size too small.\n"); + return 0; + } + + SET_UINT16(buffer, 0, WBT_FRAME_BEGIN); /* CodecChannelT.blockType */ + SET_UINT32(buffer, 2, 14); /* CodecChannelT.blockLen */ + SET_UINT8(buffer, 6, 1); /* CodecChannelT.codecId */ + SET_UINT8(buffer, 7, 0); /* CodecChannelT.channelId */ + SET_UINT32(buffer, 8, context->frame_idx); /* frameIdx */ + SET_UINT16(buffer, 12, 1); /* numRegions */ + + return 14; } static int rfx_compose_message_region(RFX_CONTEXT * context, uint8 * buffer, int buffer_size, const RFX_RECT * rects, int num_rects) { - return 0; + int size; + int i; + + if (buffer_size < 15 + num_rects * 8) + { + printf("rfx_compose_message_region: buffer size too small.\n"); + return 0; + } + + SET_UINT16(buffer, 0, WBT_REGION); /* CodecChannelT.blockType */ + /* set CodecChannelT.blockLen later */ + SET_UINT8(buffer, 6, 1); /* CodecChannelT.codecId */ + SET_UINT8(buffer, 7, 0); /* CodecChannelT.channelId */ + SET_UINT8(buffer, 8, 1); /* regionFlags */ + SET_UINT16(buffer, 9, num_rects); /* numRects */ + size = 11; + + for (i = 0; i < num_rects; i++) + { + SET_UINT16(buffer, size, rects[i].x); + SET_UINT16(buffer, size + 2, rects[i].y); + SET_UINT16(buffer, size + 4, rects[i].width); + SET_UINT16(buffer, size + 6, rects[i].height); + size += 8; + } + + SET_UINT16(buffer, size, CBT_REGION); /* regionType */ + SET_UINT16(buffer, size + 2, 1); /* numTilesets */ + size += 4; + + SET_UINT32(buffer, 2, size); /* CodecChannelT.blockLen */ + return size; +} + +static int +rfx_compose_message_tile(RFX_CONTEXT * context, uint8 * buffer, int buffer_size, + uint8 * tile_data, int tile_width, int tile_height, int rowstride, + const uint32 * quantVals, int quantIdxY, int quantIdxCb, int quantIdxCr, int xIdx, int yIdx) +{ + int YLen = 0; + int CbLen = 0; + int CrLen = 0; + int size; + + if (buffer_size < 19) + { + printf("rfx_compose_message_tile: buffer size too small.\n"); + return 0; + } + + SET_UINT16(buffer, 0, CBT_TILE); /* BlockT.blockType */ + /* set BlockT.blockLen later */ + SET_UINT8(buffer, 6, quantIdxY); /* quantIdxY */ + SET_UINT8(buffer, 7, quantIdxCb); /* quantIdxCb */ + SET_UINT8(buffer, 8, quantIdxCr); /* quantIdxCr */ + SET_UINT16(buffer, 9, xIdx); /* xIdx */ + SET_UINT16(buffer, 11, yIdx); /* yIdx */ + + rfx_encode_rgb(context, tile_data, tile_width, tile_height, rowstride, + quantVals + quantIdxY * 10, quantVals + quantIdxCb * 10, quantVals + quantIdxCr * 10, + buffer + 19, buffer_size - 19, &YLen, &CbLen, &CrLen); + + DEBUG_RFX("xIdx=%d yIdx=%d width=%d height=%d YLen=%d CbLen=%d CrLen=%d", + xIdx, yIdx, tile_width, tile_height, YLen, CbLen, CrLen); + + SET_UINT16(buffer, 13, YLen); /* YLen */ + SET_UINT16(buffer, 15, CbLen); /* CbLen */ + SET_UINT16(buffer, 17, CrLen); /* CrLen */ + size = 19 + YLen + CbLen + CrLen; + SET_UINT32(buffer, 2, size); /* BlockT.blockLen */ + + return size; } static int rfx_compose_message_tileset(RFX_CONTEXT * context, uint8 * buffer, int buffer_size, - uint8 * image_buffer, int width, int height) + uint8 * image_data, int width, int height, int rowstride) { - return 0; + int size; + int i; + int numQuants; + const uint32 * quantVals; + const uint32 * quantValsPtr; + int quantIdxY; + int quantIdxCb; + int quantIdxCr; + int numTiles; + int numTilesX; + int numTilesY; + int xIdx; + int yIdx; + int tilesDataSize; + + if (context->num_quants == 0) + { + numQuants = 1; + quantVals = rfx_default_quantization_values; + quantIdxY = 0; + quantIdxCb = 0; + quantIdxCr = 0; + } + else + { + numQuants = context->num_quants; + quantVals = context->quants; + quantIdxY = context->quant_idx_y; + quantIdxCb = context->quant_idx_cb; + quantIdxCr = context->quant_idx_cr; + } + + numTilesX = (width + 63) / 64; + numTilesY = (height + 63) / 64; + numTiles = numTilesX * numTilesY; + + if (buffer_size < 22 + numQuants * 5) + { + printf("rfx_compose_message_tileset: buffer size too small.\n"); + return 0; + } + + SET_UINT16(buffer, 0, WBT_EXTENSION); /* CodecChannelT.blockType */ + /* set CodecChannelT.blockLen later */ + SET_UINT8(buffer, 6, 1); /* CodecChannelT.codecId */ + SET_UINT8(buffer, 7, 0); /* CodecChannelT.channelId */ + SET_UINT16(buffer, 8, CBT_TILESET); /* subtype */ + SET_UINT16(buffer, 10, 0); /* idx */ + SET_UINT16(buffer, 12, context->properties); /* properties */ + SET_UINT8(buffer, 14, numQuants); /* numQuants */ + SET_UINT8(buffer, 15, 0x40); /* tileSize */ + SET_UINT16(buffer, 16, numTiles); /* numTiles */ + /* set tilesDataSize later */ + size = 22; + + quantValsPtr = quantVals; + for (i = 0; i < numQuants * 5; i++) + { + SET_UINT8(buffer, size, quantValsPtr[0] + (quantValsPtr[1] << 4)); + quantValsPtr += 2; + size++; + } + + DEBUG_RFX("width:%d height:%d rowstride:%d", width, height, rowstride); + + tilesDataSize = 0; + for (yIdx = 0; yIdx < numTilesY; yIdx++) + { + for (xIdx = 0; xIdx < numTilesX; xIdx++) + { + tilesDataSize += rfx_compose_message_tile(context, + buffer + size + tilesDataSize, buffer_size - size - tilesDataSize, + image_data + yIdx * 64 * rowstride + xIdx * 64 * context->bytes_per_pixel, + xIdx < numTilesX - 1 ? 64 : width - xIdx * 64, + yIdx < numTilesY - 1 ? 64 : height - yIdx * 64, + rowstride, quantVals, quantIdxY, quantIdxCb, quantIdxCr, xIdx, yIdx); + } + } + + size += tilesDataSize; + SET_UINT32(buffer, 2, size); /* CodecChannelT.blockLen */ + SET_UINT32(buffer, 18, tilesDataSize); /* tilesDataSize */ + + return size; } static int rfx_compose_message_frame_end(RFX_CONTEXT * context, uint8 * buffer, int buffer_size) { - return 0; + if (buffer_size < 8) + { + printf("rfx_compose_message_frame_end: buffer size too small.\n"); + return 0; + } + + SET_UINT16(buffer, 0, WBT_FRAME_END); /* CodecChannelT.blockType */ + SET_UINT32(buffer, 2, 8); /* CodecChannelT.blockLen */ + SET_UINT8(buffer, 6, 1); /* CodecChannelT.codecId */ + SET_UINT8(buffer, 7, 0); /* CodecChannelT.channelId */ + + return 8; } int rfx_compose_message_data(RFX_CONTEXT * context, uint8 * buffer, int buffer_size, - const RFX_RECT * rects, int num_rects, uint8 * image_buffer, int width, int height) + const RFX_RECT * rects, int num_rects, uint8 * image_data, int width, int height, int rowstride) { int composed_size; @@ -614,7 +884,7 @@ rfx_compose_message_data(RFX_CONTEXT * context, uint8 * buffer, int buffer_size, composed_size += rfx_compose_message_region(context, buffer + composed_size, buffer_size - composed_size, rects, num_rects); composed_size += rfx_compose_message_tileset(context, buffer + composed_size, buffer_size - composed_size, - image_buffer, width, height); + image_data, width, height, rowstride); composed_size += rfx_compose_message_frame_end(context, buffer + composed_size, buffer_size - composed_size); return composed_size; diff --git a/libfreerdp-rfx/rfx_decode.c b/libfreerdp-rfx/rfx_decode.c index 75eab83..2e3180c 100644 --- a/libfreerdp-rfx/rfx_decode.c +++ b/libfreerdp-rfx/rfx_decode.c @@ -27,6 +27,57 @@ #include "rfx_decode.h" +static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +rfx_decode_format_RGB(sint16 * r_buf, sint16 * g_buf, sint16 * b_buf, + RFX_PIXEL_FORMAT pixel_format, uint8 * dst_buf) +{ + sint16 * r = r_buf; + sint16 * g = g_buf; + sint16 * b = b_buf; + uint8 * dst = dst_buf; + int i; + + switch (pixel_format) + { + case RFX_PIXEL_FORMAT_BGRA: + for (i = 0; i < 4096; i++) + { + *dst++ = (uint8) (*b++); + *dst++ = (uint8) (*g++); + *dst++ = (uint8) (*r++); + *dst++ = 0xFF; + } + break; + case RFX_PIXEL_FORMAT_RGBA: + for (i = 0; i < 4096; i++) + { + *dst++ = (uint8) (*r++); + *dst++ = (uint8) (*g++); + *dst++ = (uint8) (*b++); + *dst++ = 0xFF; + } + break; + case RFX_PIXEL_FORMAT_BGR: + for (i = 0; i < 4096; i++) + { + *dst++ = (uint8) (*b++); + *dst++ = (uint8) (*g++); + *dst++ = (uint8) (*r++); + } + break; + case RFX_PIXEL_FORMAT_RGB: + for (i = 0; i < 4096; i++) + { + *dst++ = (uint8) (*r++); + *dst++ = (uint8) (*g++); + *dst++ = (uint8) (*b++); + } + break; + default: + break; + } +} + #define MINMAX(_v,_l,_h) ((_v) < (_l) ? (_l) : ((_v) > (_h) ? (_h) : (_v))) void @@ -81,10 +132,6 @@ rfx_decode_rgb(RFX_CONTEXT * context, const uint8 * cb_data, int cb_size, const uint32 * cb_quants, const uint8 * cr_data, int cr_size, const uint32 * cr_quants, uint8* rgb_buffer) { - int i; - uint8 * dst; - sint16 * r, * g, * b; - PROFILER_ENTER(context->prof_rfx_decode_rgb); dst = rgb_buffer; @@ -96,58 +143,11 @@ rfx_decode_rgb(RFX_CONTEXT * context, context->decode_YCbCr_to_RGB(context->y_r_buffer, context->cb_g_buffer, context->cr_b_buffer); PROFILER_EXIT(context->prof_rfx_decode_YCbCr_to_RGB); - switch (context->pixel_format) - { - case RFX_PIXEL_FORMAT_BGRA: - r = context->y_r_buffer; - g = context->cb_g_buffer; - b = context->cr_b_buffer; - for (i = 0; i < 4096; i++) - { - *dst++ = (uint8) (*b++); - *dst++ = (uint8) (*g++); - *dst++ = (uint8) (*r++); - *dst++ = 0xFF; - } - break; - case RFX_PIXEL_FORMAT_RGBA: - r = context->y_r_buffer; - g = context->cb_g_buffer; - b = context->cr_b_buffer; - for (i = 0; i < 4096; i++) - { - *dst++ = (uint8) (*r++); - *dst++ = (uint8) (*g++); - *dst++ = (uint8) (*b++); - *dst++ = 0xFF; - } - break; - case RFX_PIXEL_FORMAT_BGR: - r = context->y_r_buffer; - g = context->cb_g_buffer; - b = context->cr_b_buffer; - for (i = 0; i < 4096; i++) - { - *dst++ = (uint8) (*b++); - *dst++ = (uint8) (*g++); - *dst++ = (uint8) (*r++); - } - break; - case RFX_PIXEL_FORMAT_RGB: - r = context->y_r_buffer; - g = context->cb_g_buffer; - b = context->cr_b_buffer; - for (i = 0; i < 4096; i++) - { - *dst++ = (uint8) (*r++); - *dst++ = (uint8) (*g++); - *dst++ = (uint8) (*b++); - } - break; - default: - break; - } - + PROFILER_ENTER(context->prof_rfx_decode_format_RGB); + rfx_decode_format_RGB(context->y_r_buffer, context->cb_g_buffer, context->cr_b_buffer, + context->pixel_format, rgb_buffer); + PROFILER_EXIT(context->prof_rfx_decode_format_RGB); + PROFILER_EXIT(context->prof_rfx_decode_rgb); return rgb_buffer; } diff --git a/libfreerdp-rfx/rfx_encode.c b/libfreerdp-rfx/rfx_encode.c index b138120..e458dd1 100644 --- a/libfreerdp-rfx/rfx_encode.c +++ b/libfreerdp-rfx/rfx_encode.c @@ -1,6 +1,6 @@ /* FreeRDP: A Remote Desktop Protocol client. - RemoteFX Codec Library - Decode + RemoteFX Codec Library - Encode Copyright 2011 Vic Lee @@ -29,6 +29,81 @@ #define MINMAX(_v,_l,_h) ((_v) < (_l) ? (_l) : ((_v) > (_h) ? (_h) : (_v))) +static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +rfx_encode_format_RGB(const uint8 * rgb_data, int width, int height, int rowstride, + RFX_PIXEL_FORMAT pixel_format, sint16 * r_buf, sint16 * g_buf, sint16 * b_buf) +{ + int x, y; + int x_exceed; + int y_exceed; + const uint8 * src; + + x_exceed = 64 - width; + y_exceed = 64 - height; + for (y = 0; y < height; y++) + { + src = rgb_data + y * rowstride; + + switch (pixel_format) + { + case RFX_PIXEL_FORMAT_BGRA: + for (x = 0; x < width; x++) + { + *b_buf++ = (sint16) (*src++); + *g_buf++ = (sint16) (*src++); + *r_buf++ = (sint16) (*src++); + src++; + } + break; + case RFX_PIXEL_FORMAT_RGBA: + for (x = 0; x < width; x++) + { + *r_buf++ = (sint16) (*src++); + *g_buf++ = (sint16) (*src++); + *b_buf++ = (sint16) (*src++); + src++; + } + break; + case RFX_PIXEL_FORMAT_BGR: + for (x = 0; x < width; x++) + { + *b_buf++ = (sint16) (*src++); + *g_buf++ = (sint16) (*src++); + *r_buf++ = (sint16) (*src++); + } + break; + case RFX_PIXEL_FORMAT_RGB: + for (x = 0; x < width; x++) + { + *r_buf++ = (sint16) (*src++); + *g_buf++ = (sint16) (*src++); + *b_buf++ = (sint16) (*src++); + } + break; + default: + break; + } + /* Fill the horizontal region outside of 64x64 tile size to 0 in order to be better compressed. */ + if (x_exceed > 0) + { + memset(r_buf, 0, x_exceed * sizeof(sint16)); + memset(g_buf, 0, x_exceed * sizeof(sint16)); + memset(b_buf, 0, x_exceed * sizeof(sint16)); + r_buf += x_exceed; + g_buf += x_exceed; + b_buf += x_exceed; + } + } + + /* Fill the vertical region outside of 64x64 tile size to 0 in order to be better compressed. */ + if (y_exceed > 0) + { + memset(r_buf, 0, y_exceed * 64 * sizeof(sint16)); + memset(g_buf, 0, y_exceed * 64 * sizeof(sint16)); + memset(b_buf, 0, y_exceed * 64 * sizeof(sint16)); + } +} + void rfx_encode_RGB_to_YCbCr(sint16 * y_r_buf, sint16 * cb_g_buf, sint16 * cr_b_buf) { @@ -54,63 +129,46 @@ static void rfx_encode_component(RFX_CONTEXT * context, const uint32 * quantization_values, sint16 * data, uint8 * buffer, int buffer_size, int * size) { - rfx_dwt_2d_encode(data, context->dwt_buffer_8, context->dwt_buffer_16, context->dwt_buffer_32); + PROFILER_ENTER(context->prof_rfx_encode_component); + + PROFILER_ENTER(context->prof_rfx_dwt_2d_encode); + rfx_dwt_2d_encode(data, context->dwt_buffer_8, context->dwt_buffer_16, context->dwt_buffer_32); + PROFILER_EXIT(context->prof_rfx_dwt_2d_encode); - rfx_quantization_encode(data, quantization_values); + PROFILER_ENTER(context->prof_rfx_quantization_encode); + context->quantization_encode(data, quantization_values); + PROFILER_EXIT(context->prof_rfx_quantization_encode); - rfx_differential_encode(data + 4032, 64); + PROFILER_ENTER(context->prof_rfx_differential_encode); + rfx_differential_encode(data + 4032, 64); + PROFILER_EXIT(context->prof_rfx_differential_encode); - *size = rfx_rlgr_encode(context->mode, data, 4096, buffer, buffer_size); + PROFILER_ENTER(context->prof_rfx_rlgr_encode); + *size = rfx_rlgr_encode(context->mode, data, 4096, buffer, buffer_size); + PROFILER_EXIT(context->prof_rfx_rlgr_encode); + + PROFILER_EXIT(context->prof_rfx_encode_component); } void -rfx_encode_rgb(RFX_CONTEXT * context, const uint8 * rgb_buffer, int rowstride, +rfx_encode_rgb(RFX_CONTEXT * context, const uint8 * rgb_data, int width, int height, int rowstride, const uint32 * y_quants, const uint32 * cb_quants, const uint32 * cr_quants, uint8 * ycbcr_buffer, int buffer_size, int * y_size, int * cb_size, int * cr_size) { - int x, y; - const uint8 * src; sint16 * y_r_buffer = context->y_r_buffer; sint16 * cb_g_buffer = context->cb_g_buffer; sint16 * cr_b_buffer = context->cr_b_buffer; - for (y = 0; y < 64; y++) - { - src = rgb_buffer + y * rowstride; + PROFILER_ENTER(context->prof_rfx_encode_rgb); - for (x = 0; x < 64; x++) - { - switch (context->pixel_format) - { - case RFX_PIXEL_FORMAT_BGRA: - *cr_b_buffer++ = (sint16) (*src++); - *cb_g_buffer++ = (sint16) (*src++); - *y_r_buffer++ = (sint16) (*src++); - src++; - break; - case RFX_PIXEL_FORMAT_RGBA: - *y_r_buffer++ = (sint16) (*src++); - *cb_g_buffer++ = (sint16) (*src++); - *cr_b_buffer++ = (sint16) (*src++); - src++; - break; - case RFX_PIXEL_FORMAT_BGR: - *cr_b_buffer++ = (sint16) (*src++); - *cb_g_buffer++ = (sint16) (*src++); - *y_r_buffer++ = (sint16) (*src++); - break; - case RFX_PIXEL_FORMAT_RGB: - *y_r_buffer++ = (sint16) (*src++); - *cb_g_buffer++ = (sint16) (*src++); - *cr_b_buffer++ = (sint16) (*src++); - break; - default: - break; - } - } - } + PROFILER_ENTER(context->prof_rfx_encode_format_RGB); + rfx_encode_format_RGB(rgb_data, width, height, rowstride, + context->pixel_format, y_r_buffer, cb_g_buffer, cr_b_buffer); + PROFILER_EXIT(context->prof_rfx_encode_format_RGB); - context->encode_RGB_to_YCbCr(context->y_r_buffer, context->cb_g_buffer, context->cr_b_buffer); + PROFILER_ENTER(context->prof_rfx_encode_RGB_to_YCbCr); + context->encode_RGB_to_YCbCr(context->y_r_buffer, context->cb_g_buffer, context->cr_b_buffer); + PROFILER_EXIT(context->prof_rfx_encode_RGB_to_YCbCr); rfx_encode_component(context, y_quants, context->y_r_buffer, ycbcr_buffer, buffer_size, y_size); ycbcr_buffer += (*y_size); @@ -119,4 +177,6 @@ rfx_encode_rgb(RFX_CONTEXT * context, const uint8 * rgb_buffer, int rowstride, ycbcr_buffer += (*cb_size); buffer_size -= (*cb_size); rfx_encode_component(context, cr_quants, context->cr_b_buffer, ycbcr_buffer, buffer_size, cr_size); + + PROFILER_EXIT(context->prof_rfx_encode_rgb); } diff --git a/libfreerdp-rfx/rfx_encode.h b/libfreerdp-rfx/rfx_encode.h index 664458c..2fac0be 100644 --- a/libfreerdp-rfx/rfx_encode.h +++ b/libfreerdp-rfx/rfx_encode.h @@ -26,7 +26,7 @@ void rfx_encode_RGB_to_YCbCr(sint16 * y_r_buf, sint16 * cb_g_buf, sint16 * cr_b_buf); void -rfx_encode_rgb(RFX_CONTEXT * context, const uint8 * rgb_buffer, int rowstride, +rfx_encode_rgb(RFX_CONTEXT * context, const uint8 * rgb_data, int width, int height, int rowstride, const uint32 * y_quants, const uint32 * cb_quants, const uint32 * cr_quants, uint8 * ycbcr_buffer, int buffer_size, int * y_size, int * cb_size, int * cr_size); diff --git a/libfreerdp-rfx/rfx_rlgr.c b/libfreerdp-rfx/rfx_rlgr.c index c1cb54d..2d27907 100644 --- a/libfreerdp-rfx/rfx_rlgr.c +++ b/libfreerdp-rfx/rfx_rlgr.c @@ -349,14 +349,17 @@ rfx_rlgr_encode(RLGR_MODE mode, const sint16 * data, int data_size, uint8 * buff /* output the remaining run length using k bits */ OutputBits(k, numZeros); - /* encode the nonzero value using GR coding */ - mag = (input < 0 ? -input : input); /* absolute value of input coefficient */ - sign = (input < 0 ? 1 : 0); /* sign of input coefficient */ + if (input != 0) + { + /* encode the nonzero value using GR coding */ + mag = (input < 0 ? -input : input); /* absolute value of input coefficient */ + sign = (input < 0 ? 1 : 0); /* sign of input coefficient */ - OutputBit(1, sign); /* output the sign bit */ - CodeGR(&krp, mag - 1); /* output GR code for (mag - 1) */ + OutputBit(1, sign); /* output the sign bit */ + CodeGR(&krp, mag - 1); /* output GR code for (mag - 1) */ - UpdateParam(kp, -DN_GR, k); + UpdateParam(kp, -DN_GR, k); + } } else { diff --git a/libfreerdp-rfx/sse/rfx_sse.c b/libfreerdp-rfx/sse/rfx_sse.c index 3ccc9bf..0407323 100644 --- a/libfreerdp-rfx/sse/rfx_sse.c +++ b/libfreerdp-rfx/sse/rfx_sse.c @@ -29,10 +29,14 @@ void rfx_init_sse(RFX_CONTEXT * context) DEBUG_RFX("Using SSE2 optimizations"); IF_PROFILER(context->prof_rfx_decode_YCbCr_to_RGB->name = "rfx_decode_YCbCr_to_RGB_SSE2"); + IF_PROFILER(context->prof_rfx_encode_RGB_to_YCbCr->name = "rfx_encode_RGB_to_YCbCr_SSE2"); IF_PROFILER(context->prof_rfx_quantization_decode->name = "rfx_quantization_decode_SSE2"); + IF_PROFILER(context->prof_rfx_quantization_encode->name = "rfx_quantization_encode_SSE2"); IF_PROFILER(context->prof_rfx_dwt_2d_decode->name = "rfx_dwt_2d_decode_SSE2"); context->decode_YCbCr_to_RGB = rfx_decode_YCbCr_to_RGB_SSE2; + context->encode_RGB_to_YCbCr = rfx_encode_RGB_to_YCbCr_SSE2; context->quantization_decode = rfx_quantization_decode_SSE2; + context->quantization_encode = rfx_quantization_encode_SSE2; context->dwt_2d_decode = rfx_dwt_2d_decode_SSE2; } diff --git a/libfreerdp-rfx/sse/rfx_sse2.c b/libfreerdp-rfx/sse/rfx_sse2.c index 642d725..63dfdcf 100644 --- a/libfreerdp-rfx/sse/rfx_sse2.c +++ b/libfreerdp-rfx/sse/rfx_sse2.c @@ -38,7 +38,8 @@ _mm_prefetch_buffer(char * buffer, int num_bytes) } } -void rfx_decode_YCbCr_to_RGB_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer) +void +rfx_decode_YCbCr_to_RGB_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer) { __m128i zero = _mm_setzero_si128(); __m128i max = _mm_set1_epi16(255); @@ -48,7 +49,7 @@ void rfx_decode_YCbCr_to_RGB_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sin __m128i * cr_b_buf = (__m128i*) cr_b_buffer; int i; - for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i+=(CACHE_LINE_BYTES / sizeof(__m128i))) + for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i))) { _mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA); _mm_prefetch((char*)(&cb_g_buf[i]), _MM_HINT_NTA); @@ -56,14 +57,14 @@ void rfx_decode_YCbCr_to_RGB_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sin } for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i++) { - // y = y_r_buf[i] + 128; + /* y = y_r_buf[i] + 128; */ __m128i y = _mm_load_si128(&y_r_buf[i]); y = _mm_add_epi16(y, _mm_set1_epi16(128)); - - // cr = cr_b_buf[i]; + + /* cr = cr_b_buf[i]; */ __m128i cr = _mm_load_si128(&cr_b_buf[i]); - - // r = between(y + cr + (cr >> 2) + (cr >> 3) + (cr >> 5), 0, 255); + + /* r = between(y + cr + (cr >> 2) + (cr >> 3) + (cr >> 5), 0, 255); */ __m128i r = _mm_add_epi16(y, cr); r = _mm_add_epi16(r, _mm_srai_epi16(cr, 2)); r = _mm_add_epi16(r, _mm_srai_epi16(cr, 3)); @@ -71,10 +72,10 @@ void rfx_decode_YCbCr_to_RGB_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sin r = _mm_between_epi16(r, zero, max); _mm_store_si128(&y_r_buf[i], r); - // cb = cb_g_buf[i]; + /* cb = cb_g_buf[i]; */ __m128i cb = _mm_load_si128(&cb_g_buf[i]); - // g = between(y - (cb >> 2) - (cb >> 4) - (cb >> 5) - (cr >> 1) - (cr >> 3) - (cr >> 4) - (cr >> 5), 0, 255); + /* g = between(y - (cb >> 2) - (cb >> 4) - (cb >> 5) - (cr >> 1) - (cr >> 3) - (cr >> 4) - (cr >> 5), 0, 255); */ __m128i g = _mm_sub_epi16(y, _mm_srai_epi16(cb, 2)); g = _mm_sub_epi16(g, _mm_srai_epi16(cb, 4)); g = _mm_sub_epi16(g, _mm_srai_epi16(cb, 5)); @@ -84,8 +85,8 @@ void rfx_decode_YCbCr_to_RGB_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sin g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 5)); g = _mm_between_epi16(g, zero, max); _mm_store_si128(&cb_g_buf[i], g); - - // b = between(y + cb + (cb >> 1) + (cb >> 2) + (cb >> 6), 0, 255); + + /* b = between(y + cb + (cb >> 1) + (cb >> 2) + (cb >> 6), 0, 255); */ __m128i b = _mm_add_epi16(y, cb); b = _mm_add_epi16(b, _mm_srai_epi16(cb, 1)); b = _mm_add_epi16(b, _mm_srai_epi16(cb, 2)); @@ -95,6 +96,73 @@ void rfx_decode_YCbCr_to_RGB_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sin } } +void +rfx_encode_RGB_to_YCbCr_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer) +{ + __m128i min = _mm_set1_epi16(-128); + __m128i max = _mm_set1_epi16(127); + + __m128i * y_r_buf = (__m128i*) y_r_buffer; + __m128i * cb_g_buf = (__m128i*) cb_g_buffer; + __m128i * cr_b_buf = (__m128i*) cr_b_buffer; + + int i; + for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i))) + { + _mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA); + _mm_prefetch((char*)(&cb_g_buf[i]), _MM_HINT_NTA); + _mm_prefetch((char*)(&cr_b_buf[i]), _MM_HINT_NTA); + } + for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i++) + { + /* r = y_r_buf[i]; */ + __m128i r = _mm_load_si128(&y_r_buf[i]); + + /* g = cb_g_buf[i]; */ + __m128i g = _mm_load_si128(&cb_g_buf[i]); + + /* b = cr_b_buf[i]; */ + __m128i b = _mm_load_si128(&cr_b_buf[i]); + + /* y = ((r >> 2) + (r >> 5) + (r >> 6)) + ((g >> 1) + (g >> 4) + (g >> 6) + (g >> 7)) + ((b >> 4) + (b >> 5) + (b >> 6)); */ + /* y_r_buf[i] = MINMAX(y, 0, 255) - 128; */ + __m128i y = _mm_add_epi16(_mm_srai_epi16(r, 2), _mm_srai_epi16(r, 5)); + y = _mm_add_epi16(y, _mm_srai_epi16(r, 6)); + y = _mm_add_epi16(y, _mm_srai_epi16(g, 1)); + y = _mm_add_epi16(y, _mm_srai_epi16(g, 4)); + y = _mm_add_epi16(y, _mm_srai_epi16(g, 6)); + y = _mm_add_epi16(y, _mm_srai_epi16(g, 7)); + y = _mm_add_epi16(y, _mm_srai_epi16(b, 4)); + y = _mm_add_epi16(y, _mm_srai_epi16(b, 5)); + y = _mm_add_epi16(y, _mm_srai_epi16(b, 6)); + y = _mm_add_epi16(y, min); + y = _mm_between_epi16(y, min, max); + _mm_store_si128(&y_r_buf[i], y); + + /* cb = 0 - ((r >> 3) + (r >> 5) + (r >> 7)) - ((g >> 2) + (g >> 4) + (g >> 6)) + (b >> 1); */ + /* cb_g_buf[i] = MINMAX(cb, -128, 127); */ + __m128i cb = _mm_sub_epi16(_mm_srai_epi16(b, 1), _mm_srai_epi16(r, 3)); + cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 5)); + cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 7)); + cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 2)); + cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 4)); + cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 6)); + cb = _mm_between_epi16(cb, min, max); + _mm_store_si128(&cb_g_buf[i], cb); + + /* cr = (r >> 1) - ((g >> 2) + (g >> 3) + (g >> 5) + (g >> 7)) - ((b >> 4) + (b >> 6)); */ + /* cr_b_buf[i] = MINMAX(cr, -128, 127); */ + __m128i cr = _mm_sub_epi16(_mm_srai_epi16(r, 1), _mm_srai_epi16(g, 2)); + cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 3)); + cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 5)); + cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 7)); + cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 4)); + cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 6)); + cr = _mm_between_epi16(cr, min, max); + _mm_store_si128(&cr_b_buf[i], cr); + } +} + static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) rfx_quantization_decode_block_SSE2(sint16 * buffer, const int buffer_size, const uint32 factor) { @@ -115,26 +183,64 @@ rfx_quantization_decode_block_SSE2(sint16 * buffer, const int buffer_size, const } while(ptr < buf_end); } -void rfx_quantization_decode_SSE2(sint16 * buffer, const uint32 * quantization_values) +void +rfx_quantization_decode_SSE2(sint16 * buffer, const uint32 * quantization_values) { _mm_prefetch_buffer((char *) buffer, 4096 * sizeof(sint16)); - rfx_quantization_decode_block_SSE2(buffer, 1024, quantization_values[8]); // HL1 - rfx_quantization_decode_block_SSE2(buffer + 1024, 1024, quantization_values[7]); // LH1 - rfx_quantization_decode_block_SSE2(buffer + 2048, 1024, quantization_values[9]); // HH1 - rfx_quantization_decode_block_SSE2(buffer + 3072, 256, quantization_values[5]); // HL2 - rfx_quantization_decode_block_SSE2(buffer + 3328, 256, quantization_values[4]); // LH2 - rfx_quantization_decode_block_SSE2(buffer + 3584, 256, quantization_values[6]); // HH2 - rfx_quantization_decode_block_SSE2(buffer + 3840, 64, quantization_values[2]); // HL3 - rfx_quantization_decode_block_SSE2(buffer + 3904, 64, quantization_values[1]); // LH3 - rfx_quantization_decode_block_SSE2(buffer + 3868, 64, quantization_values[3]); // HH3 - rfx_quantization_decode_block_SSE2(buffer + 4032, 64, quantization_values[0]); // LL3 + rfx_quantization_decode_block_SSE2(buffer, 1024, quantization_values[8]); /* HL1 */ + rfx_quantization_decode_block_SSE2(buffer + 1024, 1024, quantization_values[7]); /* LH1 */ + rfx_quantization_decode_block_SSE2(buffer + 2048, 1024, quantization_values[9]); /* HH1 */ + rfx_quantization_decode_block_SSE2(buffer + 3072, 256, quantization_values[5]); /* HL2 */ + rfx_quantization_decode_block_SSE2(buffer + 3328, 256, quantization_values[4]); /* LH2 */ + rfx_quantization_decode_block_SSE2(buffer + 3584, 256, quantization_values[6]); /* HH2 */ + rfx_quantization_decode_block_SSE2(buffer + 3840, 64, quantization_values[2]); /* HL3 */ + rfx_quantization_decode_block_SSE2(buffer + 3904, 64, quantization_values[1]); /* LH3 */ + rfx_quantization_decode_block_SSE2(buffer + 3868, 64, quantization_values[3]); /* HH3 */ + rfx_quantization_decode_block_SSE2(buffer + 4032, 64, quantization_values[0]); /* LL3 */ +} + +static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +rfx_quantization_encode_block_SSE2(sint16 * buffer, const int buffer_size, const uint32 factor) +{ + int shift = factor-6; + if (shift <= 0) + return; + + __m128i a; + __m128i * ptr = (__m128i*) buffer; + __m128i * buf_end = (__m128i*) (buffer + buffer_size); + do + { + a = _mm_load_si128(ptr); + a = _mm_srai_epi16(a, shift); + _mm_store_si128(ptr, a); + + ptr++; + } while(ptr < buf_end); +} + +void +rfx_quantization_encode_SSE2(sint16 * buffer, const uint32 * quantization_values) +{ + _mm_prefetch_buffer((char *) buffer, 4096 * sizeof(sint16)); + + rfx_quantization_encode_block_SSE2(buffer, 1024, quantization_values[8]); /* HL1 */ + rfx_quantization_encode_block_SSE2(buffer + 1024, 1024, quantization_values[7]); /* LH1 */ + rfx_quantization_encode_block_SSE2(buffer + 2048, 1024, quantization_values[9]); /* HH1 */ + rfx_quantization_encode_block_SSE2(buffer + 3072, 256, quantization_values[5]); /* HL2 */ + rfx_quantization_encode_block_SSE2(buffer + 3328, 256, quantization_values[4]); /* LH2 */ + rfx_quantization_encode_block_SSE2(buffer + 3584, 256, quantization_values[6]); /* HH2 */ + rfx_quantization_encode_block_SSE2(buffer + 3840, 64, quantization_values[2]); /* HL3 */ + rfx_quantization_encode_block_SSE2(buffer + 3904, 64, quantization_values[1]); /* LH3 */ + rfx_quantization_encode_block_SSE2(buffer + 3868, 64, quantization_values[3]); /* HH3 */ + rfx_quantization_encode_block_SSE2(buffer + 4032, 64, quantization_values[0]); /* LL3 */ } static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) rfx_dwt_2d_decode_block_horiz_SSE2(sint16 * l, sint16 * h, sint16 * dst, int subband_width) { - int y, n; + int y, n; sint16 * l_ptr = l; sint16 * h_ptr = h; sint16 * dst_ptr = dst; @@ -166,7 +272,7 @@ rfx_dwt_2d_decode_block_horiz_SSE2(sint16 * l, sint16 * h, sint16 * dst, int sub l_ptr+=8; h_ptr+=8; - } + } l_ptr -= subband_width; h_ptr -= subband_width; @@ -206,19 +312,87 @@ rfx_dwt_2d_decode_block_horiz_SSE2(sint16 * l, sint16 * h, sint16 * dst, int sub } static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +rfx_dwt_2d_decode_block_vert_SSE2(sint16 * l, sint16 * h, sint16 * dst, int subband_width) +{ + int x, n; + sint16 * l_ptr = l; + sint16 * h_ptr = h; + sint16 * dst_ptr = dst; + + int total_width = subband_width + subband_width; + + /* Even coefficients */ + for (n = 0; n < subband_width; n++) + { + for (x = 0; x < total_width; x+=8) + { + // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); + + __m128i l_n = _mm_load_si128((__m128i*) l_ptr); + __m128i h_n = _mm_load_si128((__m128i*) h_ptr); + + __m128i tmp_n = _mm_add_epi16(h_n, _mm_set1_epi16(1));; + if (n == 0) + tmp_n = _mm_add_epi16(tmp_n, h_n); + else + { + __m128i h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - total_width)); + tmp_n = _mm_add_epi16(tmp_n, h_n_m); + } + tmp_n = _mm_srai_epi16(tmp_n, 1); + + __m128i dst_n = _mm_sub_epi16(l_n, tmp_n); + _mm_store_si128((__m128i*) dst_ptr, dst_n); + + l_ptr+=8; + h_ptr+=8; + dst_ptr+=8; + } + dst_ptr+=total_width; + } + + h_ptr = h; + dst_ptr = dst + total_width; + + /* Odd coefficients */ + for (n = 0; n < subband_width; n++) + { + for (x = 0; x < total_width; x+=8) + { + // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); + + __m128i h_n = _mm_load_si128((__m128i*) h_ptr); + __m128i dst_n_m = _mm_load_si128((__m128i*) (dst_ptr - total_width)); + h_n = _mm_slli_epi16(h_n, 1); + + __m128i tmp_n = dst_n_m; + if (n == subband_width - 1) + tmp_n = _mm_add_epi16(tmp_n, dst_n_m); + else + { + __m128i dst_n_p = _mm_loadu_si128((__m128i*) (dst_ptr + total_width)); + tmp_n = _mm_add_epi16(tmp_n, dst_n_p); + } + tmp_n = _mm_srai_epi16(tmp_n, 1); + + __m128i dst_n = _mm_add_epi16(tmp_n, h_n); + _mm_store_si128((__m128i*) dst_ptr, dst_n); + + h_ptr+=8; + dst_ptr+=8; + } + dst_ptr+=total_width; + } +} + +static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) rfx_dwt_2d_decode_block_SSE2(sint16 * buffer, sint16 * idwt, int subband_width) { - sint16 * dst, * l, * h; - sint16 * l_dst, * h_dst; sint16 * hl, * lh, * hh, * ll; - int total_width; - int x, y; - int n; + sint16 * l_dst, * h_dst; _mm_prefetch_buffer((char *) idwt, subband_width * 4 * sizeof(sint16)); - total_width = subband_width + subband_width; - /* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt. */ /* The 4 sub-bands are stored in HL(0), LH(1), HH(2), LL(3) order. */ /* The lower part L uses LL(3) and HL(0). */ @@ -237,31 +411,11 @@ rfx_dwt_2d_decode_block_SSE2(sint16 * buffer, sint16 * idwt, int subband_width) rfx_dwt_2d_decode_block_horiz_SSE2(lh, hh, h_dst, subband_width); /* Inverse DWT in vertical direction, results are stored in original buffer. */ - for (x = 0; x < total_width; x++) - { - /* Even coefficients */ - for (n = 0; n < subband_width; n++) - { - y = n << 1; - dst = buffer + y * total_width + x; - l = idwt + n * total_width + x; - h = l + subband_width * total_width; - dst[0] = *l - (((n > 0 ? *(h - total_width) : *h) + (*h) + 1) >> 1); - } - - /* Odd coefficients */ - for (n = 0; n < subband_width; n++) - { - y = n << 1; - dst = buffer + y * total_width + x; - l = idwt + n * total_width + x; - h = l + subband_width * total_width; - dst[total_width] = (*h << 1) + ((dst[0] + dst[n < subband_width - 1 ? 2 * total_width : 0]) >> 1); - } - } + rfx_dwt_2d_decode_block_vert_SSE2(l_dst, h_dst, buffer, subband_width); } -void rfx_dwt_2d_decode_SSE2(sint16 * buffer, sint16 * dwt_buffer_8, sint16 * dwt_buffer_16, sint16 * dwt_buffer_32) +void +rfx_dwt_2d_decode_SSE2(sint16 * buffer, sint16 * dwt_buffer_8, sint16 * dwt_buffer_16, sint16 * dwt_buffer_32) { _mm_prefetch_buffer((char *) buffer, 4096 * sizeof(sint16)); diff --git a/libfreerdp-rfx/sse/rfx_sse2.h b/libfreerdp-rfx/sse/rfx_sse2.h index d1df7db..8f35f7c 100644 --- a/libfreerdp-rfx/sse/rfx_sse2.h +++ b/libfreerdp-rfx/sse/rfx_sse2.h @@ -23,7 +23,9 @@ #include <freerdp/rfx.h> void rfx_decode_YCbCr_to_RGB_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer); +void rfx_encode_RGB_to_YCbCr_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer); void rfx_quantization_decode_SSE2(sint16 * buffer, const uint32 * quantization_values); +void rfx_quantization_encode_SSE2(sint16 * buffer, const uint32 * quantization_values); void rfx_dwt_2d_decode_SSE2(sint16 * buffer, sint16 * dwt_buffer_8, sint16 * dwt_buffer_16, sint16 * dwt_buffer_32); #endif /* __RFX_SSE2_H */ |