diff options
author | Vic Lee <llyzs@163.com> | 2011-06-20 08:39:53 +0400 |
---|---|---|
committer | Vic Lee <llyzs@163.com> | 2011-06-20 08:39:53 +0400 |
commit | e389579b86f3e009e5e47baa141a47f4e6f20865 (patch) | |
tree | 86e4c76d41747368b7d4e5983b503cdfc15df0ca | |
parent | 08e9d3d5e22d10f5ed81e278389b502fefa9af6a (diff) |
libfreerdp-rfx: add SSE2 optimization for RGB-YCbCr encoding.
-rw-r--r-- | libfreerdp-rfx/sse/rfx_sse.c | 2 | ||||
-rw-r--r-- | libfreerdp-rfx/sse/rfx_sse2.c | 72 | ||||
-rw-r--r-- | libfreerdp-rfx/sse/rfx_sse2.h | 1 |
3 files changed, 73 insertions, 2 deletions
diff --git a/libfreerdp-rfx/sse/rfx_sse.c b/libfreerdp-rfx/sse/rfx_sse.c index efd1cc3..4c86fb8 100644 --- a/libfreerdp-rfx/sse/rfx_sse.c +++ b/libfreerdp-rfx/sse/rfx_sse.c @@ -37,10 +37,12 @@ void rfx_init_sse(RFX_CONTEXT * context) DEBUG_RFX("Using SSE2 optimizations"); IF_PROFILER(context->prof_rfx_decode_YCbCr_to_RGB->name = "rfx_decode_YCbCr_to_RGB_SSE2"); + IF_PROFILER(context->prof_rfx_encode_RGB_to_YCbCr->name = "rfx_encode_RGB_to_YCbCr_SSE2"); IF_PROFILER(context->prof_rfx_quantization_decode->name = "rfx_quantization_decode_SSE2"); IF_PROFILER(context->prof_rfx_dwt_2d_decode->name = "rfx_dwt_2d_decode_SSE2"); context->decode_YCbCr_to_RGB = rfx_decode_YCbCr_to_RGB_SSE2; + context->encode_RGB_to_YCbCr = rfx_encode_RGB_to_YCbCr_SSE2; context->quantization_decode = rfx_quantization_decode_SSE2; context->dwt_2d_decode = rfx_dwt_2d_decode_SSE2; } diff --git a/libfreerdp-rfx/sse/rfx_sse2.c b/libfreerdp-rfx/sse/rfx_sse2.c index 44402f3..ef2e6ff 100644 --- a/libfreerdp-rfx/sse/rfx_sse2.c +++ b/libfreerdp-rfx/sse/rfx_sse2.c @@ -38,7 +38,8 @@ _mm_prefetch_buffer(char * buffer, int num_bytes) } } -void rfx_decode_YCbCr_to_RGB_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer) +void +rfx_decode_YCbCr_to_RGB_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer) { __m128i zero = _mm_setzero_si128(); __m128i max = _mm_set1_epi16(255); @@ -48,7 +49,7 @@ void rfx_decode_YCbCr_to_RGB_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sin __m128i * cr_b_buf = (__m128i*) cr_b_buffer; int i; - for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i+=(CACHE_LINE_BYTES / sizeof(__m128i))) + for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i))) { _mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA); _mm_prefetch((char*)(&cb_g_buf[i]), _MM_HINT_NTA); @@ -95,6 +96,73 @@ void rfx_decode_YCbCr_to_RGB_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sin } } +void +rfx_encode_RGB_to_YCbCr_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer) +{ + __m128i min = _mm_set1_epi16(-128); + __m128i max = _mm_set1_epi16(127); + + __m128i * y_r_buf = (__m128i*) y_r_buffer; + __m128i * cb_g_buf = (__m128i*) cb_g_buffer; + __m128i * cr_b_buf = (__m128i*) cr_b_buffer; + + int i; + for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i))) + { + _mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA); + _mm_prefetch((char*)(&cb_g_buf[i]), _MM_HINT_NTA); + _mm_prefetch((char*)(&cr_b_buf[i]), _MM_HINT_NTA); + } + for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i++) + { + /* r = y_r_buf[i]; */ + __m128i r = _mm_load_si128(&y_r_buf[i]); + + /* g = cb_g_buf[i]; */ + __m128i g = _mm_load_si128(&cb_g_buf[i]); + + /* b = cr_b_buf[i]; */ + __m128i b = _mm_load_si128(&cr_b_buf[i]); + + /* y = ((r >> 2) + (r >> 5) + (r >> 6)) + ((g >> 1) + (g >> 4) + (g >> 6) + (g >> 7)) + ((b >> 4) + (b >> 5) + (b >> 6)); */ + /* y_r_buf[i] = MINMAX(y, 0, 255) - 128; */ + __m128i y = _mm_add_epi16(_mm_srai_epi16(r, 2), _mm_srai_epi16(r, 5)); + y = _mm_add_epi16(y, _mm_srai_epi16(r, 6)); + y = _mm_add_epi16(y, _mm_srai_epi16(g, 1)); + y = _mm_add_epi16(y, _mm_srai_epi16(g, 4)); + y = _mm_add_epi16(y, _mm_srai_epi16(g, 6)); + y = _mm_add_epi16(y, _mm_srai_epi16(g, 7)); + y = _mm_add_epi16(y, _mm_srai_epi16(b, 4)); + y = _mm_add_epi16(y, _mm_srai_epi16(b, 5)); + y = _mm_add_epi16(y, _mm_srai_epi16(b, 6)); + y = _mm_add_epi16(y, min); + y = _mm_between_epi16(y, min, max); + _mm_store_si128(&y_r_buf[i], y); + + /* cb = 0 - ((r >> 3) + (r >> 5) + (r >> 7)) - ((g >> 2) + (g >> 4) + (g >> 6)) + (b >> 1); */ + /* cb_g_buf[i] = MINMAX(cb, -128, 127); */ + __m128i cb = _mm_sub_epi16(_mm_srai_epi16(b, 1), _mm_srai_epi16(r, 3)); + cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 5)); + cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 7)); + cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 2)); + cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 4)); + cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 6)); + cb = _mm_between_epi16(cb, min, max); + _mm_store_si128(&cb_g_buf[i], cb); + + /* cr = (r >> 1) - ((g >> 2) + (g >> 3) + (g >> 5) + (g >> 7)) - ((b >> 4) + (b >> 6)); */ + /* cr_b_buf[i] = MINMAX(cr, -128, 127); */ + __m128i cr = _mm_sub_epi16(_mm_srai_epi16(r, 1), _mm_srai_epi16(g, 2)); + cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 3)); + cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 5)); + cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 7)); + cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 4)); + cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 6)); + cr = _mm_between_epi16(cr, min, max); + _mm_store_si128(&cr_b_buf[i], cr); + } +} + static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) rfx_quantization_decode_block_SSE2(sint16 * buffer, const int buffer_size, const uint32 factor) { diff --git a/libfreerdp-rfx/sse/rfx_sse2.h b/libfreerdp-rfx/sse/rfx_sse2.h index d1df7db..ea87347 100644 --- a/libfreerdp-rfx/sse/rfx_sse2.h +++ b/libfreerdp-rfx/sse/rfx_sse2.h @@ -23,6 +23,7 @@ #include <freerdp/rfx.h> void rfx_decode_YCbCr_to_RGB_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer); +void rfx_encode_RGB_to_YCbCr_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer); void rfx_quantization_decode_SSE2(sint16 * buffer, const uint32 * quantization_values); void rfx_dwt_2d_decode_SSE2(sint16 * buffer, sint16 * dwt_buffer_8, sint16 * dwt_buffer_16, sint16 * dwt_buffer_32); |