Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/FreeRDP/FreeRDP-old.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorVic Lee <llyzs@163.com>2011-06-20 08:39:53 +0400
committerVic Lee <llyzs@163.com>2011-06-20 08:39:53 +0400
commite389579b86f3e009e5e47baa141a47f4e6f20865 (patch)
tree86e4c76d41747368b7d4e5983b503cdfc15df0ca
parent08e9d3d5e22d10f5ed81e278389b502fefa9af6a (diff)
libfreerdp-rfx: add SSE2 optimization for RGB-YCbCr encoding.
-rw-r--r--libfreerdp-rfx/sse/rfx_sse.c2
-rw-r--r--libfreerdp-rfx/sse/rfx_sse2.c72
-rw-r--r--libfreerdp-rfx/sse/rfx_sse2.h1
3 files changed, 73 insertions, 2 deletions
diff --git a/libfreerdp-rfx/sse/rfx_sse.c b/libfreerdp-rfx/sse/rfx_sse.c
index efd1cc3..4c86fb8 100644
--- a/libfreerdp-rfx/sse/rfx_sse.c
+++ b/libfreerdp-rfx/sse/rfx_sse.c
@@ -37,10 +37,12 @@ void rfx_init_sse(RFX_CONTEXT * context)
DEBUG_RFX("Using SSE2 optimizations");
IF_PROFILER(context->prof_rfx_decode_YCbCr_to_RGB->name = "rfx_decode_YCbCr_to_RGB_SSE2");
+ IF_PROFILER(context->prof_rfx_encode_RGB_to_YCbCr->name = "rfx_encode_RGB_to_YCbCr_SSE2");
IF_PROFILER(context->prof_rfx_quantization_decode->name = "rfx_quantization_decode_SSE2");
IF_PROFILER(context->prof_rfx_dwt_2d_decode->name = "rfx_dwt_2d_decode_SSE2");
context->decode_YCbCr_to_RGB = rfx_decode_YCbCr_to_RGB_SSE2;
+ context->encode_RGB_to_YCbCr = rfx_encode_RGB_to_YCbCr_SSE2;
context->quantization_decode = rfx_quantization_decode_SSE2;
context->dwt_2d_decode = rfx_dwt_2d_decode_SSE2;
}
diff --git a/libfreerdp-rfx/sse/rfx_sse2.c b/libfreerdp-rfx/sse/rfx_sse2.c
index 44402f3..ef2e6ff 100644
--- a/libfreerdp-rfx/sse/rfx_sse2.c
+++ b/libfreerdp-rfx/sse/rfx_sse2.c
@@ -38,7 +38,8 @@ _mm_prefetch_buffer(char * buffer, int num_bytes)
}
}
-void rfx_decode_YCbCr_to_RGB_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer)
+void
+rfx_decode_YCbCr_to_RGB_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer)
{
__m128i zero = _mm_setzero_si128();
__m128i max = _mm_set1_epi16(255);
@@ -48,7 +49,7 @@ void rfx_decode_YCbCr_to_RGB_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sin
__m128i * cr_b_buf = (__m128i*) cr_b_buffer;
int i;
- for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i+=(CACHE_LINE_BYTES / sizeof(__m128i)))
+ for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i)))
{
_mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA);
_mm_prefetch((char*)(&cb_g_buf[i]), _MM_HINT_NTA);
@@ -95,6 +96,73 @@ void rfx_decode_YCbCr_to_RGB_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sin
}
}
+void
+rfx_encode_RGB_to_YCbCr_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer)
+{
+ __m128i min = _mm_set1_epi16(-128);
+ __m128i max = _mm_set1_epi16(127);
+
+ __m128i * y_r_buf = (__m128i*) y_r_buffer;
+ __m128i * cb_g_buf = (__m128i*) cb_g_buffer;
+ __m128i * cr_b_buf = (__m128i*) cr_b_buffer;
+
+ int i;
+ for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i)))
+ {
+ _mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA);
+ _mm_prefetch((char*)(&cb_g_buf[i]), _MM_HINT_NTA);
+ _mm_prefetch((char*)(&cr_b_buf[i]), _MM_HINT_NTA);
+ }
+ for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i++)
+ {
+ /* r = y_r_buf[i]; */
+ __m128i r = _mm_load_si128(&y_r_buf[i]);
+
+ /* g = cb_g_buf[i]; */
+ __m128i g = _mm_load_si128(&cb_g_buf[i]);
+
+ /* b = cr_b_buf[i]; */
+ __m128i b = _mm_load_si128(&cr_b_buf[i]);
+
+ /* y = ((r >> 2) + (r >> 5) + (r >> 6)) + ((g >> 1) + (g >> 4) + (g >> 6) + (g >> 7)) + ((b >> 4) + (b >> 5) + (b >> 6)); */
+ /* y_r_buf[i] = MINMAX(y, 0, 255) - 128; */
+ __m128i y = _mm_add_epi16(_mm_srai_epi16(r, 2), _mm_srai_epi16(r, 5));
+ y = _mm_add_epi16(y, _mm_srai_epi16(r, 6));
+ y = _mm_add_epi16(y, _mm_srai_epi16(g, 1));
+ y = _mm_add_epi16(y, _mm_srai_epi16(g, 4));
+ y = _mm_add_epi16(y, _mm_srai_epi16(g, 6));
+ y = _mm_add_epi16(y, _mm_srai_epi16(g, 7));
+ y = _mm_add_epi16(y, _mm_srai_epi16(b, 4));
+ y = _mm_add_epi16(y, _mm_srai_epi16(b, 5));
+ y = _mm_add_epi16(y, _mm_srai_epi16(b, 6));
+ y = _mm_add_epi16(y, min);
+ y = _mm_between_epi16(y, min, max);
+ _mm_store_si128(&y_r_buf[i], y);
+
+ /* cb = 0 - ((r >> 3) + (r >> 5) + (r >> 7)) - ((g >> 2) + (g >> 4) + (g >> 6)) + (b >> 1); */
+ /* cb_g_buf[i] = MINMAX(cb, -128, 127); */
+ __m128i cb = _mm_sub_epi16(_mm_srai_epi16(b, 1), _mm_srai_epi16(r, 3));
+ cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 5));
+ cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 7));
+ cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 2));
+ cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 4));
+ cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 6));
+ cb = _mm_between_epi16(cb, min, max);
+ _mm_store_si128(&cb_g_buf[i], cb);
+
+ /* cr = (r >> 1) - ((g >> 2) + (g >> 3) + (g >> 5) + (g >> 7)) - ((b >> 4) + (b >> 6)); */
+ /* cr_b_buf[i] = MINMAX(cr, -128, 127); */
+ __m128i cr = _mm_sub_epi16(_mm_srai_epi16(r, 1), _mm_srai_epi16(g, 2));
+ cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 3));
+ cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 5));
+ cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 7));
+ cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 4));
+ cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 6));
+ cr = _mm_between_epi16(cr, min, max);
+ _mm_store_si128(&cr_b_buf[i], cr);
+ }
+}
+
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
rfx_quantization_decode_block_SSE2(sint16 * buffer, const int buffer_size, const uint32 factor)
{
diff --git a/libfreerdp-rfx/sse/rfx_sse2.h b/libfreerdp-rfx/sse/rfx_sse2.h
index d1df7db..ea87347 100644
--- a/libfreerdp-rfx/sse/rfx_sse2.h
+++ b/libfreerdp-rfx/sse/rfx_sse2.h
@@ -23,6 +23,7 @@
#include <freerdp/rfx.h>
void rfx_decode_YCbCr_to_RGB_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer);
+void rfx_encode_RGB_to_YCbCr_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer);
void rfx_quantization_decode_SSE2(sint16 * buffer, const uint32 * quantization_values);
void rfx_dwt_2d_decode_SSE2(sint16 * buffer, sint16 * dwt_buffer_8, sint16 * dwt_buffer_16, sint16 * dwt_buffer_32);